diff --git a/WORKSPACE b/WORKSPACE
index 957b8d8528dc9b5e2ea134921b28601aa6fed2d1..9f07b9fd47136d058cc4039ed6948db539485039 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -29,7 +29,7 @@ load(
 bazel_toolchains_repositories()
 
 load(
-    "@io_bazel_rules_docker//container:container.bzl",
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
     container_repositories = "repositories",
 )
 
@@ -43,29 +43,17 @@ remote_config_workspace()
 # Apple and Swift rules.
 http_archive(
     name = "build_bazel_rules_apple",
-    sha256 = "4fe4ee824200b48821730f89ff260984332dc3551db587c24691235d1d96a8a7",
-    strip_prefix = "rules_apple-0.10.0",
-    urls = ["https://github.com/bazelbuild/rules_apple/archive/0.10.0.tar.gz"],
-)
-http_archive(
-    name = "build_bazel_rules_swift",
-    sha256 = "6544ff5615febec0342de1127144d2f3e43ea80fb7f9b1ade65e6a184e39e618",
-    strip_prefix = "rules_swift-0.5.0",
-    urls = ["https://github.com/bazelbuild/rules_swift/archive/0.5.0.tar.gz"],
-)
-http_archive(
-    name = "bazel_skylib",
-    sha256 = "eb5c57e4c12e68c0c20bc774bfbc60a568e800d025557bc4ea022c6479acc867",
-    strip_prefix = "bazel-skylib-0.6.0",
-    urls = ["https://github.com/bazelbuild/bazel-skylib/archive/0.6.0.tar.gz"],
+    sha256 = "73b4980a318d203d3307f850e27e66ec5cc8d223147a3475a6f11597eb6438a5",
+    strip_prefix = "rules_apple-0.13.0",
+    urls = ["https://github.com/bazelbuild/rules_apple/archive/0.13.0.tar.gz"],
 )
 http_file(
     name = "xctestrunner",
     executable = 1,
-    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.5/ios_test_runner.par"],
+    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.6/ios_test_runner.par"],
 )
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
-apple_rules_dependencies(ignore_version_differences = True)
+apple_rules_dependencies()
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
 swift_rules_dependencies()
 
@@ -134,4 +122,3 @@ http_archive(
         "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
-
diff --git a/configure.py b/configure.py
index 8dcd31822000820df12c7e96f5c57c68ed605f41..3eb09a1ae905b70dc5d02fab7c316f73c79633dd 100644
--- a/configure.py
+++ b/configure.py
@@ -55,6 +55,12 @@ NCCL_LIB_PATHS = [
     'lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', ''
 ]
 
+# List of files to be configured for using Bazel on Apple platforms.
+APPLE_BAZEL_FILES = [
+    'tensorflow/lite/experimental/objc/BUILD',
+    'tensorflow/lite/experimental/swift/BUILD'
+]
+
 if platform.machine() == 'ppc64le':
   _DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/powerpc64le-linux-gnu/'
 else:
@@ -256,6 +262,7 @@ def reset_tf_configure_bazelrc():
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
 
+
 def cleanup_makefile():
   """Delete any leftover BUILD files from the Makefile build.
 
@@ -785,8 +792,7 @@ def set_gcc_host_compiler_path(environ_cp):
       environ_cp,
       var_name='GCC_HOST_COMPILER_PATH',
       var_default=default_gcc_host_compiler_path,
-      ask_for_var=
-      'Please specify which gcc should be used by nvcc as the host compiler.',
+      ask_for_var='Please specify which gcc should be used by nvcc as the host compiler.',
       check_success=os.path.exists,
       error_msg='Invalid gcc path. %s cannot be found.',
   )
@@ -1237,6 +1243,7 @@ def set_tf_nccl_install_path(environ_cp):
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
   write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
 
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1273,13 +1280,15 @@ def set_tf_cuda_compute_capabilities(environ_cp):
 
     ask_cuda_compute_capabilities = (
         'Please specify a list of comma-separated '
-        'Cuda compute capabilities you want to '
+        'CUDA compute capabilities you want to '
         'build with.\nYou can find the compute '
         'capability of your device at: '
         'https://developer.nvidia.com/cuda-gpus.\nPlease'
         ' note that each additional compute '
         'capability significantly increases your '
-        'build time and binary size. [Default is: %s]: ' %
+        'build time and binary size, and that '
+        'TensorFlow only supports compute '
+        'capabilities >= 3.5 [Default is: %s]: ' %
         default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
@@ -1292,12 +1301,14 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
-        print('Invalid compute capability: ' % compute_capability)
+        print('Invalid compute capability: %s' % compute_capability)
         all_valid = False
       else:
-        ver = int(m.group(0).split('.')[0])
-        if ver < 3:
-          print('Only compute capabilities 3.0 or higher are supported.')
+        ver = float(m.group(0))
+        if ver < 3.5:
+          print('ERROR: TensorFlow only supports CUDA compute capabilities 3.5 '
+                'and higher. Please re-specify the list of compute '
+                'capabilities excluding version %s.' % ver)
           all_valid = False
 
     if all_valid:
@@ -1484,6 +1495,34 @@ def set_other_mpi_vars(environ_cp):
         'Cannot find the MPI library file in %s/lib or %s/lib64 or %s/lib32' %
         (mpi_home, mpi_home, mpi_home))
 
+def system_specific_test_config(env):
+  """Add default test flags required for TF tests to bazelrc."""
+  write_to_bazelrc('test --flaky_test_attempts=3')
+  write_to_bazelrc('test --test_size_filters=small,medium')
+  write_to_bazelrc(
+      'test --test_tag_filters=-benchmark-test,-no_oss,-oss_serial')
+  write_to_bazelrc('test --build_tag_filters=-benchmark-test,-no_oss')
+  if is_windows():
+    if env.get('TF_NEED_CUDA', None) == 1:
+      write_to_bazelrc(
+          'test --test_tag_filters=-no_windows,-no_windows_gpu,-no_gpu')
+      write_to_bazelrc(
+          'test --build_tag_filters=-no_windows,-no_windows_gpu,-no_gpu')
+    else:
+      write_to_bazelrc('test --test_tag_filters=-no_windows,-gpu')
+      write_to_bazelrc('test --build_tag_filters=-no_windows,-gpu')
+  elif is_macos():
+    write_to_bazelrc('test --test_tag_filters=-gpu,-nomac,-no_mac')
+    write_to_bazelrc('test --build_tag_filters=-gpu,-nomac,-no_mac')
+  elif is_linux():
+    if env.get('TF_NEED_CUDA', None) == 1:
+      write_to_bazelrc('test --test_tag_filters=-no_gpu')
+      write_to_bazelrc('test --build_tag_filters=-no_gpu')
+      write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
+    else:
+      write_to_bazelrc('test --test_tag_filters=-gpu')
+      write_to_bazelrc('test --build_tag_filters=-gpu')
+
 
 def set_system_libs_flag(environ_cp):
   syslibs = environ_cp.get('TF_SYSTEM_LIBS', '')
@@ -1513,10 +1552,6 @@ def set_windows_build_flags(environ_cp):
   # The host and target platforms are the same in Windows build. So we don't
   # have to distinct them. This avoids building the same targets twice.
   write_to_bazelrc('build --distinct_host_configuration=false')
-  # Enable short object file path to avoid long path issue on Windows.
-  # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0
-  # Short object file path will be enabled by default.
-  write_to_bazelrc('build --experimental_shortened_obj_file_path=true')
 
   if get_var(
       environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
@@ -1537,6 +1572,23 @@ def config_info_line(name, help_text):
   print('\t--config=%-12s\t# %s' % (name, help_text))
 
 
+def configure_apple_bazel_rules():
+  """Configures Bazel rules for building on Apple platforms.
+
+  Enables analyzing and building Apple Bazel rules on Apple platforms. This
+  function will only be executed if `is_macos()` is true.
+  """
+  if not is_macos():
+    return
+  for filepath in APPLE_BAZEL_FILES:
+    print(
+        'Configuring %s file to analyze and build Bazel rules on Apple platforms.'
+        % filepath)
+    existing_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath + '.apple')
+    renamed_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath)
+    os.rename(existing_filepath, renamed_filepath)
+
+
 def main():
   global _TF_WORKSPACE_ROOT
   global _TF_BAZELRC
@@ -1556,7 +1608,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.19.0', '0.21.0')
+  check_bazel_version('0.19.0', '0.22.0')
 
   reset_tf_configure_bazelrc()
 
@@ -1577,6 +1629,8 @@ def main():
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
+  else:
+    environ_cp['TF_CONFIGURE_APPLE_BAZEL_RULES'] = '0'
 
   # The numpy package on ppc64le uses OpenBLAS which has multi-threading
   # issues that lead to incorrect answers.  Set OMP_NUM_THREADS=1 at
@@ -1679,6 +1733,16 @@ def main():
     create_android_ndk_rule(environ_cp)
     create_android_sdk_rule(environ_cp)
 
+  system_specific_test_config(os.environ)
+
+  if get_var(
+      environ_cp, 'TF_CONFIGURE_APPLE_BAZEL_RULES',
+      'Configure Bazel rules for Apple platforms', False,
+      ('Would you like to configure Bazel rules for building on Apple platforms?'
+      ), 'Configuring Bazel rules for Apple platforms.',
+      'Not configuring Bazel rules for Apple platforms.'):
+    configure_apple_bazel_rules()
+
   print('Preconfigured Bazel build configs. You can use any of the below by '
         'adding "--config=<>" to your build command. See .bazelrc for more '
         'details.')
@@ -1687,8 +1751,9 @@ def main():
   config_info_line('gdr', 'Build with GDR support.')
   config_info_line('verbs', 'Build with libverbs support.')
   config_info_line('ngraph', 'Build with Intel nGraph support.')
-  config_info_line('dynamic_kernels',
-                   '(Experimental) Build kernels into separate shared objects.')
+  config_info_line(
+      'dynamic_kernels',
+      '(Experimental) Build kernels into separate shared objects.')
 
   print('Preconfigured Bazel build configs to DISABLE default on features:')
   config_info_line('noaws', 'Disable AWS S3 filesystem support.')
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 6bc8403d126a58c1eb6499ab7f224e12c6bc5aa4..f53982f1efc9885cc12dcc672ad819c762aca378 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -94,6 +94,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "emscripten",
+    values = {"crosstool_top": "//external:android/emscripten"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "raspberry_pi_armeabi",
     values = {
@@ -456,8 +462,7 @@ tf_cc_shared_object(
         "//tensorflow:darwin": [],
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow:tf_framework_version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow:tf_framework_version_script.lds)",
         ],
     }),
     linkstatic = 1,
@@ -491,15 +496,13 @@ tf_cc_shared_object(
     name = "libtensorflow.so",
     linkopts = select({
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/c:exported_symbols.lds)",
+            "-Wl,-exported_symbols_list,$(location //tensorflow/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflow.so",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/c:version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow/c:version_script.lds)",
         ],
     }),
     visibility = ["//visibility:public"],
@@ -517,14 +520,12 @@ tf_cc_shared_object(
     name = "libtensorflow_cc.so",
     linkopts = select({
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow:tf_exported_symbols.lds)",
+            "-Wl,-exported_symbols_list,$(location //tensorflow:tf_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow:tf_version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow:tf_version_script.lds)",
         ],
     }),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index a93799bfe84b0f9c4743e1ad0effd6e69ad7f3f2..ddcacfcbe2d4d8b089f10f1a771384dc8c4fd199 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -26,14 +26,28 @@ import sys as _sys
 
 # API IMPORTS PLACEHOLDER
 
+# Make sure directory containing top level submodules is in
+# the __path__ so that "from tensorflow.foo import bar" works.
+# We're using bitwise, but there's nothing special about that.
+_API_MODULE = bitwise  # pylint: disable=undefined-variable
+_current_module = _sys.modules[__name__]
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
+if not hasattr(_current_module, '__path__'):
+  __path__ = [_tf_api_dir]
+elif _tf_api_dir not in __path__:
+  __path__.append(_tf_api_dir)
+
 # pylint: disable=g-bad-import-order
 from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorboard.summary._tf.summary'),
+    error_msg="Limited tf.summary API due to missing TensorBoard installation")
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=(
         'tensorflow_estimator.python.estimator.api._v2.estimator'))
 
-_current_module = _sys.modules[__name__]
 if not hasattr(_current_module, 'estimator'):
   _component_api_helper.package_hook(
       parent_package_str=__name__,
@@ -42,14 +56,6 @@ if not hasattr(_current_module, 'estimator'):
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow.python.keras.api._v2.keras'))
-# Make sure directory containing top level submodules is in
-# the __path__ so that "from tensorflow.foo import bar" works.
-# We're using bitwise, but there's nothing special about that.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
-if not hasattr(_current_module, '__path__'):
-  __path__ = [_tf_api_dir]
-elif _tf_api_dir not in __path__:
-  __path__.append(_tf_api_dir)
 
 # Enable TF2 behaviors
 from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
@@ -111,5 +117,10 @@ try:
 except NameError:
   pass
 
+# Add module aliases
+if hasattr(_current_module, 'keras'):
+  losses = keras.losses
+  metrics = keras.metrics
+  optimizers = keras.optimizers
 
 # pylint: enable=undefined-variable
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index eeca8f0d566a6401cb64e4fe3f0ee3c5aeb4ece2..5eb25a81b7f765f551bc4f1b7ba99b35dbc6b7bb 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -70,7 +70,7 @@ _API_MODULE = app  # pylint: disable=undefined-variable
 
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))  # pylint: disable=undefined-variable
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
 if not hasattr(_current_module, '__path__'):
   __path__ = [_tf_api_dir]
 elif _tf_api_dir not in __path__:
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index ef52a28460062b57317b4027ab83479e5e075b5f..ef7863dc0d5cbd57da30baa6e04278c2a0354b25 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -67,6 +67,23 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "c_api",
+    hdrs = ["c_api.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api_no_xla",
+        ":c_api_internal",
+    ] + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+tf_cuda_library(
+    name = "c_api_no_xla",
     srcs = [
         "c_api.cc",
         "c_api_function.cc",
@@ -75,14 +92,12 @@ tf_cuda_library(
         "c_api.h",
     ],
     copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = select({
+    visibility = ["//tensorflow/c:__subpackages__"],
+    deps = [":c_api_internal"] + select({
         "//tensorflow:android": [
-            ":c_api_internal",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":c_api_internal",
             "//tensorflow/cc/saved_model:loader_lite",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
@@ -97,13 +112,8 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core/distributed_runtime:server_lib",
+            "//tensorflow/core/kernels:logging_ops",
         ],
-    }) + select({
-        "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
-            "//tensorflow/compiler/jit",
-        ],
-        "//conditions:default": [],
     }),
 )
 
@@ -156,8 +166,8 @@ tf_cuda_library(
     hdrs = ["tf_status_helper.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":c_api",
         ":c_api_internal",
+        ":c_api_no_xla",
         "//tensorflow/core:lib",
     ],
 )
@@ -213,13 +223,13 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            ":c_api",
+            ":c_api_no_xla",
             ":c_api_internal",
             ":tf_status_helper",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":c_api",
+            ":c_api_no_xla",
             ":c_api_internal",
             ":tf_status_helper",
             "//tensorflow/core:framework",
@@ -346,6 +356,7 @@ tf_cc_test(
     srcs = ["c_api_function_test.cc"],
     deps = [
         ":c_api",
+        ":c_api_internal",
         ":c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 94d9f4a6fa2f14cb3343bdd51b7e4d61944444d0..245d7ba2b186895532953aa61ebfc3fc6bf635a7 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/while_loop.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/kernels/logging_ops.h"
 #endif
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -640,7 +641,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
                       dimvec.size(), base, size, DeleteArray, base);
 }
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in,
+Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
                        TF_Buffer* out) {
   if (out->data != nullptr) {
     return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
@@ -1310,6 +1311,13 @@ void TF_SetAttrTypeList(TF_OperationDescription* desc, const char* attr_name,
                      reinterpret_cast<const DataType*>(values), num_values));
 }
 
+void TF_SetAttrPlaceholder(TF_OperationDescription* desc, const char* attr_name,
+                           const char* placeholder) {
+  tensorflow::AttrValue attr_value;
+  attr_value.set_placeholder(placeholder);
+  desc->node_builder.Attr(attr_name, attr_value);
+}
+
 void TF_SetAttrFuncName(TF_OperationDescription* desc, const char* attr_name,
                         const char* value, size_t length) {
   tensorflow::NameAttrList func_name;
@@ -2954,4 +2962,11 @@ void TF_DeleteServer(TF_Server* server) {
   delete server;
 #endif
 }
+
+void TF_RegisterLogListener(void (*listener)(const char*)) {
+#ifndef __ANDROID__
+  tensorflow::logging::RegisterListener(listener);
+#endif
+}
+
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 8031928dac4de2391f0aec46e69d61a137606e4d..051de3a7dc0f8c630b6c81d2cfa960e5279c93c0 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -549,6 +549,10 @@ TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
                                               const char* attr_name,
                                               const TF_DataType* values,
                                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrPlaceholder(TF_OperationDescription* desc,
+                                                 const char* attr_name,
+                                                 const char* placeholder);
+
 // Set a 'func' attribute to the specified name.
 // `value` must point to a string of length `length` bytes.
 TF_CAPI_EXPORT extern void TF_SetAttrFuncName(TF_OperationDescription* desc,
@@ -1310,6 +1314,28 @@ TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction(
     int noutputs, const TF_Output* outputs, const char* const* output_names,
     const TF_FunctionOptions* opts, const char* description, TF_Status* status);
 
+// Similar to TF_GraphToFunction but allows specifying control outputs of the
+// function.
+//
+//  The arguments of TF_GraphToFunction have the same meaning, but the new
+//  arguments are as follows:
+//
+//    ncontrol_outputs: Number of control outputs of the function.
+//    control_outputs: vector of TF_Operation objects to be marked as control
+//      outputs of the function. Operations marked as control outputs are
+//      guaranteed to execute.
+//    control_output_names: Optional. If not nullptr, vector of strings, one
+//      per control output, with their names to be added to the function's
+//      OpDef.
+TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status);
+
 // Returns the name of the graph function.
 // The return value points to memory that is only usable until the next
 // mutation to *func.
@@ -1743,6 +1769,14 @@ TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
 // it will be stopped and joined.
 TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
 
+// Register a listener method that processes printed messages.
+//
+// If any listeners are registered, the print operator will call all listeners
+// with the printed messages and immediately return without writing to the
+// logs.
+TF_CAPI_EXPORT extern void TF_RegisterLogListener(
+    void (*listener)(const char*));
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index a8325ce494c4f57fcd7e64b2d233ee4e6666bc4e..7ff4084decc686b067226ecaecf2af29d51d42f2 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -9064,11 +9064,6 @@ TF_Operation* TFE_AddEagerOpToGraph(TFE_Op* op, TFE_TraceContext* trace_ctx,
       tensorflow::strings::StrCat(op_type, "_", trace_ctx->node_counter++);
   auto* desc =
       TF_NewOperation(trace_ctx->graph, op_type.c_str(), op_name.c_str());
-  for (auto* input : op->operation.Inputs()) {
-    auto symbolic_input = getOrCreateSymbolicTensor(trace_ctx, input, status);
-    if (!status->status.ok()) return nullptr;
-    TF_AddInput(desc, symbolic_input);
-  }
 
   VLOG(1) << "Adding attrs.";
   tensorflow::AttrValueMap attrs;
@@ -9077,6 +9072,34 @@ TF_Operation* TFE_AddEagerOpToGraph(TFE_Op* op, TFE_TraceContext* trace_ctx,
     desc->node_builder.Attr(attr.first, attr.second);
   }
 
+  VLOG(1) << "Adding inputs.";
+  const auto& inputs = op->operation.Inputs();
+  size_t inputIndex = 0;
+  const tensorflow::OpDef& op_def = desc->node_builder.op_def();
+  for (const tensorflow::OpDef::ArgDef& input_arg : op_def.input_arg()) {
+    // TODO(bgogul): Add support for number attributes.
+    DCHECK(input_arg.number_attr().empty())
+        << "Number attributes is not implemented yet.";
+    if (input_arg.type_list_attr().empty()) {
+      auto symbolic_input =
+          getOrCreateSymbolicTensor(trace_ctx, inputs[inputIndex++], status);
+      if (!status->status.ok()) return nullptr;
+      TF_AddInput(desc, symbolic_input);
+      continue;
+    }
+    const std::string& type_list_attr = input_arg.type_list_attr();
+    const auto& attr_value = attrs[type_list_attr];
+    DCHECK(attr_value.value_case() == tensorflow::AttrValue::kList)
+        << "Type list attribute should be a list!";
+    std::vector<TF_Output> list_inputs(attr_value.list().type_size());
+    for (TF_Output& list_input : list_inputs) {
+      list_input =
+          getOrCreateSymbolicTensor(trace_ctx, inputs[inputIndex++], status);
+      if (!status->status.ok()) return nullptr;
+    }
+    TF_AddInputList(desc, list_inputs.data(), list_inputs.size());
+  }
+
   auto* graph_op = TF_FinishOperation(desc, status);
   if (!status->status.ok()) return nullptr;
 
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index 354ee5f49f373edbc10e7706aa8776f3cc2a17cd..c54021a7517ebbdd00405cbfa9cee8f3f6616cca 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -446,5 +446,29 @@ TEST_F(AddEagerOpToGraphTest, ListAttributesArePreserved) {
   TFE_DeleteOp(squeeze);
 }
 
+TEST_F(AddEagerOpToGraphTest, ListInputsAreAddedCorrectly) {
+  TFE_TensorHandle* scalar = TestScalarTensorHandle();
+  TFE_Op* identityn = TFE_NewOp(eager_ctx_, "IdentityN", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  constexpr size_t kNumInputs = 3;
+  for (size_t i = 0; i < kNumInputs; ++i) {
+    TFE_OpAddInput(identityn, scalar, status_);
+  }
+  TF_DataType types[kNumInputs] = {TF_FLOAT, TF_FLOAT, TF_FLOAT};
+  TFE_OpSetAttrTypeList(identityn, "T", types, kNumInputs);
+  AddEagerOpToGraphAndCheck(
+      identityn, [this, kNumInputs](TF_Operation* graph_op) {
+        EXPECT_EQ(TF_OperationNumInputs(graph_op), kNumInputs);
+        EXPECT_EQ(TF_OperationInputListLength(graph_op, "input", status_),
+                  kNumInputs);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        EXPECT_EQ(TF_OperationOutputListLength(graph_op, "output", status_),
+                  kNumInputs);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+      });
+  TFE_DeleteTensorHandle(scalar);
+  TFE_DeleteOp(identityn);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 28b9f8df9c873ee394eb6a241dd9ac06ba6c8796..03d65ecefd4a9ba5a23a94ed902dfba6dd4fbda9 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -162,6 +162,11 @@ Status FillFunctionBody(
     const std::vector<const Node*>& body_nodes,
     const std::unordered_map<string, string>& tensor_renaming,
     FunctionDef* fdef) {
+  std::unordered_set<string> func_attr_names;
+  for (const auto& func_attr : fdef->signature().attr()) {
+    func_attr_names.insert(func_attr.name());
+  }
+
   std::vector<const Edge*> in_edges;
   std::vector<const Edge*> control_edges;
   for (const Node* node : body_nodes) {
@@ -243,6 +248,48 @@ Status FillFunctionBody(
     if (node->op_def().is_stateful()) {
       fdef->mutable_signature()->set_is_stateful(true);
     }
+
+    // If this node has any attributes with placeholder value, add the
+    // attribute to FunctionDef signature.
+    for (const auto& iter : node->attrs()) {
+      if (iter.second.placeholder().empty()) {
+        continue;
+      }
+
+      // If we already added the attribute, skip it.
+      string func_attr_name = iter.second.placeholder();
+      if (func_attr_names.find(func_attr_name) != func_attr_names.end()) {
+        continue;
+      }
+
+      // This node's attribute is a placeholder value, so it does not have type
+      // information. We check node's OpDef for attribute type.
+      string node_attr_name = iter.first;
+      const OpDef::AttrDef* node_attr_def = nullptr;
+      for (const auto& node_attr : node->op_def().attr()) {
+        if (node_attr.name() == node_attr_name) {
+          node_attr_def = &node_attr;
+        }
+      }
+      if (!node_attr_def) {
+#ifdef TENSORFLOW_LITE_PROTOS
+        return errors::Unimplemented(
+            "Placeholder value is not supported for attributes not in OpDef. "
+            "Attribute: ",
+            node_attr_name);
+#else
+        return errors::Unimplemented(
+            "Placeholder value is not supported for attributes not in OpDef. "
+            "Attribute: ",
+            node_attr_name, ", OpDef: ", node->op_def().DebugString());
+#endif
+      }
+      OpDef::AttrDef* attr_def = fdef->mutable_signature()->add_attr();
+      attr_def->set_name(func_attr_name);
+      attr_def->set_type(node_attr_def->type());
+
+      func_attr_names.insert(func_attr_name);
+    }
   }
   return Status::OK();
 }
@@ -255,6 +302,8 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           const std::vector<OutputTensor>& inputs,
                           const std::vector<OutputTensor>& outputs,
                           const std::vector<string>& output_names,
+                          const std::vector<const Node*>& control_outputs,
+                          const std::vector<string>& control_output_names,
                           const char* description, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
@@ -378,6 +427,29 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     fdef->mutable_signature()->set_name(fn_name);
   }
 
+  if (!control_output_names.empty() &&
+      (control_outputs.size() != control_output_names.size())) {
+    return InvalidArgument(
+        "Expected number of control outputs (", control_outputs.size(),
+        ") and the number of control output names (",
+        control_output_names.size(), ") to match but they do not.");
+  }
+  std::unordered_set<string> control_output_names_set;
+  for (int i = 0; i < control_outputs.size(); ++i) {
+    string signature_name;
+    if (!control_output_names.empty()) {
+      signature_name = control_output_names[i];
+    } else {
+      signature_name = control_outputs[i]->name();
+    }
+    if (!control_output_names_set.insert(signature_name).second) {
+      return errors::InvalidArgument("Repeated control output name: ",
+                                     signature_name);
+    }
+    fdef->mutable_signature()->add_control_output(signature_name);
+    (*fdef->mutable_control_ret())[signature_name] = control_outputs[i]->name();
+  }
+
   return Status::OK();
 }
 
@@ -485,14 +557,14 @@ Status ComputeBodyNodes(
 using tensorflow::Node;
 using tensorflow::string;
 
-TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
-                                unsigned char append_hash_to_fn_name,
-                                int num_opers, const TF_Operation* const* opers,
-                                int ninputs, const TF_Output* inputs,
-                                int noutputs, const TF_Output* outputs,
-                                const char* const* output_names,
-                                const TF_FunctionOptions* opts,
-                                const char* description, TF_Status* status) {
+TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status) {
   tensorflow::mutex_lock l(*const_cast<tensorflow::mutex*>(&fn_body->mu));
 
   // Process inputs.
@@ -517,19 +589,34 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
     }
   }
 
+  // Process control output names.
+  std::vector<string> control_output_names_vec;
+  if (control_output_names) {
+    control_output_names_vec.reserve(ncontrol_outputs);
+    for (int i = 0; i < ncontrol_outputs; ++i) {
+      control_output_names_vec.push_back(string(output_names[i]));
+    }
+  }
+
   // Compute body nodes.
   std::vector<const Node*> body_nodes;
   status->status = tensorflow::ComputeBodyNodes(
       fn_body, fn_name, num_opers, opers, input_nodes, &body_nodes);
   if (!status->status.ok()) return nullptr;
 
+  // Compute body nodes.
+  std::vector<const Node*> control_output_nodes;
+  for (int i = 0; i < ncontrol_outputs; ++i) {
+    control_output_nodes.push_back(&control_outputs[i]->node);
+  }
+
   // Do the actual function creation.
   TF_Function* tf_function = new TF_Function();
   DCHECK(append_hash_to_fn_name <= 1);
   status->status = tensorflow::GraphToFunctionDef(
       fn_body->graph, fn_name, append_hash_to_fn_name != 0, body_nodes,
-      input_tensors, output_tensors, output_names_vec, description,
-      &tf_function->fdef);
+      input_tensors, output_tensors, output_names_vec, control_output_nodes,
+      control_output_names_vec, description, &tf_function->fdef);
   if (!status->status.ok()) {
     TF_DeleteFunction(tf_function);
     return nullptr;
@@ -537,6 +624,20 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
   return tf_function;
 }
 
+TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
+                                unsigned char append_hash_to_fn_name,
+                                int num_opers, const TF_Operation* const* opers,
+                                int ninputs, const TF_Output* inputs,
+                                int noutputs, const TF_Output* outputs,
+                                const char* const* output_names,
+                                const TF_FunctionOptions* opts,
+                                const char* description, TF_Status* status) {
+  return TF_GraphToFunctionWithControlOutputs(
+      fn_body, fn_name, append_hash_to_fn_name, num_opers, opers, ninputs,
+      inputs, noutputs, outputs, output_names, 0, nullptr, nullptr, opts,
+      description, status);
+}
+
 const char* TF_FunctionName(TF_Function* func) {
   return func->fdef.signature().name().c_str();
 }
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 73fe73769bc1219ce865149d67d333c53371ccc5..946f8c4a2c3fb25f908d809e00bf579b40a8668b 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -1230,6 +1231,53 @@ void DefineFunction(const char* name, TF_Function** func,
   ASSERT_NE(*func, nullptr);
 }
 
+REGISTER_OP("CustomOp")
+    .Output("output: float32")
+    .Attr("index: int")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
+
+void NodeWithPlaceholderAttrHelper(TF_Graph* graph, TF_Status* s,
+                                   const char* name, const char* placeholder,
+                                   TF_Operation** op) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "CustomOp", name);
+  TF_SetAttrPlaceholder(desc, "index", placeholder);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TEST_F(CApiFunctionTest, GraphToFunctionDefWithPlaceholderAttr) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Operation *node1, *node2, *node3;
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node1", "v1",
+                                &node1);
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node2", "v1",
+                                &node2);
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node3", "v2",
+                                &node3);
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{node1, 0}, {node2, 0}, {node3, 0}};
+  func_ = TF_GraphToFunction(
+      func_graph.get(), "func", /*append_hash_to_fn_name=*/false, -1,
+      /*opers=*/nullptr, 0, inputs, 3, outputs,
+      /*output_names=*/nullptr,
+      /*opts=*/nullptr, /*description=*/nullptr, s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(func_, nullptr);
+
+  // Verify that FunctionDef has 2 attributes, "v1" and "v2".
+  ASSERT_EQ(func_->fdef.signature().attr().size(), 2);
+  EXPECT_EQ(func_->fdef.signature().attr(0).name(), "v1");
+  EXPECT_EQ(func_->fdef.signature().attr(0).type(), "int");
+  EXPECT_EQ(func_->fdef.signature().attr(1).name(), "v2");
+  EXPECT_EQ(func_->fdef.signature().attr(1).type(), "int");
+}
+
 TEST_F(CApiFunctionTest, SetGradientAndRun) {
   // Define the function and its grad
   DefineFunction(func_name_, &func_);
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 73283d775639b297857b2a50007dc7c28b1f39a3..d520b6b76849e562def6abd8be0510d3b4797e8c 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -204,7 +204,8 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 
 TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
+Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
+                       TF_Buffer* out);
 
 // Set the shapes and types of the output's handle.
 //
diff --git a/tensorflow/c/c_test.c b/tensorflow/c/c_test.c
index b86d8eb8e300e02a3871ecd5f424a82c521b18fc..7468122cd567270c8454f886e478be34c2c15cbf 100644
--- a/tensorflow/c/c_test.c
+++ b/tensorflow/c/c_test.c
@@ -25,6 +25,16 @@ limitations under the License.
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/kernels.h"
 
+// A create function. This will never actually get called in this test, it's
+// just nice to know that it compiles.
+void* create(TF_OpKernelConstruction* ctx) {
+  TF_DataType type;
+  TF_Status* s = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "foobar", &type, s);
+  TF_DeleteStatus(s);
+  return NULL;
+}
+
 // A compute function. This will never actually get called in this test, it's
 // just nice to know that it compiles.
 void compute(void* kernel, TF_OpKernelContext* ctx) {
@@ -32,12 +42,7 @@ void compute(void* kernel, TF_OpKernelContext* ctx) {
   TF_Status* s = TF_NewStatus();
   TF_GetInput(ctx, 0, &input, s);
   TF_DeleteTensor(input);
-
-  TF_DataType type;
-  TF_OpKernelContext_GetAttrType(ctx, "foobar", &type, s);
-
   TF_DeleteStatus(s);
-
 }
 
 // Exercises tensorflow's C API.
@@ -80,7 +85,7 @@ int main(int argc, char** argv) {
   TF_StringStreamDone(s);
 
   TF_KernelBuilder* b =
-      TF_NewKernelBuilder("SomeOp", "SomeDevice", NULL, &compute, NULL);
+      TF_NewKernelBuilder("SomeOp", "SomeDevice", &create, &compute, NULL);
   TF_RegisterKernelBuilder("someKernel", b, status);
 
   TF_DeleteStatus(status);
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 257be6379c09841d1427813a0aa25b10a205016d..282f0da302fac89c6fae9f8b5aa4b3c33ab93532 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -211,6 +211,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -230,7 +231,6 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         "//tensorflow/c:c_test_util",
         "//tensorflow/cc/profiler",
-        "//tensorflow/contrib/tpu/profiler:trace_events_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index af13f487af91594fedd4d5f77592682a6f98c34f..45701c7fcf02d5e6ec464ae10d4d20f20ba1d9f0 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -762,11 +762,13 @@ unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
 }
 
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreMetadata(true);
+  ctx->context.SetShouldStoreGraphs(true);
+  ctx->context.SetShouldStoreStepStats(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreMetadata(false);
+  ctx->context.SetShouldStoreGraphs(false);
+  ctx->context.SetShouldStoreStepStats(false);
 }
 
 }  // extern "C"
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index dab17505643e791e6294a64247898ae23769a055..ff798593b5f2f77339b668668ff6dafb9f44a2b3 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 
 using tensorflow::string;
@@ -25,10 +26,14 @@ void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
   op->operation.ConsumeInput(h->handle);
 }
 
-TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
+TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx) {
   return new TFE_Profiler(ctx);
 }
 
+bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
+  return profiler->profiler->Status().ok();
+}
+
 void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
 
 void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
@@ -46,7 +51,43 @@ void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
   };
 }
 
-void TFE_StartProfilerServer(TFE_Context* ctx, int port) {
-  auto server_thread = tensorflow::StartProfilerServer(&ctx->context, port);
-  ctx->context.AddChildThread(std::move(server_thread));
+TFE_ProfilerContext* TFE_NewProfilerContext() {
+  return new TFE_ProfilerContext;
+}
+
+void TFE_ProfilerContextSetEagerContext(TFE_ProfilerContext* profiler_context,
+                                        TFE_Context* eager_context) {
+  profiler_context->profiler_context.eager_context = &eager_context->context;
+}
+
+void TFE_DeleteProfilerContext(TFE_ProfilerContext* profiler_context) {
+  delete profiler_context;
+}
+
+void TFE_StartProfilerServer(TFE_ProfilerContext* context, int port) {
+  // Release child thread intentionally. The child thread can be terminate by
+  // terminating the main thread.
+  tensorflow::StartProfilerServer(&context->profiler_context, port).release();
+}
+
+void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreGraphs(true);
+}
+
+void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreGraphs(false);
+}
+
+bool TFE_ProfilerClientStartTracing(char* service_addr, char* logdir,
+                                    char* worker_list, bool include_dataset_ops,
+                                    int duration_ms, int num_tracing_attempts) {
+  tensorflow::Status s =
+      tensorflow::profiler::client::ValidateHostPortPair(service_addr);
+  if (!s.ok()) {
+    return false;
+  }
+  s = tensorflow::profiler::client::StartTracing(
+      service_addr, logdir, worker_list, include_dataset_ops, duration_ms,
+      num_tracing_attempts);
+  return s.ok();
 }
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 8c85d0e51695fde09cf0e2bb3930f9173e6cfb54..89523793d37b89ee49c4db844a85f019381ff730 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -25,6 +25,8 @@ extern "C" {
 TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
                                               TF_Status* status);
 
+typedef struct TFE_ProfilerContext TFE_ProfilerContext;
+
 // A profiler which will start profiling when creating the object and will stop
 // when the object is destroyed. It will profile all operations run under the
 // given TFE_Context. Multiple instance of it can be created, but at most one
@@ -32,7 +34,8 @@ TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
 // Thread-safety: TFE_Profiler is thread-safe.
 typedef struct TFE_Profiler TFE_Profiler;
 
-TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
+TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx);
+TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
 TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
 
 // The output string is a binary string of tensorflow.tpu.Trace. User can write
@@ -42,14 +45,47 @@ TF_CAPI_EXPORT extern void TFE_ProfilerSerializeToString(TFE_Context* ctx,
                                                          TF_Buffer* buf,
                                                          TF_Status* status);
 
+// Return a new profiler context object.
+TF_CAPI_EXPORT extern TFE_ProfilerContext* TFE_NewProfilerContext(void);
+
+// Set the eager context in TFE_ProfilerServerOptions
+TF_CAPI_EXPORT extern void TFE_ProfilerContextSetEagerContext(
+    TFE_ProfilerContext* profiler_context, TFE_Context* eager_context);
+
+// Destroy a profiler context object.
+TF_CAPI_EXPORT extern void TFE_DeleteProfilerContext(
+    TFE_ProfilerContext* profiler_context);
+
 // Start a profiler grpc server which listens to specified port. It will start
-// the server on its own thread. It can be shutdown by destructing TFE_Context.
-// Creating multiple profiler server is allowed. The service defined in
+// the server on its own thread. It can be shutdown by terminating tensorflow.
+// It can be used in both Eager mode and graph mode. Creating multiple profiler
+// server is allowed. The service defined in
 // tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
 // tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
 // file following
 // https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
-TF_CAPI_EXPORT extern void TFE_StartProfilerServer(TFE_Context* ctx, int port);
+TF_CAPI_EXPORT extern void TFE_StartProfilerServer(TFE_ProfilerContext* context,
+                                                   int port);
+
+// Enables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextEnableGraphCollection(TFE_Context* ctx);
+
+// Disables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextDisableGraphCollection(TFE_Context* ctx);
+
+// Send a grpc request to profiler server (service_addr) to perform on-demand
+// profiling and save the result into logdir which can be visualized by
+// TensorBoard. worker_list is the list of worker TPUs separated by ','. Set
+// include_dataset_opts to false to profile longer traces. It will block the
+// caller thread until receives tracing result.
+// This API is designed for TensorBoard, for end user, please use
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile instead following
+// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern bool TFE_ProfilerClientStartTracing(
+    char* service_addr, char* logdir, char* worker_list,
+    bool include_dataset_ops, int duration_ms, int num_tracing_attempts);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index af55fee66e8708e39626da3b10b6dd2f73af92bb..d85048caa7c7f727271352883cb834a2575bd251 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <string.h>
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/cc/profiler/profiler.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 using tensorflow::string;
 
@@ -41,9 +41,12 @@ void ExecuteWithProfiling(bool async) {
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
-  TFE_Profiler* profiler = TFE_NewProfiler(ctx);
+  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
+  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
+  TFE_Profiler* profiler = TFE_NewProfiler(profiler_context);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
+  TFE_DeleteProfilerContext(profiler_context);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle();
   TFE_Op* matmul = MatMulOp(ctx, m, m);
@@ -70,7 +73,7 @@ void ExecuteWithProfiling(bool async) {
   TFE_ProfilerSerializeToString(ctx, profiler, profiler_result, status);
   TFE_DeleteProfiler(profiler);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  tensorflow::tpu::Trace profile_proto;
+  profiler::Trace profile_proto;
   EXPECT_TRUE(profile_proto.ParseFromString(
       {reinterpret_cast<const char*>(profiler_result->data),
        profiler_result->length}));
@@ -100,5 +103,27 @@ void ExecuteWithProfiling(bool async) {
 TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
 TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
 
+TEST(CAPI, MultipleProfilerSession) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
+  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
+
+  TFE_Profiler* profiler1 = TFE_NewProfiler(profiler_context);
+  EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
+
+  TFE_Profiler* profiler2 = TFE_NewProfiler(profiler_context);
+  EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
+
+  TFE_DeleteProfiler(profiler1);
+  TFE_DeleteProfiler(profiler2);
+  TFE_DeleteProfilerContext(profiler_context);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index b70c0f1c112c675641a023d6c7bf4fa847ee4610..a563e4b8f50f2a90497736f4cb9ca234400bfa04 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -107,9 +107,14 @@ struct TFE_Op {
   tensorflow::EagerOperation operation;
 };
 
+struct TFE_ProfilerContext {
+  tensorflow::ProfilerContext profiler_context;
+};
+
 struct TFE_Profiler {
-  TFE_Profiler(TFE_Context* ctx)
-      : profiler(tensorflow::ProfilerSession::Create(&ctx->context)) {}
+  TFE_Profiler(TFE_ProfilerContext* ctx) {
+    profiler = tensorflow::ProfilerSession::Create(&ctx->profiler_context);
+  }
 
   std::unique_ptr<tensorflow::ProfilerSession> profiler;
 };
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 9505bf9dda32b9a338b574f1d31ec555a5628c6a..71181ae430ab64106e2a75937bd54fbf2efc61ac 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -173,9 +173,10 @@ void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
   cc_ctx->CtxFailure(s);
 }
 
-#define DEFINE_TF_GETATTR_(struct_name, func, c_type, cc_type)                 \
-  void struct_name##_GetAttr##func(struct_name* ctx, const char* attr_name,    \
-                                   c_type* val, TF_Status* status) {           \
+#define DEFINE_TF_GETATTR(func, c_type, cc_type)                               \
+  void TF_OpKernelConstruction_GetAttr##func(TF_OpKernelConstruction* ctx,     \
+                                             const char* attr_name,            \
+                                             c_type* val, TF_Status* status) { \
     TF_SetStatus(status, TF_OK, "");                                           \
     cc_type v;                                                                 \
     auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx); \
@@ -186,10 +187,6 @@ void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
     }                                                                          \
   }
 
-#define DEFINE_TF_GETATTR(func, c_type, cc_type)                     \
-  DEFINE_TF_GETATTR_(TF_OpKernelConstruction, func, c_type, cc_type) \
-  DEFINE_TF_GETATTR_(TF_OpKernelContext, func, c_type, cc_type)
-
 DEFINE_TF_GETATTR(Type, TF_DataType, tensorflow::DataType)
 
 TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index b015d0103969355e8566242bfcc007f697c6ae18..c47bfa8aa3a721d422a0a1536b924f3e53793193 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -137,15 +137,6 @@ TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrType(
     TF_OpKernelConstruction* ctx, const char* attr_name, TF_DataType* val,
     TF_Status* status);
 
-// Interprets the named kernel context attribute as a TF_DataType and places it
-// into *val. *status is set to TF_OK.
-//
-// If the attribute could not be found or could not be interpreted as
-// TF_DataType, *status is populated with an error.
-TF_CAPI_EXPORT extern void TF_OpKernelContext_GetAttrType(
-    TF_OpKernelContext* ctx, const char* attr_name, TF_DataType* val,
-    TF_Status* status);
-
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 0d2954717e7a83c102a35815809a554e3a917e07..608887722f7bca44c884a3426d5e378e9387a530 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -36,6 +36,15 @@ static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
   struct MyCustomKernel* s = new struct MyCustomKernel;
   s->created = true;
   s->compute_called = false;
+
+  // Exercise attribute reads.
+  TF_DataType type;
+  TF_Status* status = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "SomeDataTypeAttr", &type, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status));
+  EXPECT_EQ(TF_FLOAT, type);
+  TF_DeleteStatus(status);
+
   return s;
 }
 
@@ -43,17 +52,7 @@ static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
   struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
   s->compute_called = true;
   if (ctx != nullptr) {
-    TF_Status* status = TF_NewStatus();
-
     EXPECT_EQ(43, TF_StepId(ctx));
-
-    // Exercise attribute reads.
-    TF_DataType type;
-    TF_OpKernelContext_GetAttrType(ctx, "SomeDataTypeAttr", &type, status);
-    EXPECT_EQ(TF_OK, TF_GetCode(status));
-    EXPECT_EQ(TF_FLOAT, type);
-
-    TF_DeleteStatus(status);
   }
 }
 
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index a09becc49b10d2c58f98fbcc11df5190f794c1d4..4c4d587fce04d101b3cc8faebcc3ba04f2f1d0cf 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -150,6 +150,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -586,6 +587,25 @@ tf_gen_op_wrappers_cc(
     pkg = "//tensorflow/core",
 )
 
+tf_gen_op_wrappers_cc(
+    name = "tpu_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 cc_library_with_android_deps(
     name = "cc_op_gen_main",
     srcs = [
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 39593370d1c243e84dc5b6091724d1d404c102b0..43a33cbea6e1e4a50f61cc7d6d8d70cac6a603d2 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -321,6 +321,7 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"tensor", {"TensorProto", true}},
           {"list(tensor)", {"gtl::ArraySlice<TensorProto>", true}},
           {"func", {"NameAttrList", true}},
+          {"list(func)", {"gtl::ArraySlice<NameAttrList>", true}},
       };
 
   auto entry = attr_type_map->find(attr_type);
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index affd90b1bcc7cb4a8b3ffed6aeeb4bd480f5e314..a7e645e8b556f14f0c7a51d2eba6ab1e2256b837 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -96,7 +96,7 @@ class SymbolicGradientBuilder {
   // Used to identify nodes at which to stop backprop.
   std::unordered_set<int> GetStopBackpropNodes(
       const std::vector<bool>& reachable_nodes,
-      std::unordered_set<int> output_nodes);
+      const std::unordered_set<int>& output_nodes);
 
   const Scope& scope_;
   const ops::GradOpRegistry* registry_;
@@ -167,7 +167,6 @@ Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad,
 std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
   std::vector<bool> reachable_nodes(scope_.graph()->num_node_ids(), false);
   std::deque<Node*> queue;
-  std::vector<bool> visited(scope_.graph()->num_node_ids(), false);
   for (const Output& out : outputs_) {
     if (!reachable_nodes[out.node()->id()]) {
       queue.push_back(out.node());
@@ -180,10 +179,10 @@ std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
     queue.pop_front();
     for (const Edge* e : n->in_edges()) {
       if (e->IsControlEdge()) continue;
-      if (visited[e->src()->id()]) continue;
-      queue.push_back(e->src());
-      reachable_nodes[e->src()->id()] = true;
-      visited[e->src()->id()] = true;
+      if (!reachable_nodes[e->src()->id()]) {
+        queue.push_back(e->src());
+        reachable_nodes[e->src()->id()] = true;
+      }
     }
   }
   return reachable_nodes;
@@ -191,7 +190,7 @@ std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
 
 std::unordered_set<int> SymbolicGradientBuilder::GetStopBackpropNodes(
     const std::vector<bool>& reachable_nodes,
-    std::unordered_set<int> output_nodes) {
+    const std::unordered_set<int>& output_nodes) {
   // Output nodes that get transitively consumed by other `outputs_` are stored
   // in `internal_outputs`.
   std::unordered_set<int> internal_outputs;
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 52345a376cc29ee47ccb9888c9bb26292468b5a9..dedd55f16afb879ea966dc89d14d88ee15d9e83e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -81,6 +81,7 @@ cc_library(
     ] + if_not_mobile([
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
     ]) + if_android([
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 10f7abf09e925c0c31cfd595ecee4605f189476f..66260fcf4a9b24f78d45010c6e86d4ee398b6d3d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
-#include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
index 1833b6a65eef9baa2e92a13d9c4d44b79620de2f..2cf68c9cd8396987899b4f34f21b994b4722ead4 100644
--- a/tensorflow/compat_template.__init__.py
+++ b/tensorflow/compat_template.__init__.py
@@ -19,13 +19,19 @@ from __future__ import division as _division
 from __future__ import print_function as _print_function
 
 import os as _os
+import sys as _sys
 
 # pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 
 # API IMPORTS PLACEHOLDER
 
 from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorboard.summary._tf.summary'),
+    error_msg=(
+        "Limited tf.compat.v2.summary API due to missing TensorBoard "
+        "installation"))
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=(
@@ -41,3 +47,10 @@ _component_api_helper.package_hook(
 #
 # This make this one symbol available directly.
 from tensorflow.python.compat.v2_compat import enable_v2_behavior  # pylint: disable=g-import-not-at-top
+
+# Add module aliases
+_current_module = _sys.modules[__name__]
+if hasattr(_current_module, 'keras'):
+  losses = keras.losses
+  metrics = keras.metrics
+  optimizers = keras.optimizers
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 16151e77737429f4fbf690fc34b12a70bacebdc4..af016bf80e7a10d8729a1eb385466af48b5810cd 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "flags.h",
     ],
     deps = [
+        ":aot_only_var_handle_op",
         ":embedded_protocol_buffers",
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
@@ -71,6 +72,7 @@ tf_cc_test(
         ":tfcompile_lib",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
@@ -205,6 +207,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aot_only_var_handle_op",
+    srcs = ["aot_only_var_handle_op.cc"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "benchmark_test",
     srcs = ["benchmark_test.cc"],
diff --git a/tensorflow/compiler/aot/aot_only_var_handle_op.cc b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ce36a979f424610a5aa952afa8db2245ed971a9
--- /dev/null
+++ b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Implementation of varhandle that binds a VarHandleOp to an XlaResource of the
+// same name. It is not safe to use this op in a JIT context.
+class XlaAotOnlyVarHandleOp : public XlaOpKernel {
+ public:
+  explicit XlaAotOnlyVarHandleOp(OpKernelConstruction* c);
+  void Compile(XlaOpKernelContext* context) override;
+
+ private:
+  string name_;
+};
+
+XlaAotOnlyVarHandleOp::XlaAotOnlyVarHandleOp(OpKernelConstruction* c)
+    : XlaOpKernel(c) {
+  OP_REQUIRES_OK(c, c->GetAttr("shared_name", &name_));
+}
+
+void XlaAotOnlyVarHandleOp::Compile(XlaOpKernelContext* context) {
+  // Look for a resource of the same name. TF also keys that on the container
+  // and type attributes, but that doesn't seem necessary.
+  for (const auto& resource : context->xla_context()->resources()) {
+    if (resource->kind() == XlaResource::kVariable &&
+        resource->name() == name_) {
+      context->SetResourceOutput(0, resource.get());
+      return;
+    }
+  }
+  context->SetStatus(
+      errors::InvalidArgument("Variable: ", name_, " not configured"));
+}
+}  // namespace
+
+REGISTER_XLA_OP(Name("VarHandleOp").CompilationOnly(), XlaAotOnlyVarHandleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index d016632da2a9d7c2c2f81c02dd573787a0502923..da0598736a7d6b7f55458d76ca30fa6ad46a74f9 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -168,12 +168,12 @@ Status GenArgMethods(const tf2xla::Config& config,
                      const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
-  if (config.feed_size() != num_args) {
-    return errors::InvalidArgument("mismatch between feed_size(",
-                                   config.feed_size(), ") and num_args(",
-                                   num_args, ")");
+  if (config.feed_size() + config.variable_size() != num_args) {
+    return errors::InvalidArgument(
+        "mismatch between feed_size(", config.feed_size(), ")+variable_size(",
+        config.variable_size(), ") and num_args(", num_args, ")");
   }
-  for (int i = 0; i < num_args; ++i) {
+  for (int i = 0; i < config.feed_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(
         AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
@@ -212,12 +212,14 @@ Status GenResultMethods(const tf2xla::Config& config,
     // tuple result, and we rely on this to simplify code generation.
     return errors::Internal("codegen requires the XLA result to be a tuple");
   }
-  if (config.fetch_size() != ps.result().tuple_shapes_size()) {
+  size_t num_results = ps.result().tuple_shapes_size();
+  if (config.fetch_size() + config.variable_size() != num_results) {
     return errors::InvalidArgument("mismatch between fetch_size(",
-                                   config.feed_size(), ") and tuple_size(",
+                                   config.fetch_size(), ")+variable_size(",
+                                   config.variable_size(), ") and tuple_size(",
                                    ps.result().tuple_shapes_size(), ")");
   }
-  for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < config.fetch_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(AddRewritesForShape(
         i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
@@ -245,6 +247,51 @@ Status GenResultMethods(const tf2xla::Config& config,
   return Status::OK();
 }
 
+// Generate methods for variables.
+Status GenVariableMethods(const tf2xla::Config& config,
+                          const xla::ProgramShapeProto& ps, string* methods) {
+  size_t num_args = ps.parameters_size();
+  for (int i = config.feed_size(); i < num_args; ++i) {
+    std::vector<std::pair<string, string>> rewrites;
+    TF_RETURN_IF_ERROR(
+        AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
+    const string code = R"(
+  void set_var_{{NAME}}_input_data({{TYPE}}* data) {
+    set_arg_data({{I}}, data);
+  }
+)";
+    const tf2xla::Variable& var = config.variable(i - config.feed_size());
+    *methods += RewriteWithName(
+        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  }
+  size_t num_results = ps.result().tuple_shapes_size();
+  for (int i = config.fetch_size(); i < num_results; ++i) {
+    std::vector<std::pair<string, string>> rewrites;
+    TF_RETURN_IF_ERROR(AddRewritesForShape(
+        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
+    string code = R"(
+  {{TYPE}}* var_{{NAME}}_result_data() {
+    return static_cast<{{TYPE}}*>(result_data({{I}}));
+  }
+  {{TYPE}}& var_{{NAME}}_result({{DIM_VARS}}) {
+    return (*static_cast<{{TYPE}}(*){{DIM_SIZES}}>(
+        result_data({{I}}))){{INDICES}};
+  }
+  const {{TYPE}}* var_{{NAME}}_result_data() const {
+    return static_cast<const {{TYPE}}*>(result_data({{I}}));
+  }
+  const {{TYPE}}& var_{{NAME}}_result({{DIM_VARS}}) const {
+    return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
+        result_data({{I}}))){{INDICES}};
+  }
+)";
+    const tf2xla::Variable& var = config.variable(i - config.fetch_size());
+    *methods += RewriteWithName(
+        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  }
+  return Status::OK();
+}
+
 // Generates code implementing {Arg,Result}Names(), where T is one of
 // tf2xla::{Feed,Fetch}. Each feed or fetch name results in a C-style string
 // literal in the array, with nullptr terminating the array.
@@ -291,6 +338,14 @@ Status ValidateFeedFetchCppNames(const tf2xla::Config& config) {
       TF_RETURN_IF_ERROR(ValidateCppIdent(fetch.name(), "fetch name"));
     }
   }
+  for (const tf2xla::Variable& variable : config.variable()) {
+    if (!variable.name().empty()) {
+      TF_RETURN_IF_ERROR(ValidateCppIdent(variable.name(), "variable name"));
+    } else {
+      TF_RETURN_IF_ERROR(
+          ValidateCppIdent(variable.node_name(), "variable name"));
+    }
+  }
   return Status::OK();
 }
 
@@ -339,9 +394,10 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   std::vector<BufferInfo> buffer_infos_for_temps =
       ExtractTempBufferInfos(buffer_infos);
   const xla::ProgramShapeProto& ps = compile_result.program_shape;
-  string methods_arg, methods_result;
+  string methods_arg, methods_result, methods_variable;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
+  TF_RETURN_IF_ERROR(GenVariableMethods(config, ps, &methods_variable));
   const size_t arg_bytes_aligned = cpu_function_runtime::AlignedBufferBytes(
       buffer_infos_for_args.data(), buffer_infos_for_args.size(),
       /*allocate_entry_params=*/true);
@@ -523,6 +579,21 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   // buffers are managed internally, and may change after each call to Run.
 {{METHODS_RESULT}}
 
+  // Methods for managing variable buffers. Buffers are in row-major order. The
+  // input and output buffers may or may not be identical.
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+{{METHODS_VARIABLE}}
+
  private:
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = {{NUM_BUFFERS}};
@@ -589,6 +660,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
        include_hlo_profile_printer_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
+      {"{{METHODS_VARIABLE}}\n", methods_variable},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index c1788ca32a1d099284eeb870f9513891051fd29e..5580e55b691bd10698b63d86bc0194b25da743b9 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/Support/TargetSelect.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -172,6 +174,15 @@ TEST(CodegenTest, Golden) {
   tf2xla::Fetch* fetch = config.add_fetch();
   fetch->mutable_id()->set_node_name("fetch0");
   fetch->set_name("myfetch");
+  tf2xla::Variable* variable = config.add_variable();
+  variable->set_node_name("myvar");
+  variable->mutable_shape()->add_dim()->set_size(1);
+  variable->set_type(DT_FLOAT);
+  tf2xla::Variable* variable2 = config.add_variable();
+  variable2->set_node_name("my/var");
+  variable2->set_name("myvar2");
+  variable2->mutable_shape()->add_dim()->set_size(5);
+  variable2->set_type(DT_INT32);
   CompileResult compile_result;
   compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult(
       {},
@@ -186,9 +197,14 @@ TEST(CodegenTest, Golden) {
           {
               xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
               xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
+              xla::ShapeUtil::MakeShape(xla::F32, {1}),
+              xla::ShapeUtil::MakeShape(xla::S32, {5}),
           },
-          xla::ShapeUtil::MakeTupleShape(
-              {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}))
+          xla::ShapeUtil::MakeTupleShape({
+              xla::ShapeUtil::MakeShape(xla::U32, {5, 6}),
+              xla::ShapeUtil::MakeShape(xla::F32, {1}),
+              xla::ShapeUtil::MakeShape(xla::S32, {5}),
+          }))
           .ToProto();
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 35994fc785d3e1d5e883c49bec96de315e189d2e..b5f33d690d492489e9090786cd341e035ae7ca15 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -52,7 +52,7 @@ namespace bar {
 //   is guaranteed that no thread may call a non-const method.
 //
 // The logical function signature is:
-//   ((unknown): f32[1,2], (unknown): s64[3,4]) -> (u32[5,6])
+//   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): f32[1], (unknown): s32[5]) -> (u32[5,6], f32[1], s32[5])
 //
 // Memory stats:
 //   arg bytes total:    104
@@ -214,6 +214,58 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
         result_data(0)))[dim0][dim1];
   }
 
+  // Methods for managing variable buffers. Buffers are in row-major order. The
+  // input and output buffers may or may not be identical.
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+
+  void set_var_myvar_input_data(float* data) {
+    set_arg_data(2, data);
+  }
+
+  void set_var_myvar2_input_data(tensorflow::int32* data) {
+    set_arg_data(3, data);
+  }
+
+  float* var_myvar_result_data() {
+    return static_cast<float*>(result_data(1));
+  }
+  float& var_myvar_result() {
+    return (*static_cast<float(*)[1]>(
+        result_data(1)))[0];
+  }
+  const float* var_myvar_result_data() const {
+    return static_cast<const float*>(result_data(1));
+  }
+  const float& var_myvar_result() const {
+    return (*static_cast<const float(*)[1]>(
+        result_data(1)))[0];
+  }
+
+  tensorflow::int32* var_myvar2_result_data() {
+    return static_cast<tensorflow::int32*>(result_data(2));
+  }
+  tensorflow::int32& var_myvar2_result(size_t dim0) {
+    return (*static_cast<tensorflow::int32(*)[5]>(
+        result_data(2)))[dim0];
+  }
+  const tensorflow::int32* var_myvar2_result_data() const {
+    return static_cast<const tensorflow::int32*>(result_data(2));
+  }
+  const tensorflow::int32& var_myvar2_result(size_t dim0) const {
+    return (*static_cast<const tensorflow::int32(*)[5]>(
+        result_data(2)))[dim0];
+  }
+
  private:
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = 6;
@@ -257,7 +309,7 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
   static const xla::ProgramShapeProto* StaticProgramShape() {
     static const xla::ProgramShapeProto* kShape = []() {
     xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 64);
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 132);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index 7f7b96428572705f30144e6c95cd4cf9c44ce2a3..2884597abcf29583e6192296b0e4ce6825d7c01a 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 9fc223bdc7c0e207ce2005cb86250aa77e709df8..0e46a9f5e9d68fa2174f7bd9b9fa7c3a82dfb715 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -108,10 +108,13 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
                         computation.Snapshot());
     // Serialize the HloSnapshot deterministically so that all the outputs of a
     // tf_library genrule are deterministic.
-    string proto;
-    TF_RET_CHECK(SerializeToStringDeterministic(*module, &proto));
+    const size_t size = module->ByteSizeLong();
+    auto serialized = absl::make_unique<char[]>(size);
+    TF_RET_CHECK(
+        SerializeToBufferDeterministic(*module, serialized.get(), size));
     TF_RETURN_IF_ERROR(
-        WriteStringToFile(Env::Default(), flags.out_session_module, proto));
+        WriteStringToFile(Env::Default(), flags.out_session_module,
+                          absl::string_view(serialized.get(), size)));
   }
   xla::cpu::CpuAotCompilationOptions aot_opts(
       flags.target_triple, flags.target_cpu, flags.target_features,
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 10fa33ab5e84dcbc1629bee6214e8969046f19c2..444264ba6e1f59c33551796025ba845c62c02d43 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -69,6 +69,7 @@ genrule(
         "test_graph_tfmatmulandadd.pb",
         "test_graph_tfsplits.pb",
         "test_graph_tftop_k.pb",
+        "test_graph_tfvariable.pb",
     ],
     # Set CUDA_VISIBLE_DEVICES='' to prevent the code we launch from using any
     # GPUs which might be present.  This is important because builds may run
@@ -222,6 +223,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfvariable",
+    testonly = 1,
+    config = "test_graph_tfvariable.config.pbtxt",
+    cpp_class = "VariableComp",
+    graph = "test_graph_tfvariable.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
@@ -241,6 +253,7 @@ tf_cc_test(
         ":test_graph_tfmatmulandadd_with_profiling",
         ":test_graph_tfsplits",
         ":test_graph_tftop_k",
+        ":test_graph_tfvariable",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 7bac79ec062af7e790134286e34eda4e123e138a..42f8812def0503824416d92daa2db71a64c3db88 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -149,6 +149,14 @@ def tftop_k(_):
   array_ops.identity(output[1], name='indices')
 
 
+def tfvariable(_):
+  x = variables.Variable(1000.0, name='x')
+  old_x = x.value()
+  with ops.control_dependencies([old_x]):
+    new_x = x.assign_add(42.0)
+  array_ops.stack([old_x, new_x], name='result')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -171,6 +179,7 @@ def main(_):
   write_graph(tfmatmulandadd, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
   write_graph(tftop_k, FLAGS.out_dir)
+  write_graph(tfvariable, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b4c4215a330b014f595edde001aba73ad7d8263
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt
@@ -0,0 +1,12 @@
+# Text form of tensorflow.tf2xla.Config proto.
+fetch {
+  id { node_name: "result" }
+}
+
+variable {
+  node_name: "x"
+  shape {
+    dim { size: 1 }
+  }
+  type: DT_FLOAT
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 4dd79e5882d7da61be029735ef2b165908c599f9..5f9316f3933713e12fc5960b9adfecc6e9bd99b5 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tftop_k.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfvariable.h"
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -473,6 +474,28 @@ TEST(TFCompileTest, TopK) {
   EXPECT_EQ(expected_indices[1], fn.result1(1));
 }
 
+TEST(TFCompileTest, Variable) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  VariableComp fn;
+  float x = 23;
+  fn.set_var_x_input_data(&x);
+
+  fn.set_thread_pool(&device);
+  fn.Run();
+  EXPECT_EQ(fn.result0(0, 0), 23);
+  EXPECT_EQ(fn.result0(1, 0), 65);
+  EXPECT_EQ(fn.var_x_result(), 65);
+
+  EXPECT_EQ(x, 23);
+  x = fn.var_x_result();
+  fn.Run();
+  EXPECT_EQ(fn.result0(0, 0), 65);
+  EXPECT_EQ(fn.result0(1, 0), 107);
+  EXPECT_EQ(fn.var_x_result(), 107);
+}
+
 TEST(TFCompileTest, AssertEqAndReturnDiff) {
   // Assert is converted into a no-op in XLA, so there is no failure even if the
   // two args are different.
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 3cae081ce7c78226390a82d222d57ac653c14321..121de401cefb2b56b984944dde769f226590dc67 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -208,6 +208,7 @@ cc_library(
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
+        "//tensorflow/core/kernels/data:optional_ops",
         "//tensorflow/core/kernels/data:prefetch_dataset_op",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
@@ -282,7 +283,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -465,7 +465,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 9f4042630edaec1b9519b6434d859a48372e8b15..285b1efa53d91922c9fa161cfd2de34e1434d0c4 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -115,6 +115,13 @@ void MergeOutgoingControlEdges(const Scope& s, Node* old_node, Node* new_node) {
     return;
   }
 
+  if (ctrl_edges.size() == 1 && ctrl_edges.front()->dst()->IsSink()) {
+    // Avoid creating a Merge node if we can just add an edge to _SINK
+    // instead.
+    s.graph()->AddControlEdge(new_node, s.graph()->sink_node());
+    return;
+  }
+
   // We can't merge control edges directly so we instead first "convert" them to
   // normal values that can be merged, merge the values and then "convert" the
   // merged value back into control.
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 390ffa694b6f127544d92f3024a02d877556aacd..c14c7465c55b7d350d6b3a6853cef6692140ce78 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -68,6 +68,8 @@ Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
     }
   }
 
+  FixupSourceAndSinkEdges(graph.get());
+
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = &graph;
   BuildXlaOpsPass pass(/*enable_lazy_compilation=*/true);
@@ -223,5 +225,23 @@ TEST_F(BuildXlaOpsTest, OnXlaDevice) {
   ASSERT_NE(write_op_new, nullptr);
   EXPECT_THAT(write_op_new, assign_var);
 }
+
+TEST_F(BuildXlaOpsTest, NoExtraMergeForEdgeToSink) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  Node* call;
+  TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+
+  std::unique_ptr<Graph> graph;
+  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+
+  Node* sink_node = graph->sink_node();
+  EXPECT_THAT(sink_node, NodeWith(CtrlDeps(NodeWith(Op("_XlaRun")),
+                                           NodeWith(Op("cluster_0")),
+                                           NodeWith(Op("NoOp")))));
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 0ef0d3db8c16e4b3f78d29aad5a2ae75a81d96f6..4397eea9af266cbd0392f08323e59077c9395150 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -113,7 +113,11 @@ class Predicate {
   enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol };
 
   virtual string ToString() const = 0;
-  int64 hash() const { return hash_; }
+
+  // An ID assigned to the Predicate at construction time.  Conceptually like a
+  // pointer, except that it is stable across runs.
+  int64 id() const { return id_; }
+
   virtual absl::Span<Predicate* const> GetOperands() const = 0;
 
   virtual Kind kind() const = 0;
@@ -126,29 +130,19 @@ class Predicate {
   static void Visit(Predicate* p, const FunctionTy& func);
 
  protected:
-  explicit Predicate(int64 hash) : hash_(hash) {}
+  explicit Predicate(int64 id) : id_(id) {}
 
  private:
-  const int64 hash_;
+  const int64 id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Predicate);
 };
 
-int64 HashPredicateSequence(Predicate::Kind kind,
-                            absl::Span<Predicate* const> preds) {
-  int64 hash = ::tensorflow::hash<Predicate::Kind>()(kind);
-  for (Predicate* pred : preds) {
-    hash = Hash64Combine(hash, pred->hash());
-  }
-  return hash;
-}
-
 // Represents a logical conjunction of a set of predicates.
 class AndPredicate : public Predicate {
  public:
-  explicit AndPredicate(std::vector<Predicate*> operands)
-      : Predicate(HashPredicateSequence(Kind::kAnd, operands)),
-        operands_(std::move(operands)) {}
+  explicit AndPredicate(int64 id, std::vector<Predicate*> operands)
+      : Predicate(id), operands_(std::move(operands)) {}
 
   string ToString() const override {
     if (operands().empty()) {
@@ -177,9 +171,8 @@ class AndPredicate : public Predicate {
 // Represents a logical disjunction of a set of predicates.
 class OrPredicate : public Predicate {
  public:
-  explicit OrPredicate(std::vector<Predicate*> operands)
-      : Predicate(HashPredicateSequence(Kind::kOr, operands)),
-        operands_(std::move(operands)) {}
+  explicit OrPredicate(int64 id, std::vector<Predicate*> operands)
+      : Predicate(id), operands_(std::move(operands)) {}
 
   string ToString() const override {
     if (operands().empty()) {
@@ -207,9 +200,8 @@ class OrPredicate : public Predicate {
 // Represents a logical negation of a set of predicates.
 class NotPredicate : public Predicate {
  public:
-  explicit NotPredicate(Predicate* operand)
-      : Predicate(HashPredicateSequence(Kind::kNot, {operand})),
-        operands_({operand}) {}
+  explicit NotPredicate(int64 id, Predicate* operand)
+      : Predicate(id), operands_({operand}) {}
 
   string ToString() const override {
     return absl::StrCat("~", operand()->ToString());
@@ -246,11 +238,9 @@ class NotPredicate : public Predicate {
 // iterations).
 class AndRecurrencePredicate : public Predicate {
  public:
-  explicit AndRecurrencePredicate(Predicate* start, Predicate* step,
+  explicit AndRecurrencePredicate(int64 id, Predicate* start, Predicate* step,
                                   std::vector<string> frame)
-      : Predicate(Hash(start, step, frame)),
-        operands_({start, step}),
-        frame_(std::move(frame)) {}
+      : Predicate(id), operands_({start, step}), frame_(std::move(frame)) {}
 
   Predicate* start() const { return operands_[0]; }
   Predicate* step() const { return operands_[1]; }
@@ -270,16 +260,6 @@ class AndRecurrencePredicate : public Predicate {
  private:
   std::array<Predicate*, 2> operands_;
   std::vector<string> frame_;
-
-  static int64 Hash(Predicate* start, Predicate* step,
-                    const std::vector<string>& frame) {
-    uint64 frame_hash = 0;
-    for (const string& sub_frame : frame) {
-      frame_hash = Hash64Combine(Hash64(sub_frame), frame_hash);
-    }
-    return Hash64Combine(
-        HashPredicateSequence(Kind::kAndRecurrence, {start, step}), frame_hash);
-  }
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
@@ -289,8 +269,8 @@ class AndRecurrencePredicate : public Predicate {
 // symbols.
 class SymbolPredicate : public Predicate {
  public:
-  explicit SymbolPredicate(TensorId tensor_id, bool must_be_true)
-      : Predicate(Hash(tensor_id, must_be_true)),
+  explicit SymbolPredicate(int64 id, TensorId tensor_id, bool must_be_true)
+      : Predicate(id),
         tensor_id_(std::move(tensor_id)),
         must_be_true_(must_be_true) {}
 
@@ -313,13 +293,6 @@ class SymbolPredicate : public Predicate {
  private:
   TensorId tensor_id_;
   bool must_be_true_;
-
-  static int64 Hash(const TensorId tensor_id, bool must_be_true) {
-    return Hash64Combine(
-        ::tensorflow::hash<bool>()(must_be_true),
-        Hash64Combine(::tensorflow::hash<Predicate::Kind>()(Kind::kSymbol),
-                      TensorId::Hasher{}(tensor_id)));
-  }
 };
 
 template <typename FunctionTy>
@@ -477,8 +450,11 @@ class PredicateFactory {
 
   template <typename PredicateT, typename... Args>
   std::unique_ptr<Predicate> Make(Args&&... args) {
+    // If we ever expose the Predicate class outside this .cc file then we may
+    // want to make this hard to misuse (by accidentally passing in an arbitrary
+    // integer to the Predicate constructor for instance).
     return std::unique_ptr<PredicateT>(
-        new PredicateT(std::forward<Args>(args)...));
+        new PredicateT(id_counter_++, std::forward<Args>(args)...));
   }
 
   Predicate* MakeAndOrImpl(absl::Span<Predicate* const> operands, bool is_and);
@@ -559,6 +535,7 @@ class PredicateFactory {
   absl::flat_hash_map<SignatureForSymbol, std::unique_ptr<Predicate>,
                       HashSignatureForSymbol>
       interned_symbol_instances_;
+  int64 id_counter_ = 0;
   int stack_depth_ = 0;
 };
 
@@ -566,7 +543,7 @@ Predicate* PredicateFactory::MakeInternedAndOr(
     std::vector<Predicate*> simplified_ops, Predicate::Kind pred_kind) {
   std::stable_sort(
       simplified_ops.begin(), simplified_ops.end(),
-      [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); });
+      [](Predicate* a, Predicate* b) { return a->id() < b->id(); });
 
   auto it = interned_and_or_instances_.find({pred_kind, simplified_ops});
   if (it != interned_and_or_instances_.end()) {
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 16ee8f86d55c72785368ac2fd67635eba2fa7cd7..38a5118d9a721b814e1b52ce4202d4fb783e3ac3 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -521,7 +521,7 @@ TEST(DeadnessAnalysisTest, Loop) {
     EXPECT_EQ(predicate_map[ControlOutputFor(iv2)],
               "{#true,&,*iv2/cond:0}<fr0>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({#true,&,*iv1/cond:0}<fr0> & {#true,&,*iv0/cond:0}<fr0>)");
+              "({#true,&,*iv0/cond:0}<fr0> & {#true,&,*iv1/cond:0}<fr0>)");
     EXPECT_EQ(predicate_map[ControlOutputFor(add1)],
               "({#true,&,*iv1/cond:0}<fr0> & {#true,&,*iv2/cond:0}<fr0>)");
   }
@@ -553,11 +553,11 @@ TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
     EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
               "{#true,&,*iv0/cond:0}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}<loop>");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}<loop>");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}<loop>");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
   }
 }
 
@@ -643,22 +643,23 @@ TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
               "{#true,&,*iv_outer/cond:0}<outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
-              "{({#true,&,*iv_outer/cond:0}<outer_loop> & "
-              "*iv_outer/cond:0),&,*iv_inner/cond:0}<inner_loop;outer_loop>");
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
 
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
               "{{#true,&,(iv_outer/iv:0 & "
-              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
-              "iv_inner/iv:0)}<inner_loop;outer_loop>");
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
 
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
               "{{#true,&,(iv_outer/iv:0 & "
-              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
-              "iv_inner/iv:0)}<inner_loop;outer_loop>");
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
               "{{#true,&,(iv_outer/iv:0 & "
-              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
-              "iv_inner/iv:0)}<inner_loop;outer_loop>");
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
   }
 }
 
@@ -702,20 +703,21 @@ TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
     EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[0])],
               "{#true,&,*iv_outer/cond:0}<outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[0])],
-              "{({#true,&,*iv_outer/cond:0}<outer_loop> & "
-              "*iv_outer/cond:0),&,*iv_inner/cond:0}<inner_loop;outer_loop>");
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[1])],
               "{#true,&,*iv_outer/cond_1:0}<outer_loop>");
-    EXPECT_EQ(
-        predicate_map[ControlOutputFor(inner_iv[1])],
-        "{({#true,&,*iv_outer/cond_1:0}<outer_loop> & "
-        "*iv_outer/cond_1:0),&,*iv_inner/cond_1:0}<inner_loop;outer_loop>");
-    EXPECT_EQ(
-        predicate_map[ControlOutputFor(add0)],
-        "({({#true,&,*iv_outer/cond:0}<outer_loop> & "
-        "*iv_outer/cond:0),&,*iv_inner/cond:0}<inner_loop;outer_loop> & "
-        "{({#true,&,*iv_outer/cond_1:0}<outer_loop> & "
-        "*iv_outer/cond_1:0),&,*iv_inner/cond_1:0}<inner_loop;outer_loop>)");
+    EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[1])],
+              "{(*iv_outer/cond_1:0 & "
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>),&,*iv_inner/"
+              "cond_1:0}<inner_loop;outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "({(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop> & {(*iv_outer/cond_1:0 & "
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>),&,*iv_inner/"
+              "cond_1:0}<inner_loop;outer_loop>)");
   }
 }
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 1f8ec09e19c01d0a8b2a3761135ed53dfb2ad3b0..261519de3478c8b3e30d206a15944b5a686598e2 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -307,22 +307,6 @@ REGISTER_OP("XlaHostCompute")
     .Attr("shapes: list(shape) >= 0")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
-REGISTER_OP("_XlaSendFromHost")
-    .Input("inputs: Tinputs")
-    .Input("dynamic_key: string")
-    .Attr("Tinputs: list(type) >= 0")
-    .Attr("key: string")
-    .Attr("device_ordinal: int")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
-REGISTER_OP("_XlaRecvAtHost")
-    .Input("dynamic_key: string")
-    .Output("outputs: Toutputs")
-    .Attr("Toutputs: list(type) >= 0")
-    .Attr("key: string")
-    .Attr("device_ordinal: int")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
 REGISTER_OP("InputTest")
     .Output("o: float")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 3bb979e0698d2d6be42ed5bae66c25267928192c..6d1661222e3eaf9df4f9f91f2b426c80b55245b2 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index ec745cdbb7e237f8b4935dd41e9791fc75f5355d..f0c9d573451952a398dce190e102a33270a4d739 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -36,6 +40,25 @@ namespace {
 
 const char* const kXlaClusterOutput = "XlaClusterOutput";
 
+bool IsCpuGpuCompile(const Graph* graph) {
+  for (Node* n : graph->nodes()) {
+    string name;
+    // Only consider nodes being compiled.
+    if (!GetNodeAttr(n->attrs(),
+                     EncapsulateXlaComputationsPass::kXlaClusterAttr, &name)
+             .ok())
+      continue;
+    // Early return for any node with a device that is not a CPU or GPU.
+    DeviceNameUtils::ParsedName parsed;
+    if (DeviceNameUtils::ParseFullName(n->requested_device(), &parsed)) {
+      if (parsed.type != DEVICE_CPU && parsed.type != DEVICE_GPU) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 // Checks if a graph node is marked to be a guaranteed constant.
 bool is_guaranteed_constant(const Node& n) {
   bool guaranteed_constant = false;
@@ -173,10 +196,11 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // Nondeterminism in serialization would not lead to incorrect results, but
   // may cause spurious cache misses. DeterministicSerialization is a
   // best-effort deterministic serialization.
-  string serialized;
-  TF_RET_CHECK(SerializeToStringDeterministic(gdef, &serialized));
-  uint64 fingerprint = Fingerprint64(serialized);
-  LOG(INFO) << "Subgraph fingerprint:" << fingerprint;
+  const size_t size = gdef.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(SerializeToBufferDeterministic(gdef, serialized.get(), size));
+  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
+  VLOG(1) << "Subgraph fingerprint:" << fingerprint;
   call_def->set_op(absl::StrCat(call_def->op(), "_", fingerprint));
   return Status::OK();
 }
@@ -351,12 +375,19 @@ Status EncapsulateXlaComputationsPass::Run(
           << dump_graph::DumpGraphToFile("encapsulate_xla_computations_before",
                                          **options.graph, options.flib_def);
 
-  TF_RETURN_IF_ERROR(Encapsulate(options.graph, options.flib_def));
+  const char* additional_help =
+      IsCpuGpuCompile(options.graph->get())
+          ? xla::status_macros::kPossibleAutoJitAlternative
+          : "";
+
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(Encapsulate(options.graph, options.flib_def),
+                                  additional_help);
   VLOG(1) << "EncapsulateXlaComputations() half-way: "
           << dump_graph::DumpGraphToFile("encapsulate_xla_computations_halfway",
                                          **options.graph, options.flib_def);
 
-  TF_RETURN_IF_ERROR(BuildXlaLaunchOps(options.graph->get()));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(BuildXlaLaunchOps(options.graph->get()),
+                                  additional_help);
   VLOG(1) << "EncapsulateXlaComputations() finished: "
           << dump_graph::DumpGraphToFile("encapsulate_xla_computations_after",
                                          **options.graph, options.flib_def);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index ebfffc3267e5acdf593bea2517c447083133e39c..5287fd175df206970b9fa73bc6b0176eddcdcaa9 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -247,6 +247,7 @@ Status ConvertTensorFlowSliceToStaticShapedSlice(
           .NewSubScope(absl::StrCat(slice->name(), "/static_shaped_slice"));
   Scope host_scope = main_scope.WithAssignedDevice(host_name);
 
+  // In the future we may want to be clever here and avoid the extra Cast ops.
   SliceInputs slice_inputs_int64 =
       MakeSliceIndexAndSizeInt64(host_scope, slice_inputs);
 
@@ -312,9 +313,9 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// Return true if `n` is a slice we can rewrite to have a static shape
+// Return true if `n` is a slice we should rewrite to have a static shape
 // (i.e. have the output shape only depend on the "size" input).
-xla::StatusOr<bool> IsRewritableSlice(Node* n) {
+xla::StatusOr<bool> ShouldRewriteSlice(Node* n) {
   if (n->type_string() != "Slice") {
     return false;
   }
@@ -332,14 +333,20 @@ xla::StatusOr<bool> IsRewritableSlice(Node* n) {
 
   // If slice_size[i] < -1 for any i then executing the slice will throw an
   // error, and we don't do anything here.
-  return absl::c_all_of(slice_inputs->size_as_vector,
-                        [](int64 size_i) { return size_i >= -1; });
+  bool slice_size_has_error = absl::c_all_of(
+      slice_inputs->size_as_vector, [](int64 size_i) { return size_i >= -1; });
+  if (!slice_size_has_error) {
+    return false;
+  }
+
+  // No point in rewriting slices that have both size and begin as constants.
+  return !slice_inputs->begin.node()->IsConstant();
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
   std::vector<Node*> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, ShouldRewriteSlice(n));
     if (is_rewritable) {
       slices_to_rewrite.push_back(n);
     }
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index 32e30216de565b4c1918903bf6c70c321c38cbb3..2add2c13f92f561904163012ee16cc17ce5badce 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -432,5 +432,26 @@ TEST(SliceToDynamicSliceRewriteTest, WithControlDepsToConstant) {
                                                       Name("dependency")))));
 }
 
+TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithConstBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Const(root.WithOpName("begin"), {10, 10});
+  Output size = ops::Const(root.WithOpName("size"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* slice_node = testing::FindNodeByName(result.get(), "slice");
+  EXPECT_THAT(slice_node,
+              NodeWith(Op("Slice"), Inputs(Out(NodeWith(Op("Placeholder"))),
+                                           Out(NodeWith(Op("Const"))),
+                                           Out(NodeWith(Op("Const"))))));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index bab824c15f8f27f5325e79cd92d50cdaad850233..d0fa2c40be9d6b13ec736a9d6483dae0b4f0f45e 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -19,6 +19,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index ad71df5a694a5f8da94675049df1062a7edb6253..997ef6e14bb9bd16ddac13eaf67368966818b29e 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -35,6 +36,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
@@ -304,10 +307,19 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   std::map<int, OptionalTensor> variables;
 
-  OP_REQUIRES_OK(
-      ctx, CompileToLocalExecutable(ctx, function_, platform_info_, resources_,
-                                    constants_, /*lazy=*/false, &client,
-                                    &variables, &kernel, &executable));
+  {
+    Status s = CompileToLocalExecutable(
+        ctx, function_, platform_info_, resources_, constants_, /*lazy=*/false,
+        &client, &variables, &kernel, &executable);
+    if (!s.ok() && (platform_info_.device_type().type_string() == DEVICE_CPU ||
+                    platform_info_.device_type().type_string() == DEVICE_GPU)) {
+      // Suggest auto jit if the failure was with GPU or CPU.
+      errors::AppendToMessage(&s,
+                              xla::status_macros::kPossibleAutoJitAlternative);
+    }
+
+    OP_REQUIRES_OK(ctx, s);
+  }
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 20c2cd7e0561f92a01486102c4d2c572fd80c957..d9a83049d6352f04f9237f21b44bdb5ea18e518a 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1176,6 +1176,8 @@ Status MarkForCompilationPass::RunImpl(
       if (absl::optional<absl::string_view> cluster_name =
               GetXlaClusterForNode(*n)) {
         n->set_name(absl::StrCat(*cluster_name, "/", n->name()));
+      } else if (n->type_string() == "VarHandleOp") {
+        n->set_name(absl::StrCat("varhandle/", n->name()));
       } else {
         // There is room for improvement here.  In particular, it may help to
         // split these unclustered nodes into classes where every node in a
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 80993861abba050fa3d6a133023d3c99f41f73e3..3adcfef4dacecb343812cefc3a893a65c74ca101 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -43,7 +43,7 @@ string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src,
     return "";
   }
 
-  auto node_name = [cycles, &graph](int node_id) {
+  auto node_name = [&graph](int node_id) {
     if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
       return string("(null)");
     }
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index c67b4f11b030f22c123336327ff9fa67b1211d7a..56c4220f12b54be09821eca4590df52e8e71850b 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -102,7 +102,8 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      absl::make_unique<XlaDeviceAllocator>();
+      absl::make_unique<XlaDeviceAllocator>(
+          backend->stream_executors()[device_ordinal]);
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -428,7 +429,7 @@ void XlaDevice::Sync(const DoneCallback& done) {
   // moment--when ThenEnqueueOnBackgroundThread is called--will have finished.
   // This achieves a device-wide sync.
   stream->ThenEnqueueOnBackgroundThread(
-      [this, stream, done](se::StreamExecutor*) {
+      [stream, done](se::StreamExecutor*) {
         tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
                                          /*is_expensive=*/true);
         done(stream->ok() ? Status::OK()
@@ -479,6 +480,23 @@ bool XlaDevice::AllowsSyncOnCompletion() const {
   return sync_on_completion_;
 }
 
+void XlaDevice::SetHandleDeviceErrorCallback(std::function<Status()> callback) {
+  mutex_lock lock(mu_);
+  device_error_callback_ = callback;
+}
+
+Status XlaDevice::HandleDeviceError() {
+  std::function<Status()> local_device_error_callback;
+  {
+    mutex_lock lock(mu_);
+    local_device_error_callback = device_error_callback_;
+  }
+  if (local_device_error_callback != nullptr) {
+    return local_device_error_callback();
+  }
+  return Status::OK();
+}
+
 Status XlaDevice::RefreshStatus() {
   std::shared_ptr<se::Stream> stream;
   {
@@ -488,8 +506,14 @@ Status XlaDevice::RefreshStatus() {
   if (!stream) {
     return Status::OK();
   }
-  // Stream status is XlaDevice status, no extra operations needed.
-  return stream->RefreshStatus();
+  Status status = stream->RefreshStatus();
+  if (!status.ok()) {
+    // Ignore errors from HandleDeviceError, since by definition the status is
+    // already non-ok, so there's nothing extra to report if HandleDeviceError
+    // itself returns an error.
+    HandleDeviceError().IgnoreError();
+  }
+  return status;
 }
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 5fe1290fa03f2b1f9d90e36dbc5769b3c2728c8d..977f5f5cf151d979d025c2966012445af04fc502 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -171,6 +171,9 @@ class XlaDevice : public LocalDevice {
   void SetAllowsSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
   bool AllowsSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
+  // Installs an error handling callback when RefreshStatus sees !status.ok().
+  void SetHandleDeviceErrorCallback(std::function<Status()> callback);
+
   Status RefreshStatus() override LOCKS_EXCLUDED(mu_);
 
  private:
@@ -187,6 +190,9 @@ class XlaDevice : public LocalDevice {
   static Status GetMetadataFromDevice(DeviceBase* device,
                                       const XlaDevice::Metadata** metadata);
 
+  // Handles error when RefreshStatus sees !status.ok().
+  Status HandleDeviceError();
+
   mutable mutex mu_;
   // The metadata of this XlaDevice.
   const Metadata xla_metadata_;
@@ -237,6 +243,9 @@ class XlaDevice : public LocalDevice {
   // regardless of status.
   bool sync_on_completion_ GUARDED_BY(mu_) = true;
 
+  // A callback that will be invoked when RefreshStatus sees a status error.
+  std::function<Status()> device_error_callback_ GUARDED_BY(mu_);
+
   // Set of devices to use. This controls which of the devices on the given
   // platform will have resources allocated. For GPUs this will be
   // filled from visible_gpu_devices list from session configuration.
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 28681bb8b03dbf97e8145972f9a04b5855fafdae..05b9c511866d3ca48ec3519bee8a4dbf6086f6ac 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -29,7 +29,10 @@ limitations under the License.
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator() {}
+XlaDeviceAllocator::XlaDeviceAllocator(
+    stream_executor::StreamExecutor* stream_executor)
+    : stream_executor_(stream_executor) {}
+
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
@@ -48,7 +51,21 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
   delete XlaTensor::FromOpaquePointer(ptr);
 }
 
-void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
+absl::optional<AllocatorStats> XlaDeviceAllocator::GetStats() {
+  absl::optional<stream_executor::AllocatorStats> se_stats =
+      stream_executor_->GetAllocatorStats();
+  if (!se_stats) {
+    return absl::nullopt;
+  }
+
+  tensorflow::AllocatorStats tf_stats;
+  tf_stats.num_allocs = se_stats->num_allocs;
+  tf_stats.bytes_in_use = se_stats->bytes_in_use;
+  tf_stats.peak_bytes_in_use = se_stats->peak_bytes_in_use;
+  tf_stats.largest_alloc_size = se_stats->largest_alloc_size;
+  tf_stats.bytes_limit = se_stats->bytes_limit;
+  return tf_stats;
+}
 
 XlaDeviceContext::XlaDeviceContext(
     std::shared_ptr<se::Stream> compute_stream,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index e45db989fac720df6c3458c93a6b8dbb0919f930..1ce64ad323b4827adc2f4d48841315fbde43e532 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -34,14 +34,18 @@ namespace tensorflow {
 // empty, XlaTensor.
 class XlaDeviceAllocator : public Allocator {
  public:
-  XlaDeviceAllocator();
+  XlaDeviceAllocator(se::StreamExecutor* stream_executor);
   ~XlaDeviceAllocator() override;
 
   string Name() override;
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
+
+ private:
+  // The stream executor of the device.
+  se::StreamExecutor* stream_executor_;
 };
 
 // Helper class for managing data transfers between host and XLA devices.
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 927f983ba9ef23c8509523f42366c0c89c29db9f..09e04d22def9c39f45c2737c1d4a5e7787e3fdc0 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/control_flow_ops.h"
 #include "tensorflow/core/kernels/data/generator_dataset_op.h"
 #include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/function_ops.h"
@@ -241,6 +242,8 @@ class XlaAssignVariableOp : public OpKernel {
                           data::AnonymousIteratorHandleOp);                    \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           data::IteratorGetNextOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
+                          data::IteratorGetNextAsOptionalOp);                  \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE),          \
                           data::IteratorGetNextSyncOp);                        \
   REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
@@ -251,6 +254,15 @@ class XlaAssignVariableOp : public OpKernel {
                               .Device(DEVICE)                                  \
                               .HostMemory("string_handle"),                    \
                           data::IteratorFromStringHandleOp);                   \
+  REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE),                 \
+                          data::OptionalNoneOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE),            \
+                          data::OptionalFromValueOp);                          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("OptionalHasValue").Device(DEVICE).HostMemory("has_value"),         \
+      data::OptionalHasValueOp);                                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE),             \
+                          data::OptionalGetValueOp);                           \
   REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp)              \
                               .Device(DEVICE)                                  \
                               .HostMemory("output")                            \
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 9b6ca4092c3177ac26503add13bce25d2c0bb820..7c1e0daf0b7b418530367cb80fbd18b93e8e5f5e 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -250,6 +250,29 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "self_adjoint_eig_op_test",
+    size = "medium",
+    srcs = ["self_adjoint_eig_op_test.py"],
+    # TODO(kuny): remove it after b/124377352 is fixed.
+    disabled_backends = [
+        "cpu",
+        "gpu",
+        "cpu_ondemand",
+    ],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_xla_py_test(
     name = "matrix_triangular_solve_op_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index 5d5e486f616937601214aa169a4c329ab78932c8..eec69ea7d2d9af9ff570f927fb25b668ccce2b97 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -119,7 +119,7 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def testSamplingCorrectness(self):
     np.random.seed(1618)  # Make it reproducible.
-    num_samples = 21000
+    num_samples = 40000
 
     rand_probs = np.random.dirichlet([1., 1., 2., 3.])
     rand_probs2 = np.random.dirichlet([1., 4., 5.], size=3)  # batched
diff --git a/tensorflow/compiler/tests/plugin.bzl b/tensorflow/compiler/tests/plugin.bzl
index fbc8781a3e59faecf985cde5114bf56a041c4be0..46a854d1459b7ea9d9fe3cf7689faee557c2cf84 100644
--- a/tensorflow/compiler/tests/plugin.bzl
+++ b/tensorflow/compiler/tests/plugin.bzl
@@ -18,13 +18,12 @@
 #   git update-index --assume-unchanged tensorflow/compiler/tests/plugin.bzl
 
 plugins = {
-  #"example": {
-  #  "device":"XLA_MY_DEVICE",
-  #  "types":"DT_FLOAT,DT_HALF,DT_INT32",
-  #   "tags":[],
-  #   "args":["--disabled_manifest=tensorflow/compiler/plugin/example/disabled_manifest.txt"],
-  #   "data":["//tensorflow/compiler/plugin/example:disabled_manifest.txt"],
-  #   "deps":[],
-  #},
+    #"example": {
+    #  "device":"XLA_MY_DEVICE",
+    #  "types":"DT_FLOAT,DT_HALF,DT_INT32",
+    #   "tags":[],
+    #   "args":["--disabled_manifest=tensorflow/compiler/plugin/example/disabled_manifest.txt"],
+    #   "data":["//tensorflow/compiler/plugin/example:disabled_manifest.txt"],
+    #   "deps":[],
+    #},
 }
-
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 97ffad34c00b8ec16eb1ec109ba5d980e0ce673d..34f2465ba63f235f893db9dd6930ac252c3e7226 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -122,8 +122,8 @@ class RandomOpsTest(xla_test.XLATestCase):
         beta = (b - mu) / sigma
         z = normal_cdf(beta) - normal_cdf(alpha)
 
-        self.assertTrue((y >= a).sum() == count)
-        self.assertTrue((y <= b).sum() == count)
+        self.assertEqual((y >= a).sum(), count)
+        self.assertEqual((y <= b).sum(), count)
 
         # For more information on these calculations, see:
         # Burkardt, John. "The Truncated Normal Distribution".
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
index 693f8513bc54e30060a2e963abd504768535a50a..a9a87b8fb3104f8b9870c41e2aa28b0c48c12921 100644
--- a/tensorflow/compiler/tests/scatter_nd_op_test.py
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -134,6 +134,12 @@ class ScatterNdTest(xla_test.XLATestCase):
     expected = np.array([0, 11, 0, 10, 9, 0, 0, 12], dtype=np.int32)
     self.assertAllEqual(expected, self._runScatterNd(indices, updates, [8]))
 
+  def testRepeatedIndices(self):
+    indices = np.array([[0], [1], [0], [1]], dtype=np.int32)
+    updates = np.array([9, 10, 11, 12], dtype=np.float32)
+    expected = np.array([20, 22], dtype=np.int32)
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [2]))
+
   def testSimple2(self):
     indices = np.array([[1, 0], [1, 1]], dtype=np.int32)
     updates = np.array([11., 12.], dtype=np.float32)
diff --git a/tensorflow/compiler/tests/self_adjoint_eig_op_test.py b/tensorflow/compiler/tests/self_adjoint_eig_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfb5c82b22ea1d7400b54045edee0ca0782ce979
--- /dev/null
+++ b/tensorflow/compiler/tests/self_adjoint_eig_op_test.py
@@ -0,0 +1,62 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.self_adjoint_eig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class SelfAdjointEigOpTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _test(self, dtype, shape):
+    np.random.seed(1)
+    x_np = np.random.uniform(
+        low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype)
+    x_np = x_np + np.swapaxes(x_np, -1, -2)
+    n = shape[-1]
+
+    e_np, _ = np.linalg.eigh(x_np)
+    with self.cached_session() as sess:
+      x_tf = array_ops.placeholder(dtype)
+      with self.test_scope():
+        e, v = linalg_ops.self_adjoint_eig(x_tf)
+      e_val, v_val = sess.run([e, v], feed_dict={x_tf: x_np})
+
+      v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n)
+      self.assertAlmostEqual(np.mean(v_diff**2), 0.0, delta=1e-6)
+      self.assertAlmostEqual(np.mean((e_val - e_np)**2), 0.0, delta=1e-6)
+
+  SIZES = [1, 2, 5, 10, 32]
+  DTYPES = [np.float32]
+  PARAMS = itertools.product(SIZES, DTYPES)
+
+  @parameterized.parameters(*PARAMS)
+  def testSelfAdjointEig(self, n, dtype):
+    for batch_dims in [(), (3,)] + [(3, 2)] * (n < 10):
+      self._test(dtype, batch_dims + (n, n))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index ee7ca7e6f196e114ff18e2597145e5c198980b08..df5914a518e06e4190c623a14287de8daefebd40 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -167,8 +167,8 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         beta = (b - mu) / sigma
         z = normal_cdf(beta) - normal_cdf(alpha)
 
-        self.assertTrue((y >= a).sum() == n)
-        self.assertTrue((y <= b).sum() == n)
+        self.assertEqual((y >= a).sum(), n)
+        self.assertEqual((y <= b).sum(), n)
 
         # For more information on these calculations, see:
         # Burkardt, John. "The Truncated Normal Distribution".
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index 47e0f384a4f1e46ccc35584aaff3a0aceff8a985..a380715301b08ce2186c97b678b7235b9121d178 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -102,7 +102,7 @@ class ListOpsTest(xla_test.XLATestCase):
       _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "Set the max number of elements"):
-        self.assertEqual(sess.run(e), 1.0 * np.ones((7, 15)))
+        self.assertAllEqual(sess.run(e), 1.0 * np.ones((7, 15)))
 
   def testEmptyTensorListMax(self):
     with self.cached_session() as sess, self.test_scope():
@@ -136,6 +136,17 @@ class ListOpsTest(xla_test.XLATestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.assertAllEqual(t, [3.0, 2.0])
 
+  def testSetDoesNotUpdatePushIndex(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_shape=[], element_dtype=dtypes.float32, max_num_elements=2)
+      # SetItem should not change the push index.
+      l = list_ops.tensor_list_set_item(l, 1, 3.)
+      l = list_ops.tensor_list_push_back(l, 5.)
+      l = list_ops.tensor_list_push_back(l, 7.)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [5., 7.])
+
   def testGetSetReserved(self):
     with self.cached_session(), self.test_scope():
       l = list_ops.tensor_list_reserve(
@@ -146,6 +157,25 @@ class ListOpsTest(xla_test.XLATestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.assertAllEqual(t, [3.0, 0.0])
 
+  def testSetStackReservedUnknownElementShape(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=None, num_elements=2)
+      l = list_ops.tensor_list_set_item(l, 0, [3.0, 4.0])
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [[3.0, 4.0], [0., 0.]])
+
+  def testPushInEmptyListWithUnknownElementShape(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None, max_num_elements=2)
+      l = list_ops.tensor_list_push_back(l, [3.0, 4.0])
+      # Pushing an element with a different shape should raise an error.
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Shape"):
+        l = list_ops.tensor_list_push_back(l, 5.)
+        self.evaluate(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32))
+
   def testGetSetReservedNonScalar(self):
     with self.cached_session() as sess, self.test_scope():
       l = list_ops.tensor_list_reserve(
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 083e2e58ae02b1aa383da76aebfca60fac59b84b..f2e0eac2d99fe3b71ecabd4b9977817c5f9c372c 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -72,6 +72,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
+        self.assertEqual(output.dtype, expected.dtype)
         self.assertAllCloseAccordingToType(
             result, expected, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
       else:
@@ -260,7 +261,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
-          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)),
+          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]],
+                                     dtype=dtype)).astype(dtype),
           rtol=1e-4,
           atol=1e-6)
 
@@ -391,6 +393,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array(
               [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.sign,
+          np.array([[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0]], dtype=dtype),
+          expected=np.array([[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.is_finite,
           np.array(
@@ -705,7 +712,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.abs,
           np.array([[2, -1]], dtype=dtype),
-          expected=np.array([[2, 1]], dtype=dtype))
+          expected=np.array([[2, 1]], dtype=np.real(dtype(0)).dtype))
 
       self._assertOpOutputMatchesExpected(
           math_ops.negative,
@@ -743,6 +750,10 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array(
               [[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype),
           expected=np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=np.bool))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sign,
+          np.array([[np.nan]], dtype=dtype),
+          expected=np.array([[0.0]], dtype=dtype))
 
   def testLogicalOps(self):
     self._assertOpOutputMatchesExpected(
@@ -811,6 +822,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
         np.array([1, 2, 0], np.int32),
         expected=np.array([2, 0, 1], dtype=np.int32))
 
+  def testInvertPermutationTwiceIsNoop(self):
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.invert_permutation(array_ops.invert_permutation(x)),
+        np.array([1, 2, 0], np.int32),
+        expected=np.array([1, 2, 0], dtype=np.int32))
+
   def testRank(self):
     rank_op = lambda x: array_ops.rank_internal(x, optimize=False)
     for dtype in self.numeric_types:
@@ -865,6 +882,17 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-1], [1], [4]], dtype=dtype),
           expected=np.int32(3))
 
+  def testSizeWithInt64OutType(self):
+
+    def size_op(x):
+      return array_ops.size_internal(x, optimize=False, out_type=np.int64)
+
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          size_op,
+          np.array([[-1], [1], [4]], dtype=dtype),
+          expected=np.int64(3))
+
   def testUnpack(self):
     self._assertOpOutputMatchesExpected(
         array_ops.unstack,
@@ -974,7 +1002,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
   def _assertSoftplusMatchesExpected(self, features, dtype):
     features = np.array(features, dtype=dtype)
     zero = np.asarray(0).astype(dtype)
-    expected = np.logaddexp(zero, features)
+    expected = np.logaddexp(zero, features).astype(dtype)
     self._assertOpOutputMatchesExpected(
         nn_ops.softplus, features, expected=expected, rtol=1e-6, atol=9.1e-6)
 
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 00d3c8cc5f610ea5c308fa7df49d963c78919d63..63cad6a159c3a9b0da9e3bb86ff250dd29e45729 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -160,6 +160,7 @@ tf_custom_op_py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
         "//tensorflow/python:resources",
@@ -170,13 +171,11 @@ tf_cuda_library(
     name = "trt_resources",
     srcs = [
         "utils/trt_int8_calibrator.cc",
-        "utils/trt_resource_manager.cc",
         "utils/trt_resources.cc",
     ],
     hdrs = [
         "utils/trt_int8_calibrator.h",
         "utils/trt_lru_cache.h",
-        "utils/trt_resource_manager.h",
         "utils/trt_resources.h",
     ],
     deps = [
@@ -265,7 +264,6 @@ tf_cuda_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
@@ -351,6 +349,7 @@ cc_library(
         "segment/segment.h",
         "segment/union_find.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
@@ -360,11 +359,12 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "segment_test",
     size = "small",
     srcs = ["segment/segment_test.cc"],
     tags = [
+        "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
     ],
@@ -430,7 +430,7 @@ cc_library(
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
     ],
 )
 
@@ -439,7 +439,7 @@ cc_library(
     srcs = ["utils/test_utils.cc"],
     hdrs = ["utils/test_utils.h"],
     deps = [
-        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
         "@com_googlesource_code_re2//:re2",
     ],
 )
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index f3db42509ecf1d5176c8f56ef13d2c76d038ee7a..1f3cae3fda0cd7be296882b7b17ea47554edace8 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
@@ -90,29 +89,40 @@ TrtCandidateSelector::TrtCandidateSelector(
 Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
-  static const std::set<string> candidate_ops = {
+  static const auto* candidate_ops = new std::set<string>{
       "Abs",
+      "Acos",
+      "Acosh",
       "Add",
+      "Asin",
+      "Asinh",
+      "Atan",
+      "Atanh",
       "AvgPool",
       "BatchMatMul",
       "BiasAdd",
+      "Ceil",
       "ConcatV2",
       "Const",
       "Conv2D",
       "Conv2DBackpropInput",
+      "Cos",
+      "Cosh",
       "DepthwiseConv2dNative",
       "Div",
       "Exp",
       "ExpandDims",
+      "Floor",
       "FusedBatchNorm",
       "FusedBatchNormV2",
+      "GatherV2",
       "Identity",
       "LeakyRelu",
       "Log",
       "MatMul",
       "Max",
-      "MaxPool",
       "Maximum",
+      "MaxPool",
       "Mean",
       "Min",
       "Minimum",
@@ -126,8 +136,10 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
       "Relu6",
       "Reshape",
       "Rsqrt",
-      "Rsqrt",
       "Sigmoid",
+      "Sin",
+      "Sinh",
+      "Slice",
       "Snapshot",
       "Softmax",
       "Sqrt",
@@ -136,14 +148,15 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
       "StridedSlice",
       "Sub",
       "Sum",
+      "Tan",
       "Tanh",
       "TopKV2",
       "Transpose",
   };
   bool is_supported_op_type =
-      (candidate_ops.count(node->type_string()) ||
+      (candidate_ops->count(node->type_string()) ||
        PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
-  static const std::set<string> quantize_ops = {
+  static const auto* quantize_ops = new std::set<string>{
       "QuantizeAndDequantizeV2",
       "QuantizeAndDequantizeV3",
       "FakeQuantWithMinMaxVars",
@@ -153,7 +166,7 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // these ops to the relevant tensors. This happens regardless of the value of
   // use_calibration.
   if (precision_mode_ == TrtPrecisionMode::INT8 &&
-      quantize_ops.count(node->type_string())) {
+      quantize_ops->count(node->type_string())) {
     is_supported_op_type = true;
   }
   // LINT.ThenChange(//tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc)
@@ -190,55 +203,6 @@ tensorflow::Status BuildNodeMap(
 
 }  // namespace
 
-// Function to get calibration from ResourceMgr and put them into nodedef.
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
-    bool is_dyn_op) {
-  LOG(INFO) << "Starting Calib Conversion";
-  infer_graph->CopyFrom(graph_def);
-  auto trt_rm = TRTResourceManager::instance();
-  auto calib_rm = trt_rm->getManager("TRTCalibration");
-  int num_nodes = infer_graph->node_size();
-  if (!is_dyn_op) {
-    LOG(WARNING) << "Construction of static int8 engine is not implemented "
-                    "yet!. Dynamic engine will be constructed";
-  }
-  for (int i = 0; i < num_nodes; ++i) {
-    auto n = infer_graph->mutable_node(i);
-    if (n->op() == "TRTEngineOp") {
-      VLOG(1) << "Processing " << n->name();
-      const string& container_name = n->attr().at("segment_funcdef_name").s();
-      TRTCalibrationResource* cres = nullptr;
-      auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
-      if (!status.ok()) {
-        LOG(ERROR) << "Could not get Calibration information. Did you run with "
-                      "calibration data?";
-        return tensorflow::errors::FailedPrecondition(
-            "Need to run graph with calibration data first!");
-      }
-      tensorflow::core::ScopedUnref calib_sc(cres);
-      if (cres->calibrator_) {
-        cres->calibrator_->waitAndSetDone();
-        cres->thr_->join();
-        const auto& calibration_table =
-            cres->calibrator_->getCalibrationTableAsString();
-        if (!calibration_table.size()) {
-          LOG(ERROR) << "Calibration table is empty";
-          return tensorflow::errors::Unknown(
-              "Calibration table is missing. This shouldn't have happened!");
-        }
-        n->mutable_attr()->at("calibration_data").set_s(calibration_table);
-      } else {
-        LOG(ERROR) << "Can't get TRTCalibrator from resource manager!";
-        return tensorflow::errors::Unknown(
-            "Can't get TRTCalibrator from resource manager!");
-      }
-      TF_RETURN_IF_ERROR(calib_rm->Cleanup(container_name));
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
@@ -662,8 +626,8 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
         info.use_calibration,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
-    segment_string =
-        string((const char*)engine_data->data(), engine_data->size());
+    segment_string = string(static_cast<const char*>(engine_data->data()),
+                            engine_data->size());
     if (calibrate_int8) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 95cf0227dcf84396b9de52194ae3a750f4acca66..80f68d36a3ab894e97586687ee9ab93dddc73c50 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -85,12 +85,6 @@ struct ConversionParams {
   std::vector<int> cached_engine_batches;  // list of cached engines
 };
 
-// This method extracts calibration information from the resource managers
-// and puts them in to engine nodedefs.
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def,
-    bool is_dyn_op);
-
 // - max_batch_size: maximum batch size which can be used for inference for
 //   optimization targets inference run with max batch size.
 // - max_workspace_size_bytes: The upper bound of memory allowance for engine
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index cabc6ccfa13df77b3bd26f51f35284816141423a..1a754181debf41865190aa7f9ca6a76efea98181 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -75,7 +75,7 @@ TEST(TrtCandidateSelector, Basics) {
                                          feed, const_1, matmul_attrs);
 
   // Unsupported op.
-  auto unsupported_op = ops::Sin(s.WithOpName("sin"), feed);
+  auto unsupported_op = ops::Erf(s.WithOpName("sin"), feed);
 
   // Incompatible input.
   auto incompatible_feed = ops::Placeholder(s.WithOpName("feed"), DT_DOUBLE);
@@ -108,7 +108,7 @@ TEST(TrtCandidateSelector, Basics) {
         "transpose_a is not supported for TensorRT FullyConnected "
         "(op: MatMul), at: incompatible_matmul");
     ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
-                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+                 error::UNIMPLEMENTED, "Op type Erf is not supported");
     ExpectStatus(
         selector.IsTensorRTCandidate(
             matmul_with_incompatible_input.operation.node()),
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 79b1cba32909c119a9127c3d254a6f14a16cb660..9a2ac8c3e5f1d149baf5de25c940e24a8acc9125 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -94,7 +93,6 @@ bool IsEngineOutput(absl::string_view name) {
 namespace convert {
 using absl::StrAppend;
 using absl::StrCat;
-using ::tensorflow::str_util::Split;
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
                                        nvinfer1::DataType* trt_dtype) {
@@ -194,6 +192,15 @@ Status ValidateTensorProperties(const string& producer_node_type,
   *trt_dims = TensorShapeToTrtDims(shape, /*ignore_first_dim=*/true);
   *batch_size = shape.dim_size(0);
 
+  // Don't convert empty tensors (dim value of 0).
+  for (int d = 1; d < shape.dims(); ++d) {
+    if (shape.dim_size(d) == 0) {
+      return errors::Unimplemented(
+          "Input tensor with shape ", shape.DebugString(),
+          " is an empty tensor, which is not supported by TRT");
+    }
+  }
+
   if (validation_only) return Status::OK();
   // Following are validations at runtime.
 
@@ -297,8 +304,8 @@ Status Converter::GetTrtBroadcastShape(
 
   const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
   auto compute_output_dims =
-      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
-                    int* output_dims_array, nvinfer1::Dims* output_dims) {
+      [](const TRT_TensorOrWeights& input, int broadcast_num_dims,
+         int* output_dims_array, nvinfer1::Dims* output_dims) {
         const nvinfer1::Dims input_dims = input.GetTrtDims();
         std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
         std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
@@ -380,6 +387,32 @@ tensorflow::Status CreateBroadcastableScalarConstant(
   return Status::OK();
 }
 
+// Convert an axis from TF format to TRT format while validating. TF format
+// includes the batch dimension, while TRT does not. TF can also use negative
+// indices.
+// TODO(tmorris): Use this method in more ops.
+tensorflow::Status ConvertAxis(int tf_axis, int trt_nb_dims,
+                               absl::string_view node_name, int* trt_axis) {
+  const int tf_nb_dims = trt_nb_dims + 1;
+  // Check bounds.
+  if (tf_axis < -tf_nb_dims || tf_axis >= tf_nb_dims) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis value of ", tf_axis, " is out of bounds, must be in range [",
+        -tf_nb_dims, ", ", tf_nb_dims, "), at ", node_name);
+  }
+  // Make negative axis positive.
+  if (tf_axis < 0) tf_axis += tf_nb_dims;
+  // Don't allow axis to be the batch dimension.
+  if (tf_axis == 0) {
+    return tensorflow::errors::Unimplemented(
+        "TensorRT does not allow manipulation of the batch dimension, at ",
+        node_name);
+  }
+  // Remove batch dimension.
+  *trt_axis = tf_axis - 1;
+  return Status::OK();
+}
+
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
                       const nvinfer1::Dims& dim_r) {
   if (dim_l.nbDims != dim_r.nbDims) {
@@ -393,6 +426,15 @@ inline bool DimsEqual(const nvinfer1::Dims& dim_l,
   return true;
 }
 
+bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
+  if (inputs.size() == 0) return true;
+  int length = inputs.at(0).size();
+  for (int i = 1; i < inputs.size(); i++) {
+    if (inputs.at(i).size() != length) return false;
+  }
+  return true;
+}
+
 inline nvinfer1::Dims GetTrtDimsForTensor(const tensorflow::Tensor& tensor) {
   nvinfer1::Dims dims;
   dims.nbDims = tensor.dims();
@@ -530,6 +572,16 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
   float getDynamicRange() const override { return 0; }
 #endif
 
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  bool dynamicRangeIsSet() const override { return true; }
+
+  void resetDynamicRange() override {}
+
+  float getDynamicRangeMin() const override { return 0.f; }
+
+  float getDynamicRangeMax() const override { return 0.f; }
+#endif
+
  private:
   nvinfer1::DataType trt_dtype_;
   nvinfer1::Dims trt_dims_;
@@ -921,7 +973,7 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
   for (size_t i = 0; i < outputs.size(); ++i) {
     TRT_TensorOrWeights& output = outputs[i];
     string output_name = node_def.name();
-    if (i != 0) output_name = StrCat(output_name, ":", i);
+    if (i != 0) absl::StrAppend(&output_name, ":", i);
     // We need to check the name before setting it. If the input is one of the
     // engine input, setting the name here will overwrite engine input
     // bindings which will cause runtime error.
@@ -2107,7 +2159,7 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
   // Mark axes to remove by setting them to 0.
   TFAttrs attrs(node_def);
   auto squeeze_dims = attrs.get<std::vector<int>>("squeeze_dims");
-  if (squeeze_dims.size() == 0) {
+  if (squeeze_dims.empty()) {
     return tensorflow::errors::Unimplemented(
         "Squeeze is only implemented for explicit dims, at ", node_def.name());
   }
@@ -2152,100 +2204,73 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-// Gets the bounds (start or end) from the weights of a StridedSlice op.
-tensorflow::Status GetStridedSliceBound(const std::vector<int>& input_dims,
-                                        const TRT_ShapedWeights& bound_weights,
-                                        int mask, bool begin, string node_name,
-                                        std::vector<int>* output_bound) {
-  const string bound_name = (begin) ? "begin" : "end";
-  const int* weights_ptr = static_cast<int*>(bound_weights.GetValues());
-  *output_bound =
-      std::vector<int>(weights_ptr, weights_ptr + bound_weights.count());
-  if (output_bound->size() != input_dims.size()) {
-    return tensorflow::errors::InvalidArgument(
-        "StridedSlice \"", bound_name, "\" specified ",
-        std::to_string(output_bound->size()), " dimensions, but input rank is ",
-        std::to_string(input_dims.size()), ", at ", node_name);
-  }
-  for (int i = 0; i < output_bound->size(); i++) {
-    if ((1 << i) & mask) {
-      // Apply mask.
-      (*output_bound)[i] = (begin) ? 0 : input_dims[i];
-      // Masked bound will always result in a valid, non-negative bound, so we
-      // don't need the following checks. For the common case of using masks on
-      // a undefined batch dim (-1), we specifically don't want to do the
-      // following checks because they will erroneously detect an out of range
-      // bound or try to correct the negative value.
-      continue;
-    }
-    // Make sure bound is valid.
-    if (((*output_bound)[i] < -input_dims[i]) ||
-        ((*output_bound)[i] > input_dims[i])) {
+tensorflow::Status ConvertStridedSliceHelper(OpConverterParams* params,
+                                             const TRT_TensorOrWeights& input,
+                                             std::vector<int> begin,
+                                             std::vector<int> size,
+                                             const std::vector<int>& stride) {
+  const auto& node_def = params->node_def;
+  // Get input dims.
+  nvinfer1::Dims dims = input.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Temporarily add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), -1);
+  // Check bounds.
+  for (int i = 1; i < input_dims.size(); i++) {
+    if (begin[i] < 0 || begin[i] > input_dims[i]) {
+      return tensorflow::errors::InvalidArgument(
+          "\"begin\" for dimension ", std::to_string(i), " in ", node_def.op(),
+          " is out of range, at ", node_def.name());
+    }
+    const int end = begin[i] + size[i];
+    if (end < 0 || end > input_dims[i]) {
       return tensorflow::errors::InvalidArgument(
-          bound_name, " value of ", std::to_string((*output_bound)[i]),
-          " for StridedSlice is invalid, must be in the range "
-          "[-dim_size(i), dim_size(i)], at ",
-          node_name);
+          "\"begin\" + \"size\" for dimension ", std::to_string(i), " in ",
+          node_def.op(), " is out of range, at ", node_def.name());
     }
-    // Convert negative values to their positive equivalent.
-    if ((*output_bound)[i] < 0) {
-      (*output_bound)[i] += input_dims[i];
+    if (size[i] <= 0) {
+      return tensorflow::errors::InvalidArgument(
+          "\"size\" cannot be negative or zero for ", node_def.op(), ", at ",
+          node_def.name());
     }
   }
-  return tensorflow::Status::OK();
-}
+// TRT 5.1 adds a slice layer. For older versions, we attempt to use the
+// padding layer with negative padding.
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  // Use ISliceLayer.
+  nvinfer1::Dims begin_dims, size_dims, stride_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(begin, &begin_dims,
+                                               /*ignore_first_dim=*/true));
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(size, &size_dims,
+                                               /*ignore_first_dim=*/true));
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(stride, &stride_dims,
+                                               /*ignore_first_dim=*/true));
+  if (params->validation_only) return Status::OK();
 
-tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(
-      *params,
-      {{"input", false}, {"begin", true}, {"end", true}, {"strides", true}}));
-  // Get input dims.
-  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
-  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
-  if (inputs.at(0).is_tensor()) {
-    // Temporarily add batch dimension so that indexes line up properly.
-    input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
-  }
-  if (input_dims.size() > 4) {
-    return tensorflow::errors::Unimplemented(
-        "StridedSlice is not implemented for tensors with rank > 4, at ",
-        node_def.name());
-  }
-  TFAttrs attrs(node_def);
-  // Get begin and end bounds per axis.
-  std::vector<int> begin, end;
-  TF_RETURN_IF_ERROR(GetStridedSliceBound(input_dims, inputs.at(1).weights(),
-                                          attrs.get<int>("begin_mask"), true,
-                                          node_def.name(), &begin));
-  TF_RETURN_IF_ERROR(GetStridedSliceBound(input_dims, inputs.at(2).weights(),
-                                          attrs.get<int>("end_mask"), false,
-                                          node_def.name(), &end));
-  // Get strides per axis (must all be 1).
-  TRT_ShapedWeights stride_weights = inputs.at(3).weights();
-  const int* stride_weights_ptr = static_cast<int*>(stride_weights.GetValues());
-  std::vector<int> strides(stride_weights_ptr,
-                           stride_weights_ptr + stride_weights.count());
-  for (int x : strides) {
+  nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
+      *const_cast<nvinfer1::ITensor*>(input.tensor()), begin_dims, size_dims,
+      stride_dims);
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return tensorflow::Status::OK();
+#else
+  // Use IPaddingLayer.
+  // Strides must be 1 in this case.
+  for (int x : stride) {
     if (x != 1) {
       return tensorflow::errors::Unimplemented(
-          "StridedSlice is only implemented for stride of 1, at ",
+          "Strides other than 1 are not supported with this version of TRT, "
+          "at ",
           node_def.name());
     }
   }
-  // Unsupported mask options.
-  for (const string& attr :
-       {"ellipsis_mask", "new_axis_mask", "shrink_axis_mask"}) {
-    int attr_val = attrs.get<int>(attr);
-    if (attr_val != 0) {
-      return tensorflow::errors::Unimplemented(
-          attr, " is not supported for StridedSlice, at ", node_def.name());
-    }
+  // Rank must be 2, 3 or 4.
+  if (input_dims.size() > 4) {
+    return tensorflow::errors::Unimplemented(node_def.op(),
+                                             " for tensors with rank > 4 is "
+                                             "not supported in this version of "
+                                             "TRT, at ",
+                                             node_def.name());
   }
-
-  nvinfer1::ITensor* tensor =
-      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
   // Reshape if necessary to 4-D, since IPaddingLayer requires a 4-D input.
   const bool need_reshape = (input_dims.size() != 4);
   int reshape_dims_added = 0;
@@ -2255,7 +2280,7 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
     while (input_dims.size() < 4) {
       input_dims.insert(input_dims.begin() + 1, 1);
       begin.insert(begin.begin() + 1, 0);
-      end.insert(end.begin() + 1, 1);
+      size.insert(size.begin() + 1, 1);
       reshape_dims_added++;
     }
     TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &reshape_dims,
@@ -2263,23 +2288,22 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   }
   // Find dimensions which need to be sliced.
   std::vector<int> pad_dims;
-  for (int i = 0; i < input_dims.size(); i++) {
-    if ((begin[i] != 0) || (end[i] != input_dims[i])) {
-      if (i == 0) {
-        return tensorflow::errors::Unimplemented(
-            "StridedSlice can't modify batch dim, at ", node_def.name());
-      } else if ((end[i] - begin[i]) < 0) {
-        return tensorflow::errors::InvalidArgument(
-            "New size of sliced dimension is negative, at ", node_def.name());
-      }
+  for (int i = 1; i < input_dims.size(); i++) {
+    if ((begin[i] != 0) || (begin[i] + size[i] != input_dims[i])) {
       pad_dims.push_back(i);
     }
   }
-  if (pad_dims.size() == 0) {
-    // No dimensions are changed. We could create a padding layer anyway with
-    // values of 0.
+  if (pad_dims.empty()) {
+    // No dimensions are changed, so this is a no-op. We could just return the
+    // input without creating a new layer. TRT will crash if an empty engine
+    // with no layers is attempted to be created, so we add a no-op shuffle to
+    // prevent our unit tests from breaking.
+    // TODO(tmorris): Allow empty engines in the unit tests and return the input
+    // as output here.
     if (params->validation_only) return Status::OK();
-    params->outputs->push_back(inputs.at(0));
+    nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
+        *const_cast<nvinfer1::ITensor*>(input.tensor()));
+    params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
     return tensorflow::Status::OK();
   } else if (pad_dims.size() == 1) {
     // Only one dim is modified but we have to have 2, mark a second dim which
@@ -2292,16 +2316,19 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
     }
   } else if (pad_dims.size() > 2) {
     return tensorflow::errors::Unimplemented(
-        "StridedSlice can only modify 2 dimensions, at ", node_def.name());
+        node_def.op(),
+        " can only modify up to 2 dimensions in this version of TRT, at ",
+        node_def.name());
   }
   std::sort(pad_dims.begin(), pad_dims.end());
   // Convert to pre/post padding values. Since TRT does not have a StridedSlice
-  // or Slice layer, we instead create an IPaddingLayer with negative padding.
+  // or Slice layer prior to 5.1, we instead create an IPaddingLayer with
+  // negative padding.
   nvinfer1::DimsHW pre_padding, post_padding;
   for (int i = 0; i < pad_dims.size(); i++) {
     const int axis = pad_dims[i];
     pre_padding.d[i] = -begin[axis];
-    post_padding.d[i] = end[axis] - input_dims[axis];
+    post_padding.d[i] = (begin[axis] + size[axis]) - input_dims[axis];
   }
 
   // IPaddingLayer will always apply the padding to dims 2,3 (input format is
@@ -2321,10 +2348,11 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   if (params->validation_only) return Status::OK();
 
   // Start conversion.
+  nvinfer1::ITensor* tensor = const_cast<nvinfer1::ITensor*>(input.tensor());
   if (need_reshape) {
     const nvinfer1::ITensor* output_tensor = nullptr;
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        inputs.at(0), reshape_dims, &output_tensor));
+        input, reshape_dims, &output_tensor));
     tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
   }
   if (need_transpose) {
@@ -2333,7 +2361,6 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
         tensor, transpose_order, &output_tensor));
     tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
   }
-
   // Add padding layer
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding);
@@ -2341,7 +2368,6 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   params->converter->MarkQuantizationRangesAsInferrable(tensor,
                                                         layer->getOutput(0));
   tensor = layer->getOutput(0);
-
   // Restore transpose
   if (need_transpose) {
     const nvinfer1::ITensor* output_tensor = nullptr;
@@ -2354,7 +2380,7 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
     // Calculate output dimensions
     for (int i = 0; i < pad_dims.size(); i++) {
       const int axis = pad_dims[i];
-      input_dims[axis] = end[axis] - begin[axis];
+      input_dims[axis] = size[axis];
     }
     // Remove added 1 dimensions
     for (int i = 0; i < reshape_dims_added; i++) {
@@ -2378,6 +2404,135 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(tensor)));
   return tensorflow::Status::OK();
+#endif
+}
+
+tensorflow::Status ConvertSlice(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"input", false}, {"begin", true}, {"size", true}}));
+  std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
+  std::vector<int> size = inputs.at(2).weights().ToVector<int>();
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  if (!AllLengthsEqual({input_dims, begin, size})) {
+    return tensorflow::errors::InvalidArgument(
+        "Length of begin and size arguments must equal rank of input for "
+        "Slice, at ",
+        node_def.name());
+  }
+  // Check that batch dimension is unmodified.
+  const bool begin_is_modified = begin[0] != 0;
+  // If size[0]s is not -1, we can only know if the batch dimension is
+  // unmodified when the batch size is defined. When the batch size is
+  // undefined, we don't convert to be safe.
+  const bool batch_size_is_defined = input_dims[0] > 0;
+  const bool size_is_modified =
+      size[0] != -1 && (!batch_size_is_defined ||
+                        (batch_size_is_defined && size[0] != input_dims[0]));
+  if (begin_is_modified || size_is_modified) {
+    return tensorflow::errors::Unimplemented(
+        "TensorRT does not allow modifications to the batch dimension, at ",
+        node_def.name());
+  }
+  // Size of -1 signifies to take all remaining elements.
+  for (int i = 1; i < input_dims.size(); i++) {
+    if (size[i] == -1) {
+      size[i] = input_dims[i] - begin[i];
+    }
+  }
+  // Stride is 1 for all dims.
+  std::vector<int> stride(begin.size(), 1);
+  return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
+}
+
+tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params,
+      {{"input", false}, {"begin", true}, {"end", true}, {"strides", true}}));
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  // Get begin and end bounds per axis.
+  std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
+  std::vector<int> end = inputs.at(2).weights().ToVector<int>();
+  std::vector<int> stride = inputs.at(3).weights().ToVector<int>();
+  if (!AllLengthsEqual({input_dims, begin, end, stride})) {
+    return tensorflow::errors::InvalidArgument(
+        "Length of begin, end, and stride arguments must equal rank of input "
+        "for StridedSlice, at ",
+        node_def.name());
+  }
+  // Unsupported mask options.
+  TFAttrs attrs(node_def);
+  for (const string& attr :
+       {"ellipsis_mask", "new_axis_mask", "shrink_axis_mask"}) {
+    int attr_val = attrs.get<int>(attr);
+    if (attr_val != 0) {
+      return tensorflow::errors::Unimplemented(
+          attr, " is not supported for StridedSlice, at ", node_def.name());
+    }
+  }
+  const int begin_mask = attrs.get<int>("begin_mask");
+  const int end_mask = attrs.get<int>("end_mask");
+  // Check that batch dimension is unmodified.
+  const bool begin_is_modified = !(begin_mask & 1) && begin[0] != 0;
+  const bool stride_is_modified = stride[0] != 1;
+  // If the batch size is -1 and the end mask is not set, we can only know if
+  // the batch dimension is unmodified when the batch size is defined. When the
+  // batch size is undefined, we don't convert to be safe.
+  const bool batch_size_is_defined = input_dims[0] > 0;
+  const bool end_is_modified =
+      !(end_mask & 1) && (!batch_size_is_defined ||
+                          (batch_size_is_defined && end[0] != input_dims[0]));
+  if (begin_is_modified || stride_is_modified || end_is_modified) {
+    return tensorflow::errors::Unimplemented(
+        "TensorRT does not allow modifications to the batch dimension, at ",
+        node_def.name());
+  }
+  // Standarize begin and end bounds by applying masks, making negative values
+  // positive, and correcting out of bounds ranges (StridedSlice does this
+  // silently).
+  for (int i = 1; i < input_dims.size(); i++) {
+    // Begin
+    if ((1 << i) & begin_mask) {
+      begin[i] = 0;
+    } else if (begin[i] < 0) {
+      begin[i] += input_dims[i];
+    }
+    begin[i] = std::max(0, std::min(begin[i], input_dims[i]));
+    // End
+    if ((1 << i) & end_mask) {
+      end[i] = input_dims[i];
+    } else if (end[i] < 0) {
+      end[i] += input_dims[i];
+    }
+    end[i] = std::max(0, std::min(end[i], input_dims[i]));
+  }
+  // Negative or zero strides currently not supported.
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (stride[i] <= 0) {
+      return tensorflow::errors::Unimplemented(
+          "Negative or zero stride values are not supported for StridedSlice, "
+          "at ",
+          node_def.name());
+    }
+  }
+  // TRT Slice layer uses (begin, size) instead of (begin, end)
+  std::vector<int> size(input_dims.size());
+  for (int i = 0; i < input_dims.size(); i++) {
+    // Divide by stride (round up)
+    size[i] = (end[i] - begin[i] + stride[i] - 1) / stride[i];
+  }
+  return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
 }
 
 tensorflow::Status ConvertConv2D(OpConverterParams* params) {
@@ -2947,58 +3102,104 @@ Status ConvertBinary(OpConverterParams* params) {
   return status;
 }
 
-tensorflow::Status ConvertUnary(OpConverterParams* params) {
+tensorflow::Status ConvertRsqrt(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  static const std::unordered_map<string, nvinfer1::UnaryOperation> ops{
-      {"Neg", nvinfer1::UnaryOperation::kNEG},
-      {"Exp", nvinfer1::UnaryOperation::kEXP},
-      {"Log", nvinfer1::UnaryOperation::kLOG},
-      {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
-      {"Abs", nvinfer1::UnaryOperation::kABS},
-      {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
-  };
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  if (params->validation_only) return tensorflow::Status::OK();
 
-  // TODO(jie): check type
-  const nvinfer1::ITensor* tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), inputs.at(0).GetTrtDims(), &tensor));
+  // TODO(tmorris): params->converter is null during validation. Allow
+  // precision_mode and use_calibration to be accessed during validation and
+  // include this check in validation.
+  // We will need a quantization range for intermediate tensor if not using
+  // calibration.
+  //
+  //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+  //                     ^
+  //               need range here
+  if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
+      !params->converter->use_calibration()) {
+    return errors::Unimplemented(
+        "Intermediate quantization range cannot be determined without"
+        " calibration for Rsqrt, consider replacing with "
+        "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+        node_def.name());
+  }
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  // Sqrt
+  nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::UnaryOperation::kSQRT);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
+  // Recip
+  nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
+      *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
+  return tensorflow::Status::OK();
+}
 
-  nvinfer1::IUnaryLayer* layer;
-  if (node_def.op() == "Rsqrt") {
-    // We will need a quantization range for intermediate tensor if not using
-    // calibration.
-    //
-    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
-    //                     ^
-    //               need range here
-    if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
-        !params->converter->use_calibration()) {
-      return errors::Unimplemented(
-          "Intermediate quantization range cannot be determined without"
-          " calibration for Rsqrt, consider replacing with "
-          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
-          node_def.name());
-    }
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor),
-        nvinfer1::UnaryOperation::kSQRT);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-    tensor = layer->getOutput(0);
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor),
-        nvinfer1::UnaryOperation::kRECIP);
-  } else if (ops.count(node_def.op()) != 0) {
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor), ops.at(node_def.op()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op: ", node_def.op(), " not supported, at ", node_def.name());
+const std::unordered_map<string, nvinfer1::UnaryOperation>*
+UnaryOperationMap() {
+  static auto* const m =
+      new std::unordered_map<string, nvinfer1::UnaryOperation>({
+        {"Neg", nvinfer1::UnaryOperation::kNEG},
+            {"Exp", nvinfer1::UnaryOperation::kEXP},
+            {"Log", nvinfer1::UnaryOperation::kLOG},
+            {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
+            {"Abs", nvinfer1::UnaryOperation::kABS},
+            {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+            {"Sin", nvinfer1::UnaryOperation::kSIN},
+            {"Cos", nvinfer1::UnaryOperation::kCOS},
+            {"Tan", nvinfer1::UnaryOperation::kTAN},
+            {"Sinh", nvinfer1::UnaryOperation::kSINH},
+            {"Cosh", nvinfer1::UnaryOperation::kCOSH},
+            {"Asin", nvinfer1::UnaryOperation::kASIN},
+            {"Acos", nvinfer1::UnaryOperation::kACOS},
+            {"Atan", nvinfer1::UnaryOperation::kATAN},
+            {"Asinh", nvinfer1::UnaryOperation::kASINH},
+            {"Acosh", nvinfer1::UnaryOperation::kACOSH},
+            {"Atanh", nvinfer1::UnaryOperation::kATANH},
+            {"Ceil", nvinfer1::UnaryOperation::kCEIL},
+            {"Floor", nvinfer1::UnaryOperation::kFLOOR},
+#endif
+      });
+  return m;
+}
+
+tensorflow::Status ConvertUnary(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  auto op_pair = UnaryOperationMap()->find(node_def.op());
+  if (op_pair == UnaryOperationMap()->end()) {
+    return tensorflow::errors::Unimplemented(
+        "Unary op: ", node_def.op(), " not supported at: ", node_def.name());
   }
+  if (params->validation_only) return tensorflow::Status::OK();
 
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
+      *const_cast<nvinfer1::ITensor*>(tensor), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Set quantization ranges.
+  if (node_def.op() == "Sin" || node_def.op() == "Cos") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  } else if (node_def.op() == "Asin" || node_def.op() == "Atan") {
+    params->converter->ProvideQuantizationRange(output_tensor, -M_PI_2, M_PI_2);
+  } else if (node_def.op() == "Acos") {
+    params->converter->ProvideQuantizationRange(output_tensor, 0.0f, M_PI);
+  } else if (node_def.op() == "Neg" || node_def.op() == "Abs") {
+    // Neg and Abs will have same range as input since TRT uses symmetric
+    // quantization.
+    // TODO(tmorris): Should we infer ranges for Ceil and Floor as well?
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), output_tensor);
+  }
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
@@ -3139,7 +3340,7 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
   }
 
   // No padding at all, we should exit
-  if (pad_index.size() == 0) {
+  if (pad_index.empty()) {
     params->outputs->push_back(inputs.at(0));
     return tensorflow::Status::OK();
   }
@@ -3329,7 +3530,7 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   TRT_ShapedWeights dummy_power_weights(parameter_type);
   size_t nweight = 0;
   for (int i = 1; i < 5; i++) {
-    nweight = std::max(nweight, (size_t)inputs.at(i).weights().count());
+    nweight = std::max<size_t>(nweight, inputs.at(i).weights().count());
   }
   TRT_ShapedWeights* ptr_shape_weights = nullptr;
   for (int i = 1; i < 5; i++) {
@@ -3414,6 +3615,29 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertGather(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"params", false}, {"indices", false}, {"axis", true}}));
+  absl::Span<const int> axis = inputs.at(2).weights().GetSpan<int>();
+  if (axis.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis for GatherV2 must be a scalar, at ", node_def.name());
+  }
+  int trt_axis = 0;
+  TF_RETURN_IF_ERROR(ConvertAxis(axis[0], inputs.at(0).GetTrtDims().nbDims,
+                                 node_def.name(), &trt_axis));
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
+      *const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()),
+      *const_cast<nvinfer1::ITensor*>(inputs.at(1).tensor()), trt_axis);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
 tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
                                        TRT_TensorOrWeights tensor_input,
                                        TRT_ShapedWeights weights_raw,
@@ -3644,11 +3868,14 @@ static void RegisterValidatableOpConverters(
   (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput;
   (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   (*registration)["ExpandDims"] = ConvertExpandDims;
+  (*registration)["GatherV2"] = ConvertGather;
   (*registration)["LeakyRelu"] = ConvertLeakyRelu;
   (*registration)["MatMul"] = ConvertMatMul;
   (*registration)["Pad"] = ConvertPad;
   (*registration)["Relu6"] = ConvertRelu6;
   (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["Rsqrt"] = ConvertRsqrt;
+  (*registration)["Slice"] = ConvertSlice;
   (*registration)["Square"] = ConvertSquare;
   (*registration)["Squeeze"] = ConvertSqueeze;
   (*registration)["StridedSlice"] = ConvertStridedSlice;
@@ -3673,6 +3900,9 @@ static void RegisterValidatableOpConverters(
   for (auto normalization_op_type : {"FusedBatchNorm", "FusedBatchNormV2"}) {
     (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
   }
+  for (auto unary_op_pair : *UnaryOperationMap()) {
+    (*registration)[unary_op_pair.first] = ConvertUnary;
+  }
 }
 
 void TrtNodeValidator::RegisterOpValidators() {
@@ -3685,14 +3915,6 @@ void Converter::RegisterOpConverters() {
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  op_registry_["Rsqrt"] = ConvertUnary;
-  op_registry_["Reciprocal"] = ConvertUnary;
-  op_registry_["Exp"] = ConvertUnary;
-  op_registry_["Log"] = ConvertUnary;
-  op_registry_["Sqrt"] = ConvertUnary;
-  op_registry_["Abs"] = ConvertUnary;
-  op_registry_["Neg"] = ConvertUnary;
-
   op_registry_["Sum"] = ConvertReduce;
   op_registry_["Prod"] = ConvertReduce;
   op_registry_["Max"] = ConvertReduce;
@@ -3722,8 +3944,12 @@ tensorflow::Status ConvertGraphDefToEngine(
   builder->setMaxWorkspaceSize(max_workspace_size_bytes);
   builder->setGpuAllocator(allocator);
   if (precision_mode == TrtPrecisionMode::FP16) {
-    builder->setHalf2Mode(true);
+    builder->setFp16Mode(true);
   } else if (precision_mode == TrtPrecisionMode::INT8) {
+    // Setting FP16 mode as well allows TRT to also consider FP16 kernels and
+    // use them in situations where they are faster than INT8 or where INT8 is
+    // not supported for a given layer.
+    builder->setFp16Mode(true);
     builder->setInt8Mode(true);
     if (use_calibration) {
       builder->setInt8Calibrator(calibrator);
@@ -3899,7 +4125,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
     local_scope = GetCommonNameScope(local_scope, node->name());
     old_to_new_id_map[node->id()] = segment_def->node_size();
     auto snode = segment_def->add_node();
-    snode->CopyFrom(node->def());
+    *snode = node->def();
     VLOG(2) << "Copying " << snode->name() << " to subgraph";
   }
   // Update the inputs of the new input nodes to point to placeholder nodes.
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index d1e30eb848bd6ab62719ca6da561d14b05d8537d..7b37173090519ff6fadd956942d7ea12a0644981 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -190,6 +190,17 @@ class TRT_ShapedWeights {
 
   string DebugString() const;
 
+  template <typename T>
+  absl::Span<const T> GetSpan() const {
+    return absl::Span<const T>(tensor_.flat<T>().data(), count());
+  }
+
+  template <typename T>
+  std::vector<T> ToVector() const {
+    auto span = GetSpan<T>();
+    return std::vector<T>(span.data(), span.data() + span.size());
+  }
+
   // TODO(aaroey): make these private.
   nvinfer1::Dims shape_;  // Note: shape.type[] is not used.
   tensorflow::DataType type_;
@@ -560,6 +571,9 @@ class Converter {
   friend class OpConverterTest;
 };
 
+// Map of all supported UnaryOperations
+const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 77221f6d9a42a165e8f9e322e1f876b02f4db59f..0aa48913f463b55a4252b01281c3ed3feed35539 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -60,6 +60,7 @@ namespace convert {
 using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::NanSensitiveFloatNear;
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -241,6 +242,16 @@ class FakeITensor : public nvinfer1::ITensor {
   float getDynamicRange() const override { return dynamic_range_; }
 #endif
 
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  bool dynamicRangeIsSet() const override { return true; }
+
+  void resetDynamicRange() override {}
+
+  float getDynamicRangeMin() const override { return 0.f; }
+
+  float getDynamicRangeMax() const override { return 0.f; }
+#endif
+
  private:
   string name_;
   nvinfer1::Dims dims_;
@@ -1439,11 +1450,6 @@ TEST_F(OpConverterTest, ConvertReshape) {
   }
 
   struct TestParams {
-    TestParams(int input_batch_size, const std::vector<int>& input_tensor_dims,
-               const std::vector<int>& input_shape)
-        : batch_size(input_batch_size),
-          tensor_dims(input_tensor_dims),
-          shape(input_shape) {}
     int batch_size;
     std::vector<int> tensor_dims;
     std::vector<int> shape;
@@ -2381,11 +2387,6 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims, int axis,
-               const std::vector<int>& expected_output_dims)
-        : input_dims(input_dims),
-          axis(axis),
-          expected_output_dims(expected_output_dims) {}
     std::vector<int> input_dims;
     int axis;
     std::vector<int> expected_output_dims;
@@ -2498,11 +2499,6 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims, const std::vector<int>& axis,
-               const std::vector<int>& expected_output_dims)
-        : input_dims(input_dims),
-          axis(axis),
-          expected_output_dims(expected_output_dims) {}
     std::vector<int> input_dims;
     std::vector<int> axis;
     std::vector<int> expected_output_dims;
@@ -2621,46 +2617,62 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "StridedSlice can't modify batch dim, at my_strided_slice");
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_strided_slice");
   }
   {
-    // Stride is not 1, should fail.
+    // Dynamic batch size without end_mask, should fail.
     Reset();
     NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
     AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 2, -1, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "StridedSlice is only implemented for stride of "
-                               "1, at my_strided_slice");
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_strided_slice");
   }
   {
-    // Begin out of bounds, should fail.
+    // Dynamic batch size but using end_mask, ok.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(/*begin_mask=*/0,
+                                                 /*end_mask=*/1);
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {0, 1, 2, 2});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(node_def);
+  }
+// TRT 5.1+ supports strides
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  {
+    // Negative strides, should fail.
     Reset();
     NodeDef node_def = get_strided_slice_nodedef();
     AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {1, 2, 3, 4});
-    AddTestWeights<int32>("end", {4}, {0, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "begin value of 2 for StridedSlice is invalid, must be in the range "
-        "[-dim_size(i), dim_size(i)], at my_strided_slice");
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, -1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Negative or zero stride values are not "
+                               "supported for StridedSlice, at "
+                               "my_strided_slice");
   }
+#else
   {
-    // End out of bounds, should fail.
+    // Stride is not 1, should fail.
     Reset();
     NodeDef node_def = get_strided_slice_nodedef();
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {1, 2, 3, 4});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "end value of 2 for StridedSlice is invalid, must be in the range "
-        "[-dim_size(i), dim_size(i)], at my_strided_slice");
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 2, 1, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Strides other than 1 are not supported with "
+                               "this version of TRT, at my_strided_slice");
   }
+#endif
   {
     // Size of sliced dim is negative, should fail.
     Reset();
@@ -2669,126 +2681,183 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     AddTestWeights<int32>("begin", {4}, {0, 0, 2, 0});
     AddTestWeights<int32>("end", {4}, {1, 1, 0, 3});
     AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "New size of sliced dimension is negative, at my_strided_slice");
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "\"size\" cannot be negative or zero for "
+                               "StridedSlice, at my_strided_slice");
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims,
-               const std::vector<int>& expected_output_dims,
-               const std::vector<int>& begin, const std::vector<int>& end,
-               const std::vector<int>& begin_mask,
-               const std::vector<int>& end_mask,
-               const std::vector<int>& expected_output)
-        : input_dims(input_dims),
-          expected_output_dims(expected_output_dims),
-          begin(begin),
-          end(end),
-          expected_output(expected_output) {
-      // Masks are provided in terms of vectors for readability. Convert them to
-      // binary here.
-      this->begin_mask = 0;
-      for (int i = 0; i < begin_mask.size(); i++) {
-        if (begin_mask[i]) this->begin_mask |= (1 << i);
-      }
-      this->end_mask = 0;
-      for (int i = 0; i < end_mask.size(); i++) {
-        if (end_mask[i]) this->end_mask |= (1 << i);
-      }
-    }
-
     std::vector<int> input_dims;
-    std::vector<int> expected_output_dims;
     std::vector<int> begin;
     std::vector<int> end;
+    std::vector<int> strides;
     int begin_mask;
     int end_mask;
-    std::vector<int> expected_output;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  auto get_mask = [](const std::vector<int>& mask) {
+    int result = 0;
+    for (int i = 0; i < mask.size(); i++) {
+      if (mask[i]) result += (1 << i);
+    }
+    return result;
   };
 
+  // Same input is used for all tests.
+  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6};
+
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  const int kStridedSliceOKCases = 23;
+#else
+  const int kStridedSliceOKCases = 19;
+#endif
   // Ok.
-  const int kStridedSliceOKCases = 18;
   TestParams ok_params[kStridedSliceOKCases] = {
-      // 2D Crop.
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 1, 2},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
-                 /*expected_output=*/{5, 6}},
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 1, 2, 3},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 0, 0},
-                 /*expected_output=*/{5, 6}},
-      // 2D Crop, with transpose.
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 2, 1},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 2, 1},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 2, 1},
-                 /*begin=*/{0, 1, 1, 0}, /*end=*/{0, 2, 3, 1},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{5, 6}},
-      TestParams{/*input_dims=*/{2, 1, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 1, 2},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{2, 1, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 1, 0, 1}, /*end=*/{0, 2, 1, 3},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{5, 6}},
-      // 2D Crop, with reshape.
-      TestParams{/*input_dims=*/{2, 3}, /*expected_output_dims=*/{1, 2},
-                 /*begin=*/{0, 0, 0}, /*end=*/{0, 1, 2},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{2, 3}, /*expected_output_dims=*/{1, 2},
-                 /*begin=*/{0, 1, 1}, /*end=*/{0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 1, 1},
-                 /*expected_output=*/{5, 6}},
-      // 1D Crop.
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 2, 2},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 0, 2},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 0},
-                 /*expected_output=*/{1, 2, 4, 5}},
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 3},
-                 /*begin=*/{0, 0, 1, 0}, /*end=*/{0, 0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
-                 /*expected_output=*/{4, 5, 6}},
-      // 1D Crop, with transpose.
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 3, 1},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 1, 1},
-                 /*expected_output=*/{1, 2, 3}},
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 3, 1},
-                 /*begin=*/{0, 1, 0, 0}, /*end=*/{0, 0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
-                 /*expected_output=*/{4, 5, 6}},
-      // 1D Crop, with reshape.
-      TestParams{/*input_dims=*/{6}, /*expected_output_dims=*/{3},
-                 /*begin=*/{0, 0}, /*end=*/{0, 3},
-                 /*begin_mask=*/{0, 0}, /*end_mask=*/{1, 0},
-                 /*expected_output=*/{1, 2, 3}},
-      TestParams{/*input_dims=*/{1, 6}, /*expected_output_dims=*/{1, 3},
-                 /*begin=*/{0, 0, 2}, /*end=*/{0, 0, 5},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 1, 0},
-                 /*expected_output=*/{3, 4, 5}},
-      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{3, 1},
-                 /*begin=*/{0, 2, 0}, /*end=*/{0, 5, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
-                 /*expected_output=*/{3, 4, 5}},
-      // Negative axis.
-      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{3, 1},
-                 /*begin=*/{0, -6, 0}, /*end=*/{0, -3, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
-                 /*expected_output=*/{1, 2, 3}},
-      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{5, 1},
-                 /*begin=*/{0, 0, 0}, /*end=*/{0, -1, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
-                 /*expected_output=*/{1, 2, 3, 4, 5}},
+    // 2D Crop.
+    TestParams{/*input_dims=*/{1, 2, 3}, /*begin=*/{0, 0, 0, 0},
+               /*end=*/{0, 0, 1, 2}, /*strides=*/{1, 1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 0, 0}),
+               /*expected_output_dims=*/{1, 1, 2}, /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 1, 2, 3}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    // 2D Crop, with transpose.
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 2, 1}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 2, 1},
+        /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 1, 1, 0}, /*end=*/{0, 2, 3, 1}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 2, 1},
+        /*expected_output=*/{5, 6}},
+    TestParams{
+        /*input_dims=*/{2, 1, 3},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 1, 2}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{2, 1, 3},
+        /*begin=*/{0, 1, 0, 1}, /*end=*/{0, 2, 1, 3}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    // 2D Crop, with reshape.
+    TestParams{/*input_dims=*/{2, 3},
+               /*begin=*/{0, 0, 0}, /*end=*/{0, 1, 2}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 0}),
+               /*expected_output_dims=*/{1, 2},
+               /*expected_output=*/{1, 2}},
+    TestParams{/*input_dims=*/{2, 3},
+               /*begin=*/{0, 1, 1}, /*end=*/{0, 0, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 1}),
+               /*expected_output_dims=*/{1, 2},
+               /*expected_output=*/{5, 6}},
+    // 1D Crop.
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 0, 2}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 0}), /*expected_output_dims=*/{1, 2, 2},
+        /*expected_output=*/{1, 2, 4, 5}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 0}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 1, 3},
+        /*expected_output=*/{4, 5, 6}},
+    // 1D Crop, with transpose.
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 1, 1}), /*expected_output_dims=*/{1, 3, 1},
+        /*expected_output=*/{1, 2, 3}},
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 1, 0, 0}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 3, 1},
+        /*expected_output=*/{4, 5, 6}},
+    // 1D Crop, with reshape.
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 3}, /*strides=*/{1, 1},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 2, 3}},
+    TestParams{/*input_dims=*/{1, 6},
+               /*begin=*/{0, 0, 2}, /*end=*/{0, 0, 5}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 0}),
+               /*expected_output_dims=*/{1, 3},
+               /*expected_output=*/{3, 4, 5}},
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, 2, 0}, /*end=*/{0, 5, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{3, 1},
+               /*expected_output=*/{3, 4, 5}},
+    // Negative axis.
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, -6, 0}, /*end=*/{0, -3, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{3, 1},
+               /*expected_output=*/{1, 2, 3}},
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, 0, 0}, /*end=*/{0, -1, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{5, 1},
+               /*expected_output=*/{1, 2, 3, 4, 5}},
+    // Clamp out of bounds begin and end.
+    TestParams{/*input_dims=*/{1, 2, 3}, /*begin=*/{0, 0, -9999, -9},
+               /*end=*/{0, 1, 1000, 4}, /*strides=*/{1, 1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 0, 0}),
+               /*expected_output_dims=*/{1, 2, 3},
+               /*expected_output=*/{1, 2, 3, 4, 5, 6}},
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+    // Strides
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 5}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 3, 5}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 6}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 3, 5}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 1}, /*end=*/{0, 6}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{2, 4, 6}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 2}, /*end=*/{0, 6}, /*strides=*/{1, 3},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{2},
+               /*expected_output=*/{3, 6}},
+#endif
   };
 
   for (int i = 0; i < kStridedSliceOKCases; i++) {
@@ -2801,16 +2870,18 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
                           ok_params[i].begin);
     AddTestWeights<int32>("end", {static_cast<int>(ok_params[i].end.size())},
                           ok_params[i].end);
-    std::vector<int> strides(ok_params[i].input_dims.size(), 1);
-    AddTestWeights<int32>("strides", {static_cast<int>(strides.size())},
-                          strides);
+    AddTestWeights<int32>("strides",
+                          {static_cast<int>(ok_params[i].strides.size())},
+                          ok_params[i].strides);
     RunValidationAndConversion(node_def);
 
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_strided_slice", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    const DataVec input_data{{"input", test::AsTensor<float>(ok_input)}};
     DataVec output_data{
         {"my_strided_slice",
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
@@ -2820,6 +2891,148 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   }
 }
 
+TEST_F(OpConverterTest, ConvertSlice) {
+  // Get nodedef for Slice layer.
+  auto get_slice_nodedef = []() -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
+    auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32);
+    auto slice = ops::Slice(s.WithOpName("my_slice"), input, begin, size);
+    return slice.operation.node()->def();
+  };
+
+  {
+    // Begin is below bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, -1, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
+  }
+  {
+    // Begin is above bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 3, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
+  }
+  {
+    // Size is below bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, -2});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" + \"size\" for dimension 3 in Slice is out of range, at "
+        "my_slice");
+  }
+  {
+    // Size is above bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 3, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" + \"size\" for dimension 2 in Slice is out of range, at "
+        "my_slice");
+  }
+  {
+    // Modify batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_slice");
+  }
+  {
+    // Dynamic batch size with size[0] not -1, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_slice");
+  }
+  {
+    // Dynamic batch size but using size[0] of -1, ok.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {-1, 1, 2, 2});
+    RunValidationAndConversion(node_def);
+  }
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<int> begin;
+    std::vector<int> size;
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_output;
+  };
+
+  // Ok.
+  const int kSliceOKCases = 5;
+  TestParams ok_params[kSliceOKCases] = {
+      TestParams{{1, 2, 3},
+                 {0, 0, 0, 0},
+                 {-1, -1, -1, -1},
+                 {1, 2, 3},
+                 {1, 2, 3, 4, 5, 6}},
+      TestParams{
+          {1, 2, 3}, {0, 0, 0, 0}, {1, 1, 2, 3}, {1, 2, 3}, {1, 2, 3, 4, 5, 6}},
+      TestParams{
+          {1, 2, 3}, {0, 0, 0, 0}, {1, -1, 2, 2}, {1, 2, 2}, {1, 2, 4, 5}},
+      TestParams{{6}, {0, 1}, {1, 5}, {5}, {2, 3, 4, 5, 6}},
+      TestParams{{6}, {0, 1}, {-1, 3}, {3}, {2, 3, 4}},
+  };
+
+  for (int i = 0; i < kSliceOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("begin",
+                          {static_cast<int>(ok_params[i].begin.size())},
+                          ok_params[i].begin);
+    AddTestWeights<int32>("size", {static_cast<int>(ok_params[i].size.size())},
+                          ok_params[i].size);
+    RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_slice", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_slice", ConstructTensor<float>(
+                                         ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
 TEST_F(OpConverterTest, ConvertConv2D) {
   {
     // Input list is empty, should fail.
@@ -2956,27 +3169,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims,
-               const std::vector<float>& input,
-               const std::vector<int>& filter_dims,
-               const std::vector<float>& filter,
-               const std::vector<int>& strides, const string& padding,
-               const string& data_format, const std::vector<int>& dilations,
-               bool is_conv2d_backprop_input,
-               const std::vector<int>& expected_output_dims,
-               const std::vector<float>& expected_output)
-        : input_dims(input_dims),
-          input(input),
-          filter_dims(filter_dims),
-          filter(filter),
-          strides(strides),
-          padding(padding),
-          data_format(data_format),
-          dilations(dilations),
-          is_conv2d_backprop_input(is_conv2d_backprop_input),
-          expected_output_dims(expected_output_dims),
-          expected_output(expected_output) {}
-
     std::vector<int> input_dims;
     std::vector<float> input;
     std::vector<int> filter_dims;
@@ -3163,6 +3355,294 @@ TEST_F(OpConverterTest, ConvertTopK) {
   }
 }
 
+template <DataType dtype>
+void TestConvertGather(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  // Get the NodeDef for GatherV2.
+  Scope s = Scope::NewRootScope();
+  auto params = ops::Placeholder(s.WithOpName("params"), dtype);
+  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  const NodeDef& node_def = gather.operation.node()->def();
+
+  struct TestParams {
+    std::vector<int> params_dims;
+    std::vector<int> indices_dims;
+    std::vector<int> indices;
+    int axis;
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_output;
+  };
+
+  // Input is the same {1, 2, 3, 4, 5, 6} for all cases.
+  const int kGatherOKCases = 5;
+  TestParams ok_params[kGatherOKCases] = {
+      // Vector indices (output is rank(params)).
+      TestParams{{1, 2, 3}, {1}, {0}, 3, {1, 2, 1}, {1, 4}},
+      TestParams{{1, 2, 3}, {1}, {1}, 3, {1, 2, 1}, {2, 5}},
+      TestParams{{1, 2, 3}, {1}, {2}, -1, {1, 2, 1}, {3, 6}},
+      TestParams{{1, 2, 3}, {3}, {2, 0, 1}, 3, {1, 2, 3}, {3, 1, 2, 6, 4, 5}},
+      // Higher rank indices (output is rank(params) + rank(indices) - 1).
+      TestParams{{1, 2, 3}, {1, 1}, {0}, 2, {1, 1, 1, 3}, {1, 2, 3}},
+  };
+
+  // Ok.
+  for (int i = 0; i < kGatherOKCases; i++) {
+    test->Reset();
+    test->AddTestTensor("params", ok_params[i].params_dims, 1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestTensor("indices", ok_params[i].indices_dims, 1,
+                        nvinfer1::DataType::kINT32);
+    test->AddTestWeights<int32>("axis", {1}, {ok_params[i].axis});
+    test->RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_gather", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    // Create input in CType and convert expected output to CType.
+    std::vector<CType> inputs = {CType(1), CType(2), CType(3),
+                                 CType(4), CType(5), CType(6)};
+    std::vector<CType> converted_expected_output(
+        ok_params[i].expected_output.begin(),
+        ok_params[i].expected_output.end());
+
+    const DataVec input_data{
+        {"params", test::AsTensor<CType>(inputs)},
+        {"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+    DataVec output_data{
+        {"my_gather",
+         ConstructTensor<CType>(ok_params[i].expected_output.size())}};
+    test->BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                ElementsAreArray(converted_expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertGather) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_gather", "GatherV2", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "GatherV2 got 0 inputs but expected 3, at my_gather");
+  }
+
+  // Get the NodeDef for GatherV2.
+  Scope s = Scope::NewRootScope();
+  auto params = ops::Placeholder(s.WithOpName("params"), DT_FLOAT);
+  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  const NodeDef& node_def = gather.operation.node()->def();
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestTensor("axis", {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"axis\" for GatherV2 must be a constant, at my_gather");
+  }
+  {
+    // Axis is out of bounds, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestWeights<int32>("axis", {1}, {4});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Axis value of 4 is out of bounds, must be in "
+                               "range [-4, 4), at my_gather");
+  }
+  {
+    // Axis is batch dimension, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestWeights<int32>("axis", {1}, {0});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "TensorRT does not allow manipulation of the "
+                               "batch dimension, at my_gather");
+  }
+
+  Reset();
+  TestConvertGather<DT_FLOAT>(this);
+  TestConvertGather<DT_HALF>(this);
+  TestConvertGather<DT_INT32>(this);
+}
+
+TEST_F(OpConverterTest, ConvertUnary) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_unary", "Neg", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Neg got 0 inputs but expected 1, at my_unary");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
+    const NodeDef& node_def = neg.operation.node()->def();
+    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"x\" for Neg must be a tensor, at my_unary");
+  }
+
+  // Get nodedef for unary layer.
+  auto get_unary_nodedef = [](string op_name) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    if (op_name == "Abs") {
+      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Acos") {
+      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Acosh") {
+      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Asin") {
+      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Asinh") {
+      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Atan") {
+      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Atanh") {
+      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Ceil") {
+      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Cos") {
+      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Cosh") {
+      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Exp") {
+      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Floor") {
+      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Log") {
+      auto unary = ops::Log(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Neg") {
+      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Reciprocal") {
+      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Rsqrt") {
+      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sin") {
+      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sinh") {
+      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sqrt") {
+      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Tan") {
+      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    }
+    EXPECT_TRUE(false);
+    return NodeDef();
+  };
+  // Get expected output for unary layer.
+  auto get_unary_output = [](string op_name, float input) -> float {
+    if (op_name == "Abs") {
+      return std::abs(input);
+    } else if (op_name == "Acos") {
+      return std::acos(input);
+    } else if (op_name == "Acosh") {
+      return std::acosh(input);
+    } else if (op_name == "Asin") {
+      return std::asin(input);
+    } else if (op_name == "Asinh") {
+      return std::asinh(input);
+    } else if (op_name == "Atan") {
+      return std::atan(input);
+    } else if (op_name == "Atanh") {
+      return std::atanh(input);
+    } else if (op_name == "Ceil") {
+      return std::ceil(input);
+    } else if (op_name == "Cos") {
+      return std::cos(input);
+    } else if (op_name == "Cosh") {
+      return std::cosh(input);
+    } else if (op_name == "Exp") {
+      return std::exp(input);
+    } else if (op_name == "Floor") {
+      return std::floor(input);
+    } else if (op_name == "Log") {
+      return std::log(input);
+    } else if (op_name == "Neg") {
+      return -input;
+    } else if (op_name == "Reciprocal") {
+      return 1.0 / input;
+    } else if (op_name == "Rsqrt") {
+      return 1.0 / std::sqrt(input);
+    } else if (op_name == "Sin") {
+      return std::sin(input);
+    } else if (op_name == "Sinh") {
+      return std::sinh(input);
+    } else if (op_name == "Sqrt") {
+      return std::sqrt(input);
+    } else if (op_name == "Tan") {
+      return std::tan(input);
+    }
+    EXPECT_TRUE(false);
+    return 0;
+  };
+
+  // Get list of ops to test.
+  std::vector<string> ops_to_test;
+  // Add all ops supported by ConvertUnary.
+  auto* map = UnaryOperationMap();
+  ops_to_test.reserve(map->size());
+  for (auto& pair : *map) {
+    ops_to_test.push_back(pair.first);
+  }
+  // Add other unary ops to test.
+  ops_to_test.push_back("Rsqrt");
+  // Ok.
+  for (string op_name : ops_to_test) {
+    Reset();
+    NodeDef node_def = get_unary_nodedef(op_name);
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+
+    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
+    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    for (int i = 0; i < input.size(); ++i) {
+      const float expected_output = get_unary_output(op_name, input[i]);
+      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
+                  NanSensitiveFloatNear(expected_output, 0.0001));
+    }
+  }
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index f36aa558ea2ea463983caf163e17f83ae1c38f40..0eedfcacb4c11c8dc63fcfc13f044586b99b3c76 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -87,7 +87,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     LOG(INFO) << offset << "type             = " << cluster->type();
     LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
     const auto dev_names = cluster->GetDeviceNames();
-    if (dev_names.size()) {
+    if (!dev_names.empty()) {
       LOG(INFO) << offset << " Device names:";
       for (const auto s : dev_names) {
         LOG(INFO) << offset2 << s;
@@ -103,7 +103,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
 
     const auto dev_props = cluster->GetDevices();
-    if (dev_props.size()) {
+    if (!dev_props.empty()) {
       LOG(INFO) << offset << "Device properties:";
       for (auto k : dev_props) {
         LOG(INFO) << offset2 << k.first;
@@ -131,7 +131,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
   }
   LOG(INFO) << "item: " << item.id;
-  if (item.feed.size()) {
+  if (!item.feed.empty()) {
     LOG(INFO) << offset << "Feeds  :";
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
@@ -140,7 +140,7 @@ void TRTOptimizationPass::PrintDebugInfo(
   } else {
     LOG(INFO) << offset << "No Feeds";
   }
-  if (item.fetch.size()) {
+  if (!item.fetch.empty()) {
     LOG(INFO) << offset << "Fetches  :";
     for (const auto& f : item.fetch) {
       LOG(INFO) << offset2 << f;
@@ -149,7 +149,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     LOG(INFO) << offset << "No Fetches";
   }
 
-  if (item.init_ops.size()) {
+  if (!item.init_ops.empty()) {
     LOG(INFO) << offset << "init ops  :";
     for (const auto& f : item.init_ops) {
       LOG(INFO) << offset2 << f;
@@ -160,7 +160,7 @@ void TRTOptimizationPass::PrintDebugInfo(
   LOG(INFO) << "Save Op = " << item.save_op;
   LOG(INFO) << "Restore Op = " << item.restore_op;
   LOG(INFO) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
-  if (item.keep_ops.size()) {
+  if (!item.keep_ops.empty()) {
     LOG(INFO) << offset << "keep ops  :";
     for (const auto& f : item.keep_ops) {
       LOG(INFO) << offset2 << f;
@@ -197,7 +197,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     PrintDebugInfo(cluster, item);
   }
   int max_dim = -1;
-  if (item.feed.size()) {
+  if (!item.feed.empty()) {
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
       if (shape.dims() > 0) {
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
index eae1f8e7525f1816d1c50072ebe4ba6713c96e47..81406b6e301ca350a3e52c97f5fcb575e88c3a90 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
-#define TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
-
 #include <memory>
 #include <vector>
 
@@ -70,4 +67,3 @@ REGISTER_KERNEL_BUILDER(Name("GetSerializedResourceOp").Device(DEVICE_GPU),
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index bc5335ef5aa35633a68e69f7de7903b4f498531a..f6d387c59cd04aa5c7ccad610290b7b1f1d2b11f 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -216,8 +215,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
                  context->GetAttr("use_calibration", &use_calibration_));
   calibration_mode_ =
       (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 &&
-       calibration_data.size() == 0);
-  if (calibration_data.size()) {
+       calibration_data.empty());
+  if (!calibration_data.empty()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
   }
@@ -254,6 +253,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
   opts.rendezvous = ctx->rendezvous();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.runner = ctx->runner();
+  inputs.reserve(ctx->num_inputs());
   for (int i = 0; i < ctx->num_inputs(); i++) {
     inputs.push_back(ctx->input(i));
   }
@@ -294,27 +294,6 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
             return this->AllocateCalibrationResources(ctx, cr);
           }}));
   tensorflow::core::ScopedUnref calib_sc(calib_res);
-  // TODO(aaroey): here we also add the resource to the ResourceMgr singleton.
-  // This is needed before we migrate all uses of calib_graph_to_infer_graph()
-  // to the new calibration workflow. After that we'll remove this block.
-  {
-    auto deprecated_rm =
-        TRTResourceManager::instance()->getManager("TRTCalibration");
-    TRTCalibrationResource* copied_resource = nullptr;
-    // Check whether the resource exists, and create it if not.
-    if (deprecated_rm->Lookup(funcdef_name_, "Calibrator", &copied_resource)
-            .ok()) {
-      // Do nothing if the resource exists.
-      copied_resource->Unref();
-    } else {
-      copied_resource = calib_res;
-      // Increase the refcount by 1 then transfer the ownership of that refcount
-      // to the ResourceMgr singleton.
-      copied_resource->Ref();
-      OP_REQUIRES_OK(ctx, deprecated_rm->Create(funcdef_name_, "Calibrator",
-                                                copied_resource));
-    }
-  }
   int num_inputs = ctx->num_inputs();
   // Pass input data to calibrator
   std::unordered_map<string, void*> input_data;
@@ -385,8 +364,9 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   }
   // Get shapes of inputs to engine.
   std::vector<tensorflow::TensorShape> input_shapes;
+  input_shapes.reserve(ctx->num_inputs());
   for (int i = 0; i < ctx->num_inputs(); ++i) {
-    input_shapes.emplace_back(ctx->input(i).shape());
+    input_shapes.push_back(ctx->input(i).shape());
   }
   EngineContext* engine_context = GetEngine(input_shapes, ctx);
   if (!engine_context->cuda_engine) {
@@ -433,7 +413,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     auto dtype = cuda_engine->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
+        buffers[binding_index] =
+            const_cast<float*>(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(ERROR) << "FP16 inputs are not supported yet!";
@@ -442,10 +423,11 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
         LOG(ERROR) << "INT8 inputs are not supported yet!";
         return kRetry;
       case nvinfer1::DataType::kINT32:
-        buffers[binding_index] = (void*)(input_tensor.flat<int32>().data());
+        buffers[binding_index] =
+            const_cast<int32*>(input_tensor.flat<int32>().data());
         break;
       default:
-        LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
+        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
         return kRetry;
     }
   }
@@ -484,7 +466,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
-            reinterpret_cast<void*>(output_tensor->flat<float>().data());
+            const_cast<float*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(WARNING) << "half size is not supported yet!";
@@ -494,7 +476,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
         return kRetry;
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] =
-            reinterpret_cast<void*>(output_tensor->flat<int32>().data());
+            const_cast<int32*>(output_tensor->flat<int32>().data());
         break;
       default:
         LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
@@ -616,11 +598,11 @@ EngineContext* TRTEngineOp::GetEngine(
     LOG(INFO) << "Building a new TensorRT engine for " << name()
               << " input shapes: "
               << TensorShapeUtils::ShapeListString(engine_input_shapes);
+
     // Convert to partial shapes
-    std::vector<PartialTensorShape> partial_shapes;
-    for (int i = 0; i < engine_input_shapes.size(); i++) {
-      partial_shapes.emplace_back(engine_input_shapes[i]);
-    }
+    std::vector<PartialTensorShape> partial_shapes(engine_input_shapes.begin(),
+                                                   engine_input_shapes.end());
+
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
diff --git a/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
index 86bfabf99e08a8e447a28504c72eebca4d3a582c..25fb3a13db9911673bac04652b8ed8ba842be93c 100644
--- a/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
+++ b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
@@ -18,17 +18,52 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+
 import platform
+from tensorflow.python.framework import errors
+
+_trt_ops_so = None
+_module_lock = threading.Lock()
+
+
+def load_trt_ops():
+  """Load TF-TRT op libraries so if it hasn't been loaded already."""
+  global _trt_ops_so
+
+  if platform.system() == "Windows":
+    raise RuntimeError("Windows platforms are not supported")
+
+  with _module_lock:
+    if _trt_ops_so:
+      return
 
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import *
+    try:
+      # pylint: disable=g-import-not-at-top,unused-variable
+      # This registers the TRT ops, it doesn't require loading TRT library.
+      from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import trt_engine_op
+      # pylint: enable=g-import-not-at-top,unused-variable
+    except ImportError as e:
+      print("**** Failed to import TF-TRT ops. This is because the binary was "
+            "not built with CUDA or TensorRT enabled. ****")
+      raise e
 
-  from tensorflow.python.framework import load_library
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
+    # TODO(laigd): we should load TF-TRT kernels here as well after removing the
+    # swig binding.
+    try:
+      # pylint: disable=g-import-not-at-top
+      from tensorflow.python.framework import load_library
+      from tensorflow.python.platform import resource_loader
+      # pylint: enable=g-import-not-at-top
 
-  _trt_ops = load_library.load_op_library(
-      resource_loader.get_path_to_datafile("_trt_ops.so"))
-else:
-  raise RuntimeError("Windows platforms are not supported")
+      _trt_ops_so = load_library.load_op_library(
+          resource_loader.get_path_to_datafile("_trt_ops.so"))
+    except errors.NotFoundError as e:
+      no_trt_message = (
+          "**** Failed to initialize TensorRT. This is either because the "
+          "TensorRT installation path is not in LD_LIBRARY_PATH, or because "
+          "you do not have it installed. If not installed, please go to "
+          "https://developer.nvidia.com/tensorrt to download and install "
+          "TensorRT ****")
+      print(no_trt_message)
+      raise e
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 4a8a4ac7589a4b68b129e8e88ee999e8a2495728..3794929b1df3fa999de6ab218dc2ddfb96e4ac81 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -30,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
@@ -725,3 +728,6 @@ tensorflow::Status SegmentGraph(
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 9a0ccc9aef475edfb0ffb83a2be21d4d4ca0e028..9622ddd593990e93ba1b54e9dfd0052006e20ced 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -24,8 +24,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
+namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
@@ -60,4 +62,7 @@ tensorflow::Status SegmentGraph(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 58512d3b09d7c6f523710bc09843c628a5838b53..e11ad2719740d908f93ef580a6b308469365f402 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -26,6 +26,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
@@ -265,3 +268,6 @@ TEST_F(SegmentTest, BigIfElse) {
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
index 3bcca99afbff8b84d2dd628ae9211ee94e86af2a..dd3c09d7e42358a1f9e6cc13be6198de58e38963 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "re2/re2.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/test_utils.h b/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
index bcd628b62f0320f7ce9dfe6240316d876f1d5a20..d85875991b79014c4f173d3157ed02e6c96f045c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
 
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index bf111d3a2ee2fbec9151d12bbb6ff7181761c2aa..5213fced1ea9220422245172f5b4a3f584a2a566 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -135,7 +135,7 @@ void TRTInt8Calibrator::setDone() {
 
 void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
                                               std::size_t length) {
-  calibration_table_ = string((const char*)ptr, length);
+  calibration_table_ = string(static_cast<const char*>(ptr), length);
   VLOG(1) << "Got calibration data for " << engine_name_ << " @" << ptr
           << " length=" << length;
 }
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 10587e99624acfb97730bbbd9dfbcde020ffc669..aa70b07f8d79848c362275815004db32cca128be 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -34,7 +34,12 @@ namespace tensorrt {
 // TRTs pull model for calibration. When TRT implements a means for
 // a push calibration This class should be updated accordingly
 
+// IInt8EntropyCalibrator2 is prefferred for TRT 5.1+.
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
+#else
 struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+#endif
  public:
   // Construct a calibrator for future calibration.
   TRTInt8Calibrator(
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index f454f55f2cb4ee65b97891ae8dd58d809d36f099..6bc842ed5ca7e03018157060a332338cdc926f14 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -26,7 +26,7 @@ namespace tensorrt {
 void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
-#if NV_TENSORRT_MAJOR >= 5 && NV_TENSORRT_MINOR >= 1
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
     case Severity::kVERBOSE:
 #endif
     case Severity::kINFO: {  // Mark TRT info messages as debug!
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.cc
deleted file mode 100644
index 0a72a88bc740101bcbadb40bfe106a5b8d284bbf..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-std::shared_ptr<TRTResourceManager>
-tensorflow::tensorrt::TRTResourceManager::instance() {
-  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
-  return instance_;
-}
-
-std::shared_ptr<tensorflow::ResourceMgr>
-tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
-  // mutex is held for lookup only. Most instantiations where mutex will be held
-  // longer will be during op creation and should be ok.
-  tensorflow::mutex_lock lock(map_mutex_);
-  auto s = managers_.find(op_name);
-  if (s == managers_.end()) {
-    auto it = managers_.emplace(
-        op_name, std::make_shared<tensorflow::ResourceMgr>(op_name));
-    VLOG(1) << "Returning a new manager " << op_name;
-    return it.first->second;
-  }
-  VLOG(1) << "Returning old manager " << op_name;
-  return s->second;
-}
-
-}  // namespace tensorrt
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h b/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h
deleted file mode 100644
index 03879ffff2fa724b05cb1919753e4aaa99e2e702..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCE_MANAGER_H_
-#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCE_MANAGER_H_
-#include <memory>
-
-#include <string>
-#include <unordered_map>
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-class TRTResourceManager {
-  TRTResourceManager() = default;
-
- public:
-  static std::shared_ptr<TRTResourceManager> instance();
-  // returns a manager for given op, if it doesn't exists it creates one
-  std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
-
- private:
-  std::unordered_map<string, std::shared_ptr<tensorflow::ResourceMgr>>
-      managers_;
-  tensorflow::mutex map_mutex_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
index 37f7fe99fbb2b9e121953fc0de211db1bbf34b7a..2e553079b19a3e5d0739cc6ac79a84f3b6a1fc4e 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
@@ -48,7 +48,7 @@ Status TRTCalibrationResource::SerializeToString(string* serialized) {
   calibrator_->waitAndSetDone();
   thr_->join();
   *serialized = calibrator_->getCalibrationTableAsString();
-  if (!serialized->size()) {
+  if (serialized->empty()) {
     return tensorflow::errors::Unknown("Calibration table is empty.");
   }
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 02de95141da1a28e59d3155742217efdf163e8dd..7d9e7b9fc1f7ea83d6aa982afb5df097b0bdbf77 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -24,7 +24,7 @@ package(
 )
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library", "xla_py_proto_library")
 
 cc_library(
     name = "tf2xla_supported_ops_lib",
@@ -60,6 +60,14 @@ xla_proto_library(
     ],
 )
 
+xla_py_proto_library(
+    name = "tf2xla_py",
+    has_services = False,
+    api_version = 2,
+    visibility = ["//visibility:public"],
+    deps = [":tf2xla_proto"],
+)
+
 xla_proto_library(
     name = "host_compute_metadata_proto",
     srcs = ["host_compute_metadata.proto"],
@@ -283,6 +291,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -448,6 +457,7 @@ cc_library(
     hdrs = [
         "dump_graph.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index c8341a2c6bb66e43fb00cb660726cf5a1979c992..8aa162be47c9181e215de6a2eb660215135ff6eb 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -63,6 +63,23 @@ size_t AncestorNode::Hash::operator()(const AncestorNode& ancestor) const {
   return Hash64Combine(h, std::hash<int>()(static_cast<int>(ancestor.type)));
 }
 
+typedef std::tuple<StateMap::CondId, StateMap::AncestorId, OutputTensor>
+    ClusterTuple;
+
+struct ClusterTupleLessThan {
+  bool operator()(const ClusterTuple& a, const ClusterTuple& b) const {
+    if (std::tie(std::get<0>(a), std::get<1>(a)) <
+        std::tie(std::get<0>(b), std::get<1>(b))) {
+      return true;
+    } else if (std::tie(std::get<0>(a), std::get<1>(a)) ==
+               std::tie(std::get<0>(b), std::get<1>(b))) {
+      return StateMap::OutputTensorLess()(std::get<2>(a), std::get<2>(b));
+    } else {
+      return false;
+    }
+  }
+};
+
 // TODO(jpienaar): Move to OutputTensor.
 string DebugString(const OutputTensor& tensor) {
   return absl::StrCat(tensor.node->name(), ":", tensor.index);
@@ -744,9 +761,9 @@ Status Conditional::BuildIfNode(Graph* graph,
   }
   builder.Device(predicate_.node->assigned_device_name());
   // Conditional should be the first input ...
-  builder.Input(NodeDefBuilder::NodeOut(predicate_.node->name(),
-                                        predicate_.index,
-                                        predicate_.node->output_type(0)));
+  builder.Input(
+      NodeDefBuilder::NodeOut(predicate_.node->name(), predicate_.index,
+                              predicate_.node->output_type(predicate_.index)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -1393,16 +1410,30 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   // Sort the merge nodes from innermost outwards.
   SortMergeNodes(&merge_order);
 
-  // Cluster merge nodes by CondId and AncestorId in order of nesting.
-  using ClusterPair = std::pair<StateMap::CondId, StateMap::AncestorId>;
+  // Cluster merge nodes by (CondId, AncestorId, predicate) in order of
+  // nesting. (CondId, AncestorId) is not enough, e.g.
+  //   pred1 = array_ops.placeholder(dtypes.bool, name='pred1')
+  //   pred2 = array_ops.placeholder(dtypes.bool, name='pred2')
+  //   cond1 = control_flow_ops.cond(pred1, ...)
+  //   cond2 = control_flow_ops.cond(pred2, ...)
+  //   cond3 = control_flow_ops.cond(pred1, use cond1 and cond2)
+  //   cond4 = control_flow_ops.cond(pred2, use cond1 and cond2)
+  // cond3 and cond4 have the same (CondId, AncestorId), but they should not
+  // be merged into one "If" node (because they have different predicates).
   std::deque<std::vector<Node*>> merge_clusters;
-  std::map<ClusterPair, int> merge_cluster_index;
+  std::map<ClusterTuple, int, ClusterTupleLessThan> merge_cluster_index;
   for (Node* merge : merge_order) {
     auto cond_id = state_map_.LookupCondId(merge);
     if (state_map_.IsDead(cond_id)) continue;
 
-    ClusterPair key =
-        std::make_pair(cond_id, state_map_.LookupAncestorId(merge));
+    auto predicate = merge_to_predicate_.find(merge);
+    if (predicate == merge_to_predicate_.end()) {
+      return errors::Internal("Cannot find predicate for Merge node ",
+                              merge->name());
+    }
+
+    ClusterTuple key = std::make_tuple(
+        cond_id, state_map_.LookupAncestorId(merge), predicate->second);
     auto idx = merge_cluster_index.find(key);
     if (idx == merge_cluster_index.end()) {
       merge_cluster_index[key] = merge_clusters.size();
@@ -1422,7 +1453,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
                      &state_map_);
     for (Node* merge : cluster) TF_RETURN_IF_ERROR(cond.AddMerge(merge));
     TF_RETURN_IF_ERROR(
-        cond.BuildAndReplace(graph_, library_, &merge_to_predicate_));
+        cond.BuildAndReplace(graph_, library_, &merge_to_replacement_));
 
     if (VLOG_IS_ON(4)) DumpGraphWithCondState("after_extract");
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 69353fe87d833fba2c8766ed185481f2238a190d..343568b2392595a2347bde41f0a2e2559fb1de19 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -107,11 +107,13 @@ tf_kernel_library(
         "xla_pad_op.cc",
         "xla_reduce_op.cc",
         "xla_select_and_scatter_op.cc",
+        "xla_self_adjoint_eig_op.cc",
     ],
     hdrs = [
         "index_ops.h",
         "shape_util.h",
     ],
+    tags = ["optonly"],
     deps = [
         ":conv_op_helpers",
         ":if_op",
@@ -143,8 +145,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:prng",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:quantize",
+        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
         "//tensorflow/compiler/xla/client/lib:sorting",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:data_flow_ops_op_lib",
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index c2b4c28d1566f5429c5d8109db94af0c3762b131..a99c6ee4431852166eec0a71bb7ad74fd5c135d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -112,9 +113,12 @@ class CategoricalOp : public XlaOpKernel {
                                     xla::PrimitiveType type,
                                     XlaOpKernelContext* ctx) {
     xla::XlaBuilder* builder = ctx->builder();
-    auto uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    // We want a number in (0, 1) rather than [0, 1) or (0, 1]:
+    // * log(-log(0)) is ∞.
+    // * log(-log(1)) is -∞.
+    auto uniforms = xla::RngUniform(
+        xla::MinPositiveNormalValue(builder, type),
+        xla::One(builder, uniform_shape.element_type()), uniform_shape);
     return xla::Log(-xla::Log(uniforms));
   }
 
@@ -143,9 +147,13 @@ class StatelessCategoricalOp : public CategoricalOp {
     if (uniform_shape.element_type() == xla::BF16) {
       uniform_shape.set_element_type(xla::F32);
     }
+    // We want a number in (0, 1) rather than [0, 1) or (0, 1]:
+    // * log(-log(0)) is ∞.
+    // * log(-log(1)) is -∞.
     auto uniforms = xla::StatelessRngUniform(
-        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
-        XlaHelpers::One(builder, DT_FLOAT));
+        {seed0, seed1}, uniform_shape,
+        xla::MinPositiveNormalValue(builder, uniform_shape.element_type()),
+        xla::One(builder, uniform_shape.element_type()));
     return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 5f99b24e221ba6c926032ef7a1b4bf1e92df7a68..e8b270c67a23b876612ab1dba92a8ae7a46a392d 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -203,7 +203,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
     StringPiece label, int num_spatial_dims, const xla::Shape& input_shape,
     const xla::Shape& filter_shape, const xla::Shape& out_backprop_shape,
     absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
+    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims,
+    absl::Span<const int64> explicit_paddings) {
   TensorShape input_tensor_shape, filter_tensor_shape,
       out_backprop_tensor_shape;
   TF_RETURN_IF_ERROR(XLAShapeToTensorShape(input_shape, &input_tensor_shape));
@@ -212,8 +213,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
       XLAShapeToTensorShape(out_backprop_shape, &out_backprop_tensor_shape));
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_tensor_shape, filter_tensor_shape,
-      out_backprop_tensor_shape, dilations, strides, padding,
-      /*explicit_paddings=*/{}, data_format, dims);
+      out_backprop_tensor_shape, dilations, strides, padding, explicit_paddings,
+      data_format, dims);
 }
 
 }  // anonymous namespace
@@ -227,10 +228,9 @@ xla::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
   TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", &attrs.dilations));
   TF_RETURN_IF_ERROR(ctx->GetAttr("strides", &attrs.strides));
   TF_RETURN_IF_ERROR(ctx->GetAttr("padding", &attrs.padding));
-  // TODO(reedwm): Support explicit padding.
   if (attrs.padding == EXPLICIT) {
-    return errors::Unimplemented(
-        "XLA does not yet support Conv2D with explicit padding.");
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
   }
 
   string data_format;
@@ -303,6 +303,11 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(StringPiece /*type_string*/,
     window_strides[i] = attrs.strides.at(dim);
     rhs_dilation[i] = attrs.dilations.at(dim);
 
+    if (attrs.padding == EXPLICIT) {
+      padding[i] = {attrs.explicit_paddings.at(dim * 2),
+                    attrs.explicit_paddings.at(dim * 2 + 1)};
+    }
+
     int64 unused_output_size;
     TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
         input_shape.dimensions(dim), filter_shape.dimensions(i),
@@ -337,7 +342,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
   TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
       type_string, attrs.num_spatial_dims, input_shape, expanded_filter_shape,
       out_backprop_shape, attrs.dilations, attrs.strides, attrs.padding,
-      attrs.data_format, &dims));
+      attrs.data_format, &dims, attrs.explicit_paddings));
 
   // The input gradients are computed by a convolution of the output
   // gradients and the filter, with some appropriate padding. See the
@@ -420,7 +425,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
       type_string, attrs.num_spatial_dims, activations_shape,
       expanded_filter_shape, out_backprop_shape, attrs.dilations, attrs.strides,
-      attrs.padding, attrs.data_format, &dims));
+      attrs.padding, attrs.data_format, &dims, attrs.explicit_paddings));
 
   // The activations (inputs) form the LHS of the convolution.
   // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
@@ -469,6 +474,8 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
     dnums.add_input_spatial_dimensions(dim);
     dnums.add_kernel_spatial_dimensions(dim);
+    rhs_dilation[i] = dims.spatial_dims[i].stride;
+    window_strides[i] = attrs.dilations[dim];
 
     // We will also need to pad the input with zeros such that after the
     // convolution, we get the right size for the filter.
@@ -495,6 +502,8 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // We apply negative padding in this case.
     const int64 pad_total = padded_in_size - dims.spatial_dims[i].input_size;
 
+    // + For the EXPLICIT padding, we pad the top/left side with the explicit
+    //   padding and pad the bottom/right side with the remaining space.
     // + For the VALID padding, we don't pad anything on the top/left side
     //   and pad the bottom/right side with the remaining space.
     // + For the SAME padding, we pad top/left side the same as bottom/right
@@ -503,12 +512,12 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // In addition, if the padded input size is smaller than the input size,
     // we need to ignore some training elements of the input. We do this by
     // applying negative padding on the right/bottom.
-    const int64 pad_before =
-        attrs.padding == Padding::SAME ? std::max<int64>(pad_total / 2, 0) : 0;
-
+    const int64 pad_before = attrs.padding == Padding::EXPLICIT
+                                 ? attrs.explicit_paddings[2 * dim]
+                                 : attrs.padding == Padding::SAME
+                                       ? std::max<int64>(pad_total / 2, 0)
+                                       : 0;
     padding[i] = {pad_before, pad_total - pad_before};
-    rhs_dilation[i] = dims.spatial_dims[i].stride;
-    window_strides[i] = attrs.dilations[dim];
   }
 
   // Besides padding the input, we will also expand output_rows to
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 6e1b70a47850ae5c05939f8dfb7ec129c031df21..d893eca7f9ba07dded76eb215af4779080fa66b9 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -47,6 +47,7 @@ struct ConvOpAttrs {
   std::vector<int32> dilations;
   std::vector<int32> strides;
   Padding padding;
+  std::vector<int64> explicit_paddings;
   TensorFormat data_format;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index b96d45316f626e678a64392a4315979eeeb6e83c..d19d48e5dd95962fe4a4e4026eaf6b06b7898564 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -134,14 +135,15 @@ int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
 // If the 2D kernel would be very large, the 1D kernel can be applied once in
 // each dimension due to the symmetry of the kernel along all axis to reduce the
 // computational intensity.
-xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder, int64 n) {
+xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder,
+                                xla::PrimitiveType type, int64 n) {
   std::vector<float> kernel(n * 2 - 1);
   for (int64 i = 0; i < n; ++i) {
     float v = (i + 1.0f) / n;
     kernel[i] = v;
     kernel[n * 2 - 2 - i] = v;
   }
-  return xla::ConstantR1<float>(builder, kernel);
+  return xla::ConvertElementType(xla::ConstantR1<float>(builder, kernel), type);
 }
 
 // Unlike the bilinear kernel, which is triangular, the nearest neighbor
@@ -153,11 +155,12 @@ xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder, int64 n) {
 // to the right (because an existing non TPU kernel
 // for nearest neighbor resize already chose to default to the right,
 // so we want to be consistent).
-xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder, int64 n) {
+xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder,
+                                       xla::PrimitiveType type, int64 n) {
   std::vector<float> kernel(n * 2 - 1, 0.0f);
   std::fill(&kernel[n / 2], &kernel[(3 * n) / 2], 1.0f);
 
-  return xla::ConstantR1<float>(builder, kernel);
+  return xla::ConvertElementType(xla::ConstantR1<float>(builder, kernel), type);
 }
 
 // Kernels with more than 16 spatial elements are considered intense and the
@@ -165,42 +168,66 @@ xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder, int64 n) {
 const int64 kMax2DKernelSize = 16;
 
 xla::XlaOp MakeGeneralResizeKernel(xla::XlaBuilder* builder,
+                                   xla::PrimitiveType type,
                                    absl::Span<const int64> kernel_size,
                                    int64 channels, bool is_kernel_bilinear) {
   auto make_kernel_func =
       is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
 
-  auto depthwise_kernel = xla::Broadcast(
-      xla::Zero(builder, xla::F32),
-      {(2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1});
+  std::vector<int64> depthwise_kernel_sizes = {
+      (2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1};
+  auto depthwise_kernel =
+      xla::BroadcastInDim(make_kernel_func(builder, type, kernel_size[1]),
+                          depthwise_kernel_sizes, /*broadcast_dimensions=*/{1});
 
-  return xla::Mul(
-      xla::Add(depthwise_kernel, make_kernel_func(builder, kernel_size[1]),
-               /*broadcast_dimensions=*/{1}),
-      make_kernel_func(builder, kernel_size[0]),
-      /*broadcast_dimensions=*/{0});
+  return xla::Mul(depthwise_kernel,
+                  make_kernel_func(builder, type, kernel_size[0]),
+                  /*broadcast_dimensions=*/{0});
 }
 
 xla::XlaOp MakeGeneralResizeKernelInDim(xla::XlaBuilder* builder,
+                                        xla::PrimitiveType type,
                                         absl::Span<const int64> kernel_size,
                                         int64 channels, int64 dim,
                                         bool is_kernel_bilinear) {
   auto make_kernel_func =
       is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
 
-  auto depthwise_kernel =
-      xla::Broadcast(xla::Zero(builder, xla::F32),
-                     {dim == 0 ? (2 * kernel_size[0] - 1) : 1,
-                      dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1});
-  return xla::Add(depthwise_kernel, make_kernel_func(builder, kernel_size[dim]),
-                  /*broadcast_dimensions=*/{dim});
+  std::vector<int64> depthwise_kernel_sizes = {
+      dim == 0 ? (2 * kernel_size[0] - 1) : 1,
+      dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1};
+  return xla::BroadcastInDim(make_kernel_func(builder, type, kernel_size[dim]),
+                             depthwise_kernel_sizes,
+                             /*broadcast_dimensions=*/{dim});
+}
+
+xla::XlaOp BroadcastSpatialDimensions(xla::XlaBuilder* builder,
+                                      const xla::XlaOp& input,
+                                      int32 spatial_dimensions_offset,
+                                      absl::Span<const int64> in_size,
+                                      absl::Span<const int64> out_size) {
+  // Add broadcasts to handle expanding from a size == 1 dimension to a
+  // size > 1 dimension.
+  auto broadcast_shape_or_status = builder->GetShape(input);
+  if (!broadcast_shape_or_status.ok()) {
+    return builder->ReportError(broadcast_shape_or_status.status());
+  }
+  xla::Shape broadcast_shape = broadcast_shape_or_status.ValueOrDie();
+  for (int32 i = 0; i < in_size.size(); ++i) {
+    if (in_size[i] == 1 && out_size[i] > 1) {
+      broadcast_shape.set_dimensions(spatial_dimensions_offset + i,
+                                     out_size[i]);
+    }
+  }
+  return xla::BroadcastInDim(input, broadcast_shape.dimensions(),
+                             /*broadcast_dimensions=*/{0, 1, 2, 3});
 }
 
 xla::XlaOp ResizeUsingDilationAndConvolution(
-    xla::XlaBuilder* builder, const xla::XlaOp& input,
-    const int num_spatial_dims, std::vector<int64> in_size,
-    std::vector<int64> out_size, const int64 channels, const bool align_corners,
-    bool is_kernel_bilinear) {
+    xla::XlaBuilder* builder, const xla::XlaOp& input, xla::PrimitiveType type,
+    const int num_spatial_dims, absl::Span<const int64> in_size,
+    absl::Span<const int64> out_size, const int64 channels,
+    const bool align_corners, bool is_kernel_bilinear) {
   // Picture for a 1x3 to 1x4 bilinear resize:
   // stride = 2, kernel size = 3
   // Input:
@@ -287,7 +314,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
   // Split convolutions into independent dimensions if they would be a very
   // large kernel.
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, dims.kernel_size,
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, type, dims.kernel_size,
                                                 channels, is_kernel_bilinear);
     output =
         xla::ConvGeneralDilated(input_data, kernel, dims.stride,
@@ -299,7 +326,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
                                 /*feature_group_count=*/channels);
   } else {
     xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 0, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 0, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         input_data, kernel0, {dims.stride[0], 1},
         /*padding=*/
@@ -308,7 +335,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
         /*rhs_dilation=*/{1, 1}, dimension_numbers,
         /*feature_group_count=*/channels);
     xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 1, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 1, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         output, kernel1, {1, dims.stride[1]},
         /*padding=*/
@@ -320,19 +347,14 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
 
   // Add broadcasts to handle expanding from a size == 1 dimension to a
   // size > 1 dimension.
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    if (in_size[i] == 1 && out_size[i] > 1) {
-      output = xla::Add(output, xla::ConstantR1<float>(builder, out_size[i], 0),
-                        /*broadcast_dimensions=*/{1 + i});
-    }
-  }
-  return output;
+  return BroadcastSpatialDimensions(
+      builder, output, /*spatial_dimensions_offset=*/1, in_size, out_size);
 }
 
 xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
-    xla::XlaBuilder* builder, const xla::XlaOp& grad,
-    const int num_spatial_dims, std::vector<int64> in_size,
-    std::vector<int64> grad_size, const int64 channels,
+    xla::XlaBuilder* builder, const xla::XlaOp& grad, xla::PrimitiveType type,
+    const int num_spatial_dims, absl::Span<const int64> in_size,
+    absl::Span<const int64> grad_size, const int64 channels,
     const bool align_corners, bool is_kernel_bilinear) {
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, grad_size, align_corners);
@@ -353,19 +375,14 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
   xla::XlaOp output;
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, dims.kernel_size,
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, type, dims.kernel_size,
                                                 channels, is_kernel_bilinear);
 
     // Broadcast the input kernel where the forward op expanded from a size == 1
     // dimension to a size > 1 dimension. This has the effect of summing the
     // gradient contributions in that dimension.
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      if (in_size[i] == 1 && grad_size[i] > 1) {
-        kernel =
-            xla::Add(kernel, xla::ConstantR1<float>(builder, grad_size[i], 0),
-                     /*broadcast_dimensions=*/{i});
-      }
-    }
+    kernel = BroadcastSpatialDimensions(
+        builder, kernel, /*spatial_dimensions_offset=*/0, in_size, grad_size);
 
     output = xla::ConvGeneralDilated(
         grad, kernel, /*window_strides=*/dims.kernel_size,
@@ -377,22 +394,22 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
         /*feature_group_count=*/channels);
   } else {
     xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 0, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 0, is_kernel_bilinear);
     xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 1, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 1, is_kernel_bilinear);
 
     // Broadcast the input kernel where the forward op expanded from a
     // size == 1 dimension to a size > 1 dimension. This has the effect of
     // summing the gradient contributions in that dimension.
     if (in_size[0] == 1 && grad_size[0] > 1) {
-      kernel0 =
-          xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[0], 0),
-                   /*broadcast_dimensions=*/{0});
+      kernel0 = BroadcastSpatialDimensions(builder, kernel0,
+                                           /*spatial_dimensions_offset=*/0, {1},
+                                           {grad_size[0]});
     }
     if (in_size[1] == 1 && grad_size[1] > 1) {
-      kernel1 =
-          xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[1], 0),
-                   /*broadcast_dimensions=*/{1});
+      kernel1 = BroadcastSpatialDimensions(builder, kernel0,
+                                           /*spatial_dimensions_offset=*/0,
+                                           in_size, grad_size);
     }
 
     output = xla::ConvGeneralDilated(
@@ -423,7 +440,7 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
     }
   }
   if (pad_output) {
-    output = xla::Pad(output, xla::ConstantR0<float>(builder, 0.0f), padding);
+    output = xla::Pad(output, xla::Zero(builder, type), padding);
   }
   return output;
 }
@@ -458,6 +475,7 @@ void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
   const int num_spatial_dims = 2;
 
   xla::XlaOp input = ctx->Input(0);
+  xla::PrimitiveType input_type = ctx->input_xla_type(0);
 
   // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
   // dimension i.
@@ -475,8 +493,11 @@ void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
                        {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
   }
 
-  // Output is always type float.
-  input = xla::ConvertElementType(input, xla::F32);
+  // Output is always type float if 'is_kernel_bilinear' is true.
+  if (is_kernel_bilinear) {
+    input = xla::ConvertElementType(input, xla::F32);
+    input_type = xla::F32;
+  }
 
   // Special Case:
   // Instead of doing a ResizeUsingDilationAndConvolution directly,
@@ -504,19 +525,19 @@ void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
         std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
                                             (in_size[1] - 1) * 2 + 1};
         output = ResizeUsingDilationAndConvolution(
-            b, input, num_spatial_dims, in_size, next_out_size, channels,
-            align_corners_, is_kernel_bilinear);
+            b, input, input_type, num_spatial_dims, in_size, next_out_size,
+            channels, align_corners_, is_kernel_bilinear);
         input = output;
         in_size = next_out_size;
       } else {
         output = ResizeUsingDilationAndConvolution(
-            b, input, num_spatial_dims, in_size, out_size, channels,
+            b, input, input_type, num_spatial_dims, in_size, out_size, channels,
             align_corners_, is_kernel_bilinear);
         in_size = out_size;
       }
     } else {
       output = ResizeUsingDilationAndConvolution(
-          b, input, num_spatial_dims, in_size, out_size, channels,
+          b, input, input_type, num_spatial_dims, in_size, out_size, channels,
           align_corners_, is_kernel_bilinear);
       in_size = out_size;
     }
@@ -631,19 +652,19 @@ class ResizeBilinearGradOp : public XlaOpKernel {
           std::vector<int64> next_grad_size = {(in_size[0] - 1) * 2 + 1,
                                                (in_size[1] - 1) * 2 + 1};
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, next_grad_size, channels,
-              align_corners_, true);
+              b, grad, xla::F32, num_spatial_dims, in_size, next_grad_size,
+              channels, align_corners_, true);
           grad = output;
           in_size = next_grad_size;
         } else {
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, grad_size, channels,
+              b, grad, xla::F32, num_spatial_dims, in_size, grad_size, channels,
               align_corners_, true);
           in_size = grad_size;
         }
       } else {
         output = ResizeUsingDilationAndConvolutionGradOp(
-            b, grad, num_spatial_dims, in_size, grad_size, channels,
+            b, grad, xla::F32, num_spatial_dims, in_size, grad_size, channels,
             align_corners_, true);
         in_size = grad_size;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 47cf8c6675bc120653c2a5ab6d4b07376dc382ee..39d96e748b3a2a852c03c0dd53ec175f0c66a43a 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -25,9 +25,6 @@ limitations under the License.
 namespace tensorflow {
 
 EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
-  // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 2 * sizeof(void*));
-
   float* input = static_cast<float*>(data[0]);
   int64 input_size = *static_cast<int64*>(data[1]);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index 90c0ebefb24ec2c4378782e9b15d3f57c33032a4..5a6569c8954d1686dc9d7577a66feb720241ea13 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -31,7 +32,10 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     auto result = xla::TriangularSolve(
         ctx->Input(0), ctx->Input(1), /*left_side=*/true,
-        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+        /*lower=*/lower_, /*unit_diagonal=*/false,
+        /*transpose_a=*/
+        adjoint_ ? xla::TriangularSolveOptions::ADJOINT
+                 : xla::TriangularSolveOptions::NO_TRANSPOSE);
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 01b047f732f0e9fb3b45b272e7886e2f8cf4fff4..d6c70d4af1c2e921b70b0869f0163c8481017c7d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -279,9 +280,9 @@ class TruncatedNormalOp : public XlaOpKernel {
 
     xla::XlaBuilder* b = ctx->builder();
 
-    xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
+    xla::XlaOp one = xla::One(b, xla_shape.element_type());
     xla::XlaOp min_positive =
-        XlaHelpers::FloatLiteral(b, dtype, std::numeric_limits<float>::min());
+        xla::MinPositiveNormalValue(b, xla_shape.element_type());
     auto uniform = xla::RngUniform(min_positive, one, xla_shape);
     ctx->SetOutput(0, TruncatedNormal(uniform));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index a95e7adacf194ba6eb33cbeb56abe1a5a2479337..a1c18bed3f94008af8038f32324c79aa5b2abded 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -110,10 +110,16 @@ class ScatterNdOp : public XlaOpKernel {
     auto updates = context->Input(1);
     auto result =
         XlaScatter(buffer, updates, indices,
-                   /*indices_are_vectors=*/true, /*combiner=*/{}, builder);
+                   /*indices_are_vectors=*/true, /*combiner=*/Combine, builder);
     OP_REQUIRES_OK(context, result.status());
     context->SetOutput(0, result.ValueOrDie());
   }
+
+ private:
+  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+                            xla::XlaBuilder* builder) {
+    return xla::Add(x, y);
+  }
 };
 
 REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstantInput("shape"),
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 31d4cc131600f360c764ffa02831046c85d846e5..280b68383c28d1b9d88f7b2ac0f8fab47244c05d 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -104,7 +104,7 @@ class SizeOp : public XlaOpKernel {
     for (int64 i = 0; i < rank; ++i) {
       size = xla::Mul(size, xla::GetDimensionSize(ctx->Input(0), i));
     }
-    size = xla::ConvertElementType(size, xla::S32);
+    size = xla::ConvertElementType(size, ctx->output_xla_type(0));
     ctx->SetOutput(0, size);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 50653d7b3973b73d580cdeec5d71943b575d7cc9..17f067e0dfcf4f8b360ee6db934df3e373d5fdd1 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -218,8 +218,8 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
     auto uniform = xla::StatelessRngUniform(
         {seed0, seed1}, xla_shape,
-        xla::ConstantR0<float>(builder, std::numeric_limits<float>::min()),
-        xla::ConstantR0<float>(builder, 1.0));
+        xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
+        xla::One(builder, xla_shape.element_type()));
     auto output = TruncatedNormal(uniform);
     output = MaybeConvertF32ToBF16(output, dtype_);
     ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 65020012283d9c5f62e5e2fd11fc2bf1110e019a..8958a48bc79dce91c41ab7d0a5fc0fbb401112ba 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -69,6 +71,43 @@ class TensorListLengthOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListLength"), TensorListLengthOp);
 
+// Creates an empty list with size (leading_dim, *element_shape) if
+// element_shape is known at compile time. Otherwise creates one with size
+// (leading_dim, 0) which gets initialized later in `GetInitializedList`.
+Status CreateZerosList(XlaOpKernelContext* ctx, int element_shape_index,
+                       int64 leading_dim, DataType dtype, xla::XlaOp* list) {
+  TensorShape list_shape;
+  list_shape.AddDim(leading_dim);
+  xla::XlaOp element_shape_handle = ctx->Input(element_shape_index);
+  TF_ASSIGN_OR_RETURN(
+      bool is_element_shape_compile_time_const,
+      element_shape_handle.builder()->IsConstant(element_shape_handle));
+  PartialTensorShape partial_element_shape;
+  if (is_element_shape_compile_time_const) {
+    TF_RETURN_IF_ERROR(ctx->ConstantInputAsPartialShape(
+        element_shape_index, &partial_element_shape));
+  }
+  if (is_element_shape_compile_time_const &&
+      partial_element_shape.IsFullyDefined()) {
+    TensorShape element_shape;
+    partial_element_shape.AsTensorShape(&element_shape);
+    list_shape.AppendShape(element_shape);
+  } else {
+    // If element_shape is not a compile time constant or if it is not fully
+    // defined we will have to wait for the first write call to fully allocate
+    // the array.
+    // TODO(srbs): We are using element_shape of [0] as a proxy to denote an
+    // uninitialized list. A better implementation may be to represent the
+    // list as a 3-tuple containining an explicit "initialized" flag. However,
+    // we would still need to create a dummy tensor for the first tuple
+    // element.
+    list_shape.AddDim(0);
+  }
+  *list = xla::Broadcast(XlaHelpers::Zero(ctx->builder(), dtype),
+                         list_shape.dim_sizes());
+  return Status::OK();
+}
+
 class TensorListReserveOp : public XlaOpKernel {
  public:
   explicit TensorListReserveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -76,20 +115,15 @@ class TensorListReserveOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    TensorShape element_shape;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
     int64 num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
 
-    TensorShape tensor_shape;
-    tensor_shape.AddDim(num_elements);
-    tensor_shape.AppendShape(element_shape);
+    xla::XlaOp list;
+    OP_REQUIRES_OK(ctx, CreateZerosList(ctx, 0, num_elements, dtype_, &list));
 
     xla::XlaBuilder* b = ctx->builder();
     ctx->SetTensorListOutput(
-        0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
-                                         tensor_shape.dim_sizes()),
-                          xla::ConstantR0<int32>(b, num_elements)}));
+        0, xla::Tuple(b, {list, xla::ConstantR0<int32>(b, num_elements)}));
   }
 
  private:
@@ -110,8 +144,6 @@ class EmptyTensorListOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    TensorShape element_shape;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
     OP_REQUIRES(
@@ -119,15 +151,13 @@ class EmptyTensorListOp : public XlaOpKernel {
         errors::InvalidArgument("XLA compilation requires a fixed tensor list "
                                 "size. Set the max number of elements."));
 
-    TensorShape tensor_shape;
-    tensor_shape.AddDim(max_num_elements);
-    tensor_shape.AppendShape(element_shape);
+    xla::XlaOp list;
+    OP_REQUIRES_OK(ctx,
+                   CreateZerosList(ctx, 0, max_num_elements, dtype_, &list));
 
     xla::XlaBuilder* b = ctx->builder();
     ctx->SetTensorListOutput(
-        0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
-                                         tensor_shape.dim_sizes()),
-                          xla::ConstantR0<int32>(b, 0)}));
+        0, xla::Tuple(b, {list, xla::ConstantR0<int32>(b, 0)}));
   }
 
  private:
@@ -274,6 +304,36 @@ REGISTER_XLA_OP(
     Name("TensorListFromTensor").CompileTimeConstantInput("element_shape"),
     TensorListFromTensorOp);
 
+// Returns the 0'th element of `tuple` containing the list tensor if it has been
+// initialized already else creates one lazily. This allows lazy initialization
+// of the list on the first call to SetItem or PushBack.
+Status GetInitializedList(XlaOpKernelContext* ctx, const xla::XlaOp& tuple,
+                          const TensorShape& element_shape, DataType dtype,
+                          xla::XlaOp* list) {
+  *list = xla::GetTupleElement(tuple, 0);
+  TensorShape list_shape;
+  TF_RETURN_IF_ERROR(GetTensorListShape(ctx->builder(), tuple, &list_shape));
+  int64 leading_dim = list_shape.dim_size(0);
+  TensorShape list_element_shape = list_shape;
+  list_element_shape.RemoveDim(0);
+  // This checks for the lazy initialization contract set by CreateEmptyList.
+  // In TensorListReserve if the element_shape is not known at compile time,
+  // it creates a list with shape [leading_dim, 0].
+  if (element_shape != list_element_shape) {
+    if (list_element_shape.num_elements() != 0) {
+      return errors::InvalidArgument(
+          "Invalid shape of value in TensorListSetItem. Expected: ",
+          list_element_shape.DebugString(),
+          " Actual: ", element_shape.DebugString());
+    }
+    list_shape = element_shape;
+    list_shape.InsertDim(0, leading_dim);
+    *list = xla::Broadcast(XlaHelpers::Zero(ctx->builder(), dtype),
+                           list_shape.dim_sizes());
+  }
+  return Status::OK();
+}
+
 class TensorListSetItemOp : public XlaOpKernel {
  public:
   explicit TensorListSetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -285,7 +345,9 @@ class TensorListSetItemOp : public XlaOpKernel {
     xla::XlaOp tl = ctx->Input(0);
     TensorShape elem_shape = ctx->InputShape(2);
 
-    xla::XlaOp ta = xla::GetTupleElement(tl, 0);
+    xla::XlaOp list;
+    OP_REQUIRES_OK(ctx, GetInitializedList(ctx, tl, elem_shape, dtype_, &list));
+
     xla::XlaOp index = ctx->Input(1);
     xla::XlaOp value = ctx->Input(2);
 
@@ -299,8 +361,8 @@ class TensorListSetItemOp : public XlaOpKernel {
     auto update = xla::Reshape(value, slice_shape.dim_sizes());
 
     ctx->SetTensorListOutput(
-        0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
-                          index + xla::ConstantR0<int32>(b, 1)}));
+        0, xla::Tuple(b, {xla::DynamicUpdateSlice(list, update, start_indices),
+                          xla::GetTupleElement(tl, 1)}));
   }
 
  private:
@@ -319,11 +381,14 @@ class TensorListPushBackOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp tl = ctx->Input(0);
+    xla::XlaOp list_tuple = ctx->Input(0);
     TensorShape elem_shape = ctx->InputShape(1);
 
-    xla::XlaOp ta = xla::GetTupleElement(tl, 0);
-    xla::XlaOp index = xla::GetTupleElement(tl, 1);
+    xla::XlaOp list;
+    OP_REQUIRES_OK(
+        ctx, GetInitializedList(ctx, list_tuple, elem_shape, dtype_, &list));
+
+    xla::XlaOp index = xla::GetTupleElement(list_tuple, 1);
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
@@ -336,7 +401,7 @@ class TensorListPushBackOp : public XlaOpKernel {
     auto update = xla::Reshape(value, slice_shape.dim_sizes());
 
     ctx->SetTensorListOutput(
-        0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
+        0, xla::Tuple(b, {xla::DynamicUpdateSlice(list, update, start_indices),
                           index + xla::ConstantR0<int32>(b, 1)}));
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 26d4214099d1d07c1b2e275d783654d9cd948e28..ceb762038009f7a3ff80d9ad4066af43d54a9e34 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -856,15 +856,12 @@ class ResourceApplyAdadelta : public XlaOpKernel {
     xla::XlaOp grad = ctx->Input(6);
 
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp neg_half = XlaHelpers::FloatLiteral(b, dtype_, -0.5);
-    xla::XlaOp half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
     xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
-    xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
 
-    accum = rho * accum + (one - rho) * xla::Pow(grad, two);
-    xla::XlaOp update = xla::Pow(accum_update + epsilon, half) *
-                        xla::Pow(accum + epsilon, neg_half) * grad;
-    accum_update = rho * accum_update + (one - rho) * xla::Pow(update, two);
+    accum = rho * accum + (one - rho) * xla::Square(grad);
+    xla::XlaOp update =
+        xla::Sqrt(accum_update + epsilon) * xla::Rsqrt(accum + epsilon) * grad;
+    accum_update = rho * accum_update + (one - rho) * xla::Square(update);
     var = var - update * lr;
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum));
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 76793d677ba45f8e863e684a149da684c8ce8787..4ac714306248302242902f20d45d2609ef2c7cd3 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 // helper.
 
 #include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -128,29 +129,46 @@ class InvertPermutationOp : public XlaOpKernel {
                 errors::InvalidArgument("permutation of nonnegative int32s "
                                         "must have <= int32 max elements"));
 
-    std::vector<int64> perm;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &perm));
-
-    int size = perm.size();
+    auto e = ctx->InputExpression(0);
+    auto tensor_or_status = e.ResolveConstant(ctx->compiler()->client());
+    OP_REQUIRES_OK(ctx, tensor_or_status.status());
+    // If the input is a constant, we also want the output to be a constant.
+    // Some models rely on the result of InvertPermutation being a constant.
+    // TODO(b/32495713): Remove this when we can check whether Scatter is
+    // constant. Right now, we always assume it is non-constant because we don't
+    // check the embedded computation.
+    if (tensor_or_status.ValueOrDie().has_value()) {
+      std::vector<int64> perm;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &perm));
+
+      int size = perm.size();
+
+      std::vector<int32> output(size);
+      std::fill_n(output.data(), size, -1);
+      for (int i = 0; i < size; ++i) {
+        const int64 d = perm[i];
+        OP_REQUIRES(ctx, FastBoundsCheck(d, size),
+                    errors::InvalidArgument(d, " is not between 0 and ", size));
+        OP_REQUIRES(ctx, output[d] == -1,
+                    errors::InvalidArgument(d, " is duplicated in the input."));
+        output[d] = i;
+      }
 
-    std::vector<int32> output(size);
-    std::fill_n(output.data(), size, -1);
-    for (int i = 0; i < size; ++i) {
-      const int64 d = perm[i];
-      OP_REQUIRES(ctx, FastBoundsCheck(d, size),
-                  errors::InvalidArgument(d, " is not between 0 and ", size));
-      OP_REQUIRES(ctx, output[d] == -1,
-                  errors::InvalidArgument(d, " is duplicated in the input."));
-      output[d] = i;
+      ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
+    } else {
+      auto indices = ctx->Input(0);
+      int size = ctx->InputShape(0).num_elements();
+      auto iota = xla::Iota(ctx->builder(), xla::S32, size);
+      auto result = XlaScatter(iota, iota, indices,
+                               /*indices_are_vectors=*/false, /*combiner=*/{},
+                               ctx->builder());
+      OP_REQUIRES_OK(ctx, result.status());
+      ctx->SetOutput(0, result.ValueOrDie());
     }
-
-    ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
   }
 };
 
-REGISTER_XLA_OP(Name("InvertPermutation")
-                    .TypeConstraint("T", DT_INT32)
-                    .CompileTimeConstantInput("x"),
+REGISTER_XLA_OP(Name("InvertPermutation").TypeConstraint("T", DT_INT32),
                 InvertPermutationOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 4544e03491438d5f21cf986bc952572bd19d548c..62b5cd32da59063f8ce07119fd085f91ec3a1bc4 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -89,8 +89,9 @@ xla::XlaOp Sigmoid(xla::XlaOp x) {
 }
 XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(x));
 
-// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-XLAJIT_MAKE_UNARY(Sign, xla::Sign(x));
+// Returns 0 if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
+XLAJIT_MAKE_UNARY(Sign,
+                  xla::Select(xla::Ne(x, x), xla::ZerosLike(x), xla::Sign(x)));
 XLAJIT_MAKE_UNARY(Sinh, xla::Sinh(x));
 
 // softplus(x) = log(1 + exp(x))
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..233ac8e7b455403f8ee65b95b1403ecefdb92c6b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaSelfAdjointEigOp : public XlaOpKernel {
+ public:
+  explicit XlaSelfAdjointEigOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_iter", &max_iter_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result =
+        xla::SelfAdjointEig(ctx->Input(0), lower_, max_iter_, epsilon_);
+    ctx->SetOutput(0, result.w);
+    ctx->SetOutput(1, result.v);
+  }
+
+ private:
+  bool lower_;
+  int32 max_iter_;
+  float epsilon_;
+};
+
+class SelfAdjointEigV2Op : public XlaOpKernel {
+ public:
+  explicit SelfAdjointEigV2Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    int n = input_shape.dim_size(input_shape.dims() - 1);
+    // This is based on heuristics that approx log(n) sweep updates are needed.
+    // Note: the heuristics provides no theoretical guarantee, max_iter=100 and
+    // epsilon should be used to determine exit condition.
+    int max_iter = 2 * tensorflow::Log2Ceiling(n);
+    auto result = xla::SelfAdjointEig(ctx->Input(0), true, max_iter, 1e-6);
+    ctx->SetOutput(0, result.w);
+    ctx->SetOutput(1, result.v);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaSelfAdjointEig").TypeConstraint("T", kFloatTypes),
+                XlaSelfAdjointEigOp);
+REGISTER_XLA_OP(Name("SelfAdjointEigV2").TypeConstraint("T", kFloatTypes),
+                SelfAdjointEigV2Op);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index af641131ed76a8d6a7291c360302fa17c94af014..ccd58071d350e605e0e1f0c2b43643a400e32c2c 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -56,6 +56,41 @@ lhs_output: the broadcasted LHS tensor
 rhs_output: the broadcasted RHS tensor
 )doc");
 
+REGISTER_OP("XlaSelfAdjointEig")
+    .Input("a: T")
+    .Attr("lower: bool")
+    .Attr("max_iter: int")
+    .Attr("epsilon: float")
+    .Output("w: T")
+    .Output("v: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Computes the eigen decomposition of a batch of self-adjoint matrices
+(Note: Only real inputs are supported).
+
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices in
+tensor such that tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i], for
+i=0...N-1.
+
+a: the input tensor.
+
+lower: a boolean specifies whether the calculation is done with the lower
+  triangular part or the upper triangular part.
+
+max_iter: maximum number of sweep update, i.e., the whole lower triangular
+  part or upper triangular part based on parameter lower. Heuristically, it has
+  been argued that approximatly logN sweeps are needed in practice (Ref: Golub &
+  van Loan "Matrix Computation").
+
+epsilon: the tolerance ratio.
+
+w: The eigenvalues in ascending order, each repeated according to its
+  multiplicity.
+v: The column v[..., :, i] is the normalized eigenvector corresponding to the
+  eigenvalue w[..., i].
+)doc");
+
 REGISTER_OP("XlaConv")
     .Input("lhs: T")
     .Input("rhs: T")
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 345193c936a885e5a9e468979c4b73b5b0c9e5c2..de4710d03a3e69afb04aa68e37961698f0e3a300 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -291,6 +291,10 @@ def dot_general(lhs, rhs, dimension_numbers, precision_config=None, name=None):
       name=name)
 
 
+def self_adjoint_eig(a, lower, max_iter, epsilon):
+  return gen_xla_ops.xla_self_adjoint_eig(a, lower, max_iter, epsilon)
+
+
 dynamic_slice = gen_xla_ops.xla_dynamic_slice
 dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index cf48576ec2746fb29779633275eac4c638b91e45..28a4566c9d284fb8410a2d618f368c4dd2c1d893 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -254,7 +254,8 @@ Status CreateXlaArgs(const Graph& graph,
 
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
+Status ConvertGraphToXla(std::unique_ptr<Graph> graph,
+                         const tf2xla::Config& config, xla::Client* client,
                          xla::XlaComputation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
@@ -264,6 +265,19 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   std::vector<XlaCompiler::Argument> xla_args;
   TF_RETURN_IF_ERROR(CreateXlaArgs(*graph, &xla_args));
 
+  // Populate arguments with resource variables from the config. The variables
+  // get turned into inputs and outputs.
+  for (const tf2xla::Variable& variable : config.variable()) {
+    XlaCompiler::Argument arg;
+    arg.type = variable.type();
+    arg.kind = XlaCompiler::Argument::kResource;
+    arg.shape = variable.shape();
+    arg.name = variable.node_name();
+    arg.resource_kind = XlaResource::kVariable;
+    arg.initialized = true;
+    xla_args.push_back(std::move(arg));
+  }
+
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
@@ -361,7 +375,8 @@ Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             xla::XlaComputation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphToXla(std::move(graph), config, client, computation));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.proto b/tensorflow/compiler/tf2xla/tf2xla.proto
index 18c9089f5fa0e9792a4763d9bfac4c4e826eb5b2..5627af7452b99da594c1c214d0b556d8d70544d5 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.proto
+++ b/tensorflow/compiler/tf2xla/tf2xla.proto
@@ -39,6 +39,15 @@ message Fetch {
   string name = 2;  // Optional name for generated code.
 };
 
+// Variable represents a resource variable with the given name, shape and type.
+message Variable {
+  string node_name = 1;
+  string name =
+      2;  // Optional name for generated code. If empty, node_name will be used.
+  TensorShapeProto shape = 3;
+  DataType type = 4;
+}
+
 // Config represents configuration information for tf2xla conversion.
 message Config {
   // Each feed is a positional input argument for the generated computation.
@@ -47,4 +56,6 @@ message Config {
   // Each fetch is a positional output argument for the generated computation.
   // The order of each entry matches the order of each output argument.
   repeated Fetch fetch = 2;
+  // Each variable is a named input and output of the generated computation.
+  repeated Variable variable = 3;
 };
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index c64f78e1a1bcdd40b1c885889ec5fa491cfa1f66..88c03a6056ac6484013c3fd32c9889899b5c15c5 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -122,7 +122,12 @@ Status ReplaceArgUsageWithConstNode(
 
   for (const auto& iter : const_input_index_to_node) {
     int arg_index = iter.first;
-    Node* const_node = g->CopyNode(iter.second);
+    NodeDef const_def = iter.second->def();
+    const_def.set_name(g->NewName(const_def.name()));
+    Status s;
+    Node* const_node = g->AddNode(const_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
     Node* arg_node = arg_nodes[arg_index];
 
     // Collect all usages of the _Arg node.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 9e9c3cecee68aee856141a620f7292f771978acb..28b4744470e7d28863b5f7275f829b9bd59641e1 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -363,5 +363,58 @@ TEST(PropagateConstIntoFunctionalNodes, WhileLoopWithResourceInput) {
   TF_EXPECT_OK(PropagateConstIntoFunctionalNodes(&graph, &fld, &fld));
 }
 
+TEST(PropagateConstIntoFunctionalNodes, CopiedConstNodeHasUniqueName) {
+  FunctionLibraryDefinition fld(OpRegistry::Global(), {});
+  {
+    // Cond graph & body graph.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto pred = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto input = ops::_Arg(scope.WithOpName("arg1"), DT_BOOL, 1);
+    auto duplicate_name = ops::NoOp(scope.WithOpName("duplicate_name"));
+    auto ret = ops::_Retval(scope.WithOpName("ret"), pred, 0);
+    Graph graph(OpRegistry::Global());
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "cond", &cond_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(cond_fdef));
+    FunctionDef body_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "body", &body_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(body_fdef));
+  }
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto pred =
+      ops::Const(scope.WithOpName("duplicate_name"), false, TensorShape({}));
+  auto input = ops::Const(scope.WithOpName("input"), false, TensorShape({}));
+  NameAttrList cond_fn, body_fn;
+  cond_fn.set_name("cond");
+  body_fn.set_name("body");
+  auto while_op =
+      ops::While(scope.WithOpName("while"),
+                 std::initializer_list<Input>{pred, input}, cond_fn, body_fn);
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+
+  TF_EXPECT_OK(PropagateConstIntoFunctionalNodes(&graph, &fld, &fld));
+
+  // Check that in rewritten body function, the NoOp node still has name
+  // "duplicate_name", and the copied Const node has name "duplicate_name/_0".
+  auto node_name_index = graph.BuildNodeNameIndex();
+  Node* while_node = node_name_index["while"];
+  ASSERT_NE(while_node, nullptr);
+  TF_ASSERT_OK(GetNodeAttr(while_node->def(), "body", &body_fn));
+  const FunctionDef* rewritten_body_fn = fld.Find(body_fn.name());
+  ASSERT_NE(rewritten_body_fn, nullptr);
+  std::unordered_map<string, NodeDef> nodes;
+  for (const NodeDef& node_def : rewritten_body_fn->node_def()) {
+    nodes[node_def.name()] = node_def;
+  }
+  auto noop_def = nodes.find("duplicate_name");
+  ASSERT_NE(noop_def, nodes.end());
+  EXPECT_EQ(noop_def->second.op(), "NoOp");
+  auto const_def = nodes.find("duplicate_name/_0");
+  ASSERT_NE(const_def, nodes.end());
+  EXPECT_EQ(const_def->second.op(), "Const");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index ddb284966eeb97cc7c9d3ed77fb313e567975e59..5bd0277c051711f2677b90a2679662899521e94a 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -60,8 +60,6 @@ class XlaCompilationAllocator : public Allocator {
   // buffers, so they get ids to track.
   bool ShouldAllocateEmptyTensors() override { return true; }
 
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
-
  private:
   // Don't run any constructors or destructors for complex objects,
   // since there is no backing store for the tensor to run them
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 514b156deb9f350813237c31b7657a5b09c800dd..3221ec5b727de1f792cd61b792ee917588d56cf9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -43,6 +43,8 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -183,9 +185,10 @@ Status BuildComputation(
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
 
-  // Keeps track of which retvals have layout to update. The first element is
-  // the output index, second element is the new layout.
-  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
+  // Keeps track of the layout of each retval. If a retval is not in this list,
+  // a descending layout is used. The first element is the output index, second
+  // element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_index_and_layout;
   for (int i = 0; i < retvals.size(); ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
@@ -214,7 +217,7 @@ Status BuildComputation(
           TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
                                                     output.shape, output.type));
           value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
-          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
+          retval_index_and_layout.emplace_back(elems.size(), shape.layout());
         } else if (it != retval_cores.end()) {
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
@@ -287,6 +290,11 @@ Status BuildComputation(
       // Ensures the correct sharding is applied to the output.
       handle = identity_op(handle);
 
+      // Set layout of the retval to device representation layout.
+      if (resource->representation_shape().has_value()) {
+        retval_index_and_layout.emplace_back(
+            elems.size(), resource->representation_shape()->layout());
+      }
       elems.push_back(handle);
     }
   }
@@ -316,15 +324,15 @@ Status BuildComputation(
                       computation->GetProgramShape());
   *output_shape = program_shape.result();
   // Update the output layout to the layout of retval.
-  for (auto& update : retval_to_update_layout) {
+  for (auto& index_and_layout : retval_index_and_layout) {
     if (!always_return_tuple && elems.size() == 1) {
-      *output_shape->mutable_layout() = update.second;
+      *output_shape->mutable_layout() = index_and_layout.second;
       continue;
     }
 
-    xla::Shape* output_sub_shape =
-        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
-    *output_sub_shape->mutable_layout() = update.second;
+    xla::Shape* output_sub_shape = xla::ShapeUtil::GetMutableSubshape(
+        output_shape, {index_and_layout.first});
+    *output_sub_shape->mutable_layout() = index_and_layout.second;
   }
   return Status::OK();
 }
@@ -1108,8 +1116,17 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   result->outputs.resize(context->retvals().size());
   std::vector<XlaExpression> retvals = context->retvals();
   if (options.resolve_compile_time_constants) {
-    TF_RETURN_IF_ERROR(ResolveConstantExpressionsToConstants(
-        client(), absl::Span<XlaExpression>(retvals)));
+    Status status = ResolveConstantExpressionsToConstants(
+        client(), absl::Span<XlaExpression>(retvals));
+
+    // If the HloEvaluator has not implemented an expression, just evaluate it
+    // at runtime.
+    if (status.code() == error::UNIMPLEMENTED) {
+      ConvertConstantsToExpressions(&builder,
+                                    absl::Span<XlaExpression>(retvals));
+    } else {
+      TF_RETURN_IF_ERROR(status);
+    }
   } else {
     ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
   }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 492010f7317d32a8a620147cd2cd9356d4f13fde..b31137867d738944eaaa73e142ad8538ec6b854a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -277,6 +277,97 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(param0_literal, actual_literal));
 }
 
+// Tests that the compiler can correctly propagate the layout assigned by
+// shape_representation_fn_ to return types.
+TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType dt) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dt, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph), args, &result));
+  xla::Shape transposed =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+}
+
+// The layout of resource variable shouldn't change after transpose
+TEST_F(XlaCompilerTest, TransposeVariables) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto transposed_read = ops::Transpose(scope, read, {1, 0});
+  auto reshape = ops::Reshape(scope, transposed_read, {2, 3});
+  auto d = ops::_Retval(scope.WithOpName("D"), reshape, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 3});
+  // Compiles the graph.
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "transpose",
+                                     std::move(graph), args, &result));
+  xla::Shape transposed =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {1, 0});
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+}
+
 // Tests that the compiler doesn't reorder the parameters.
 TEST_F(XlaCompilerTest, MixedOrderArguments) {
   for (bool swap_order : {false, true}) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 6139bf3cea0790c2697130a993e92be96c81848b..3f787fd86c9f7366a7728dcf146a3797ba672bc3 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -76,7 +76,7 @@ XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
-  return LookupOrCreate(type, &max_func_, [this, type] {
+  return LookupOrCreate(type, &max_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
@@ -92,7 +92,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
-  return LookupOrCreate(type, &min_func_, [this, type] {
+  return LookupOrCreate(type, &min_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
@@ -108,7 +108,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
-  return LookupOrCreate(type, &add_func_, [this, type] {
+  return LookupOrCreate(type, &add_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
@@ -124,7 +124,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
-  return LookupOrCreate(type, &mul_func_, [this, type] {
+  return LookupOrCreate(type, &mul_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 04a5d934064a9083a41cc210b48df65bbc862fff..7bb1ad27467a5b281626de4203169e575288f9ee 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -81,61 +81,27 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
   return Status::OK();
 }
 
-template <typename T>
-static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
-  Tensor linspace(DataTypeToEnum<T>::v(), shape);
-  auto linspace_flat = linspace.flat<T>();
-  for (int64 i = 0; i < depth; ++i) {
-    linspace_flat(i) = i;
-  }
-  return linspace;
-}
-
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
                           DataType index_type, const TensorShape& indices_shape,
                           const xla::XlaOp& indices, const xla::XlaOp& on_value,
                           const xla::XlaOp& off_value, xla::XlaOp* one_hot) {
-  const int indices_dims = indices_shape.dims();
-  const int output_dims = indices_dims + 1;
-
-  TensorShape output_shape = indices_shape;
-  output_shape.InsertDim(axis, depth);
-
-  // Build a Tensor populated with values 0, 1, 2, ... depth.
-  std::vector<int64> linspace_dims(output_dims, 1);
-  linspace_dims[axis] = depth;
-  TensorShape linspace_shape(linspace_dims);
-  Tensor linspace;
-  switch (index_type) {
-    case DT_UINT8:
-      linspace = MakeLinspaceTensor<uint8>(linspace_shape, depth);
-      break;
-    case DT_INT32:
-      linspace = MakeLinspaceTensor<int32>(linspace_shape, depth);
-      break;
-    case DT_INT64:
-      linspace = MakeLinspaceTensor<int64>(linspace_shape, depth);
-      break;
-    default:
-      return errors::InvalidArgument("Invalid argument type ",
-                                     DataTypeString(index_type));
-  }
-
-  xla::BorrowingLiteral linspace_literal;
-  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(linspace, &linspace_literal));
-
   // Broadcast the linspace constant across the indices along the new axis,
   // and test equality at each position.
   std::vector<int64> broadcast_dims(indices_shape.dims());
   std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
   std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-  xla::XlaOp one_hot_bool = xla::Eq(
-      indices, xla::ConstantLiteral(builder, linspace_literal), broadcast_dims);
+
+  TensorShape output_shape = indices_shape;
+  output_shape.InsertDim(axis, depth);
+  xla::Shape iota_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(index_type, output_shape, &iota_shape));
 
   // Selects the user-provided off_value and on_value values.
-  *one_hot = xla::Select(one_hot_bool,
-                         xla::Broadcast(on_value, output_shape.dim_sizes()),
-                         xla::Broadcast(off_value, output_shape.dim_sizes()));
+  *one_hot = xla::Select(
+      xla::Eq(indices, xla::Iota(builder, iota_shape, axis), broadcast_dims),
+      xla::Broadcast(on_value, output_shape.dim_sizes()),
+      xla::Broadcast(off_value, output_shape.dim_sizes()));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 78bc2c94425e00c2b26058daf609d71f1853664e..ee11f3a3de658c7e5108605122b84fbc3e1cd963 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -319,6 +319,27 @@ Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
   return Status::OK();
 }
 
+Status XlaOpKernelContext::ConstantInputAsPartialShape(
+    int index, PartialTensorShape* shape) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
+  // If `literal` is a scalar it's value must be -1.
+  if (literal.shape().rank() == 0) {
+    int64 shape_val;
+    TF_RETURN_IF_ERROR(LiteralToInt64Scalar(literal, &shape_val));
+    if (shape_val != -1) {
+      return errors::InvalidArgument(
+          "Cannot convert value to PartialTensorShape: ", shape_val);
+    }
+    *shape = PartialTensorShape();  // Shape with unknown rank.
+    return Status::OK();
+  }
+  std::vector<int64> dims;
+  TF_RETURN_IF_ERROR(LiteralToInt64Vector(literal, &dims));
+  *shape = PartialTensorShape(dims);
+  return Status::OK();
+}
+
 Status XlaOpKernelContext::InputList(absl::string_view name,
                                      std::vector<xla::XlaOp>* handles,
                                      std::vector<TensorShape>* shapes) {
@@ -447,6 +468,16 @@ void XlaOpKernelContext::SetOutputExpression(int index,
   }
 }
 
+xla::PrimitiveType XlaOpKernelContext::output_xla_type(int index) {
+  xla::PrimitiveType type;
+  Status status = DataTypeToPrimitiveType(expected_output_dtype(index), &type);
+  if (!status.ok()) {
+    SetStatus(status);
+    return xla::PRIMITIVE_TYPE_INVALID;
+  }
+  return type;
+}
+
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
   SetOutputExpression(
       index,
@@ -503,6 +534,7 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
     handle = xla::Reshape(handle,
                           xla::AsInt64Slice(representation_shape.dimensions()));
   }
+  variable->SetRepresentationShape(representation_shape);
   return variable->SetValue(handle);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index e44415f60bff82fb92d0cf4ec81935564a2f083a..cc2d5e8de3eb020ba41dfed7d730b48cd0534b4c 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -138,6 +138,10 @@ class XlaOpKernelContext {
   // Converts a constant 1D int32 or int64 tensor into a TensorShape.
   Status ConstantInputAsShape(int index, TensorShape* shape);
 
+  // Converts a constant 1D int32 or int64 tensor, or a scalar with value -1
+  // into a PartialTensorShape.
+  Status ConstantInputAsPartialShape(int index, PartialTensorShape* shape);
+
   // Returns the named list-valued immutable input in "list", as
   // defined in the OpDef.  If the named output is not list-valued,
   // returns a one-element list.
@@ -155,6 +159,11 @@ class XlaOpKernelContext {
     return context_->expected_output_dtype(index);
   }
 
+  // Returns the type of output `index` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType output_xla_type(int index);
+
   // Sets output `index` to the XlaOp `handle`.
   // All outputs should be set using SetOutput and SetConstantOutput, not
   // via the underlying OpKernelContext.
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 736588bb8b89ba756cdce77eeebff8d1fcf4774c..ab3a5bdd9bc580c16d65d35c3be3ba8204511f83 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -86,6 +86,12 @@ class XlaResource {
   // variables have new values that need to be written back.
   const xla::XlaOp& initial_value() const { return initial_value_; }
 
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  const absl::optional<xla::Shape>& representation_shape() const {
+    return representation_shape_;
+  }
+
   // A variable is initialized if it has a value.
   bool initialized() const { return value_.valid(); }
 
@@ -100,6 +106,11 @@ class XlaResource {
   // Sets the current value of the resource to an all-zero value.
   Status SetZeroValue(xla::XlaBuilder* builder);
 
+  // Sets the representational shape of the resource on device.
+  void SetRepresentationShape(const xla::Shape& shape) {
+    representation_shape_ = absl::make_optional(shape);
+  }
+
   // Looks up the gradient for `source`, or creates it if it does not already
   // exist. The call target must be an initialized TensorArray resource. A
   // TensorArray can have multiple named gradients; see the operator
@@ -160,6 +171,10 @@ class XlaResource {
   xla::XlaOp value_;
   xla::XlaOp initial_value_;
 
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  absl::optional<xla::Shape> representation_shape_;
+
   int64 max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 636e5ef721f58c009566c10a653d09a7667619c0..ee6f7d5956ede4af99498ca0df5de47150cc5e4d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -150,8 +150,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":status",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor/lib",
     ],
 )
@@ -194,7 +192,7 @@ cc_library(
         ":types",
         ":util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -833,7 +831,6 @@ cc_library(
             "//tensorflow/compiler/xla:xla_proto",
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
             "@com_google_absl//absl/strings",
         ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index b30ab84240286fe4eb145fc893ba3f3f7ab26d00..c5dea5f18030f2d226c86e3408ea85b2b5989728 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -65,7 +65,6 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:loops",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:slicing",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:lib",
     ],
 )
@@ -231,6 +230,7 @@ cc_library(
     deps = [
         ":arithmetic",
         ":constants",
+        ":slicing",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -376,6 +376,7 @@ cc_library(
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
+        ":comparators",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -451,42 +452,48 @@ cc_library(
 )
 
 cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
+    name = "self_adjoint_eig",
+    srcs = ["self_adjoint_eig.cc"],
+    hdrs = ["self_adjoint_eig.h"],
     deps = [
-        "//tensorflow/compiler/xla:literal",
+        ":arithmetic",
+        ":comparators",
+        ":constants",
+        ":loops",
+        ":math",
+        ":matrix",
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
 
 xla_test(
-    name = "triangular_solve_test",
-    srcs = ["triangular_solve_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-        "noasan",  # sometimes times out, http://b/78650012
+    name = "self_adjoint_eig_test",
+    srcs = ["self_adjoint_eig_test.cc"],
+    blacklisted_backends = [
+        "cpu",
+        "gpu",
     ],
+    real_hardware_only = True,
+    shard_count = 10,
+    tags = ["optonly"],
     deps = [
-        ":math",
+        ":arithmetic",
+        ":constants",
         ":matrix",
-        ":triangular_solve",
+        ":self_adjoint_eig",
         "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
diff --git a/tensorflow/compiler/xla/client/lib/cholesky.cc b/tensorflow/compiler/xla/client/lib/cholesky.cc
index 414bd1494cd32f32a5c37e84119de930678a776b..bb41f9932d1cc62b62d37fea2c10fbfeaa0bd15e 100644
--- a/tensorflow/compiler/xla/client/lib/cholesky.cc
+++ b/tensorflow/compiler/xla/client/lib/cholesky.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -194,12 +193,12 @@ XlaOp Cholesky(XlaOp a, int64 block_size,
         // l[i+k:, i:i+k] =
         //     trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k])
         auto panel = SliceInMinorDims(a, {i + k, i}, {n, i + k});
-        auto update = TriangularSolve(factorized, panel,
-                                      /*left_side=*/false,
-                                      /*lower=*/true,
-                                      /*transpose_a=*/true,
-                                      /*conjugate_a=*/false,
-                                      /*block_size=*/block_size);
+        auto update =
+            TriangularSolve(factorized, panel,
+                            /*left_side=*/false,
+                            /*lower=*/true,
+                            /*unit_diagonal=*/false,
+                            /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
         l = UpdateSliceInMinorDims(l, update, {i + k, i});
       }
     }
diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc
index 1ada7b4a964ccf7ca400b937abbe425bef083468..6bd56a8df0a5d0417f747a158664ed0daa8a7b40 100644
--- a/tensorflow/compiler/xla/client/lib/constants.cc
+++ b/tensorflow/compiler/xla/client/lib/constants.cc
@@ -80,6 +80,24 @@ XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type) {
   }
 }
 
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type) {
+  switch (type) {
+    case F16:
+      return ConstantR0<Eigen::half>(builder,
+                                     std::numeric_limits<Eigen::half>::min());
+    case BF16:
+      return ConstantR0<bfloat16>(builder, bfloat16::min_positive_normal());
+    case F32:
+      return ConstantR0<float>(builder, std::numeric_limits<float>::min());
+    case F64:
+      return ConstantR0<double>(builder, std::numeric_limits<double>::min());
+    default:
+      return builder->ReportError(
+          InvalidArgument("Invalid type for MinPositiveNormalValue (%s).",
+                          PrimitiveType_Name(type)));
+  }
+}
+
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type) {
   return ConstantLiteral(builder, LiteralUtil::MaxValue(type));
 }
@@ -100,4 +118,28 @@ XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type) {
   }
 }
 
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    switch (type) {
+      case F16:
+        return ConstantR0<Eigen::half>(
+            builder, Eigen::NumTraits<Eigen::half>::quiet_NaN());
+      case BF16:
+        return ConstantR0<bfloat16>(
+            builder, bfloat16(std::numeric_limits<float>::quiet_NaN()));
+      case F32:
+        return ConstantR0<float>(builder,
+                                 std::numeric_limits<float>::quiet_NaN());
+      case F64:
+        return ConstantR0<double>(builder,
+                                  std::numeric_limits<double>::quiet_NaN());
+      default:
+        return InvalidArgument(
+            "Operand to NanValue was %s, but must be a real-valued "
+            "floating-point type.",
+            PrimitiveType_Name(type));
+    }
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index 4e5310a380e8bda15348dae2cbb0ea9e2c381bcb..47b8f1b44ffa12b2b15be0e865d693a709962e6e 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -135,6 +135,9 @@ XlaOp MinValue(XlaBuilder* builder, PrimitiveType type);
 // point type, this is equal to -MaxFiniteValue().
 XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type);
 
+// Returns the minimum positive normal value for floating-point type `type`.
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type);
+
 // Returns the maximum representable finite or infinite value for 'type'.
 // Returns 'inf' for floating-point types.
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
@@ -142,6 +145,9 @@ XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
 // Returns the maximum representable finite value for 'type'.
 XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type);
 
+// Returns a nan for the given type.  Only valid for real-valued fp types.
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONSTANTS_H_
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index f4320f65c1f76d4d4c384110b39d6606773aaf01..180175b7495b32250af8ae77c8c7fba804703885 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -155,5 +155,12 @@ XLA_TEST_F(ConstantsTest, MaxValueF32) {
                              {});
 }
 
+XLA_TEST_F(ConstantsTest, NanValueF32) {
+  XlaBuilder builder(TestName());
+  NanValue(&builder, F32);
+  ComputeAndCompareR0<float>(&builder, std::numeric_limits<float>::quiet_NaN(),
+                             {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 14891206855725f1ba71bda9f92134d7c7eb9217..19d98d100191fcba590d64c643d76b2bc5d5e5c5 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -79,6 +79,34 @@ XlaOp IsNan(XlaOp operand) {
   });
 }
 
+XlaOp IsNegZero(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNegZero", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+
+    // The bitwise representation of -0 in bfloat16 and IEEE 754 is 0x80...0
+    // (sign bit on, all other bits off).
+    switch (shape.element_type()) {
+      case F64:
+        return Eq(BitcastConvertType(operand, U64),
+                  ConstantR0WithType(&b, U64, uint64{1} << 63));
+      case F32:
+        return Eq(BitcastConvertType(operand, U32),
+                  ConstantR0WithType(&b, U32, uint32{1} << 31));
+      case F16:
+      case BF16:
+        // Not all XLA backends handle U16 well, so we convert to F32/U32.
+        // TODO(jlebar): It would be nice if we could stay in (B)F16/U16 for
+        // backends that *do* support it.
+        return Eq(BitcastConvertType(ConvertElementType(operand, F32), U32),
+                  ConstantR0WithType(&b, U32, uint32{1} << 31));
+      default:
+        LOG(FATAL) << "Expected real fp type.";
+    }
+  });
+}
+
 XlaOp Sqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, 0.5)); }
 
 XlaOp Rsqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, -0.5)); }
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 907571c9a3ec65b0be0087ad4837c842a0bdcc79..b036fa299d92988439dfecbb5415865071d5577d 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -28,6 +28,11 @@ XlaOp IsNegInf(XlaOp operand);
 XlaOp IsInf(XlaOp operand);
 XlaOp IsNan(XlaOp operand);
 
+// Determines whether operand is equal to -0.
+//
+// Raises an error for integral or complex values.
+XlaOp IsNegZero(XlaOp operand);
+
 // Returns the next number after 'from' in the direction of 'to' the same way
 // std::nextafter(from, to) would.
 XlaOp NextAfter(XlaOp from, XlaOp to);
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 364ac5876abbec825834081518a6dfda84356048..bdfb0575f573716b54cf9116d155d8a3a55056e8 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -88,6 +88,22 @@ class MathTypedTest : public MathTest {
             {false, false, false, false, false, false, false, true, true}));
     ComputeAndCompareLiteral(&b, expected, {});
   }
+
+  void TestIsNegZero() {
+    SetFastMathDisabled(true);
+    XlaBuilder b(TestName());
+    T inf(std::numeric_limits<float>::infinity());
+    T nan(std::numeric_limits<float>::quiet_NaN());
+    IsNegZero(AddParam(
+        LiteralUtil::CreateR1<T>({T{-0.0}, T{0}, T{1}, T{-1}, inf, -inf, nan}),
+        &b));
+
+    ComputeAndCompareLiteral(
+        &b,
+        LiteralUtil::CreateR1<bool>(
+            {true, false, false, false, false, false, false}),
+        {}, error_spec_);
+  }
 };
 
 // TODO(b/123355973): Add bfloat16 to TestTypes once it's working.
@@ -102,6 +118,7 @@ TYPED_TEST_CASE(MathTypedTest, TestTypes);
 XLA_TYPED_TEST(MathTypedTest, LogEdgeCases) { this->TestLogEdgeCases(); }
 XLA_TYPED_TEST(MathTypedTest, Log1pEdgeCases) { this->TestLog1pEdgeCases(); }
 XLA_TYPED_TEST(MathTypedTest, IsInfOrNan) { this->TestIsInfOrNan(); }
+XLA_TYPED_TEST(MathTypedTest, IsNegZero) { this->TestIsNegZero(); }
 
 // Check that certain ops only support real, floating-point inputs.
 //
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index a5aea96090c59c78d20cfc10a4bd6b312be592c1..a055a8e625c680cf5232896c95cd35b78cb172bc 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -45,7 +46,7 @@ XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
   return ConvertElementType(indicator, type);
 }
 
-XlaOp GetMatrixDiagonal(XlaOp x) {
+XlaOp GetMatrixDiagonal(XlaOp x, int k) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
@@ -53,10 +54,13 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
+
+    auto offset = ConstantR0WithType(builder, S32, k);
+
     absl::Span<const int64> major_dims =
         AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
+    auto a = Iota(builder, S32, n);
+    auto b = Iota(builder, S32, m) + offset;
     auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
     auto mask = Broadcast(indicator, major_dims);
 
@@ -66,9 +70,21 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
         primitive_util::IsIntegralType(shape.element_type())
             ? CreateScalarOrComputation(shape.element_type(), builder)
             : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+    // k == 0, we can save one slice op.
+    if (k == 0) {
+      return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                    reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+    } else if (k > 0) {
+      auto result = Reduce(Select(mask, x, Zeros(builder, shape)),
+                           ScalarLike(x, 0), reducer, {n_dims - 2});
+      return SliceInMinorDims(result, {std::min<int64>(k, n)},
+                              {std::min(m + k, n)});
+    } else {
+      auto result = Reduce(Select(mask, x, Zeros(builder, shape)),
+                           ScalarLike(x, 0), reducer, {n_dims - 1});
+      return SliceInMinorDims(result, {std::min<int64>(-k, m)},
+                              {std::min(m, n - k)});
+    }
   });
 }
 
@@ -336,4 +352,5 @@ XlaOp TransposeInMinorDims(XlaOp x) {
 XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
   return transpose ? TransposeInMinorDims(x) : x;
 }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 491f1eab4cbffbbf9df70d4c35a61351df3e98aa..60c41ec45a086726086dac7227fc432a9c62d0c8 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -30,10 +30,15 @@ namespace xla {
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 
-// Get the diagonals of the last two dimensions. If 'x' has shape
-// [..., M, N], then the output has shape [..., min(M, N)], containing the
-// diagonal elements (i.e., with indices [..., i, i]).
-XlaOp GetMatrixDiagonal(XlaOp x);
+// Get the diagonals of the last two dimensions. Use k>0 for diagonals above the
+// main diagonal, and k<0 for diagonals below the main diagonal.
+//
+// If 'x' has shape [..., M, N]
+//  If k >= 0: then the output has shape [..., min(M, N - k)], containing the
+//            diagonal elements (i.e., with indices [..., i, i + k]).
+//  If k < 0: then the output has shape [..., min(M + k, N)], containing the
+//            diagonal elements (i.e., with indices [..., i - k, i]).
+XlaOp GetMatrixDiagonal(XlaOp x, int k = 0);
 
 // Returns a lower-triangular mask, i.e., true below the `diagonal`-th diagonal
 // and false above that diagonal.
diff --git a/tensorflow/compiler/xla/client/lib/matrix_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
index 79cf529ee94b044ee0af788522200cd28c778997..a93fc2ccb92912a10b9b6c2192b81cd73566f2a0 100644
--- a/tensorflow/compiler/xla/client/lib/matrix_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -54,13 +53,24 @@ void MatrixTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
   Array3D<T> input(2, 3, 4);
   input.FillIota(0);
+  std::map<int, Array2D<T>> k_and_expected = {
+      {0, {{0, 5, 10}, {12, 17, 22}}},
+      {1, {{1, 6, 11}, {13, 18, 23}}},
+      {2, {{2, 7}, {14, 19}}},
+      {3, {{3}, {15}}},
+      {4, {{}, {}}},
+      {-1, {{4, 9}, {16, 21}}},
+      {-2, {{8}, {20}}},
+      {-3, {{}, {}}},
+      {-4, {{}, {}}},
+  };
+  for (const auto& kv : k_and_expected) {
+    XlaOp a;
+    auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
+    GetMatrixDiagonal(a, kv.first);
 
-  XlaOp a;
-  auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
-  GetMatrixDiagonal(a);
-  Array2D<T> expected({{0, 5, 10}, {12, 17, 22}});
-
-  ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
+    ComputeAndCompareR2<T>(&builder, kv.second, {a_data.get()});
+  }
 }
 
 XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..546127e4627f1717913d1039be13fd0c655be1a3
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
@@ -0,0 +1,471 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// Jacobi rotation (also known as Givens rotation):
+// G = [[ c, s],
+//      [-s, c]]
+// matmul(G_T, G) = I
+struct SymmetricSchurDecomposition {
+  XlaOp c;          // cosine.
+  XlaOp s;          // sine.
+};
+
+// JacobiUpdate holds the intermediate orthogonal matrix, Jacobi-rotated matrix
+// and the off-diagonal norm of the rotated matrix. After each Jacobi iteration,
+// off-diagonal norm is reduced.
+struct JacobiUpdate {
+  XlaOp v;
+  XlaOp w;
+};
+
+struct FrobeniusNorms {
+  XlaOp off_diagonal_norm;
+  XlaOp total_norm;
+};
+
+// Given an n-by-n symmetric A and integers p and q that satisfy 0 <= p < q < n,
+// it computes a rotation matrix G = [[c, s], [-s, c]], such that
+//                        G_T * A[[p, q], [p, q]] * G
+// is diagonalized.
+//
+//  def sym_schur2x2(A, p, q):
+//      if np.abs(A[p, q]) > 1e-6:
+//          tau = (A[q, q] - A[p, p]) / (2 * A[p, q])
+//          if tau >= 0:
+//              t = 1.0 / (tau + np.sqrt(1 + tau ** 2))
+//          else:
+//              t = -1.0 / (-tau + np.sqrt(1 + tau ** 2))
+//          c = 1.0 / np.sqrt(1.0 + t ** 2)
+//          s = t * c
+//      else:
+//          c = 1.0
+//          s = 0.0
+//      return c, s
+StatusOr<SymmetricSchurDecomposition> SymmetricShurDecomposition2x2(XlaOp a,
+                                                                    XlaOp p,
+                                                                    XlaOp q,
+                                                                    XlaOp tol) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+
+  auto zero = ScalarLike(a, 0.0);
+  auto one = ScalarLike(a, 1.0);
+  auto two = ScalarLike(a, 2.0);
+
+  auto pqs = DynamicSliceInMinorDims(a, {p, q}, {1, 1});
+
+  auto ps = DynamicSliceInMinorDims(a, {p, p}, {1, 1});
+  auto qs = DynamicSliceInMinorDims(a, {q, q}, {1, 1});
+
+  auto tau = (qs - ps) / (pqs * two);
+  auto t_pos = one / (tau + Sqrt(one + Square(tau)));
+  auto t_neg = -one / (-tau + Sqrt(one + Square(tau)));
+  auto t = Select(Ge(tau, zero), t_pos, t_neg);
+
+  auto c_temp = Rsqrt(one + Square(t));
+  auto s_temp = t * c_temp;
+
+  auto c = Select(Ge(Abs(pqs), tol), c_temp, ZerosLike(c_temp) + one);
+  auto s = Select(Ge(Abs(pqs), tol), s_temp, ZerosLike(s_temp));
+  // Renormalize c and s to compensate for low precision arithmetic, this step
+  // is redundant if high precision float is used, like float64.
+  auto rnorm = Rsqrt(Square(c) + Square(s));
+
+  SymmetricSchurDecomposition schur;
+
+  schur.c = c * rnorm;
+  schur.s = s * rnorm;
+
+  return schur;
+}
+
+StatusOr<JacobiUpdate> Update(JacobiUpdate jacobi_update, XlaOp p, XlaOp q,
+                              XlaOp tol, int64 n) {
+  XlaBuilder* builder = jacobi_update.w.builder();
+  TF_ASSIGN_OR_RETURN(
+      SymmetricSchurDecomposition schur,
+      SymmetricShurDecomposition2x2(jacobi_update.w, p, q, tol));
+
+  TF_ASSIGN_OR_RETURN(Shape w_shape, builder->GetShape(jacobi_update.w));
+  const std::vector<int64> batch_dims(w_shape.dimensions().begin(),
+                                      w_shape.dimensions().end() - 2);
+  const int64 num_dims = w_shape.rank();
+
+  auto zero = ScalarLike(p, 0);
+
+  XlaOp c = schur.c;
+  XlaOp s = schur.s;
+
+  auto slice_p = DynamicSliceInMinorDims(jacobi_update.w, {p, zero}, {1, n});
+  auto slice_q = DynamicSliceInMinorDims(jacobi_update.w, {q, zero}, {1, n});
+
+  auto slice_p_new = c * slice_p - s * slice_q;
+  auto slice_q_new = s * slice_p + c * slice_q;
+
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_p_new, {p, zero});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_q_new, {q, zero});
+
+  slice_p = DynamicSliceInMinorDims(jacobi_update.w, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(jacobi_update.w, {zero, q}, {n, 1});
+
+  slice_p_new = c * slice_p - s * slice_q;
+  slice_q_new = s * slice_p + c * slice_q;
+
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_p_new, {zero, p});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_q_new, {zero, q});
+
+  // Zero out a_{pq} explicitly.
+  std::vector<int64> pq_dims(batch_dims.begin(), batch_dims.end());
+  pq_dims.push_back(1);
+  pq_dims.push_back(1);
+  auto pq_zero = ScalarLike(jacobi_update.w, 0.0);
+  auto pq_zeros = Broadcast(pq_zero, pq_dims);
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, pq_zeros, {p, q});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, pq_zeros, {q, p});
+
+  slice_p = DynamicSliceInMinorDims(jacobi_update.v, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(jacobi_update.v, {zero, q}, {n, 1});
+
+  std::vector<int64> broadcast_dims(batch_dims.size());
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims.push_back(num_dims - 1);
+
+  // Renormalize the p-th and q-th columns. This step is redundant if high
+  // precision floats are used, like 64-bit float. But for 32-bit float, it
+  // becomes necessary. This step will not increase the overall complexity.
+  slice_p_new = c * slice_p - s * slice_q;
+  slice_p_new = Mul(
+      slice_p_new,
+      Rsqrt(Reduce(Square(slice_p_new), pq_zero,
+                   CreateScalarAddComputation(w_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+  slice_q_new = s * slice_p + c * slice_q;
+  slice_q_new = Mul(
+      slice_q_new,
+      Rsqrt(Reduce(Square(slice_q_new), pq_zero,
+                   CreateScalarAddComputation(w_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  jacobi_update.v =
+      DynamicUpdateSliceInMinorDims(jacobi_update.v, slice_p_new, {zero, p});
+  jacobi_update.v =
+      DynamicUpdateSliceInMinorDims(jacobi_update.v, slice_q_new, {zero, q});
+
+  return jacobi_update;
+}
+
+StatusOr<FrobeniusNorms> ComputeFrobeniusNorms(XlaOp w) {
+  XlaBuilder* builder = w.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w));
+  const int64 num_dims = shape.rank();
+  auto frobenius_norm =
+      Sqrt(Reduce(Square(w), ScalarLike(w, 0.0),
+                  CreateScalarAddComputation(shape.element_type(), builder),
+                  {num_dims - 2, num_dims - 1}));
+  auto diag = GetMatrixDiagonal(w);
+  auto diag_square =
+      Reduce(Square(diag), ScalarLike(w, 0.0),
+             CreateScalarAddComputation(shape.element_type(), builder),
+             {num_dims - 2});
+
+  FrobeniusNorms frobenius_norms;
+
+  frobenius_norms.off_diagonal_norm =
+      Sqrt(Max(Square(frobenius_norm) - diag_square, ScalarLike(w, 0.0)));
+  frobenius_norms.total_norm = frobenius_norm;
+
+  return frobenius_norms;
+}
+
+StatusOr<std::vector<XlaOp>> WhileLoopFn(
+    absl::Span<const XlaOp> initial_values,  //
+    int matrix_dimension,                    //
+    int max_sweep_updates,                   //
+    PrimitiveType index_type,                //
+    absl::string_view name,                  //
+    XlaBuilder* builder) {
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    auto k = values[0];
+    auto max_sweeps = ScalarLike(k, max_sweep_updates);
+    auto sweep_update_cond = Gt(max_sweeps, k);
+
+    auto norms = ComputeFrobeniusNorms(values[2]).ValueOrDie();
+    auto tol = norms.total_norm * values[3];
+    auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
+                              xla::ConstantR0<bool>(cond_builder, false),
+                              CreateScalarOrComputation(PRED, cond_builder));
+
+    return And(sweep_update_cond, tol_cond);
+  };
+
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    auto while_cond_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_cond_builder) -> StatusOr<XlaOp> {
+      auto p = values_inner[0];
+      return Lt(p, ScalarLike(p, matrix_dimension - 1));
+    };
+
+    auto while_body_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_body_builder) -> StatusOr<std::vector<XlaOp>> {
+      auto while_cond_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_cond_builder) -> StatusOr<XlaOp> {
+        auto q = values_innermost[1];
+        return Lt(q, ScalarLike(q, matrix_dimension));
+      };
+      auto while_body_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_body_builder)
+          -> StatusOr<std::vector<XlaOp>> {
+        auto p = values_innermost[0];
+        auto q = values_innermost[1];
+
+        JacobiUpdate jacobi_update;
+        jacobi_update.v = values_innermost[2];
+        jacobi_update.w = values_innermost[3];
+
+        auto tol = values_innermost[4];
+
+        TF_ASSIGN_OR_RETURN(jacobi_update,
+                            Update(jacobi_update, p, q, tol, matrix_dimension));
+
+        std::vector<XlaOp> updated_values_innermost;
+        updated_values_innermost.reserve(values_innermost.size());
+
+        updated_values_innermost.push_back(p);
+        updated_values_innermost.push_back(q + ScalarLike(q, 1));
+        updated_values_innermost.push_back(jacobi_update.v);
+        updated_values_innermost.push_back(jacobi_update.w);
+        updated_values_innermost.push_back(tol);
+
+        return updated_values_innermost;
+      };
+
+      std::vector<XlaOp> values_innermost(5);
+      auto p = values_inner[0];
+      auto q = p + ScalarLike(p, 1);
+      values_innermost[0] = p;                // index p.
+      values_innermost[1] = q;                // index q.
+      values_innermost[2] = values_inner[1];  // v.
+      values_innermost[3] = values_inner[2];  // w.
+      values_innermost[4] = values_inner[3];  // tol.
+      TF_ASSIGN_OR_RETURN(
+          values_innermost,
+          WhileLoopHelper(while_cond_fn_innermost, while_body_fn_innermost,
+                          values_innermost, absl::StrCat(name, "-Innermost"),
+                          inner_body_builder));
+
+      std::vector<XlaOp> updated_values_inner;
+      updated_values_inner.reserve(values_inner.size());
+
+      updated_values_inner.push_back(p + ScalarLike(p, 1));
+      updated_values_inner.push_back(values_innermost[2]);
+      updated_values_inner.push_back(values_innermost[3]);
+      updated_values_inner.push_back(values_innermost[4]);
+      return updated_values_inner;
+    };
+    // Indexes.
+    XlaOp k = values[0];
+
+    std::vector<XlaOp> values_inner(4);
+    values_inner[0] = ScalarLike(k, 0);  // index p.
+    values_inner[1] = values[1];         // v.
+    values_inner[2] = values[2];         // w.
+    values_inner[3] = values[3];         // tol.
+    TF_ASSIGN_OR_RETURN(
+        values_inner,
+        WhileLoopHelper(while_cond_fn_inner, while_body_fn_inner, values_inner,
+                        absl::StrCat(name, "-Inner"), body_builder));
+
+    std::vector<XlaOp> updated_values;
+    updated_values.reserve(values_inner.size());
+
+    updated_values.push_back(k + ScalarLike(k, 1));
+    updated_values.push_back(values_inner[1]);
+    updated_values.push_back(values_inner[2]);
+    updated_values.push_back(values_inner[3]);
+
+    return updated_values;
+  };
+  std::vector<XlaOp> values;
+  TF_ASSIGN_OR_RETURN(values, WhileLoopHelper(while_cond_fn, while_body_fn,
+                                              initial_values, name, builder));
+
+  return values;
+}
+
+StatusOr<SelfAdjointEigResult> SortByEigenvalues(SelfAdjointEigResult result) {
+  XlaBuilder* builder = result.v.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(result.v));
+  const int64 num_dims = shape.rank();
+  auto dimensions = shape.dimensions();
+
+  std::vector<int64> broadcast_dims(num_dims - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims[num_dims - 2] = num_dims - 1;
+  result.w = BroadcastInDim(result.w, dimensions, broadcast_dims);
+
+  XlaOp sort_result =
+      Sort({result.w, result.v},
+           CreateScalarLtComputation(
+               {shape.element_type(), shape.element_type()}, builder),
+           num_dims - 1);
+  result.w = GetMatrixDiagonal(GetTupleElement(sort_result, 0));
+  result.v = GetTupleElement(sort_result, 1);
+  return result;
+}
+
+}  // namespace
+
+// This is the cyclic Jacobi iteration. Please note that the eigenvalues are
+// possibly not ordered.
+//
+//  def jacobi(A):
+//      n, _ = A.shape
+//      V = np.eye(n)
+//      frobenius_norm = np.linalg.norm(A)
+//      diag_norm = np.linalg.norm(np.diag(A))
+//      off_diag_norm = np.sqrt(
+//          frobenius_norm - diag_norm) * np.sqrt(frobenius_norm + diag_norm)
+//      while off_diag_norm > 1e-6 * frobenius_norm:
+//          for p in range(n - 1):
+//              for q in range(p + 1, n):
+//                  c, s = sym_schur2x2(A, p, q)
+//                  A[[p, q], :] = np.matmul(np.array([[c, -s], [s, c]]),
+//                                           A[[p, q], :])
+//                  A[:, [p, q]] = np.matmul(A[:, [p, q]],
+//                                           np.array([[c, s], [-s, c]]))
+//                  V[:, [p, q]] = np.matmul(V[:, [p, q]],
+//                                               np.array([[c, s], [-s, c]]))
+//          frobenius_norm_sq = np.linalg.norm(A)
+//          diag_square_sum = np.linalg.norm(np.diag(A))
+//          off_diag_norm = np.sqrt(
+//              frobenius_norm - diag_norm) * np.sqrt(
+//                  frobenius_norm + diag_norm)
+//
+//      return A, V
+//
+// TODO(kuny): Implement parallel order Jacobi.
+//
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
+                                    float epsilon) {
+  XlaBuilder* builder = a.builder();
+  auto return_error = [&](const Status& status) {
+    SelfAdjointEigResult result;
+    result.v = builder->ReportError(status);
+    result.w = builder->ReportError(status);
+    return result;
+  };
+  auto shape_with_status = builder->GetShape(a);
+  if (!shape_with_status.status().ok()) {
+    return return_error(shape_with_status.status());
+  }
+  Shape a_shape = shape_with_status.ValueOrDie();
+  const int64 num_dims = a_shape.rank();
+  if (num_dims < 2) {
+    return return_error(InvalidArgument(
+        "Arguments to Eigen decomposition must have rank >= 2: got shape %s.",
+        a_shape.ToString()));
+  }
+  PrimitiveType type = a_shape.element_type();
+  if (!primitive_util::IsFloatingPointType(type)) {
+    return return_error(InvalidArgument(
+        "Type of the input matrix must be float: got %s.", a_shape.ToString()));
+  }
+
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+
+  if (m != n) {
+    return return_error(InvalidArgument(
+        "Arguments to Eigen decomposition must be square matrices: got shape "
+        "(%d, %d).",
+        m, n));
+  }
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+
+  auto tol = ScalarLike(a, epsilon);
+
+  auto v_init = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
+  auto w_init = Triangle(a, lower);
+  w_init = w_init + TransposeInMinorDims(w_init) - w_init * v_init;
+
+  auto output_with_status = WhileLoopFn(
+      {
+          Zero(builder, S32),  // k
+          v_init,              // v
+          w_init,              // w
+          tol,                 //
+      },                       //
+      n,                       //
+      max_iter,                //
+      S32,                     //
+      "CyclicJacobi",          //
+      builder);
+  if (!output_with_status.status().ok()) {
+    return return_error(output_with_status.status());
+  }
+
+  auto output = output_with_status.ValueOrDie();
+
+  SelfAdjointEigResult result;
+  result.v = output[1];
+  result.w = GetMatrixDiagonal(output[2]);
+
+  return SortByEigenvalues(result).ValueOrDie();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a089891d6a2d80c0c265a3310539b4f1c5db4d5
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// The eigenvalue decomposition of a symmetric matrix, the original matrix is
+// recovered by v * w * v_t.
+struct SelfAdjointEigResult {
+  // The i-th column is the normalized eigenvector corresponding to the
+  // eigenvalue w[i]. Will return a matrix object if a is a matrix object.
+  XlaOp v;
+  // The eigenvalues in ascending order, each repeated according to its
+  // multiplicity.
+  XlaOp w;
+};
+
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower = true,
+                                    int64 max_iter = 100, float epsilon = 1e-6);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8875dff7bfdbd4e133297cef0a6686bfcd9bb6f
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
@@ -0,0 +1,313 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+
+class SelfAdjointEigTest : public ClientLibraryTestBase {
+ protected:
+  void SetUp() override {
+    ClientLibraryTestBase::SetUp();
+    batch_3d_4x4_ = Array3D<float>{
+        {
+            {4, 6, 8, 10},
+            {6, 45, 54, 63},
+            {8, 54, 146, 166},
+            {10, 63, 166, 310},
+        },
+        {
+            {16, 24, 8, 12},
+            {24, 61, 82, 48},
+            {8, 82, 100, 6},
+            {12, 48, 6, 62},
+        },
+    };
+    matrix2d_8x8_ = Array2D<float>{
+        {14., 123., 49., 112., 115., 173., 182., 125.},
+        {123., 14., 60., 118., 150., 130., 91., 72.},
+        {49., 60., 138., 111., 106., 101., 115., 142.},
+        {112., 118., 111., 142., 91., 130., 25., 61.},
+        {115., 150., 106., 91., 116., 121., 128., 85.},
+        {173., 130., 101., 130., 121., 70., 151., 132.},
+        {182., 91., 115., 25., 128., 151., 66., 92.},
+        {125., 72., 142., 61., 85., 132., 92., 156.},
+    };
+    low_rank_4x4_ = Array2D<float>{
+        // x = [[1, 2, 3, 4], [1, -1, 1, -1]]
+        // matmul(x.T, x)
+        {2, 1, 4, 3},
+        {1, 5, 5, 9},
+        {4, 5, 10, 11},
+        {3, 9, 11, 17},
+    };
+  }
+  void TearDown() override { ClientLibraryTestBase::TearDown(); }
+
+  Array3D<float> GetUnitMatrix3D(const Array3D<float>& matrix) {
+    Array3D<float> result(matrix.n1(), matrix.n2(), matrix.n3(), 0.0);
+    for (int i = 0; i < matrix.n1(); ++i) {
+      for (int j = 0; j < matrix.n2(); ++j) {
+        result({i, j, j}) = 1.0;
+      }
+    }
+    return result;
+  }
+
+  Array3D<float> ExtractTriangularMatrix(const Array3D<float>& matrix,
+                                         bool lower) {
+    Array3D<float> result(matrix);
+    for (int i = 0; i < result.n1(); ++i) {
+      for (int j = 0; j < result.n2(); ++j) {
+        if (lower) {
+          for (int k = j + 1; k < result.n3(); ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        } else {
+          for (int k = 0; k < j; ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  XlaOp ComputeMatmulVWVt(SelfAdjointEigResult result, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(result.v).ValueOrDie();
+    std::vector<int64> out_dims = shape.dimensions();
+    std::vector<int64> broadcast_dims(shape.rank() - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+
+    broadcast_dims[shape.rank() - 2] = shape.rank() - 1;
+    auto vw = Mul(result.v, BroadcastInDim(result.w, out_dims, broadcast_dims));
+    return BatchDot(vw, TransposeInMinorDims(result.v),
+                    PrecisionConfig::HIGHEST);
+  }
+
+  XlaOp GetAverageAbsoluteError(XlaOp m1, XlaOp m2, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(m1).ValueOrDie();
+    int64 size = 1;
+    for (auto d : shape.dimensions()) {
+      size *= d;
+    }
+    return ReduceAll(Abs(m1 - m2), ConstantR0WithType(builder, F32, 0),
+                     CreateScalarAddComputation(F32, builder)) /
+           ConstantR0WithType(builder, F32, size);
+  }
+
+  Array2D<float> GenerateRandomSymmetricMatrix(int size) {
+    Array2D<float> result{size, size, 0.0};
+    result.FillRandom(10 /* stddev */, 2 /* mean */);
+    for (int i = 0; i < size; ++i) {
+      for (int j = 0; j < i; ++j) {
+        result({j, i}) = result({i, j});
+      }
+    }
+    return result;
+  }
+
+  Array3D<float> batch_3d_4x4_;
+  Array2D<float> matrix2d_8x8_;
+  Array2D<float> low_rank_4x4_;
+  Array2D<int> wrong_type_4x4_;
+};
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Lower_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(
+      ExtractTriangularMatrix(batch_3d_4x4_, true), 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Upper_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(
+      ExtractTriangularMatrix(batch_3d_4x4_, false), 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a, false);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  BatchDot(result.v, TransposeInMinorDims(result.v), PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(batch_3d_4x4_),
+                             {a_data.get()}, ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VtWV_EQ_A_Rank_Deficient_4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(low_rank_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR2<float>(&builder, low_rank_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Eigen_8x8) {
+  XlaBuilder builder(TestName());
+
+  // This is computed by numpy.linalg.eigh with float32.
+  std::vector<float> expected{-182.69205, -116.86245, -105.74489, -9.545369,
+                              37.81711,   104.732285, 120.29153,  868.00385};
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(matrix2d_8x8_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  Add(result.w, ZerosLike(result.w));
+
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_8x8) {
+  XlaBuilder builder(TestName());
+
+  float expected_vals = 1e-3;
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(matrix2d_8x8_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  // np.sum(norm(eye(n) - matmul(conj(T(v)), v)) / n**2
+  GetAverageAbsoluteError(IdentityMatrix(&builder, F32, 8, 8),
+                          BatchDot(TransposeInMinorDims(result.v), result.v),
+                          &builder);
+
+  ComputeAndCompareR0<float>(&builder, expected_vals, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Wrong_Type_Int) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<int>(wrong_type_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  EXPECT_FALSE(result.v.valid());
+  EXPECT_FALSE(result.w.valid());
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_8x8) {
+  XlaBuilder builder(TestName());
+  int size = 8;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_16x16) {
+  XlaBuilder builder(TestName());
+  int size = 16;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_32x32) {
+  XlaBuilder builder(TestName());
+  int size = 32;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_256x256) {
+  XlaBuilder builder(TestName());
+  int size = 256;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_512x512) {
+  XlaBuilder builder(TestName());
+  int size = 512;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index 77145ba7d4c72435450d3e33d57b2507eb84d2fc..d7b33c5af25606c4e7e443027b913f7ca13a013c 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -134,4 +134,31 @@ XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
   });
 }
 
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape index_shape, builder->GetShape(index));
+    ShapeUtil::AppendMajorDimension(1, &index_shape);
+    std::vector<XlaOp> to_concat;
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    to_concat.reserve(input_shape.rank());
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      if (i == dim) {
+        to_concat.push_back(Reshape(index, index_shape.dimensions()));
+      } else {
+        to_concat.push_back(Iota(builder, index_shape, i));
+      }
+    }
+    XlaOp gather_indices = ConcatInDim(builder, to_concat, input_shape.rank());
+    std::vector<int64> slice_sizes(input_shape.rank(), 1);
+    GatherDimensionNumbers gather_dnums;
+    gather_dnums.set_index_vector_dim(input_shape.rank());
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      gather_dnums.add_collapsed_slice_dims(i);
+      gather_dnums.add_start_index_map(i);
+    }
+    return Gather(input, gather_indices, gather_dnums, slice_sizes);
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index 6c482a38b5489c9fb17c3dca9ee3d2a1b8fd1890..69f98a6f43fa167adf6f77b28645a3460b292633 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -43,6 +43,20 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
 XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
                                     absl::Span<const XlaOp> starts);
 
+// Gathers values along an axis specified by dim.
+//
+// For a 3-D tensor the output is specified by:
+//
+// out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+// out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+// out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+//
+// If `input` is an n-dimensional tensor with size
+// [X0,X1,X2,..XN] and dim = i `index` must be an n-dimensional tensor with size
+// [X0,X1,...Y,Xi+1,...,X[N] where y >= 1 and `out` will have the same sizes as
+// `index`.
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 8d362119e01006555db0f82d02626175936e1d05..db6ebb9df18372260a64a3e9fd17b0c30b35667d 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -102,5 +102,18 @@ XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
+XLA_TEST_F(SlicingTest, TorchGather) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  auto input_data =
+      CreateR2Parameter<int>({{1, 2}, {3, 4}}, 0, "input", &builder, &input);
+  auto index_data =
+      CreateR2Parameter<int>({{0, 0}, {1, 0}}, 1, "index", &builder, &index);
+  TorchGather(input, index, 1);
+
+  ComputeAndCompareR2<int>(&builder, {{1, 1}, {4, 3}},
+                           {input_data.get(), index_data.get()});
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index e8553a08bb014e790822a14e128686b60b8d6b7c..ddc39f4d874cd3613a763b969091e7e65ff1c783 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -30,7 +31,13 @@ XlaOp TopK(XlaOp input, int64 k) {
         ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
     XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    XlaOp sort_result = Sort(Neg(input), {iota_s32});
+    // TODO(b/122298745): Get rid of Neg() and use CreateScalarGtComputation
+    // once the TPU backend supports the comparison computations.
+    XlaOp sort_result =
+        Sort({Neg(input), iota_s32},
+             CreateScalarLtComputation({input_shape.element_type(), S32},
+                                       iota_s32.builder()),
+             last_dim, /*is_stable=*/true);
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.h b/tensorflow/compiler/xla/client/lib/triangular_solve.h
deleted file mode 100644
index 50a3b30ebd1c15eb6d2ace4e351cb41f21db7093..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-
-// Solves systems of linear equations with lower or upper triangular coefficient
-// matrices by forward- or back-substitution. Broadcasting along leading
-// dimensions, this routine solves one of the matrix systems
-//   `op(a) * x = b`,  or `x * op(a) = b`,
-// for the variable `x` given `a` and `b`, where `op(a)` is either
-//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
-// That is, the innermost matrices in the output satisfy a scalar system
-// depending on the value of the value of (left_side, transpose_a, conjugate_a)
-// according to:
-//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
-//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
-//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
-//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
-//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
-//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
-//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
-//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
-// where * denotes complex conjugation and where the index `k` is summed over.
-//
-// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
-// square matrices. If lower is true (false), then the strictly upper (lower)
-// triangular part of each innermost matrix in `a` is assumed to be zero and is
-// not accessed.
-// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
-// tensor of shape `[..., K, M]`.
-// `left_side` is a boolean, indicating whether to solve a system of the form
-// op(a) * x = b (true) or x * op(a) = b (false).
-// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
-// (true) or upper-triangular (false).
-// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
-// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
-// conjugated (independently of whether they are transposed), so that when both
-// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
-//
-// Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
-// blocking is used.
-XlaOp TriangularSolve(
-    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
-    bool conjugate_a, int64 block_size = 128,
-    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index c00ba26295a30c192fedae48f5aabf78cbd7d831..9b7c01a727a5aa0aebf600584b59aec4d47aa0b4 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -299,12 +299,17 @@ XlaComputation XlaBuilder::BuildAndNoteError() {
   return build_status.ConsumeValueOrDie();
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
+Status XlaBuilder::GetCurrentStatus() const {
   if (!first_error_.ok()) {
     string backtrace;
     first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
     return AppendStatus(first_error_, backtrace);
   }
+  return Status::OK();
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
+  TF_RETURN_IF_ERROR(GetCurrentStatus());
   return Build(instructions_.back().id(), remove_dynamic_dimensions);
 }
 
@@ -318,11 +323,7 @@ StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root,
 
 StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id,
                                            bool remove_dynamic_dimensions) {
-  if (!first_error_.ok()) {
-    string backtrace;
-    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
-    return AppendStatus(first_error_, backtrace);
-  }
+  TF_RETURN_IF_ERROR(GetCurrentStatus());
 
   // TODO(b/121223198): XLA backend cannot handle dynamic dimensions yet, remove
   // all dynamic dimensions before building xla program until we have support in
@@ -573,16 +574,6 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
   });
 }
 
-XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1004,36 +995,6 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
   });
 }
 
-XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs,
                       const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1054,6 +1015,18 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    // If one operand is a scalar, just multiply the two operands.
+    if (ShapeUtil::IsScalar(lhs_shape) || ShapeUtil::IsScalar(rhs_shape)) {
+      if (dimension_numbers.rhs_batch_dimensions_size() != 0 ||
+          dimension_numbers.lhs_batch_dimensions_size() != 0 ||
+          dimension_numbers.rhs_contracting_dimensions_size() != 0 ||
+          dimension_numbers.lhs_contracting_dimensions_size() != 0) {
+        return InvalidArgument(
+            "Dots with scalar operands must have no contracting or batch "
+            "dimensions");
+      }
+      return xla::Mul(lhs, rhs);
+    }
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
@@ -1549,147 +1522,6 @@ XlaOp XlaBuilder::CustomCall(
   });
 }
 
-XlaOp XlaBuilder::Complex(const XlaOp& real, const XlaOp& imag,
-                          absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Conj(const XlaOp& operand) {
-  return Complex(Real(operand), Neg(Imag(operand)));
-}
-
-XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kXor, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Not(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kNot, operand);
-}
-
-XlaOp XlaBuilder::ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                            absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::ShiftRightArithmetic(
-    const XlaOp& lhs, const XlaOp& rhs,
-    absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
-                  broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::ShiftRightLogical(
-    const XlaOp& lhs, const XlaOp& rhs,
-    absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
-                  broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Abs(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kAbs, operand);
-}
-
-XlaOp XlaBuilder::Atan2(const XlaOp& y, const XlaOp& x,
-                        absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Exp(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kExp, operand);
-}
-
-XlaOp XlaBuilder::Expm1(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kExpm1, operand);
-}
-
-XlaOp XlaBuilder::Floor(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kFloor, operand);
-}
-
-XlaOp XlaBuilder::Ceil(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kCeil, operand);
-}
-
-XlaOp XlaBuilder::Round(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kRoundNearestAfz, operand);
-}
-
-XlaOp XlaBuilder::Log(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kLog, operand);
-}
-
-XlaOp XlaBuilder::Log1p(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kLog1p, operand);
-}
-
-XlaOp XlaBuilder::Sign(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kSign, operand);
-}
-
-XlaOp XlaBuilder::Clz(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kClz, operand);
-}
-
-XlaOp XlaBuilder::Cos(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kCos, operand);
-}
-
-XlaOp XlaBuilder::Sin(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kSin, operand);
-}
-
-XlaOp XlaBuilder::Tanh(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kTanh, operand);
-}
-
-XlaOp XlaBuilder::Real(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kReal, operand);
-}
-
-XlaOp XlaBuilder::Imag(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kImag, operand);
-}
-
-XlaOp XlaBuilder::IsFinite(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kIsFinite, operand);
-}
-
 XlaOp XlaBuilder::Transpose(const XlaOp& operand,
                             absl::Span<const int64> permutation) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1720,36 +1552,146 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   });
 }
 
+namespace {
+// Switch from a floating point value to a integer value in such a way that when
+// using the integer value to compare, we get the same result for normal values,
+// and -Nan is treated as the smallest value, and Nan is treated as the largest
+// value.
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? numeric_limits<int32>::max() - x : x;
+// then y is ordered as an int32 such that finite values have the obvious order,
+// -0 is ordered before 0, and -NaN and NaN appear at the beginning and end of
+// the ordering.
+// Note that in order to avoid -x to overflow, we calculate
+// numeric_limits<int32>::max() - x as unsigned, and then convert back to
+// signed.
+XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
+                                            int64 bit_width) {
+  PrimitiveType signed_type;
+  PrimitiveType unsigned_type;
+  XlaOp max_value;
+  switch (bit_width) {
+    case 16:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint16>(std::numeric_limits<int16>::max()));
+      signed_type = S16;
+      unsigned_type = U16;
+      break;
+    case 32:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint32>(std::numeric_limits<int32>::max()));
+      signed_type = S32;
+      unsigned_type = U32;
+      break;
+    case 64:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint64>(std::numeric_limits<int64>::max()));
+      signed_type = S64;
+      unsigned_type = U64;
+      break;
+    default:
+      return value.builder()->ReportError(
+          InvalidArgument("Invalid bit width %lld for Comparator floating "
+                          "point parameter.",
+                          bit_width));
+  }
+  auto signed_value = BitcastConvertType(value, signed_type);
+  auto unsigned_value = BitcastConvertType(value, unsigned_type);
+  auto flipped_value =
+      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
+  auto is_negative =
+      Lt(signed_value,
+         ConstantLiteral(value.builder(), LiteralUtil::Zero(signed_type)));
+  return Select(is_negative, flipped_value, signed_value);
+}
+}  // namespace
+
 XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                        int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    std::vector<XlaOp> operands{keys};
+    for (const XlaOp& value : values) {
+      operands.push_back(value);
+    }
+    // Build the default less-than comparator (copied from lib/comparators.cc).
+    // TODO(b/122298745): Remove the deprecated API method so that this code
+    // duplication can be deleted.
+    auto b = this->CreateSubBuilder("comparator");
+    std::vector<PrimitiveType> operand_types;
+    for (const XlaOp& operand : operands) {
+      TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(operand));
+      operand_types.push_back(operand_shape.element_type());
+    }
+
+    int64 parameter_count = 0;
+    XlaOp first_lhs_param;
+    XlaOp first_rhs_param;
+
+    for (auto operand_type : operand_types) {
+      auto scalar_shape = ShapeUtil::MakeShape(operand_type, {});
+      auto lhs_param =
+          b->Parameter(parameter_count * 2, scalar_shape,
+                       absl::StrCat("p.", parameter_count, ".lhs"));
+      auto rhs_param =
+          b->Parameter(parameter_count * 2 + 1, scalar_shape,
+                       absl::StrCat("p.", parameter_count, ".rhs"));
+      if (parameter_count == 0) {
+        first_lhs_param = lhs_param;
+        first_rhs_param = rhs_param;
+      }
+      ++parameter_count;
+    }
+    if (primitive_util::IsFloatingPointType(operand_types[0])) {
+      PrimitiveType compare_type = operand_types[0];
+      // Special-case handling for BF16. We currently do not support direct
+      // comparisons with BF16, so we convert to F32 and then use the F32
+      // comparison logic.
+      if (compare_type == BF16) {
+        compare_type = F32;
+        first_lhs_param = b->ConvertElementType(first_lhs_param, F32);
+        first_rhs_param = b->ConvertElementType(first_rhs_param, F32);
+      }
+      int64 bit_width = primitive_util::BitWidth(compare_type);
+      first_lhs_param =
+          BitcastConvertFloatingPointToIntegral(first_lhs_param, bit_width);
+      first_rhs_param =
+          BitcastConvertFloatingPointToIntegral(first_rhs_param, bit_width);
+    }
+    Lt(first_lhs_param, first_rhs_param);
+
+    TF_ASSIGN_OR_RETURN(auto comparator, b->Build());
+    return Sort(operands, comparator, dimension, /*is_stable=*/false);
+  });
+}
+
+XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
+                       const XlaComputation& comparator, int64 dimension,
+                       bool is_stable) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
+    instr.set_is_stable(is_stable);
     std::vector<const Shape*> operand_shape_ptrs;
-    TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
-    operand_shape_ptrs.push_back(&keys_shape);
-    TF_ASSIGN_OR_RETURN(std::vector<Shape> values_shapes,
-                        GetOperandShapes(values));
-    absl::c_transform(values_shapes, std::back_inserter(operand_shape_ptrs),
+    TF_ASSIGN_OR_RETURN(std::vector<Shape> operand_shapes,
+                        GetOperandShapes(operands));
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
                                          HloOpcode::kSort, operand_shape_ptrs));
     *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
-      TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
+      TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(operands[0]));
       dimension = keys_shape.rank() - 1;
     }
     instr.add_dimensions(dimension);
-    std::vector<XlaOp> operands{keys};
-    operands.insert(operands.end(), values.begin(), values.end());
+    AddCalledComputation(comparator, &instr);
     return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
   });
 }
 
-XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1775,10 +1717,6 @@ XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
   });
 }
 
-XlaOp XlaBuilder::Neg(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kNegate, operand);
-}
-
 XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand,
                         const XlaOp& max) {
   return TernaryOp(HloOpcode::kClamp, min, operand, max);
@@ -2159,8 +2097,8 @@ XlaOp XlaBuilder::CrossReplicaSum(
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
     const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {});
     auto b = CreateSubBuilder("sum");
-    b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
-           b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
+    Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
+        b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
     TF_ASSIGN_OR_RETURN(auto computation, b->Build());
     return CrossReplicaSum(operand, computation, replica_groups,
                            /*channel_id=*/absl::nullopt);
@@ -2956,32 +2894,38 @@ XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index) {
 
 XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Eq(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kEq, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Ne(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kNe, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Ge(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kGe, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Gt(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kGt, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Lt(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kLe, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Le(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kLt, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
@@ -3055,6 +2999,29 @@ XlaOp Fft(const XlaOp& operand, FftType fft_type,
   return operand.builder()->Fft(operand, fft_type, fft_length);
 }
 
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(const Shape& b_shape, builder->GetShape(b));
+    xla::TriangularSolveOptions& options =
+        *instr.mutable_triangular_solve_options();
+    options.set_left_side(left_side);
+    options.set_lower(lower);
+    options.set_unit_diagonal(unit_diagonal);
+    options.set_transpose_a(transpose_a);
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTriangularSolveShape(
+                                         a_shape, b_shape, options));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return builder->AddInstruction(std::move(instr),
+                                   HloOpcode::kTriangularSolve, {a, b});
+  });
+}
+
 XlaOp Infeed(XlaBuilder* builder, const Shape& shape, const string& config) {
   return builder->Infeed(shape, config);
 }
@@ -3084,78 +3051,96 @@ XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
                              operand_shapes_with_layout);
 }
 
-XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+XlaOp Complex(const XlaOp& lhs, const XlaOp& rhs,
               absl::Span<const int64> broadcast_dimensions) {
-  return real.builder()->Complex(real, imag, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kComplex, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Conj(const XlaOp& operand) { return operand.builder()->Conj(operand); }
+XlaOp Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
 
 XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Add(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAdd, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Sub(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kSubtract, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Mul(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMultiply, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Div(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kDivide, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Rem(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kRemainder, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Max(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMaximum, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Min(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMinimum, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->And(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAnd, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Or(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kOr, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Xor(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kXor, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Not(const XlaOp& operand) { return operand.builder()->Not(operand); }
+XlaOp Not(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kNot, operand);
+}
 
 XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
                 absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftLeft(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftLeft, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftRightArithmetic(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                         absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftRightLogical(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
@@ -3250,48 +3235,67 @@ XlaOp SelectAndScatterWithGeneralPadding(
       init_value, scatter);
 }
 
-XlaOp Abs(const XlaOp& operand) { return operand.builder()->Abs(operand); }
+XlaOp Abs(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kAbs, operand);
+}
 
-XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+XlaOp Atan2(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions) {
-  return y.builder()->Atan2(y, x, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAtan2, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Exp(const XlaOp& operand) { return operand.builder()->Exp(operand); }
-
-XlaOp Expm1(const XlaOp& operand) { return operand.builder()->Expm1(operand); }
-
-XlaOp Floor(const XlaOp& operand) { return operand.builder()->Floor(operand); }
-
-XlaOp Ceil(const XlaOp& operand) { return operand.builder()->Ceil(operand); }
-
-XlaOp Round(const XlaOp& operand) { return operand.builder()->Round(operand); }
-
-XlaOp Log(const XlaOp& operand) { return operand.builder()->Log(operand); }
-
-XlaOp Log1p(const XlaOp& operand) { return operand.builder()->Log1p(operand); }
-
-XlaOp Sign(const XlaOp& operand) { return operand.builder()->Sign(operand); }
-
-XlaOp Clz(const XlaOp& operand) { return operand.builder()->Clz(operand); }
-
-XlaOp Cos(const XlaOp& operand) { return operand.builder()->Cos(operand); }
-
-XlaOp Sin(const XlaOp& operand) { return operand.builder()->Sin(operand); }
-
-XlaOp Tanh(const XlaOp& operand) { return operand.builder()->Tanh(operand); }
-
-XlaOp Real(const XlaOp& operand) { return operand.builder()->Real(operand); }
-
-XlaOp Imag(const XlaOp& operand) { return operand.builder()->Imag(operand); }
+XlaOp Exp(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kExp, operand);
+}
+XlaOp Expm1(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kExpm1, operand);
+}
+XlaOp Floor(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kFloor, operand);
+}
+XlaOp Ceil(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCeil, operand);
+}
+XlaOp Round(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kRoundNearestAfz, operand);
+}
+XlaOp Log(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog, operand);
+}
+XlaOp Log1p(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
+}
+XlaOp Sign(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
+}
+XlaOp Clz(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kClz, operand);
+}
+XlaOp Cos(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCos, operand);
+}
+XlaOp Sin(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSin, operand);
+}
+XlaOp Tanh(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kTanh, operand);
+}
+XlaOp Real(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kReal, operand);
+}
+XlaOp Imag(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kImag, operand);
+}
 
 XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Pow(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kPower, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp IsFinite(const XlaOp& operand) {
-  return operand.builder()->IsFinite(operand);
+  return operand.builder()->UnaryOp(HloOpcode::kIsFinite, operand);
 }
 
 XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type) {
@@ -3302,7 +3306,9 @@ XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type) {
   return operand.builder()->BitcastConvertType(operand, new_element_type);
 }
 
-XlaOp Neg(const XlaOp& operand) { return operand.builder()->Neg(operand); }
+XlaOp Neg(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kNegate, operand);
+}
 
 XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation) {
   return operand.builder()->Transpose(operand, permutation);
@@ -3316,6 +3322,12 @@ XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values, int64 dimension) {
   return keys.builder()->Sort(keys, values, dimension);
 }
 
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64 dimension, bool is_stable) {
+  return operands[0].builder()->Sort(operands, comparator, dimension,
+                                     is_stable);
+}
+
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max) {
   return min.builder()->Clamp(min, operand, max);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index c429035ad0f96928525219a5506df81d64ffef95..b5bdb75998a4a0bfce3607715963c1609bc45ef8 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -56,6 +56,9 @@ class XlaOp {
   }
   ~XlaOp() = default;
 
+  XlaOp(const XlaOp& other) = default;
+  XlaOp& operator=(const XlaOp& other) = default;
+
   // Precondition: !IsUninitialized().
   //
   // It's very common to do foo.builder()->bar().  Without this precondition, if
@@ -235,6 +238,10 @@ class XlaBuilder {
   // See also set_die_immediately_on_error().
   Status first_error() const { return first_error_; }
 
+  // Returns the current status of the builder, complete with the stack trace
+  // information.
+  Status GetCurrentStatus() const;
+
   // Returns the shape of the given op.
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
@@ -315,38 +322,6 @@ class XlaBuilder {
 
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  template <typename NativeT>
-  XlaOp ConstantR0(NativeT value);
-  template <typename NativeT>
-  XlaOp ConstantR1(absl::Span<const NativeT> values);
-  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  XlaOp ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                    const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  XlaOp ConstantR1(int64 length, NativeT value);
-
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
@@ -394,24 +369,6 @@ class XlaBuilder {
 
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
@@ -476,50 +433,6 @@ class XlaBuilder {
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Conj(const XlaOp& operand);
-
-  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Not(const XlaOp& operand);
-
-  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                  absl::Span<const int64> broadcast_dimensions = {});
-  XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
-                             absl::Span<const int64> broadcast_dimensions = {});
-  XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
-                          absl::Span<const int64> broadcast_dimensions = {});
-
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
@@ -578,44 +491,6 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  XlaOp Abs(const XlaOp& operand);
-
-  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-              absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Exp(const XlaOp& operand);
-
-  XlaOp Expm1(const XlaOp& operand);
-
-  XlaOp Floor(const XlaOp& operand);
-
-  XlaOp Ceil(const XlaOp& operand);
-
-  XlaOp Round(const XlaOp& operand);
-
-  XlaOp Log(const XlaOp& operand);
-
-  XlaOp Log1p(const XlaOp& operand);
-
-  XlaOp Sign(const XlaOp& operand);
-
-  XlaOp Clz(const XlaOp& operand);
-
-  XlaOp Cos(const XlaOp& operand);
-
-  XlaOp Sin(const XlaOp& operand);
-
-  XlaOp Tanh(const XlaOp& operand);
-
-  XlaOp Real(const XlaOp& operand);
-
-  XlaOp Imag(const XlaOp& operand);
-
-  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp IsFinite(const XlaOp& operand);
-
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
   XlaOp Iota(PrimitiveType type, int64 size);
@@ -626,14 +501,15 @@ class XlaBuilder {
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  XlaOp Neg(const XlaOp& operand);
-
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
+  ABSL_DEPRECATED("Use form with comparator computation instead")
   XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
+  XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+             int64 dimension = -1, bool is_stable = false);
 
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
@@ -825,48 +701,6 @@ class XlaBuilder {
                          const Shape& shape, const string& name);
   friend XlaOp ConstantLiteral(XlaBuilder* builder,
                                const LiteralSlice& literal);
-  template <typename NativeT>
-  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          absl::Span<const NativeT> values);
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2(
-      XlaBuilder* builder,
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                           const Array<NativeT>& values,
-                                           const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArray(XlaBuilder* builder,
-                                 const Array<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                               const Array2D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                                     const Array2D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                               const Array3D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                                     const Array3D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                               const Array4D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                                     const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
 
   friend XlaOp Broadcast(const XlaOp& operand,
                          absl::Span<const int64> broadcast_sizes);
@@ -964,6 +798,9 @@ class XlaBuilder {
       const PrecisionConfig* precision_config);
   friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
                    absl::Span<const int64> fft_length);
+  friend XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                               bool unit_diagonal,
+                               TriangularSolveOptions::Transpose transpose_a);
   friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
                       const string& config);
   friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
@@ -1089,6 +926,9 @@ class XlaBuilder {
   friend XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
   friend XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                     int64 dimension);
+  friend XlaOp Sort(absl::Span<const XlaOp> operands,
+                    const XlaComputation& comparator, int64 dimension,
+                    bool is_stable);
   friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
   friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                    const XlaComputation& computation,
@@ -1484,6 +1324,32 @@ XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp Fft(const XlaOp& operand, FftType fft_type,
           absl::Span<const int64> fft_length);
 
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves for x in one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+//
+// * `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+//   square matrices. If `lower` is true (false), then the strictly upper
+//   (lower) triangular part of each innermost matrix in `a` is assumed to be
+//   zero and is not accessed.
+// * `b` is a tensor of shape `[..., M, K]` if `left_side` is true, otherwise a
+//   tensor of shape `[..., K, M]`.
+// * `left_side` is a boolean, indicating whether to solve a system of the form
+//   op(a) * x = b (true) or x * op(a) = b (false).
+// * `lower` is a boolean, indicating whether the argument `a` is
+// lower-triangular
+//   (true) or upper-triangular (false).
+// * If `unit_diagonal` is true, the diagonal elements of `a` are assumed to be
+//   1 and not accessed.
+// * `transpose_a` indicates which function `op` we use to transform the tensor
+//   `a`: the identity function, transpose(a), or conjugate(transpose(a))
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a);
+
 // Enqueues an infeed instruction onto the computation, which writes data of
 // the given shape to the infeed buffer of the device.
 XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
@@ -1819,7 +1685,7 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // of keys, in ascending order.
 // * If the keys have higher rank, the keys are sorted along the provided
 // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-// value of 0 will indepenently sort every column, and a dimension value of 1
+// value of 0 will independently sort every column, and a dimension value of 1
 // will independently sort each row. If no dimension number is provided, then
 // the last dimension is chosen by default.
 //
@@ -1829,9 +1695,39 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // * The result is a tuple that consists of a sorted tensor of keys (along the
 // provided dimension, as above) as the first element, and tensors with their
 // corresponding values as the other elements.
+ABSL_DEPRECATED("Use form with comparator computation instead")
 XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
            int64 dimension = -1);
 
+// Enqueues a sort instruction onto the computation, using 'comparator' for
+// comparisons. 'comparator' needs to define a strict weak order. 'is_stable'
+// determines whether the stable sorting should be used.
+// If only one operand is provided:
+// * If the operand is a rank-1 tensor (an array), the result is a sorted array.
+//   The resulting sorting order has the property that for all index positions
+//   i, j with i < j, either
+//   comparator(value[i], value[j]) = comparator(value[j], value[i]) = false or
+//   comparator(value[i], value[j]) = true.
+// * If the operand has higher rank, the operand is sorted along the provided
+//   dimension. For example, for a rank-2 tensor (a matrix), a dimension value
+//   of 0 will independently sort every column, and a dimension value of 1 will
+//   independently sort each row. If no dimension number is provided, then the
+//   last dimension is chosen by default. For the dimension which is sorted, the
+//   same sorting order applies as in the rank-1 case.
+//
+// If more than one operand is provided:
+// * All operands must be tensors with the same dimensions. The element types of
+//   the tensors may be different.
+// * The result is a tuple that consists of the operands in sorted order (along
+//   the provided dimension, as above). The same permutation as implied by the
+//   comparison computation is applied to all operand tensors. When comparing
+//   two index positions, 'comparator' is called with 2 * n scalar parameters,
+//   where parameter 2 * i and 2 * i + 1 correspond to the value of operand i at
+//   two index positions.
+// Default comparator computations can be found in lib/comparators.h
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64 dimension = -1, bool is_stable = false);
+
 // Enqueues a clamp instruction onto the computation.
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
@@ -1970,81 +1866,6 @@ XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 // Implementation details below this point.
 //
 
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(absl::Span<const NativeT> values) {
-  return ConstantLiteral(LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                              const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
-  return ConstantLiteral(LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
-  return ConstantLiteral(LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
 // Free function template implementations.
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index a9a91648ac377987e7f226116e11c9c697ace103..43d9ee0d9a5e689676b00e59d7c59bb0f4e37461 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -128,11 +128,6 @@ static void AllocateFlags() {
       tensorflow::Flag(
           "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
           "With xla_generate_hlo_graph, dump the graphs into this path."),
-      tensorflow::Flag(
-          "xla_hlo_dump_as_graphdef",
-          bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
-          flag_values->xla_hlo_dump_as_graphdef(),
-          "Dump HLO graphs as TensorFlow GraphDefs."),
       tensorflow::Flag("xla_hlo_dump_as_html",
                        bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_html),
                        flag_values->xla_hlo_dump_as_html(),
@@ -144,13 +139,6 @@ static void AllocateFlags() {
           flag_values->xla_hlo_graph_sharding_color(),
           "Assign colors based on sharding assignments when generating the "
           "HLO graphs."),
-      tensorflow::Flag(
-          "xla_hlo_tfgraph_device_scopes",
-          bool_setter_for(&DebugOptions::set_xla_hlo_tfgraph_device_scopes),
-          flag_values->xla_hlo_tfgraph_device_scopes(),
-          "When generating TensorFlow HLO graphs, if the HLO instructions "
-          "are assigned to a specific device, prefix the name scope with "
-          "\"devX\" with X being the device ordinal."),
       tensorflow::Flag(
           "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
           "HLO modules matching this regex will be dumped to LOG(INFO)."),
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 0f9b591c70d4fd96147958d18bd5fb7dd78a7f3f..230f3b202a4b531c381665471c3856c3feba5a3a 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -77,7 +77,7 @@ ExecutionProfile* ExecutableRunOptions::execution_profile() const {
 }
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_assignment(
-    DeviceAssignment* device_assignment) {
+    const DeviceAssignment* device_assignment) {
   device_assignment_ = device_assignment;
   return *this;
 }
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 6f36d11dfb34eb27e79ea4ff797d35f80fb44b27..1e744953bd3be58afba5b81c0e2a8ba26665f9c4 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -74,7 +74,7 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
 
   ExecutableRunOptions& set_device_assignment(
-      DeviceAssignment* device_assignment);
+      const DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
   ExecutableRunOptions& set_rng_seed(int rng_seed);
@@ -83,7 +83,7 @@ class ExecutableRunOptions {
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
-  DeviceAssignment* device_assignment_ = nullptr;
+  const DeviceAssignment* device_assignment_ = nullptr;
   stream_executor::Stream* stream_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index 267701e9c0e42a21d2cda6238520f6a9692e7e76..d756cd74c98b98a6fda099690d966562bd694e2c 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -25,6 +25,8 @@ upper_tabs:
         path: /xla/operation_semantics
       - title: Shapes and layout
         path: /xla/shapes
+      - title: Tiled layout
+        path: /xla/tiled_layout
       - title: Using AOT compilation
         path: /xla/tfcompile
       - heading: Tutorials
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 363fd17b69bfbe54d486e367d9bf5cc0eee4205e..db90d184b5218614ac49363ebf2a7e25fffe44de 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1186,7 +1186,7 @@ if and only if the corresponding input element is finite.
 
 <b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ -0 & x = -0\\ NaN & x = NaN\\ +0 & x = +0\\ 1 & x > 0 \end{cases}$$
 
 using the comparison operator of the element type of `operand`.
 
diff --git a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md b/tensorflow/compiler/xla/g3doc/tiled_layout.md
similarity index 96%
rename from tensorflow/compiler/xla/g3doc/layout_with_tiling.md
rename to tensorflow/compiler/xla/g3doc/tiled_layout.md
index 5e990851af7495ebd4417e44f1d955fcc14dadf1..21e88ceab6208cdf940826d769fd93713044d5a0 100644
--- a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
+++ b/tensorflow/compiler/xla/g3doc/tiled_layout.md
@@ -1,9 +1,7 @@
 # Tiled layout
 
-*Note: This doc describes how tiled layout is intended to work. Tiling is being
-implemented, but this is an early effort and it is currently not even guaranteed
-to get an Unimplemented error if one tries to use tiling - it may be just
-silently ignored.*
+Caution: Tiled layout is *pre-release* and this describes how it's intended to
+work. Errors may be silently ignored.
 
 <center> ![](images/xla_array_layout_figure1.png)
 
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 7e22a32e545e4155545ffcfb9582187eadec3a82..eebd8245abe759b71b3fe732943761325ea04b81 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
index e3b5fcd5274881cec31ecf906e3461685f82a1f4..000c4fdc40519214fa9fa721a8987b77b534442b 100644
--- a/tensorflow/compiler/xla/layout.cc
+++ b/tensorflow/compiler/xla/layout.cc
@@ -30,7 +30,19 @@ TileProto Tile::ToProto() const {
 }
 
 string Tile::ToString() const {
-  return absl::StrCat("(", absl::StrJoin(dimensions(), ","), ")");
+  std::vector<string> elements;
+  for (auto dim : dimensions()) {
+    if (dim >= 0) {
+      elements.push_back(std::to_string(dim));
+    } else {
+      if (dim == kCombineDimension) {
+        elements.push_back("*");
+      } else {
+        elements.push_back(absl::StrCat("Invalid value ", dim));
+      }
+    }
+  }
+  return absl::StrCat("(", absl::StrJoin(elements, ","), ")");
 }
 
 /* static */ Layout Layout::CreateFromProto(const LayoutProto& proto) {
@@ -64,23 +76,43 @@ LayoutProto Layout::ToProto() const {
 }
 
 string Layout::ToString() const {
-  // TODO(b/119839262): Emit tiles in string.
   if (format() == SPARSE) {
+    CHECK_EQ(tiles_size(), 0) << "Sparse layout should not be tiled.";
     return absl::StrCat("sparse{", max_sparse_elements(), "}");
   } else if (format() == DENSE) {
-    return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","), "}");
+    string colon_string = tiles().empty() ? "" : "T";
+    for (Tile tile : tiles()) {
+      absl::StrAppend(&colon_string, tile.ToString());
+    }
+    if (element_size_in_bits() != 0) {
+      absl::StrAppend(&colon_string, "E(", element_size_in_bits(), ")");
+    }
+    return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","),
+                        colon_string.empty() ? "" : ":", colon_string, "}");
   } else {
     CHECK_EQ(format(), INVALID_FORMAT);
     return "invalid{}";
   }
 }
 
+bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
+  if (lhs.format() != rhs.format() ||
+      lhs.minor_to_major() != rhs.minor_to_major() ||
+      lhs.max_sparse_elements() != rhs.max_sparse_elements()) {
+    return false;
+  }
+  if (!ignore_tiles_ && lhs.tiles() != rhs.tiles()) {
+    return false;
+  }
+  if (!ignore_element_size_ &&
+      lhs.element_size_in_bits() != rhs.element_size_in_bits()) {
+    return false;
+  }
+  return true;
+}
+
 bool Layout::operator==(const Layout& other) const {
-  return (other.format() == format() &&
-          other.minor_to_major() == minor_to_major() &&
-          other.element_size_in_bits() == element_size_in_bits() &&
-          other.max_sparse_elements() == max_sparse_elements() &&
-          other.tiles() == tiles());
+  return Equal()(*this, other);
 }
 
 std::ostream& operator<<(std::ostream& out, const Tile& tile) {
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index 313368c39e4c976fc481941eb17325101f2ba69a..acc449b781b503142b24ed7229e3559230bb1599 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -55,6 +55,20 @@ class Tile {
   // Returns the dimensions of the tile.
   const std::vector<int64>& dimensions() const { return dimensions_; }
 
+  Tile& add_dimensions(int64 value) {
+    dimensions_.push_back(value);
+    return *this;
+  }
+
+  Tile& clear_dimensions() {
+    dimensions_.clear();
+    return *this;
+  }
+
+  // This dimension size means the corresponding dimension in the shape is
+  // combined with the next minor dimension before tiling is applied.
+  static constexpr int64 kCombineDimension = std::numeric_limits<int64>::min();
+
  private:
   // The bounds of the tile.
   std::vector<int64> dimensions_;
@@ -71,10 +85,12 @@ class Layout {
 
   // Constructs a dense tiled layout with the given minor-to-major order and
   // tiles.
-  Layout(absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles)
+  Layout(absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+         int64 element_size_in_bits = 0)
       : format_(DENSE),
         minor_to_major_(minor_to_major.begin(), minor_to_major.end()),
-        tiles_(tiles.begin(), tiles.end()) {}
+        tiles_(tiles.begin(), tiles.end()),
+        element_size_in_bits_(element_size_in_bits) {}
 
   // Construct a shape from a LayoutProto.
   static Layout CreateFromProto(const LayoutProto& proto);
@@ -85,6 +101,37 @@ class Layout {
   // Returns a human-readable string that represents this layout.
   string ToString() const;
 
+  // Equal is a configurable functor to check the equality of two layouts.
+  //
+  // Examples:
+  //
+  // - Comparing two layouts ignoring their difference in tiles:
+  //   Equal().IgnoreTiles()(layout1, layout2);
+  //
+  // - Comparing two layouts ignoring their difference in tiles and element
+  //   size:
+  //   Equal().IgnoreTiles().IgnoreElementSize()(layout1, layout2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Layout& lhs, const Layout& rhs);
+
+    Equal& IgnoreTiles() {
+      ignore_tiles_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreElementSize() {
+      ignore_element_size_ = true;
+      return *this;
+    }
+
+   private:
+    bool ignore_tiles_ = false;
+    bool ignore_element_size_ = false;
+  };
+
   bool operator==(const Layout& other) const;
   bool operator!=(const Layout& other) const { return !(*this == other); }
 
@@ -159,7 +206,7 @@ class Layout {
     element_size_in_bits_ = 0;
   }
 
- public:
+ private:
   // The format of this layout.
   Format format_ = INVALID_FORMAT;
 
@@ -172,11 +219,11 @@ class Layout {
   // memory.  This field must be zero unless the format is SPARSE.
   int64 max_sparse_elements_ = 0;
 
-  // The number of bits used to store an individual array element.
-  int64 element_size_in_bits_ = 0;
-
   // The tiles used in tiling-based layout.
   std::vector<Tile> tiles_;
+
+  // The number of bits used to store an individual array element.
+  int64 element_size_in_bits_ = 0;
 };
 
 std::ostream& operator<<(std::ostream& out, const Tile& Tile);
diff --git a/tensorflow/compiler/xla/layout_test.cc b/tensorflow/compiler/xla/layout_test.cc
index fb6abd3f6523b978e72b21ec082ae06973e86243..f5d71c553ed2e0cfd5d5945144dd476557582b5f 100644
--- a/tensorflow/compiler/xla/layout_test.cc
+++ b/tensorflow/compiler/xla/layout_test.cc
@@ -38,10 +38,13 @@ TEST_F(LayoutTest, ToString) {
             "sparse{123}");
   EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
   EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}).ToString(),
-            "{3,2,1,0}");
+            "{3,2,1,0:T(42,123)(4,5)}");
   EXPECT_EQ(
       Layout({1, 0}, {Tile({2, 55})}).set_element_size_in_bits(42).ToString(),
-      "{1,0}");
+      "{1,0:T(2,55)E(42)}");
+  EXPECT_EQ(
+      Layout({1, 0}, {Tile({-2, 55})}).set_element_size_in_bits(42).ToString(),
+      "{1,0:T(Invalid value -2,55)E(42)}");
 }
 
 TEST_F(LayoutTest, StreamOut) {
@@ -84,6 +87,15 @@ TEST_F(LayoutTest, Equality) {
             Layout().set_format(SPARSE).set_max_sparse_elements(42));
   EXPECT_NE(Layout().set_format(SPARSE).set_max_sparse_elements(42),
             Layout().set_format(SPARSE).set_max_sparse_elements(24));
+
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2})));
+  EXPECT_TRUE(Layout::Equal().IgnoreTiles()(Layout({0, 1, 2}, {Tile({42, 44})}),
+                                            Layout({0, 1, 2})));
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {}, 32), Layout({0, 1, 2}, {}, 1)));
+  EXPECT_TRUE(Layout::Equal().IgnoreElementSize()(Layout({0, 1, 2}, {}, 32),
+                                                  Layout({0, 1, 2}, {}, 1)));
 }
 
 TEST_F(LayoutTest, LayoutToFromProto) {
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 2fe9b56c6bdffb931726f60ab75081361b43ebb4..62314118ca9713a04cb4e3cf6ad261b966d85f15 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -54,12 +54,24 @@ void SetDefaultLayoutToContainer(std::vector<int64>* minor_to_major) {
 }  // namespace
 
 /* static */ Layout LayoutUtil::MakeLayout(
-    absl::Span<const int64> minor_to_major) {
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
   Layout layout;
   layout.set_format(DENSE);
   for (int64 dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
   }
+  for (Tile tile : tiles) {
+    for (int64 dim : tile.dimensions()) {
+      if (dim < 0 && dim != Tile::kCombineDimension) {
+        LOG(FATAL) << "Tile dimension size needs to be mininum int64 value if "
+                      "it's negative. Value is "
+                   << dim;
+      }
+    }
+    *layout.add_tiles() = tile;
+  }
+  layout.set_element_size_in_bits(element_size_in_bits);
   return layout;
 }
 
@@ -235,6 +247,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       }
       dimensions_in_layout[dim] = true;
     }
+  } else {
+    if (layout.tiles_size() != 0) {
+      return InvalidArgument("Only dense layouts can be tiled.");
+    }
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 609dba67bcdbcb11be0906b7d87a52a17ba0dfbd..9997aef465daa48ee77050e03d97cde0ea2425cc 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -36,7 +36,9 @@ class LayoutUtil {
  public:
   // Creates a layout with the given minor-to-major dimension order. (This is a
   // convenience function for protobuf construction.)
-  static Layout MakeLayout(absl::Span<const int64> minor_to_major);
+  static Layout MakeLayout(absl::Span<const int64> minor_to_major,
+                           absl::Span<const Tile> tiles = {},
+                           int64 element_size_in_bits = 0);
 
   // Similar to MakeLayout, but take indices in reverse order.
   static Layout MakeLayoutFromMajorToMinor(
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 4cc94c270cd64eb19761cc1044861c7d185b7888..12da214063676717aa075e66aa54974f4cc2b31b 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -317,6 +317,81 @@ TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
                             ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
 }
 
+TEST_F(LayoutUtilTest, HumanStringWithTiling) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3, 4}, {0, 1, 2});
+  Tile* tile;
+
+  // No tiling.
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape), "f32[2,3,4]{0,1,2}");
+
+  // 2D tile.
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(512);
+  tile->add_dimensions(1024);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "f32[2,3,4]{0,1,2:T(512,1024)}");
+
+  // 1D tile.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(512);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "f32[2,3,4]{0,1,2:T(512)}");
+
+  // 2 tiles.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {2, 3, 4}, {1, 2, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(16);
+  tile->add_dimensions(256);
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(1);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[2,3,4]{1,2,0:T(16,256)(2,1)}");
+
+  // PRED with element size of 8 bits.
+  shape = ShapeUtil::MakeShapeWithLayout(PRED, {8, 8, 8}, {0, 2, 1});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)}");
+
+  // PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)E(32)}");
+
+  // No tile. PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:E(32)}");
+
+  // Tile with negative dimension size for combining dimensions.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {2, 3, 1004}, {2, 1, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[2,3,1004]{2,1,0:T(2,*,128)}");
+
+  // Tile with two negative dimensions.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {8, 2, 3, 1004}, {3, 2, 1, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[8,2,3,1004]{3,2,1,0:T(2,*,*,128)}");
+}
+
 TEST_F(LayoutUtilTest, ValidateLayout_ValidArrayLayout) {
   Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {0, 1});
   auto status =
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 8600e8752cfbe072407391559d210d0b49bea511..5cd738d0f7769ceac7eb3bdbc5abd3196d9cf99c 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -44,7 +44,6 @@ namespace xla {
 namespace {
 
 using absl::StrCat;
-using absl::StrFormat;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
@@ -1628,26 +1627,20 @@ bool LiteralBase::IsAllFloat(float value) const {
           return true;
         }
 
-        auto piece_is_all = [&]() {
-          switch (shape().element_type()) {
-            case F32:
-              return AllElementsEqualValue<float>(piece.data<float>(), value);
-            case F64:
-              return AllElementsEqualValue<double>(piece.data<double>(), value);
-            case F16:
-              return AllElementsEqualValue<half>(piece.data<half>(),
-                                                 static_cast<half>(value));
-            case BF16:
-              return AllElementsEqualValue<bfloat16>(
-                  piece.data<bfloat16>(), static_cast<bfloat16>(value));
-            default:
-              return false;
-          }
-        };
-        if (!piece_is_all()) {
-          return false;
+        switch (shape().element_type()) {
+          case F32:
+            return AllElementsEqualValue<float>(piece.data<float>(), value);
+          case F64:
+            return AllElementsEqualValue<double>(piece.data<double>(), value);
+          case F16:
+            return AllElementsEqualValue<half>(piece.data<half>(),
+                                               static_cast<half>(value));
+          case BF16:
+            return AllElementsEqualValue<bfloat16>(
+                piece.data<bfloat16>(), static_cast<bfloat16>(value));
+          default:
+            return false;
         }
-        return true;
       });
 }
 
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 91e71f5d1d02d135158d0dffc140c21cf8ea5e3a..e1e22f784172b5f3850f0bc510322dfad9e7f1bb 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -186,6 +186,14 @@ bool ParseFlagsFromEnvAndDieIfUnknown(
   tensorflow::mutex_lock lock(env_argv_mu);
   auto* env_argv = &EnvArgvs()[string(envvar)];
   SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "For env var " << envvar << " found arguments:";
+    for (int i = 0; i < env_argv->argc; i++) {
+      VLOG(1) << "  argv[" << i << "] = " << env_argv->argv[i];
+    }
+  }
+
   bool result =
       tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
 
diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index f22fc8b8499dd4a5329276040331a2ed9e89bea9..4a88a48f2857a327aba3600ca72191e5c7b28585 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 
+#include "google/protobuf/duration.pb.h"
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -43,6 +45,20 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
 // dirpath along as-is.
 void RegisterDirectoryExpander(const std::function<string(string)>& expander);
 
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
 }  // namespace protobuf_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 4afb21d5c8864c2974114af2de08df4106a13a8c..55eacc1c16a76522215d27ac7cf4e801e69c9740 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -59,10 +59,6 @@ cc_library(
     srcs = ["local_computation_builder.cc"],
     hdrs = ["local_computation_builder.h"],
     deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -77,15 +73,38 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/core:lib",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "xrt",
+    srcs = ["xrt.cc"],
+    hdrs = ["xrt.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt/cc:xrt_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -93,9 +112,12 @@ cc_library(
 
 tf_py_wrap_cc(
     name = "pywrap_xla",
-    srcs = ["xla.i"],
+    srcs = [
+        "xla.i",
+    ],
     swig_includes = [
         "local_computation_builder.i",
+        "xla_data.i",
         "//tensorflow/python:platform/base.i",
     ],
     version_script = select({
@@ -112,3 +134,27 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ] + xla_python_default_plugins(),
 )
+
+tf_py_wrap_cc(
+    name = "pywrap_xrt",
+    srcs = [
+        "xrt.i",
+    ],
+    swig_includes = [
+        "xla_data.i",
+        "//tensorflow/python:platform/base.i",
+    ],
+    version_script = select({
+        "//tensorflow:darwin": "pywrap_xla_exported_symbols.lds",
+        "//tensorflow:windows": None,
+        "//conditions:default": "pywrap_xla_version_script.lds",
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":numpy_bridge",
+        ":xrt",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index ce4bd6f681b80a0c52579f62e3422be81d06076f..c14a01a858af414fc78a5f727372e8fa64cad4b8 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -20,29 +20,22 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/cholesky.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -51,72 +44,6 @@ limitations under the License.
 namespace xla {
 namespace swig {
 
-// TODO(b/118641336): Factor out XRT parts into a small c++ library of their
-// own.
-
-// TODO(b/34473877) Ideally XLA would support AllReduce among arbitrary sets of
-// device handles instead of needing to set the number of replicas at XLA
-// service initialization time.
-tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
-int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
-LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
-
-string* GetPlatformNameString() {
-  static string* platform_name_string PT_GUARDED_BY(g_local_client_mutex) =
-      new string("Host");
-  return platform_name_string;
-}
-
-Status InitializeReplicaCount(int replica_count) {
-  if (replica_count < 1) {
-    return InvalidArgument("Replica count must be >= 1; got %d.",
-                           replica_count);
-  }
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return FailedPrecondition(
-        "Attempted to set the replica count to %d, but a local XLA service was "
-        "previously created with a replica count of %d.",
-        replica_count, g_replica_count);
-  }
-  g_replica_count = replica_count;
-  return Status::OK();
-}
-
-Status InitializePlatformName(const string& platform_name) {
-  string* g_platform_name = GetPlatformNameString();
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return FailedPrecondition(
-        "Attempted to set the platform name to %s, but a local XLA service was "
-        "previously created with a platform name of %s.",
-        platform_name, *g_platform_name);
-  }
-  TF_RETURN_IF_ERROR(PlatformUtil::GetPlatform(platform_name).status());
-  *g_platform_name = platform_name;
-  return Status::OK();
-}
-
-int GetReplicaCount() {
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  return g_replica_count;
-}
-
-StatusOr<LocalClient*> GetOrCreateLocalClient() {
-  string* platform_name = GetPlatformNameString();
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return g_local_client;
-  }
-  LocalClientOptions options;
-  options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
-  options.set_number_of_replicas(g_replica_count);
-  TF_ASSIGN_OR_RETURN(g_local_client,
-                      ClientLibrary::GetOrCreateLocalClient(options));
-  CHECK(g_local_client != nullptr);
-  return g_local_client;
-}
-
 Status RegisterCpuCustomCallTarget(const string& fn_name, PyObject* capsule) {
   const char* name = "xla._CPU_CUSTOM_CALL_TARGET";
   if (!PyCapsule_IsValid(capsule, name)) {
@@ -131,62 +58,66 @@ Status RegisterCpuCustomCallTarget(const string& fn_name, PyObject* capsule) {
   return Status::OK();
 }
 
-Status TransferToInfeedLocal(const Literal& literal) {
-  VLOG(1) << "Infeeding literal without replica number; shape: "
-          << literal.shape();
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  return client->TransferToInfeedLocal(literal, /*device_ordinal=*/0);
-}
+LocalClient::LocalClient(xla::LocalClient* client) : client_(client) {}
 
-Status TransferToInfeedLocalReplica(const Literal& literal,
-                                    int replica_number) {
-  VLOG(1) << "Infeeding shape " << literal.shape()
-          << " to replica number: " << replica_number;
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  return client->TransferToInfeedLocal(literal, device_ordinal);
+/* static */ StatusOr<LocalClient> LocalClient::Get(
+    const string& platform_name) {
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform(platform_name));
+  if (platform->VisibleDeviceCount() <= 0) {
+    return InvalidArgument("Platform %s has no visible devices.",
+                           platform_name);
+  }
+  LocalClientOptions options;
+  options.set_platform(platform);
+  TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
+                      ClientLibrary::GetOrCreateLocalClient(options));
+  CHECK(client != nullptr);
+  return LocalClient(client);
 }
 
-StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
-                                                  int replica_number) {
-  VLOG(1) << "Outfeeding literal from replica number: " << replica_number
-          << " shape: " << shape;
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  return client->TransferFromOutfeedLocal(shape, device_ordinal);
+// Returns the number of devices known to the XLA client.
+int LocalClient::DeviceCount() const { return client_->device_count(); }
+
+Status LocalClient::TransferToInfeed(const Literal& literal,
+                                     int device_ordinal) {
+  VLOG(1) << "Infeeding literal to device " << device_ordinal
+          << "; shape: " << literal.shape();
+  return client_->TransferToInfeed(literal, device_ordinal);
 }
 
-static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
-                                             int device_ordinal,
-                                             const Literal& arg) {
-  return client->LiteralToShapedBuffer(arg, device_ordinal,
-                                       client->backend().memory_allocator());
+StatusOr<Literal> LocalClient::TransferFromOutfeed(const Shape& shape,
+                                                   int device_ordinal) {
+  VLOG(1) << "Outfeeding literal from device " << device_ordinal
+          << "; shape: " << shape;
+  return client_->TransferFromOutfeed(&shape, device_ordinal);
 }
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
     const Literal& argument, const absl::optional<Shape>& shape_with_layout,
-    int replica_number) {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
-          << replica_number << "/" << device_ordinal;
+    const LocalClient& client, int device_ordinal) {
+  VLOG(1) << "Creating shaped buffer from literal on device ordinal: "
+          << device_ordinal;
+  auto literal_to_buffer = [&](const Literal& arg) {
+    return client.client()->LiteralToShapedBuffer(
+        arg, device_ordinal, client.client()->backend().memory_allocator());
+  };
+
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, device_ordinal, relaid);
+      return literal_to_buffer(relaid);
     }
-    return ToBuffer(client, device_ordinal, argument);
+    return literal_to_buffer(argument);
   }();
   TF_RETURN_IF_ERROR(buf.status());
-  return new LocalShapedBuffer(std::move(buf).ValueOrDie());
+  return new LocalShapedBuffer(std::move(buf).ValueOrDie(), client.client());
 }
 
-LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
-    : shaped_buffer_(std::move(shaped_buffer)) {}
+LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer,
+                                     xla::LocalClient* client)
+    : shaped_buffer_(std::move(shaped_buffer)), client_(client) {}
 
 const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
   return &shaped_buffer_;
@@ -199,8 +130,7 @@ const Shape& LocalShapedBuffer::shape() const {
 }
 
 StatusOr<Literal> LocalShapedBuffer::ToLiteral() const {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  return client->ShapedBufferToLiteral(*shaped_buffer());
+  return client_->ShapedBufferToLiteral(*shaped_buffer());
 }
 
 LocalShapedBufferTuple::LocalShapedBufferTuple(
@@ -231,120 +161,77 @@ StatusOr<LocalShapedBuffer*> LocalShapedBufferTuple::Release(int i) {
 
 int64 LocalShapedBufferTuple::size() const { return elements_.size(); }
 
-XrtAllocation::XrtAllocation(int64 handle, Shape shape,
-                             const string& session_target)
-    : handle_(handle), shape_(shape), session_target_(session_target) {}
-
-XrtAllocation::~XrtAllocation() {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto allocation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto release =
-      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
-  if (!root.status().ok()) {
-    LOG(ERROR) << root.status();
-    return;
-  }
+StatusOr<LocalShapedBufferTuple*> LocalShapedBuffer::DestructureTuple() {
+  const Shape tuple_shape = shape();
 
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({allocation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  auto status = session.Run(inputs, {}, {release}, &outputs);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return;
+  if (!tuple_shape.IsTuple()) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
   }
-}
-
-/* static */
-StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
-    const Literal& argument, const string& session_target) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() = argument.ToProto();
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto literal_string =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
-  TF_RETURN_IF_ERROR(root.status());
 
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({literal_string, alloc.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+  DeviceMemoryAllocator* allocator = shaped_buffer()->memory_allocator();
+  ShapedBuffer tuple_buffer = Release();
 
-  int64 handle = outputs[0].scalar<int64>()();
-  return new XrtAllocation(handle, argument.shape(), session_target);
-}
-
-const int64 XrtAllocation::handle() const { return handle_; }
-
-const Shape& XrtAllocation::shape() const { return shape_; }
-
-StatusOr<Literal> XrtAllocation::ToLiteral() const {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto allocation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
-  TF_RETURN_IF_ERROR(root.status());
+  // Extract some metadata we use to construct scoped buffers.
+  const se::Platform* platform = tuple_buffer.platform();
+  int device_ordinal = tuple_buffer.device_ordinal();
 
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({allocation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
+  std::vector<LocalShapedBuffer*> results;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    // Create a shaped buffer for this destructured tuple element.
+    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
+    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
+    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
 
-  xla::LiteralProto response;
-  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
-  return Literal::CreateFromProto(response);
-}
+    ShapeUtil::ForEachSubshape(
+        subshape, [&](const Shape& s, const ShapeIndex& index) {
+          ShapeIndex original(index);
+          original.push_front(i);
+          se::DeviceMemoryBase* device_memory =
+              shape_tree.mutable_element(original);
+          shaped_buffer.set_buffer(*device_memory, index);
+          *device_memory = se::DeviceMemoryBase();
+        });
 
-XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
-    : elements_(std::move(elements)) {
-  for (auto* element : elements_) {
-    CHECK(element != nullptr);
+    VLOG(3) << "Completed tuple element: " << i;
+    results.push_back(new LocalShapedBuffer(
+        ScopedShapedBuffer(std::move(shaped_buffer), allocator), client_));
   }
+  // Deallocate the root buffer.
+  se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer();
+  TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer));
+  return new LocalShapedBufferTuple(std::move(results));
 }
 
-XrtAllocationTuple::~XrtAllocationTuple() {
-  for (XrtAllocation* element : elements_) {
-    if (element != nullptr) {
-      delete element;
-    }
-  }
-}
+LocalExecutable::LocalExecutable(
+    std::unique_ptr<xla::LocalExecutable> executable,
+    xla::DeviceAssignment device_assignment, xla::LocalClient* client)
+    : executable_(std::move(executable)),
+      device_assignment_(std::move(device_assignment)),
+      client_(client) {}
 
-StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
-  XrtAllocation* element = elements_[i];
-  if (element == nullptr) {
-    return InvalidArgument("Attempted to release already-released element %d.",
-                           i);
+std::vector<int> LocalExecutable::DeviceOrdinals() const {
+  int num_replicas = device_assignment_.replica_count();
+  std::vector<int> device_ordinals;
+  device_ordinals.reserve(num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    device_ordinals.push_back(device_assignment_(i, 0));
   }
-  elements_[i] = nullptr;
-  return element;
+  return device_ordinals;
 }
 
-int64 XrtAllocationTuple::size() const { return elements_.size(); }
-
-CompiledLocalComputation::CompiledLocalComputation(
-    std::unique_ptr<LocalExecutable> executable)
-    : executable_(std::move(executable)) {}
-
-StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
+StatusOr<LocalShapedBuffer*> LocalExecutable::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
   if (num_replicas() != 1) {
     return InvalidArgument(
         "Attempted to execute computation with %d replicas using Execute()",
         num_replicas());
   }
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      client->backend().computation_placer()->AssignDevices(
-                          1, /*computation_count=*/1));
   StatusOr<ScopedShapedBuffer> result_buffer_status;
-  const int device_ordinal = device_assignment(0, 0);
+  const int device_ordinal = device_assignment_(0, 0);
   VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
           << device_ordinal;
 
@@ -356,10 +243,10 @@ StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
 
   ExecutableRunOptions options;
   options.set_device_ordinal(device_ordinal);
-  options.set_allocator(client->backend().memory_allocator());
+  options.set_allocator(client_->backend().memory_allocator());
   options.set_intra_op_thread_pool(
-      client->backend().eigen_intra_op_thread_pool_device());
-  options.set_device_assignment(&device_assignment);
+      client_->backend().eigen_intra_op_thread_pool_device());
+  options.set_device_assignment(&device_assignment_);
 
   result_buffer_status = executable_->Run(argument_buffers, options);
 
@@ -369,13 +256,13 @@ StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
         "%s.",
         result_buffer_status.status().ToString());
   }
-  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie(),
+                               client_);
 }
 
-StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+StatusOr<LocalShapedBufferTuple*> LocalExecutable::ExecutePerReplica(
     absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  const int num_devices = client->device_count();
+  const int num_devices = client_->device_count();
 
   if (argument_handles.size() != num_replicas()) {
     return InvalidArgument(
@@ -390,14 +277,9 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
 
   VLOG(1) << "Executing with " << num_replicas() << " replicas.";
 
-  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      client->backend().computation_placer()->AssignDevices(
-                          num_replicas(), /*computation_count=*/1));
-
   std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas());
-  auto execute = [this, client, &device_assignment, &argument_handles,
-                  &results](int replica) {
-    const int device_ordinal = device_assignment(replica, 0);
+  auto execute = [this, &argument_handles, &results](int replica) {
+    const int device_ordinal = device_assignment_(replica, 0);
     VLOG(3) << "Replica " << replica
             << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -409,10 +291,10 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
 
     ExecutableRunOptions options;
     options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client->backend().memory_allocator());
+    options.set_allocator(client_->backend().memory_allocator());
     options.set_intra_op_thread_pool(
-        client->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment);
+        client_->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment_);
     StatusOr<ScopedShapedBuffer> result_buffer_status =
         executable_->Run(argument_buffers, options);
 
@@ -444,151 +326,43 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
           replica, statusor.status().ToString());
     }
     wrapped_results[replica] =
-        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie(), client_);
   }
 
   return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
-static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  return std::move(*program_shape.mutable_result());
-}
-
-CompiledXrtComputation::CompiledXrtComputation(
-    const ProgramShape& program_shape, int64 handle,
-    const string& session_target)
-    : program_shape_(program_shape),
-      handle_(handle),
-      session_target_(session_target) {}
-
-CompiledXrtComputation::~CompiledXrtComputation() {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto computation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto release =
-      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
-  if (!root.status().ok()) {
-    LOG(ERROR) << root.status();
-    return;
-  }
-
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({computation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  auto status = session.Run(inputs, {}, {release}, &outputs);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return;
-  }
-}
-
-StatusOr<XrtAllocation*> CompiledXrtComputation::Execute(
-    absl::Span<XrtAllocation* const> argument_handles) {
-  const int num_expected_arguments = program_shape().parameters().size();
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  std::vector<tensorflow::Output> arguments;
-  arguments.reserve(num_expected_arguments);
-  for (int i = 0; i < num_expected_arguments; ++i) {
-    arguments.push_back(
-        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
-  }
-  auto computation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto execution_config =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
-                                             execution_config, arguments);
-  TF_RETURN_IF_ERROR(root.status());
-
-  TF_RET_CHECK(argument_handles.size() == arguments.size());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(false);
-
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  for (int i = 0; i < arguments.size(); ++i) {
-    inputs.insert({arguments[i], argument_handles[i]->handle()});
-  }
-  inputs.insert({computation_handle, handle()});
-  inputs.insert({execution_config, e.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
-
-  int64 output = outputs[0].scalar<int64>()();
-  return new XrtAllocation(output, program_shape().result(), session_target_);
-}
-
-const ProgramShape& CompiledXrtComputation::program_shape() const {
-  return program_shape_;
-}
-
-int64 CompiledXrtComputation::handle() const { return handle_; }
-
-LocalComputation::LocalComputation(XlaComputation computation)
+Computation::Computation(XlaComputation computation)
     : computation_(std::move(computation)) {}
 
-StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
+StatusOr<LocalExecutable*> Computation::Compile(
     const std::vector<Shape>& argument_shapes,
-    const ExecutableBuildOptions* build_options) {
+    const ExecutableBuildOptions* build_options, const LocalClient& client) {
   std::vector<const Shape*> argument_shape_pointers;
   argument_shape_pointers.reserve(argument_shapes.size());
   for (auto& argument_shape : argument_shapes) {
     argument_shape_pointers.push_back(&argument_shape);
   }
 
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   ExecutableBuildOptions options;
   if (build_options != nullptr) {
     options = *build_options;
   }
   TF_ASSIGN_OR_RETURN(
       auto local_executable,
-      client->Compile(computation_, argument_shape_pointers, options));
-  return new CompiledLocalComputation(std::move(local_executable));
-}
-
-StatusOr<CompiledXrtComputation*> LocalComputation::CompileForXrt(
-    const std::vector<Shape>& argument_shapes, const string& session_target) {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto compile = tensorflow::ops::XRTCompile(root, program);
-  TF_RETURN_IF_ERROR(root.status());
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  ProgramShape shapes;
-  for (auto& shape : argument_shapes) {
-    *shapes.add_parameters() = shape;
-  }
-  TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape());
-  LayoutUtil::SetToDefaultLayout(&shapes);
-  *config->mutable_program_shape() = shapes.ToProto();
-  auto snapshot = computation().Snapshot().ValueOrDie();
-  *c.mutable_hlo_snapshot() = *snapshot;
-
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({program, c.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+      client.client()->Compile(computation_, argument_shape_pointers, options));
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      client.client()->backend().computation_placer()->AssignDevices(
+          options.num_replicas(), /*computation_count=*/1));
 
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation().GetProgramShape());
-  int64 handle = outputs[0].scalar<int64>()();
-  return new CompiledXrtComputation(program_shape, handle, session_target);
+  return new LocalExecutable(std::move(local_executable),
+                             std::move(device_assignment), client.client());
 }
 
-const XlaComputation& LocalComputation::computation() const {
-  return computation_;
-}
+const XlaComputation& Computation::computation() const { return computation_; }
 
-string LocalComputation::GetSerializedProto() const {
+string Computation::GetSerializedProto() const {
   string result;
   if (!computation_.proto().SerializeToString(&result)) {
     LOG(ERROR) << "Failed to serialize the HloModuleProto.";
@@ -597,101 +371,129 @@ string LocalComputation::GetSerializedProto() const {
   return result;
 }
 
-StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
-  return swig::GetReturnValueShape(computation_);
+StatusOr<string> Computation::GetHloText() const {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation_.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation_.proto(), module_config));
+  HloPrintOptions options;
+  options = HloPrintOptions::ShortParsable();
+  options.set_print_large_constants(false);
+  return hlo_module->ToString(options);
+}
+
+StatusOr<string> Computation::GetHloDotGraph() const {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation_.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation_.proto(), module_config));
+  hlo_graph_dumper::DotGraphOptions options;
+  options.debug_options = &hlo_module->config().debug_options();
+  return hlo_graph_dumper::HloComputationToDotGraph(
+      *hlo_module->entry_computation(), options);
+}
+
+StatusOr<ProgramShape> Computation::GetProgramShape() const {
+  return computation_.GetProgramShape();
+}
+
+StatusOr<Shape> Computation::GetReturnValueShape() const {
+  TF_ASSIGN_OR_RETURN(ProgramShape shape, computation_.GetProgramShape());
+  return std::move(*shape.mutable_result());
 }
 
 LocalOp::LocalOp(const XlaOp& op) : op_(op) {}
 
 const XlaOp& LocalOp::op() const { return op_; }
 
-LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
+ComputationBuilder::ComputationBuilder(const string& computation_name)
     : builder_(computation_name) {}
 
-void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
+void ComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
   builder_.SetOpMetadata(metadata);
 }
 
-void LocalComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
+void ComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::Build() {
+StatusOr<Computation*> ComputationBuilder::Build() {
   TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build());
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-LocalOp LocalComputationBuilder::Parameter(int64 parameter_number,
-                                           const Shape& shape,
-                                           const string& name) {
+LocalOp ComputationBuilder::Parameter(int64 parameter_number,
+                                      const Shape& shape, const string& name) {
   return xla::Parameter(&builder_, parameter_number, shape, name);
 }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::BuildWithRoot(
-    const LocalOp& root) {
+StatusOr<Computation*> ComputationBuilder::BuildWithRoot(const LocalOp& root) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build(root.op()));
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-StatusOr<Shape> LocalComputationBuilder::GetShape(const LocalOp& operand) {
+StatusOr<Shape> ComputationBuilder::GetShape(const LocalOp& operand) {
   return builder_.GetShape(operand.op());
 }
 
-StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
+StatusOr<Shape> ComputationBuilder::GetReturnValueShape() {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape, builder_.GetProgramShape());
   return program_shape.result();
 }
 
-LocalOp LocalComputationBuilder::Infeed(const Shape& shape) {
+LocalOp ComputationBuilder::Infeed(const Shape& shape) {
   return xla::Infeed(&builder_, shape);
 }
 
-void LocalComputationBuilder::Outfeed(const LocalOp& operand,
-                                      const Shape& shape,
-                                      const string& outfeed_config) {
+void ComputationBuilder::Outfeed(const LocalOp& operand, const Shape& shape,
+                                 const string& outfeed_config) {
   xla::Outfeed(operand.op(), shape, outfeed_config);
 }
 
-LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) {
+LocalOp ComputationBuilder::ConstantLiteral(const Literal& literal) {
   return xla::ConstantLiteral(&builder_, literal);
 }
 
-LocalOp LocalComputationBuilder::Iota(PrimitiveType element_type, int64 size) {
+LocalOp ComputationBuilder::Iota(PrimitiveType element_type, int64 size) {
   return xla::Iota(&builder_, element_type, size);
 }
 
-LocalOp LocalComputationBuilder::BroadcastedIota(const Shape& shape,
-                                                 int64 dimension) {
+LocalOp ComputationBuilder::BroadcastedIota(const Shape& shape,
+                                            int64 dimension) {
   return xla::Iota(&builder_, shape, dimension);
 }
 
-LocalOp LocalComputationBuilder::Broadcast(
-    const LocalOp& operand, absl::Span<const int64> broadcast_sizes) {
+LocalOp ComputationBuilder::Broadcast(const LocalOp& operand,
+                                      absl::Span<const int64> broadcast_sizes) {
   return xla::Broadcast(operand.op(), broadcast_sizes);
 }
 
-LocalOp LocalComputationBuilder::BroadcastInDim(
+LocalOp ComputationBuilder::BroadcastInDim(
     const LocalOp& operand, absl::Span<const int64> out_dim_sizes,
     absl::Span<const int64> broadcast_dimensions) {
   return xla::BroadcastInDim(operand.op(), out_dim_sizes, broadcast_dimensions);
 }
 
-LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
-                                     const LocalOp& padding_value,
-                                     const PaddingConfig& padding_config) {
+LocalOp ComputationBuilder::Pad(const LocalOp& operand,
+                                const LocalOp& padding_value,
+                                const PaddingConfig& padding_config) {
   return xla::Pad(operand.op(), padding_value.op(), padding_config);
 }
 
-LocalOp LocalComputationBuilder::Reshape(const LocalOp& operand,
-                                         absl::Span<const int64> dimensions,
-                                         absl::Span<const int64> new_sizes) {
+LocalOp ComputationBuilder::Reshape(const LocalOp& operand,
+                                    absl::Span<const int64> dimensions,
+                                    absl::Span<const int64> new_sizes) {
   return xla::Reshape(operand.op(), dimensions, new_sizes);
 }
 
-LocalOp LocalComputationBuilder::Collapse(const LocalOp& operand,
-                                          absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Collapse(const LocalOp& operand,
+                                     absl::Span<const int64> dimensions) {
   return xla::Collapse(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::AllToAll(
+LocalOp ComputationBuilder::AllToAll(
     const LocalOp& operand, int64 split_dimension, int64 concat_dimension,
     int64 split_count, absl::Span<const ReplicaGroup> replica_groups) {
   std::vector<ReplicaGroup> rg(replica_groups.size());
@@ -702,39 +504,38 @@ LocalOp LocalComputationBuilder::AllToAll(
                        split_count, rg);
 }
 
-LocalOp LocalComputationBuilder::CrossReplicaSum(
+LocalOp ComputationBuilder::CrossReplicaSum(
     const LocalOp& operand, absl::Span<const ReplicaGroup> replica_groups) {
   return xla::CrossReplicaSum(operand.op(), replica_groups);
 }
 
-LocalOp LocalComputationBuilder::Slice(const LocalOp& operand,
-                                       absl::Span<const int64> start_indices,
-                                       absl::Span<const int64> limit_indices,
-                                       absl::Span<const int64> strides) {
+LocalOp ComputationBuilder::Slice(const LocalOp& operand,
+                                  absl::Span<const int64> start_indices,
+                                  absl::Span<const int64> limit_indices,
+                                  absl::Span<const int64> strides) {
   return xla::Slice(operand.op(), start_indices, limit_indices, strides);
 }
 
-LocalOp LocalComputationBuilder::SliceInDim(const LocalOp& operand,
-                                            int64 start_index,
-                                            int64 limit_index, int64 stride,
-                                            int64 dimno) {
+LocalOp ComputationBuilder::SliceInDim(const LocalOp& operand,
+                                       int64 start_index, int64 limit_index,
+                                       int64 stride, int64 dimno) {
   return xla::SliceInDim(operand.op(), start_index, limit_index, stride, dimno);
 }
 
-LocalOp LocalComputationBuilder::DynamicSlice(
-    const LocalOp& operand, const LocalOp& start_indices,
-    absl::Span<const int64> slice_sizes) {
+LocalOp ComputationBuilder::DynamicSlice(const LocalOp& operand,
+                                         const LocalOp& start_indices,
+                                         absl::Span<const int64> slice_sizes) {
   return xla::DynamicSlice(operand.op(), start_indices.op(), slice_sizes);
 }
 
-LocalOp LocalComputationBuilder::DynamicUpdateSlice(
-    const LocalOp& operand, const LocalOp& update,
-    const LocalOp& start_indices) {
+LocalOp ComputationBuilder::DynamicUpdateSlice(const LocalOp& operand,
+                                               const LocalOp& update,
+                                               const LocalOp& start_indices) {
   return xla::DynamicUpdateSlice(operand.op(), update.op(), start_indices.op());
 }
 
-LocalOp LocalComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
-                                             int64 dimension) {
+LocalOp ComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
+                                        int64 dimension) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -743,18 +544,18 @@ LocalOp LocalComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
   return xla::ConcatInDim(&builder_, xla_ops, dimension);
 }
 
-LocalOp LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const LocalOp& operand, const LocalComputation& select,
+LocalOp ComputationBuilder::SelectAndScatterWithGeneralPadding(
+    const LocalOp& operand, const Computation& select,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding, const LocalOp& source,
-    const LocalOp& init_value, const LocalComputation& scatter) {
+    const LocalOp& init_value, const Computation& scatter) {
   return xla::SelectAndScatterWithGeneralPadding(
       operand.op(), select.computation(), window_dimensions, window_strides,
       padding, source.op(), init_value.op(), scatter.computation());
 }
 
-LocalOp LocalComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
+LocalOp ComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(elements.size());
   for (const auto& op : elements) {
@@ -764,22 +565,22 @@ LocalOp LocalComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
   return xla::Tuple(&builder_, xla_ops);
 }
 
-LocalOp LocalComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
-                                                 int64 index) {
+LocalOp ComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
+                                            int64 index) {
   return xla::GetTupleElement(tuple_data.op(), index);
 }
 
-LocalOp LocalComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
+LocalOp ComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
   return xla::Dot(lhs.op(), rhs.op());
 }
 
-LocalOp LocalComputationBuilder::DotGeneral(
+LocalOp ComputationBuilder::DotGeneral(
     const LocalOp& lhs, const LocalOp& rhs,
     const DotDimensionNumbers& dimension_numbers) {
   return xla::DotGeneral(lhs.op(), rhs.op(), dimension_numbers);
 }
 
-LocalOp LocalComputationBuilder::ConvGeneralDilated(
+LocalOp ComputationBuilder::ConvGeneralDilated(
     const LocalOp& lhs, const LocalOp& rhs,
     absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding,
@@ -791,18 +592,18 @@ LocalOp LocalComputationBuilder::ConvGeneralDilated(
                                  feature_group_count);
 }
 
-LocalOp LocalComputationBuilder::ConvertElementType(
-    const LocalOp& operand, PrimitiveType new_element_type) {
+LocalOp ComputationBuilder::ConvertElementType(const LocalOp& operand,
+                                               PrimitiveType new_element_type) {
   return xla::ConvertElementType(operand.op(), new_element_type);
 }
 
-LocalOp LocalComputationBuilder::BitcastConvertType(
-    const LocalOp& operand, PrimitiveType new_element_type) {
+LocalOp ComputationBuilder::BitcastConvertType(const LocalOp& operand,
+                                               PrimitiveType new_element_type) {
   return xla::BitcastConvertType(operand.op(), new_element_type);
 }
 
-LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
-                                      absl::Span<const LocalOp> operands) {
+LocalOp ComputationBuilder::Call(const Computation& local_computation,
+                                 absl::Span<const LocalOp> operands) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -811,7 +612,7 @@ LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
   return xla::Call(&builder_, local_computation.computation(), xla_ops);
 }
 
-LocalOp LocalComputationBuilder::CustomCall(
+LocalOp ComputationBuilder::CustomCall(
     const string& call_target_name, absl::Span<const LocalOp> operands,
     const Shape& shape_with_layout,
     const std::vector<Shape>& operand_shapes_with_layout,
@@ -826,19 +627,19 @@ LocalOp LocalComputationBuilder::CustomCall(
                                    operand_shapes_with_layout, opaque);
 }
 
-LocalOp LocalComputationBuilder::Transpose(
-    const LocalOp& operand, absl::Span<const int64> permutation) {
+LocalOp ComputationBuilder::Transpose(const LocalOp& operand,
+                                      absl::Span<const int64> permutation) {
   return xla::Transpose(operand.op(), permutation);
 }
 
-LocalOp LocalComputationBuilder::Rev(const LocalOp& operand,
-                                     absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Rev(const LocalOp& operand,
+                                absl::Span<const int64> dimensions) {
   return xla::Rev(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::Map(absl::Span<const LocalOp> operands,
-                                     const LocalComputation& local_computation,
-                                     absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Map(absl::Span<const LocalOp> operands,
+                                const Computation& local_computation,
+                                absl::Span<const int64> dimensions) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -849,17 +650,17 @@ LocalOp LocalComputationBuilder::Map(absl::Span<const LocalOp> operands,
                   dimensions);
 }
 
-LocalOp LocalComputationBuilder::Reduce(
+LocalOp ComputationBuilder::Reduce(
     const LocalOp& operand, const LocalOp& init_value,
-    const LocalComputation& local_computation,
+    const Computation& local_computation,
     absl::Span<const int64> dimensions_to_reduce) {
   return xla::Reduce(operand.op(), init_value.op(),
                      local_computation.computation(), dimensions_to_reduce);
 }
 
-LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
+LocalOp ComputationBuilder::ReduceWindowWithGeneralPadding(
     const LocalOp& operand, const LocalOp& init_value,
-    const LocalComputation& local_computation,
+    const Computation& local_computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const int64> base_dilations,
@@ -871,51 +672,50 @@ LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
       padding);
 }
 
-LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
-                                           const LocalOp& sigma,
-                                           const Shape& shape) {
+LocalOp ComputationBuilder::RngNormal(const LocalOp& mu, const LocalOp& sigma,
+                                      const Shape& shape) {
   return xla::RngNormal(mu.op(), sigma.op(), shape);
 }
 
-LocalOp LocalComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
-                                            const Shape& shape) {
+LocalOp ComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
+                                       const Shape& shape) {
   return xla::RngUniform(a.op(), b.op(), shape);
 }
 
-LocalOp LocalComputationBuilder::While(const LocalComputation& condition,
-                                       const LocalComputation& body,
-                                       const LocalOp& init) {
+LocalOp ComputationBuilder::While(const Computation& condition,
+                                  const Computation& body,
+                                  const LocalOp& init) {
   return xla::While(condition.computation(), body.computation(), init.op());
 }
 
-LocalOp LocalComputationBuilder::Conditional(
-    const LocalOp& predicate, const LocalOp& true_operand,
-    const LocalComputation& true_computation, const LocalOp& false_operand,
-    const LocalComputation& false_computation) {
+LocalOp ComputationBuilder::Conditional(const LocalOp& predicate,
+                                        const LocalOp& true_operand,
+                                        const Computation& true_computation,
+                                        const LocalOp& false_operand,
+                                        const Computation& false_computation) {
   return xla::Conditional(predicate.op(), true_operand.op(),
                           true_computation.computation(), false_operand.op(),
                           false_computation.computation());
 }
 
-StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
+StatusOr<bool> ComputationBuilder::IsConstant(const LocalOp& operand) {
   return builder_.IsConstant(operand.op());
 }
 
-LocalOp LocalComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
+LocalOp ComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
   return xla::Sort(operand.op(), {}, dimension);
 }
 
-LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys,
-                                            const LocalOp& values,
-                                            int64 dimension) {
+LocalOp ComputationBuilder::SortKeyVal(const LocalOp& keys,
+                                       const LocalOp& values, int64 dimension) {
   return xla::Sort(keys.op(), {values.op()}, dimension);
 }
 
-LocalOp LocalComputationBuilder::Cholesky(const LocalOp& a) {
+LocalOp ComputationBuilder::Cholesky(const LocalOp& a) {
   return xla::Cholesky(a.op());
 }
 
-LocalOp LocalComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
+LocalOp ComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
   XlaBuilder* builder = a.op().builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto qr, xla::QRDecomposition(a.op(), full_matrices));
@@ -923,16 +723,16 @@ LocalOp LocalComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
   });
 }
 
-LocalOp LocalComputationBuilder::TriangularSolve(const LocalOp& a,
-                                                 const LocalOp& b,
-                                                 bool left_side, bool lower,
-                                                 bool transpose_a,
-                                                 bool conjugate_a) {
-  return xla::TriangularSolve(a.op(), b.op(), left_side, lower, transpose_a,
-                              conjugate_a);
+LocalOp ComputationBuilder::TriangularSolve(const LocalOp& a, const LocalOp& b,
+                                            bool left_side, bool lower,
+                                            bool unit_diagonal,
+                                            int transpose_a) {
+  return xla::TriangularSolve(
+      a.op(), b.op(), left_side, lower, unit_diagonal,
+      xla::TriangularSolveOptions::Transpose(transpose_a));
 }
 
-LocalOp LocalComputationBuilder::Gather(
+LocalOp ComputationBuilder::Gather(
     const LocalOp& input, const LocalOp& start_indices,
     const GatherDimensionNumbers& dimension_numbers,
     absl::Span<const int64> slice_sizes) {
@@ -940,24 +740,24 @@ LocalOp LocalComputationBuilder::Gather(
                      slice_sizes);
 }
 
-LocalOp LocalComputationBuilder::Scatter(
+LocalOp ComputationBuilder::Scatter(
     const LocalOp& input, const LocalOp& scatter_indices,
-    const LocalOp& updates, const LocalComputation& update_computation,
+    const LocalOp& updates, const Computation& update_computation,
     const ScatterDimensionNumbers& dimension_numbers) {
   return xla::Scatter(input.op(), scatter_indices.op(), updates.op(),
                       update_computation.computation(), dimension_numbers);
 }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
+StatusOr<Computation*> ComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       builder_.BuildConstantSubGraph(operand.op()));
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-#define _FORWARD(method_name, return_sig, args_sig, args)    \
-  return_sig LocalComputationBuilder::method_name args_sig { \
-    return xla::method_name args;                            \
+#define _FORWARD(method_name, return_sig, args_sig, args) \
+  return_sig ComputationBuilder::method_name args_sig {   \
+    return xla::method_name args;                         \
   }
 
 #define _FORWARD_UNOP(method_name) \
@@ -999,6 +799,7 @@ _FORWARD_BINOP(Atan2)
 _FORWARD_BINOP(Pow)
 _FORWARD_BINOP(Complex)
 _FORWARD_UNOP(Not)
+_FORWARD_UNOP(Clz)
 _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
 _FORWARD_UNOP(Expm1)
@@ -1044,108 +845,9 @@ void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
 }
 
-void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
-
-void DeleteCompiledLocalComputation(CompiledLocalComputation* computation) {
-  delete computation;
-}
-
-void DeleteCompiledXrtComputation(CompiledXrtComputation* computation) {
-  delete computation;
-}
-
-void DeleteLocalComputation(LocalComputation* computation) {
-  delete computation;
-}
-
-StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
-    LocalShapedBuffer* local_shaped_buffer) {
-  const Shape tuple_shape = local_shaped_buffer->shape();
-
-  if (!tuple_shape.IsTuple()) {
-    return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
-        "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
-  }
-
-  DeviceMemoryAllocator* allocator =
-      local_shaped_buffer->shaped_buffer()->memory_allocator();
-  ShapedBuffer tuple_buffer = local_shaped_buffer->Release();
+void DeleteLocalExecutable(LocalExecutable* computation) { delete computation; }
 
-  // Extract some metadata we use to construct scoped buffers.
-  const se::Platform* platform = tuple_buffer.platform();
-  int device_ordinal = tuple_buffer.device_ordinal();
-
-  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
-  std::vector<LocalShapedBuffer*> results;
-  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    // Create a shaped buffer for this destructured tuple element.
-    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
-    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
-    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
-
-    ShapeUtil::ForEachSubshape(
-        subshape, [&](const Shape& s, const ShapeIndex& index) {
-          ShapeIndex original(index);
-          original.push_front(i);
-          se::DeviceMemoryBase* device_memory =
-              shape_tree.mutable_element(original);
-          shaped_buffer.set_buffer(*device_memory, index);
-          *device_memory = se::DeviceMemoryBase();
-        });
-
-    VLOG(3) << "Completed tuple element: " << i;
-    results.push_back(new LocalShapedBuffer(
-        ScopedShapedBuffer(std::move(shaped_buffer), allocator)));
-  }
-  // Deallocate the root buffer.
-  se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer();
-  TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer));
-  return new LocalShapedBufferTuple(std::move(results));
-}
-
-StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
-    XrtAllocation* allocation, const string& session_target) {
-  const Shape& tuple_shape = allocation->shape();
-
-  if (!tuple_shape.IsTuple()) {
-    return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
-        "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
-  }
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
-  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
-  TF_RETURN_IF_ERROR(root.status());
-
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  std::vector<XrtAllocation*> results;
-  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    inputs.clear();
-    inputs.insert({base_handle, allocation->handle()});
-    inputs.insert({shape_index, {i}});
-    std::vector<tensorflow::Tensor> outputs;
-    auto status = session.Run(inputs, {subtuple}, &outputs);
-    if (!status.ok()) {
-      // Clean up before returning non-ok status.
-      for (int j = 0; j < results.size(); ++j) {
-        delete results[j];
-      }
-      return status;
-    }
-    const int64 subtuple_handle = outputs[0].scalar<int64>()();
-    const Shape& subtuple_shape =
-        ShapeUtil::GetTupleElementShape(tuple_shape, i);
-    results.push_back(
-        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
-  }
-  return new XrtAllocationTuple(std::move(results));
-}
+void DeleteComputation(Computation* computation) { delete computation; }
 
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index e3af88f82559c32a7267a56c87d3bafda01b934d..66b1cce7fb598388af40940ea2ed52ac2f8ee8e1 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -22,9 +22,6 @@ limitations under the License.
 #include <Python.h>
 
 #include "absl/types/span.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -35,42 +32,42 @@ limitations under the License.
 namespace xla {
 namespace swig {
 
-// Initializes the number of replicas that XLA will be initialized with (when
-// first obtaining a handle to the local XLA service). If this is called after
-// the handle to the local XLA service has been established, then an error is
-// returned.
-Status InitializeReplicaCount(int replica_count);
-
-// Initializes the platform name that XLA will be initialized with (when
-// first obtaining a handle to the local XLA service). If this is called after
-// the handle to the local XLA service has been established, then an error is
-// returned.
-Status InitializePlatformName(const string& platform_name);
-
-// Returns the replica count that is currently set, regardless of whether the
-// local XLA service has been instantiated yet or not.
-int GetReplicaCount();
-
 // Registers a 'fn_capsule' as a CPU custom call target.
 // 'fn_capsule' is a void* pointer encapsulated in a PyCapsule object, with name
 // "xla._CPU_CUSTOM_CALL_TARGET".
 Status RegisterCpuCustomCallTarget(const string& name, PyObject* fn_capsule);
 
-// Wraps the local client's infeed-transfer function.
-//
-// The default device ordinal (0) is used.
-Status TransferToInfeedLocal(const Literal& literal);
+// Wrapper around an xla::LocalClient.
+class LocalClient {
+ public:
+  // Initializes a local XLA client for `platform_name`. Returns an error if no
+  /// such platform exists, or if the platform has no visible devices.
+  static StatusOr<LocalClient> Get(const string& platform_name);
+
+  // Copyable and moveable; the class is just a wrapper around a
+  // xla::LocalClient pointer for convenient SWIG wrapping.
+
+  // Returns the number of devices known to the XLA client.
+  int DeviceCount() const;
+
+  // Wraps the local client's infeed-transfer function.
+  //
+  // The default device ordinal (0) is used.
+  Status TransferToInfeed(const Literal& literal, int device_ordinal);
 
-// Transfers the given literal to the infeed of the given replica.
-//
-// The replica number is resolved to an appropriate device ordinal.
-Status TransferToInfeedLocalReplica(const Literal& literal, int replica_number);
+  // Transfers a literal of the given shape from the outfeed of the given
+  // replica.
+  StatusOr<Literal> TransferFromOutfeed(const Shape& shape, int device_ordinal);
 
-// Transfers a literal of the given shape from the outfeed of the given replica.
-//
-// The replica number is resolved to an appropriate device ordinal.
-StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
-                                                  int replica_number);
+  xla::LocalClient* client() const { return client_; }
+
+ private:
+  LocalClient(xla::LocalClient* client);
+
+  xla::LocalClient* client_;
+};
+
+class LocalShapedBufferTuple;
 
 // Represents a reference to literals that live in a device-allocated buffer via
 // XLA. Specifically, wraps a ScopedShapedBuffer produced by transferring a
@@ -79,9 +76,9 @@ class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
       const Literal& argument, const absl::optional<Shape>& shape_with_layout,
-      int replica_number);
+      const LocalClient& client, int device_ordinal);
 
-  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
+  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer, xla::LocalClient* client);
   StatusOr<Literal> ToLiteral() const;
   const Shape& shape() const;
   const ScopedShapedBuffer* shaped_buffer() const;
@@ -90,8 +87,13 @@ class LocalShapedBuffer {
   // analogous to std::unique_ptr::release().
   ShapedBuffer Release();
 
+  // Destructures a tuple-valued LocalShapedBuffer into its constitutent
+  // elements in LocalShapedBufferTuple form.
+  StatusOr<LocalShapedBufferTuple*> DestructureTuple();
+
  private:
   ScopedShapedBuffer shaped_buffer_;
+  xla::LocalClient* client_;
 };
 
 // Result of a tuple destructuring operation on a LocalShapedBuffer -- this
@@ -117,73 +119,21 @@ class LocalShapedBufferTuple {
   std::vector<LocalShapedBuffer*> elements_;
 };
 
-// Destructures a tuple-valued LocalShapedBuffer into its constitutent elements
-// in LocalShapedBufferTuple form.
-StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
-    LocalShapedBuffer* local_shaped_buffer);
-
-// Represents a reference to literals that live in a device-allocated buffer via
-// XRT. Specifically, wraps an int64 handle produced by running the allocation
-// graph, and an XLA shape to track the referent's shape.
-class XrtAllocation {
- public:
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which allocation and deallocation
-  // graphs are run.
-  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
-                                              const string& session_target);
-
-  XrtAllocation(int64 handle, Shape shape, const string& session_target);
-  ~XrtAllocation();
-  StatusOr<Literal> ToLiteral() const;
-  const Shape& shape() const;
-  const int64 handle() const;
-
- private:
-  const int64 handle_;
-  const Shape shape_;
-  const string session_target_;
-};
-
-// Result of a tuple destructuring operation on an XrtAllocation.
-class XrtAllocationTuple {
- public:
-  // Note: any XrtAllocation elements that are not Release()'d will be
-  // deallocated in the destructor.
-  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
-
-  ~XrtAllocationTuple();
-
-  // Releases the ith element to the caller. Further attempts to release the ith
-  // element will return an invalid argument error.
-  StatusOr<XrtAllocation*> Release(int i);
-
-  // Returns the number of elements in the destructured tuple.
-  int64 size() const;
-
- private:
-  std::vector<XrtAllocation*> elements_;
-};
-
-// Destructures a tuple-valued XrtAllocation into its constitutent elements
-// in XrtAllocationTuple form.
-//
-// Accepts a `session_target` argument, used in constructing the
-// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
-// and passed along in constructing each constituent XrtAllocation.
-StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
-    XrtAllocation* allocation, const string& session_target);
-
 // Represents a compiled computation that can be executed given handles to
 // device-allocated literals. Specifically, wraps an XLA LocalExecutable.
-class CompiledLocalComputation {
+class LocalExecutable {
  public:
-  CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+  LocalExecutable(std::unique_ptr<xla::LocalExecutable> executable,
+                  xla::DeviceAssignment device_assignment,
+                  xla::LocalClient* client);
 
   int num_replicas() const {
     return executable_->build_options().num_replicas();
   }
 
+  // Returns the device ordinals to which each replica is assigned.
+  std::vector<int> DeviceOrdinals() const;
+
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
 
@@ -194,47 +144,22 @@ class CompiledLocalComputation {
       absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
 
  private:
-  std::unique_ptr<LocalExecutable> executable_;
+  const std::unique_ptr<xla::LocalExecutable> executable_;
+  const xla::DeviceAssignment device_assignment_;
+  xla::LocalClient* const client_;
 };
 
-// Represents a compiled computation that can be executed given handles to
-// device-allocated literals. Specifically, wraps an XRT computation handle.
-class CompiledXrtComputation {
- public:
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which the execution graph is run.
-  CompiledXrtComputation(const ProgramShape& program_shape, int64 handle,
-                         const string& session_target);
-  ~CompiledXrtComputation();
-
-  StatusOr<XrtAllocation*> Execute(
-      absl::Span<XrtAllocation* const> argument_handles);
-
-  const ProgramShape& program_shape() const;
-  int64 handle() const;
-
- private:
-  const ProgramShape program_shape_;
-  const int64 handle_;
-  const string session_target_;
-};
-
-// Wraps a XlaComputation produced by a LocalComputationBuilder. The
+// Wraps a XlaComputation produced by a ComputationBuilder. The
 // Compile method compiles the computation to a (local) executable via
 // the client library's local client. This class is intended to be
 // made available to Python via SWIG.
-class LocalComputation {
+class Computation {
  public:
-  LocalComputation(XlaComputation computation);
+  Computation(XlaComputation computation);
 
-  StatusOr<CompiledLocalComputation*> Compile(
+  StatusOr<LocalExecutable*> Compile(
       const std::vector<Shape>& argument_shapes,
-      const ExecutableBuildOptions* build_options);
-
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which the compilation graph is run.
-  StatusOr<CompiledXrtComputation*> CompileForXrt(
-      const std::vector<Shape>& argument_shapes, const string& session_target);
+      const ExecutableBuildOptions* build_options, const LocalClient& client);
 
   const XlaComputation& computation() const;
 
@@ -243,6 +168,15 @@ class LocalComputation {
   // string on failure.
   string GetSerializedProto() const;
 
+  // Returns the computation in human-readable HLO text format.
+  StatusOr<string> GetHloText() const;
+
+  // Returns the computation in graphviz dot format.
+  StatusOr<string> GetHloDotGraph() const;
+
+  // Returns the program shape for this computation.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
   // Returns the return-value shape for this computation.
   StatusOr<Shape> GetReturnValueShape() const;
 
@@ -250,7 +184,7 @@ class LocalComputation {
   XlaComputation computation_;
 };
 
-// Wraps a XlaOp produced by a LocalComputationBuilder. This class is intended
+// Wraps a XlaOp produced by a ComputationBuilder. This class is intended
 // to be made available to Python via SWIG.
 class LocalOp {
  public:
@@ -267,20 +201,20 @@ class LocalOp {
 //   Python.
 // - Set up the underlying builder to use the client library's
 //   LocalClient.
-// - Wrap Computations in LocalComputations for Python access.
-// - Correspondingly unwrap incoming LocalComputations.
-class LocalComputationBuilder {
+// - Wrap Computations in Computations for Python access.
+// - Correspondingly unwrap incoming Computations.
+class ComputationBuilder {
  public:
-  LocalComputationBuilder(const string& computation_name);
+  ComputationBuilder(const string& computation_name);
 
   void SetOpMetadata(const OpMetadata& metadata);
   void ClearOpMetadata();
 
-  // Returns an owned LocalComputation to the caller on success.
-  StatusOr<LocalComputation*> Build();
+  // Returns an owned Computation to the caller on success.
+  StatusOr<Computation*> Build();
 
-  // Returns an owned LocalComputation to the caller on success with given root.
-  StatusOr<LocalComputation*> BuildWithRoot(const LocalOp& root);
+  // Returns an owned Computation to the caller on success with given root.
+  StatusOr<Computation*> BuildWithRoot(const LocalOp& root);
 
   LocalOp Parameter(int64 parameter_number, const Shape& shape,
                     const string& name);
@@ -339,11 +273,11 @@ class LocalComputationBuilder {
   LocalOp ConcatInDim(absl::Span<const LocalOp> operands, int64 dimension);
 
   LocalOp SelectAndScatterWithGeneralPadding(
-      const LocalOp& operand, const LocalComputation& select,
+      const LocalOp& operand, const Computation& select,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64> > padding, const LocalOp& source,
-      const LocalOp& init_value, const LocalComputation& scatter);
+      const LocalOp& init_value, const Computation& scatter);
 
   LocalOp Tuple(absl::Span<const LocalOp> elements);
 
@@ -369,7 +303,7 @@ class LocalComputationBuilder {
   LocalOp BitcastConvertType(const LocalOp& operand,
                              PrimitiveType new_element_type);
 
-  LocalOp Call(const LocalComputation& local_computation,
+  LocalOp Call(const Computation& local_computation,
                absl::Span<const LocalOp> operands);
 
   LocalOp CustomCall(const string& call_target_name,
@@ -384,16 +318,16 @@ class LocalComputationBuilder {
   LocalOp Rev(const LocalOp& operand, absl::Span<const int64> dimensions);
 
   LocalOp Map(absl::Span<const LocalOp> operands,
-              const LocalComputation& local_computation,
+              const Computation& local_computation,
               absl::Span<const int64> dimensions);
 
   LocalOp Reduce(const LocalOp& operand, const LocalOp& init_value,
-                 const LocalComputation& local_computation,
+                 const Computation& local_computation,
                  absl::Span<const int64> dimensions_to_reduce);
 
   LocalOp ReduceWindowWithGeneralPadding(
       const LocalOp& operand, const LocalOp& init_value,
-      const LocalComputation& local_computation,
+      const Computation& local_computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const int64> base_dilations,
@@ -405,13 +339,13 @@ class LocalComputationBuilder {
 
   LocalOp RngUniform(const LocalOp& a, const LocalOp& b, const Shape& shape);
 
-  LocalOp While(const LocalComputation& condition, const LocalComputation& body,
+  LocalOp While(const Computation& condition, const Computation& body,
                 const LocalOp& init);
 
   LocalOp Conditional(const LocalOp& predicate, const LocalOp& true_operand,
-                      const LocalComputation& true_computation,
+                      const Computation& true_computation,
                       const LocalOp& false_operand,
-                      const LocalComputation& false_computation);
+                      const Computation& false_computation);
 
   StatusOr<bool> IsConstant(const LocalOp& operand);
 
@@ -424,19 +358,21 @@ class LocalComputationBuilder {
 
   LocalOp Cholesky(const LocalOp& a);
 
+  // `transpose_a` is the integer value of a TriangularSolveOptions::Transpose
+  // enum. We use an integer here so we don't have to teach SWIG about the
+  // enum.
   LocalOp TriangularSolve(const LocalOp& a, const LocalOp& b, bool left_side,
-                          bool lower, bool transpose_a, bool conjugate_a);
+                          bool lower, bool unit_diagonal, int transpose_a);
 
   LocalOp Gather(const LocalOp& input, const LocalOp& start_indices,
                  const GatherDimensionNumbers& dimension_numbers,
                  absl::Span<const int64> slice_sizes);
 
   LocalOp Scatter(const LocalOp& input, const LocalOp& scatter_indices,
-                  const LocalOp& updates,
-                  const LocalComputation& update_computation,
+                  const LocalOp& updates, const Computation& update_computation,
                   const ScatterDimensionNumbers& dimension_numbers);
 
-  StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
+  StatusOr<Computation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
@@ -478,6 +414,7 @@ class LocalComputationBuilder {
   _FORWARD_BINOP(Pow)
   _FORWARD_BINOP(Complex)
   _FORWARD_UNOP(Not)
+  _FORWARD_UNOP(Clz)
   _FORWARD_UNOP(Abs)
   _FORWARD_UNOP(Exp)
   _FORWARD_UNOP(Expm1)
@@ -525,10 +462,8 @@ class LocalComputationBuilder {
 
 // Functions for freeing resources from the Python side.
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer);
-void DeleteXrtAllocation(XrtAllocation* allocation);
-void DeleteCompiledLocalComputation(CompiledLocalComputation* computation);
-void DeleteCompiledXrtComputation(CompiledXrtComputation* computation);
-void DeleteLocalComputation(LocalComputation* computation);
+void DeleteLocalExecutable(LocalExecutable* computation);
+void DeleteComputation(Computation* computation);
 
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 7b2f69d6ecf44f492f70351b38997530567b5277..7d7a860baa03e99cc254b7596fb5f9d41acbef20 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -23,11 +23,13 @@ limitations under the License.
 //    C++                                  Python
 // -------------------------------------+---------------------------------------
 //  Span<int64>                        <-  sequence of int
+//  vector<int>                        ->  sequence of int
 //  Span<LocalOp>                      <-  sequence of LocalOp
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
 //  Shape                               -> pair holding (dtype, dimensions)
 //                                     <-  object duck-typed as xla_client.Shape
+//  ProgramShape                       ->  pair of ([arg_shapes], ret_shape)
 //  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
 //  PrimitiveType                      <-  int
 //  Span<pair<int64, in64>>            <-  sequence of int pairs
@@ -97,7 +99,7 @@ limitations under the License.
 // wrapped in a Python class (xla_client.Shape) so as not to expose
 // the raw pair externally.
 //
-// Other SWIG object wrappers (e.g. of LocalComputation) are further
+// Other SWIG object wrappers (e.g. of Computation) are further
 // wrapped by xla_client in order to set up a custom destructor that
 // triggers memory deallocation on the C++ side.
 
@@ -107,6 +109,7 @@ limitations under the License.
 %nothread;
 
 %include "tensorflow/python/platform/base.i"
+%include "tensorflow/compiler/xla/python/xla_data.i"
 
 %{
 // Must be included first
@@ -124,87 +127,6 @@ limitations under the License.
 using namespace xla;
 using namespace xla::swig;
 
-namespace xla {
-
-namespace swig {
-
-bool GetIntAttr(PyObject* o, const char* field, int64* result) {
-  PyObject* fo = PyObject_GetAttrString(o, field);
-  if (!fo) {
-    return false;
-  }
-  const int64 value = numpy::PyIntOrPyLongToLong(fo);
-  if (value == -1 && PyErr_Occurred()) {
-    Py_DECREF(fo);
-    return false;
-  }
-  Py_DECREF(fo);
-  *result = value;
-  return true;
-}
-
-// Returns "ok"; true if there is no error, false if there was an error.
-bool HandleStringAttribute(PyObject* o,
-                           const char* attr_name,
-                           std::function<void(string s)> f) {
-  if (!PyObject_HasAttrString(o, attr_name)) {
-    return true;  // It's ok for the object to not have the attribute.
-  }
-  PyObject* attr = PyObject_GetAttrString(o, attr_name);
-  if (attr == nullptr) {
-    return false;  // An error occurred getting the attribute.
-  }
-  if (attr == Py_None) {
-    Py_DECREF(attr);
-    return true;  // The attribute is None, which we consider ok.
-  }
-  if (!PyString_Check(attr)) {
-    string message = absl::StrFormat("%s must be a string or none; got %s",
-        attr_name, numpy::PyObjectCppRepr(attr));
-    PyErr_SetString(PyExc_TypeError, message.c_str());
-    Py_DECREF(attr);
-    return false;  // Type error, not ok.
-  }
-  f(PyString_AsString(attr));
-  Py_DECREF(attr);
-  return true;  // Handled string attribute, ok!
-}
-
-bool HandleRepeatedInt64Attribute(
-    PyObject* o, const char* attr_name,
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
-  PyObject* seq = PyObject_GetAttrString(o, attr_name);
-  if (!seq) {
-    return false;
-  }
-
-  int length = PySequence_Size(seq);
-  if (length == -1) {
-    Py_DECREF(seq);
-    return false;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(seq, i);
-    if (!item) {
-      Py_DECREF(seq);
-      return false;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(seq);
-      return false;
-    }
-    *field->Add() = dimension;
-    Py_DECREF(item);
-  }
-  Py_DECREF(seq);
-  return true;
-}
-
-}  // namespace swig
-}  // namespace xla
 %}
 
 // Required to use PyArray_* functions.
@@ -212,57 +134,6 @@ bool HandleRepeatedInt64Attribute(
 tensorflow::ImportNumpy();
 %}
 
-// Basic types
-
-%typemap(out) StatusOr<bool> {
-  if ($1.ok()) {
-    $result = PyBool_FromLong($1.ConsumeValueOrDie());
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) Status {
-  if (!$1.ok()) {
-    PyErr_SetString(
-        PyExc_RuntimeError, $1.ToString().c_str());
-    SWIG_fail;
-  }
-  Py_INCREF(Py_None);
-  $result = Py_None;
-}
-
-%typemap(in) absl::Span<const int64>
-    (std::vector<int64> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.resize(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    PyObject* py_int = numpy::PyNumberToPyInt(o);
-    if (!py_int) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Argument sequence element cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
-    if (temps[i] == -1 && PyErr_Occurred()) {
-      Py_DECREF(py_int);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    Py_DECREF(py_int);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
 // Computation builder types
 
 %typemap(in) absl::Span<const xla::swig::LocalOp>(
@@ -287,12 +158,12 @@ tensorflow::ImportNumpy();
 
 // Computation and buffer/allocation types
 
-%typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalClient> {
   if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
+    xla::swig::LocalClient value = $1.ValueOrDie();
     {
-      auto* $1 = value;
-      $typemap(out, xla::swig::CompiledLocalComputation*)
+      auto $1 = value;
+      $typemap(out, xla::swig::LocalClient)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -300,12 +171,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::CompiledXrtComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalExecutable*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::CompiledXrtComputation*)
+      $typemap(out, xla::swig::LocalExecutable*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -339,38 +210,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
+%typemap(out) StatusOr<xla::swig::Computation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::XrtAllocation*)
-    }
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
-  if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
-    {
-      auto* $1 = value;
-      $typemap(out, xla::swig::XrtAllocationTuple*)
-    }
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) StatusOr<xla::swig::LocalComputation*> {
-  if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
-    {
-      auto* $1 = value;
-      $typemap(out, xla::swig::LocalComputation*)
+      $typemap(out, xla::swig::Computation*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -430,473 +275,6 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
-%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
-    (std::vector<XrtAllocation*> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    XrtAllocation* xrta;
-    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
-                         SWIG_POINTER_EXCEPTION)) == -1) {
-      SWIG_fail;
-    }
-    temps.push_back(xrta);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-// Literal
-
-%typemap(out) StatusOr<Literal> {
-  if ($1.ok()) {
-    Literal value = $1.ConsumeValueOrDie();
-    $result = numpy::PyObjectFromXlaLiteral(*value);
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(in) const Literal& (StatusOr<Literal> literal_status) {
-  literal_status = numpy::XlaLiteralFromPyObject($input);
-  if (!literal_status.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    SWIG_fail;
-  }
-  $1 = &literal_status.ValueOrDie();
-}
-
-%typemap(out) Literal {
-  $result = numpy::PyObjectFromXlaLiteral(*$1);
-}
-
-%typemap(out) StatusOr<Literal> {
-  if (!$1.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-  $result = numpy::PyObjectFromXlaLiteral($1.ValueOrDie());
-}
-
-%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    StatusOr<Literal> literal_status = numpy::XlaLiteralFromPyObject(o);
-    if (!literal_status.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps.push_back(literal_status.ConsumeValueOrDie());
-    Py_DECREF(o);
-  }
-  $1 = &temps;
-}
-
-// OpMetadata
-
-%typemap(in) const OpMetadata& (OpMetadata temp) {
-  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
-  if (!statusor.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    SWIG_fail;
-  }
-  temp = std::move(statusor).ValueOrDie();
-  $1 = &temp;
-}
-
-// Shape
-
-%typemap(out) const Shape& {
-  $result = numpy::PyShapeInfoFromXlaShape(*$1);
-}
-
-%typemap(out) StatusOr<Shape> {
-  if ($1.ok()) {
-    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(in) const Shape& (Shape temp) {
-  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
-  if (!statusor.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    SWIG_fail;
-  }
-  temp = std::move(statusor).ValueOrDie();
-  $1 = &temp;
-}
-
-%typemap(in) const absl::optional<Shape>& (
-    absl::optional<Shape> temp) {
-  if ($input == Py_None) {
-    temp = absl::nullopt;
-    $1 = &temp;
-  } else {
-    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
-    if (!statusor.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      SWIG_fail;
-    }
-    temp = std::move(statusor).ValueOrDie();
-    $1 = &temp;
-  }
-}
-
-%typemap(out) std::unique_ptr<Shape> {
-  $result = numpy::PyShapeInfoFromXlaShape(*$1);
-}
-
-%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
-    Py_DECREF(o);
-    if (!statusor.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      SWIG_fail;
-    }
-    temps.push_back(statusor.ConsumeValueOrDie());
-  }
-  $1 = &temps;
-}
-
-%typemap(in) const std::vector<absl::optional<Shape> >& (
-    std::vector<absl::optional<Shape> > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    if (o == Py_None) {
-      temps.push_back(absl::nullopt);
-    } else {
-      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
-      Py_DECREF(o);
-      if (!statusor.ok()) {
-        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        SWIG_fail;
-      }
-      temps.push_back(statusor.ConsumeValueOrDie());
-    }
-  }
-  $1 = &temps;
-}
-
-// PrimitiveType
-
-%typemap(in) PrimitiveType {
-  PyObject* py_int = numpy::PyNumberToPyInt($input);
-  if (!py_int) {
-    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    SWIG_fail;
-  }
-  const long value = numpy::PyIntOrPyLongToLong(py_int);
-  if (value == -1 && PyErr_Occurred()) {
-    Py_DECREF(py_int);
-    SWIG_fail;
-  }
-  if (!PrimitiveType_IsValid(value)) {
-    PyErr_SetString(
-        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
-    Py_DECREF(py_int);
-    SWIG_fail;
-  }
-  $1 = static_cast<PrimitiveType>(value);
-}
-
-// Span<pair<int64, in64>>
-
-%typemap(in) absl::Span<const std::pair<int64, int64> >
-    (std::vector<std::pair<int64, int64> > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    if (!o) {
-      SWIG_fail;
-    }
-    PyObject* first = PyTuple_GetItem(o, 0);
-    if (!first) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
-    if (!first_pyint) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "First pair item cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    PyObject* second = PyTuple_GetItem(o, 1);
-    if (!second) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      SWIG_fail;
-    }
-    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
-    if (!second_pyint) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Second pair item cannot be converted to int");
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      SWIG_fail;
-    }
-    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
-    if (first_value == -1 && PyErr_Occurred()) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      Py_DECREF(second_pyint);
-      SWIG_fail;
-    }
-    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
-    if (second_value == -1 && PyErr_Occurred()) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      Py_DECREF(second_pyint);
-      SWIG_fail;
-    }
-    temps.push_back(std::make_pair(first_value, second_value));
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-// DotDimensionNumbers
-
-%typemap(in) const DotDimensionNumbers&
-    (DotDimensionNumbers dimension_numbers) {
-  if (!HandleRepeatedInt64Attribute(
-        $input, "lhs_contracting_dimensions",
-        dimension_numbers.mutable_lhs_contracting_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "rhs_contracting_dimensions",
-        dimension_numbers.mutable_rhs_contracting_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "lhs_batch_dimensions",
-        dimension_numbers.mutable_lhs_batch_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "rhs_batch_dimensions",
-        dimension_numbers.mutable_rhs_batch_dimensions())) {
-    SWIG_fail;
-  }
-
-  $1 = &dimension_numbers;
-}
-
-// PaddingConfig
-
-%typemap(in) const PaddingConfig&
-    (PaddingConfig padding_config) {
-  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
-  if (!dimensions) {
-    SWIG_fail;
-  }
-
-  int length = PySequence_Size(dimensions);
-  if (length == -1) {
-    Py_DECREF(dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(dimensions, i);
-    if (!item) {
-      Py_DECREF(dimensions);
-      SWIG_fail;
-    }
-    int64 edge_padding_low, edge_padding_high, interior_padding;
-    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
-        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
-        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
-      Py_DECREF(item);
-      Py_DECREF(dimensions);
-      SWIG_fail;
-    }
-    Py_DECREF(item);
-
-    PaddingConfig::PaddingConfigDimension* dimension =
-        padding_config.add_dimensions();
-    dimension->set_edge_padding_low(edge_padding_low);
-    dimension->set_edge_padding_high(edge_padding_high);
-    dimension->set_interior_padding(interior_padding);
-  }
-  Py_DECREF(dimensions);
-
-  $1 = &padding_config;
-}
-
-// ConvolutionDimensionNumbers
-
-%typemap(in) const ConvolutionDimensionNumbers&
-    (ConvolutionDimensionNumbers dimension_numbers) {
-  int64 value;
-
-  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_input_batch_dimension(value);
-
-  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_input_feature_dimension(value);
-
-  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_output_batch_dimension(value);
-
-  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_output_feature_dimension(value);
-
-  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_kernel_output_feature_dimension(value);
-
-  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_kernel_input_feature_dimension(value);
-
-  if (!HandleRepeatedInt64Attribute(
-        $input, "input_spatial_dimensions",
-        dimension_numbers.mutable_input_spatial_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "kernel_spatial_dimensions",
-        dimension_numbers.mutable_kernel_spatial_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "output_spatial_dimensions",
-        dimension_numbers.mutable_output_spatial_dimensions())) {
-    SWIG_fail;
-  }
-
-  $1 = &dimension_numbers;
-}
-
-// GatherDimensionNumbers
-
-%typemap(in) const GatherDimensionNumbers&
-    (GatherDimensionNumbers dimension_numbers) {
-  if (!HandleRepeatedInt64Attribute(
-        $input, "offset_dims",
-        dimension_numbers.mutable_offset_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "collapsed_slice_dims",
-        dimension_numbers.mutable_collapsed_slice_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "start_index_map",
-        dimension_numbers.mutable_start_index_map())) {
-    SWIG_fail;
-  }
-
-  int64 value;
-  if (!GetIntAttr($input, "index_vector_dim", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_index_vector_dim(value);
-
-  $1 = &dimension_numbers;
-}
-
-// ScatterDimensionNumbers
-
-%typemap(in) const ScatterDimensionNumbers&
-    (ScatterDimensionNumbers dimension_numbers) {
-  if (!HandleRepeatedInt64Attribute(
-        $input, "update_window_dims",
-        dimension_numbers.mutable_update_window_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "inserted_window_dims",
-        dimension_numbers.mutable_inserted_window_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "scatter_dims_to_operand_dims",
-        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
-    SWIG_fail;
-  }
-
-  int64 value;
-  if (!GetIntAttr($input, "index_vector_dim", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_index_vector_dim(value);
-
-  $1 = &dimension_numbers;
-}
-
-// Span<const ReplicaGroup>
-
-%typemap(in) absl::Span<const ReplicaGroup >
-    (std::vector<ReplicaGroup > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    ReplicaGroup rgrp;
-    if (!HandleRepeatedInt64Attribute(
-            o, "replica_ids",
-            rgrp.mutable_replica_ids())) {
-        SWIG_fail;
-    }
-    temps.push_back(rgrp);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-
 // ExecutableBuildOptions
 
 %typemap(in) const ExecutableBuildOptions*
@@ -966,160 +344,151 @@ tensorflow::ImportNumpy();
 %ignoreall
 %unignore xla;
 %unignore xla::swig;
-%unignore xla::swig::InitializeReplicaCount;
-%unignore xla::swig::InitializePlatformName;
-%unignore xla::swig::GetReplicaCount;
 %unignore xla::swig::RegisterCpuCustomCallTarget;
-%unignore xla::swig::TransferToInfeedLocal;
-%unignore xla::swig::TransferToInfeedLocalReplica;
-%unignore xla::swig::TransferFromOutfeedLocalReplica;
+%unignore xla::swig::LocalClient;
+%unignore xla::swig::LocalClient::Get;
+%unignore xla::swig::LocalClient::DeviceCount;
+%unignore xla::swig::LocalClient::TransferToInfeed;
+%unignore xla::swig::LocalClient::TransferFromOutfeed;
 %unignore xla::swig::LocalShapedBuffer;
 %unignore xla::swig::LocalShapedBuffer::FromLiteral;
 %unignore xla::swig::LocalShapedBuffer::ToLiteral;
 %unignore xla::swig::LocalShapedBuffer::shape;
+%unignore xla::swig::LocalShapedBuffer::DestructureTuple;
 %unignore xla::swig::LocalShapedBufferTuple;
 %unignore xla::swig::LocalShapedBufferTuple::Release;
 %unignore xla::swig::LocalShapedBufferTuple::size;
-%unignore xla::swig::XrtAllocation;
-%unignore xla::swig::XrtAllocation::FromLiteral;
-%unignore xla::swig::XrtAllocation::ToLiteral;
-%unignore xla::swig::XrtAllocation::shape;
-%unignore xla::swig::XrtAllocationTuple;
-%unignore xla::swig::XrtAllocationTuple::Release;
-%unignore xla::swig::XrtAllocationTuple::size;
-%unignore xla::swig::CompiledLocalComputation;
-%unignore xla::swig::CompiledLocalComputation::Execute;
-%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
-%unignore xla::swig::CompiledXrtComputation;
-%unignore xla::swig::CompiledXrtComputation::Execute;
-%unignore xla::swig::LocalComputation;
-%unignore xla::swig::LocalComputation::Compile;
-%unignore xla::swig::LocalComputation::CompileForXrt;
-%unignore xla::swig::LocalComputation::GetReturnValueShape;
-%unignore xla::swig::LocalComputation::GetSerializedProto;
+%unignore xla::swig::LocalExecutable;
+%unignore xla::swig::LocalExecutable::DeviceOrdinals;
+%unignore xla::swig::LocalExecutable::Execute;
+%unignore xla::swig::LocalExecutable::ExecutePerReplica;
+%unignore xla::swig::Computation;
+%unignore xla::swig::Computation::Compile;
+%unignore xla::swig::Computation::GetProgramShape;
+%unignore xla::swig::Computation::GetReturnValueShape;
+%unignore xla::swig::Computation::GetSerializedProto;
+%unignore xla::swig::Computation::GetHloText;
+%unignore xla::swig::Computation::GetHloDotGraph;
 %unignore xla::swig::LocalOp;
-%unignore xla::swig::LocalComputationBuilder;
-%unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
-%unignore xla::swig::LocalComputationBuilder::Build;
-%unignore xla::swig::LocalComputationBuilder::BuildWithRoot;
-%unignore xla::swig::LocalComputationBuilder::SetOpMetadata;
-%unignore xla::swig::LocalComputationBuilder::ClearOpMetadata;
-%unignore xla::swig::LocalComputationBuilder::Parameter;
-%unignore xla::swig::LocalComputationBuilder::GetShape;
-%unignore xla::swig::LocalComputationBuilder::GetReturnValueShape;
-%unignore xla::swig::LocalComputationBuilder::Infeed;
-%unignore xla::swig::LocalComputationBuilder::Outfeed;
-%unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
-%unignore xla::swig::LocalComputationBuilder::ConstantR0;
-%unignore xla::swig::LocalComputationBuilder::Iota;
-%unignore xla::swig::LocalComputationBuilder::BroadcastedIota;
-%unignore xla::swig::LocalComputationBuilder::Broadcast;
-%unignore xla::swig::LocalComputationBuilder::BroadcastInDim;
-%unignore xla::swig::LocalComputationBuilder::Pad;
-%unignore xla::swig::LocalComputationBuilder::Reshape;
-%unignore xla::swig::LocalComputationBuilder::Collapse;
-%unignore xla::swig::LocalComputationBuilder::AllToAll;
-%unignore xla::swig::LocalComputationBuilder::CrossReplicaSum;
-%unignore xla::swig::LocalComputationBuilder::Slice;
-%unignore xla::swig::LocalComputationBuilder::SliceInDim;
-%unignore xla::swig::LocalComputationBuilder::DynamicSlice;
-%unignore xla::swig::LocalComputationBuilder::DynamicUpdateSlice;
-%unignore xla::swig::LocalComputationBuilder::ConcatInDim;
-%unignore xla::swig::LocalComputationBuilder::SelectAndScatterWithGeneralPadding;
-%unignore xla::swig::LocalComputationBuilder::Select;
-%unignore xla::swig::LocalComputationBuilder::Tuple;
-%unignore xla::swig::LocalComputationBuilder::GetTupleElement;
-%unignore xla::swig::LocalComputationBuilder::ConvertElementType;
-%unignore xla::swig::LocalComputationBuilder::BitcastConvertType;
-%unignore xla::swig::LocalComputationBuilder::Call;
-%unignore xla::swig::LocalComputationBuilder::Transpose;
-%unignore xla::swig::LocalComputationBuilder::Rev;
-%unignore xla::swig::LocalComputationBuilder::Clamp;
-%unignore xla::swig::LocalComputationBuilder::Map;
-%unignore xla::swig::LocalComputationBuilder::Reduce;
-%unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
-%unignore xla::swig::LocalComputationBuilder::RngNormal;
-%unignore xla::swig::LocalComputationBuilder::RngUniform;
-%unignore xla::swig::LocalComputationBuilder::RngBernoulli;
-%unignore xla::swig::LocalComputationBuilder::While;
-%unignore xla::swig::LocalComputationBuilder::Conditional;
-%unignore xla::swig::LocalComputationBuilder::IsConstant;
-%unignore xla::swig::LocalComputationBuilder::Eq;
-%unignore xla::swig::LocalComputationBuilder::Ne;
-%unignore xla::swig::LocalComputationBuilder::Ge;
-%unignore xla::swig::LocalComputationBuilder::Gt;
-%unignore xla::swig::LocalComputationBuilder::Lt;
-%unignore xla::swig::LocalComputationBuilder::Le;
-%unignore xla::swig::LocalComputationBuilder::Dot;
-%unignore xla::swig::LocalComputationBuilder::DotGeneral;
-%unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
-%unignore xla::swig::LocalComputationBuilder::Add;
-%unignore xla::swig::LocalComputationBuilder::Sub;
-%unignore xla::swig::LocalComputationBuilder::Mul;
-%unignore xla::swig::LocalComputationBuilder::Div;
-%unignore xla::swig::LocalComputationBuilder::Rem;
-%unignore xla::swig::LocalComputationBuilder::Max;
-%unignore xla::swig::LocalComputationBuilder::Min;
-%unignore xla::swig::LocalComputationBuilder::And;
-%unignore xla::swig::LocalComputationBuilder::Or;
-%unignore xla::swig::LocalComputationBuilder::Xor;
-%unignore xla::swig::LocalComputationBuilder::ShiftLeft;
-%unignore xla::swig::LocalComputationBuilder::ShiftRightArithmetic;
-%unignore xla::swig::LocalComputationBuilder::ShiftRightLogical;
-%unignore xla::swig::LocalComputationBuilder::Not;
-%unignore xla::swig::LocalComputationBuilder::Abs;
-%unignore xla::swig::LocalComputationBuilder::Exp;
-%unignore xla::swig::LocalComputationBuilder::Expm1;
-%unignore xla::swig::LocalComputationBuilder::Floor;
-%unignore xla::swig::LocalComputationBuilder::Ceil;
-%unignore xla::swig::LocalComputationBuilder::Round;
-%unignore xla::swig::LocalComputationBuilder::Log;
-%unignore xla::swig::LocalComputationBuilder::Log1p;
-%unignore xla::swig::LocalComputationBuilder::Sign;
-%unignore xla::swig::LocalComputationBuilder::Cos;
-%unignore xla::swig::LocalComputationBuilder::Sin;
-%unignore xla::swig::LocalComputationBuilder::Tanh;
-%unignore xla::swig::LocalComputationBuilder::Atan2;
-%unignore xla::swig::LocalComputationBuilder::IsFinite;
-%unignore xla::swig::LocalComputationBuilder::Pow;
-%unignore xla::swig::LocalComputationBuilder::Neg;
-%unignore xla::swig::LocalComputationBuilder::Sort;
-%unignore xla::swig::LocalComputationBuilder::SortKeyVal;
-%unignore xla::swig::LocalComputationBuilder::Sqrt;
-%unignore xla::swig::LocalComputationBuilder::Rsqrt;
-%unignore xla::swig::LocalComputationBuilder::Square;
-%unignore xla::swig::LocalComputationBuilder::Reciprocal;
-%unignore xla::swig::LocalComputationBuilder::Erfc;
-%unignore xla::swig::LocalComputationBuilder::Erf;
-%unignore xla::swig::LocalComputationBuilder::ErfInv;
-%unignore xla::swig::LocalComputationBuilder::Lgamma;
-%unignore xla::swig::LocalComputationBuilder::Digamma;
-%unignore xla::swig::LocalComputationBuilder::Acos;
-%unignore xla::swig::LocalComputationBuilder::Asin;
-%unignore xla::swig::LocalComputationBuilder::Atan;
-%unignore xla::swig::LocalComputationBuilder::Tan;
-%unignore xla::swig::LocalComputationBuilder::Acosh;
-%unignore xla::swig::LocalComputationBuilder::Asinh;
-%unignore xla::swig::LocalComputationBuilder::Atanh;
-%unignore xla::swig::LocalComputationBuilder::Cosh;
-%unignore xla::swig::LocalComputationBuilder::Sinh;
-%unignore xla::swig::LocalComputationBuilder::Real;
-%unignore xla::swig::LocalComputationBuilder::Imag;
-%unignore xla::swig::LocalComputationBuilder::Conj;
-%unignore xla::swig::LocalComputationBuilder::Complex;
-%unignore xla::swig::LocalComputationBuilder::Cholesky;
-%unignore xla::swig::LocalComputationBuilder::QR;
-%unignore xla::swig::LocalComputationBuilder::TriangularSolve;
-%unignore xla::swig::LocalComputationBuilder::CustomCall;
-%unignore xla::swig::LocalComputationBuilder::Gather;
-%unignore xla::swig::LocalComputationBuilder::Scatter;
-%unignore xla::swig::DeleteLocalComputation;
-%unignore xla::swig::DestructureLocalShapedBufferTuple;
-%unignore xla::swig::DestructureXrtAllocationTuple;
+%unignore xla::swig::ComputationBuilder;
+%unignore xla::swig::ComputationBuilder::ComputationBuilder;
+%unignore xla::swig::ComputationBuilder::Build;
+%unignore xla::swig::ComputationBuilder::BuildWithRoot;
+%unignore xla::swig::ComputationBuilder::SetOpMetadata;
+%unignore xla::swig::ComputationBuilder::ClearOpMetadata;
+%unignore xla::swig::ComputationBuilder::Parameter;
+%unignore xla::swig::ComputationBuilder::GetShape;
+%unignore xla::swig::ComputationBuilder::GetReturnValueShape;
+%unignore xla::swig::ComputationBuilder::Infeed;
+%unignore xla::swig::ComputationBuilder::Outfeed;
+%unignore xla::swig::ComputationBuilder::ConstantLiteral;
+%unignore xla::swig::ComputationBuilder::ConstantR0;
+%unignore xla::swig::ComputationBuilder::Iota;
+%unignore xla::swig::ComputationBuilder::BroadcastedIota;
+%unignore xla::swig::ComputationBuilder::Broadcast;
+%unignore xla::swig::ComputationBuilder::BroadcastInDim;
+%unignore xla::swig::ComputationBuilder::Pad;
+%unignore xla::swig::ComputationBuilder::Reshape;
+%unignore xla::swig::ComputationBuilder::Collapse;
+%unignore xla::swig::ComputationBuilder::AllToAll;
+%unignore xla::swig::ComputationBuilder::CrossReplicaSum;
+%unignore xla::swig::ComputationBuilder::Slice;
+%unignore xla::swig::ComputationBuilder::SliceInDim;
+%unignore xla::swig::ComputationBuilder::DynamicSlice;
+%unignore xla::swig::ComputationBuilder::DynamicUpdateSlice;
+%unignore xla::swig::ComputationBuilder::ConcatInDim;
+%unignore xla::swig::ComputationBuilder::SelectAndScatterWithGeneralPadding;
+%unignore xla::swig::ComputationBuilder::Select;
+%unignore xla::swig::ComputationBuilder::Tuple;
+%unignore xla::swig::ComputationBuilder::GetTupleElement;
+%unignore xla::swig::ComputationBuilder::ConvertElementType;
+%unignore xla::swig::ComputationBuilder::BitcastConvertType;
+%unignore xla::swig::ComputationBuilder::Call;
+%unignore xla::swig::ComputationBuilder::Transpose;
+%unignore xla::swig::ComputationBuilder::Rev;
+%unignore xla::swig::ComputationBuilder::Clamp;
+%unignore xla::swig::ComputationBuilder::Map;
+%unignore xla::swig::ComputationBuilder::Reduce;
+%unignore xla::swig::ComputationBuilder::ReduceWindowWithGeneralPadding;
+%unignore xla::swig::ComputationBuilder::RngNormal;
+%unignore xla::swig::ComputationBuilder::RngUniform;
+%unignore xla::swig::ComputationBuilder::RngBernoulli;
+%unignore xla::swig::ComputationBuilder::While;
+%unignore xla::swig::ComputationBuilder::Conditional;
+%unignore xla::swig::ComputationBuilder::IsConstant;
+%unignore xla::swig::ComputationBuilder::Eq;
+%unignore xla::swig::ComputationBuilder::Ne;
+%unignore xla::swig::ComputationBuilder::Ge;
+%unignore xla::swig::ComputationBuilder::Gt;
+%unignore xla::swig::ComputationBuilder::Lt;
+%unignore xla::swig::ComputationBuilder::Le;
+%unignore xla::swig::ComputationBuilder::Dot;
+%unignore xla::swig::ComputationBuilder::DotGeneral;
+%unignore xla::swig::ComputationBuilder::ConvGeneralDilated;
+%unignore xla::swig::ComputationBuilder::Add;
+%unignore xla::swig::ComputationBuilder::Sub;
+%unignore xla::swig::ComputationBuilder::Mul;
+%unignore xla::swig::ComputationBuilder::Div;
+%unignore xla::swig::ComputationBuilder::Rem;
+%unignore xla::swig::ComputationBuilder::Max;
+%unignore xla::swig::ComputationBuilder::Min;
+%unignore xla::swig::ComputationBuilder::And;
+%unignore xla::swig::ComputationBuilder::Or;
+%unignore xla::swig::ComputationBuilder::Xor;
+%unignore xla::swig::ComputationBuilder::ShiftLeft;
+%unignore xla::swig::ComputationBuilder::ShiftRightArithmetic;
+%unignore xla::swig::ComputationBuilder::ShiftRightLogical;
+%unignore xla::swig::ComputationBuilder::Not;
+%unignore xla::swig::ComputationBuilder::Clz;
+%unignore xla::swig::ComputationBuilder::Abs;
+%unignore xla::swig::ComputationBuilder::Exp;
+%unignore xla::swig::ComputationBuilder::Expm1;
+%unignore xla::swig::ComputationBuilder::Floor;
+%unignore xla::swig::ComputationBuilder::Ceil;
+%unignore xla::swig::ComputationBuilder::Round;
+%unignore xla::swig::ComputationBuilder::Log;
+%unignore xla::swig::ComputationBuilder::Log1p;
+%unignore xla::swig::ComputationBuilder::Sign;
+%unignore xla::swig::ComputationBuilder::Cos;
+%unignore xla::swig::ComputationBuilder::Sin;
+%unignore xla::swig::ComputationBuilder::Tanh;
+%unignore xla::swig::ComputationBuilder::Atan2;
+%unignore xla::swig::ComputationBuilder::IsFinite;
+%unignore xla::swig::ComputationBuilder::Pow;
+%unignore xla::swig::ComputationBuilder::Neg;
+%unignore xla::swig::ComputationBuilder::Sort;
+%unignore xla::swig::ComputationBuilder::SortKeyVal;
+%unignore xla::swig::ComputationBuilder::Sqrt;
+%unignore xla::swig::ComputationBuilder::Rsqrt;
+%unignore xla::swig::ComputationBuilder::Square;
+%unignore xla::swig::ComputationBuilder::Reciprocal;
+%unignore xla::swig::ComputationBuilder::Erfc;
+%unignore xla::swig::ComputationBuilder::Erf;
+%unignore xla::swig::ComputationBuilder::ErfInv;
+%unignore xla::swig::ComputationBuilder::Lgamma;
+%unignore xla::swig::ComputationBuilder::Digamma;
+%unignore xla::swig::ComputationBuilder::Acos;
+%unignore xla::swig::ComputationBuilder::Asin;
+%unignore xla::swig::ComputationBuilder::Atan;
+%unignore xla::swig::ComputationBuilder::Tan;
+%unignore xla::swig::ComputationBuilder::Acosh;
+%unignore xla::swig::ComputationBuilder::Asinh;
+%unignore xla::swig::ComputationBuilder::Atanh;
+%unignore xla::swig::ComputationBuilder::Cosh;
+%unignore xla::swig::ComputationBuilder::Sinh;
+%unignore xla::swig::ComputationBuilder::Real;
+%unignore xla::swig::ComputationBuilder::Imag;
+%unignore xla::swig::ComputationBuilder::Conj;
+%unignore xla::swig::ComputationBuilder::Complex;
+%unignore xla::swig::ComputationBuilder::Cholesky;
+%unignore xla::swig::ComputationBuilder::QR;
+%unignore xla::swig::ComputationBuilder::TriangularSolve;
+%unignore xla::swig::ComputationBuilder::CustomCall;
+%unignore xla::swig::ComputationBuilder::Gather;
+%unignore xla::swig::ComputationBuilder::Scatter;
+%unignore xla::swig::DeleteComputation;
 %unignore xla::swig::DeleteLocalShapedBuffer;
-%unignore xla::swig::DeleteXrtAllocation;
-%unignore xla::swig::DeleteCompiledLocalComputation;
-%unignore xla::swig::DeleteCompiledXrtComputation;
+%unignore xla::swig::DeleteLocalExecutable;
 
 %thread;
 %include "tensorflow/compiler/xla/python/local_computation_builder.h"
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 52c5c621f7294c5da341879d15b77559fe870551..74f45b7cdcfd7d7b10a5832be37ac1fb34057743 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -26,6 +26,10 @@ namespace swig {
 
 namespace numpy {
 
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
+}
+
 int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case PRED:
@@ -123,28 +127,42 @@ bool NumpyTypeIsValid(int np_type) {
   }
 }
 
-PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
+Safe_PyObjectPtr PyShapeInfoFromXlaShape(const Shape& shape) {
   int np_typenum = PrimitiveTypeToNumpyType(shape.element_type());
   PyArray_Descr* np_dtype = PyArray_DescrFromType(np_typenum);
 
-  PyObject* dimensions;
+  Safe_PyObjectPtr dimensions;
   if (shape.IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(shape);
-    dimensions = PyTuple_New(ShapeUtil::TupleElementCount(shape));
+    dimensions = make_safe(PyTuple_New(ShapeUtil::TupleElementCount(shape)));
     for (int i = 0; i < num_elements; ++i) {
       PyTuple_SET_ITEM(
-          dimensions, i,
-          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i)));
+          dimensions.get(), i,
+          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i))
+              .release());
     }
   } else {
     int rank = shape.rank();
-    dimensions = PyTuple_New(rank);
+    dimensions = make_safe(PyTuple_New(rank));
     for (int i = 0; i < rank; ++i) {
-      PyTuple_SET_ITEM(dimensions, i,
+      PyTuple_SET_ITEM(dimensions.get(), i,
                        LongToPyIntOrPyLong(ShapeUtil::GetDimension(shape, i)));
     }
   }
-  return PyTuple_Pack(2, np_dtype, dimensions);
+  return make_safe(PyTuple_Pack(2, np_dtype, dimensions.release()));
+}
+
+Safe_PyObjectPtr PyProgramShapeInfoFromXlaProgramShape(
+    const ProgramShape& shape) {
+  Safe_PyObjectPtr arg_shapes = make_safe(PyTuple_New(shape.parameters_size()));
+  for (int i = 0; i < shape.parameters_size(); ++i) {
+    PyTuple_SET_ITEM(arg_shapes.get(), i,
+                     PyShapeInfoFromXlaShape(shape.parameters(i)).release());
+  }
+
+  Safe_PyObjectPtr result_shape = PyShapeInfoFromXlaShape(shape.result());
+  return make_safe(
+      PyTuple_Pack(2, arg_shapes.release(), result_shape.release()));
 }
 
 // Precondition: o->ob_type == &PyArrayDescr_Type
@@ -349,13 +367,17 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
   return result;
 }
 
-PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
+StatusOr<Safe_PyObjectPtr> PyObjectFromXlaLiteral(const LiteralSlice& literal) {
   if (literal.shape().IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(literal.shape());
-    PyObject* tuple = PyTuple_New(num_elements);
+    std::vector<Safe_PyObjectPtr> elems(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      TF_ASSIGN_OR_RETURN(elems[i],
+                          PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
+    }
+    Safe_PyObjectPtr tuple = make_safe(PyTuple_New(num_elements));
     for (int i = 0; i < num_elements; i++) {
-      PyTuple_SET_ITEM(tuple, i,
-                       PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
+      PyTuple_SET_ITEM(tuple.get(), i, elems[i].release());
     }
     return tuple;
   } else {
@@ -365,10 +387,10 @@ PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
       dimensions[i] = ShapeUtil::GetDimension(literal.shape(), i);
     }
     int np_type = PrimitiveTypeToNumpyType(literal.shape().element_type());
-    PyObject* array =
-        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0);
-    CopyLiteralToNumpyArray(np_type, literal,
-                            reinterpret_cast<PyArrayObject*>(array));
+    Safe_PyObjectPtr array = make_safe(
+        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0));
+    TF_RETURN_IF_ERROR(CopyLiteralToNumpyArray(
+        np_type, literal, reinterpret_cast<PyArrayObject*>(array.get())));
     return array;
   }
 }
@@ -408,6 +430,12 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_BOOL:
       CopyNumpyArrayToLiteral<bool>(py_array, literal);
       break;
+    case NPY_INT8:
+      CopyNumpyArrayToLiteral<int8>(py_array, literal);
+      break;
+    case NPY_INT16:
+      CopyNumpyArrayToLiteral<int16>(py_array, literal);
+      break;
     case NPY_INT32:
       CopyNumpyArrayToLiteral<int32>(py_array, literal);
       break;
@@ -417,6 +445,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_UINT8:
       CopyNumpyArrayToLiteral<uint8>(py_array, literal);
       break;
+    case NPY_UINT16:
+      CopyNumpyArrayToLiteral<uint16>(py_array, literal);
+      break;
     case NPY_UINT32:
       CopyNumpyArrayToLiteral<uint32>(py_array, literal);
       break;
@@ -445,12 +476,18 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
   return Status::OK();
 }
 
-void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
-                             PyArrayObject* py_array) {
+Status CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
+                               PyArrayObject* py_array) {
   switch (np_type) {
     case NPY_BOOL:
       CopyLiteralToNumpyArray<bool>(literal, py_array);
       break;
+    case NPY_INT8:
+      CopyLiteralToNumpyArray<int8>(literal, py_array);
+      break;
+    case NPY_INT16:
+      CopyLiteralToNumpyArray<int16>(literal, py_array);
+      break;
     case NPY_INT32:
       CopyLiteralToNumpyArray<int32>(literal, py_array);
       break;
@@ -460,6 +497,9 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
     case NPY_UINT8:
       CopyLiteralToNumpyArray<uint8>(literal, py_array);
       break;
+    case NPY_UINT16:
+      CopyLiteralToNumpyArray<uint16>(literal, py_array);
+      break;
     case NPY_UINT32:
       CopyLiteralToNumpyArray<uint32>(literal, py_array);
       break;
@@ -482,8 +522,10 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
       CopyLiteralToNumpyArray<complex128>(literal, py_array);
       break;
     default:
-      LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
+      return InvalidArgument(
+          "No XLA literal container for Numpy type number: %d", np_type);
   }
+  return Status::OK();
 }
 
 PyObject* LongToPyIntOrPyLong(long x) {  // NOLINT
@@ -525,6 +567,92 @@ PyObject* PyNumberToPyInt(PyObject* o) {
 
 }  // namespace numpy
 
+bool GetIntAttr(PyObject* o, const char* field, int64* result) {
+  PyObject* fo = PyObject_GetAttrString(o, field);
+  if (!fo) {
+    return false;
+  }
+  const int64 value = numpy::PyIntOrPyLongToLong(fo);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(fo);
+    return false;
+  }
+  Py_DECREF(fo);
+  *result = value;
+  return true;
+}
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o, const char* attr_name,
+                           std::function<void(string s)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+#if PY_MAJOR_VERSION < 3
+  if (!PyString_Check(attr)) {
+    string message = absl::StrFormat("%s must be a string or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyString_AsString(attr));
+#else
+  if (!PyBytes_Check(attr)) {
+    string message = absl::StrFormat("%s must be a string or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyBytes_AsString(attr));
+#endif
+
+  Py_DECREF(attr);
+  return true;  // Handled string attribute, ok!
+}
+
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
+  PyObject* seq = PyObject_GetAttrString(o, attr_name);
+  if (!seq) {
+    return false;
+  }
+
+  int length = PySequence_Size(seq);
+  if (length == -1) {
+    Py_DECREF(seq);
+    return false;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(seq, i);
+    if (!item) {
+      Py_DECREF(seq);
+      return false;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(seq);
+      return false;
+    }
+    *field->Add() = dimension;
+    Py_DECREF(item);
+  }
+  Py_DECREF(seq);
+  return true;
+}
+
 }  // namespace swig
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 40ff2d9ad214cc4dcad42234fa296834cbc92882..eff8cda334f00050605febad66a61aa1c518c500 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -36,6 +36,16 @@ namespace swig {
 
 namespace numpy {
 
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, PyDecrefDeleter>;
+
+Safe_PyObjectPtr make_safe(PyObject* object);
+
 // Maps XLA primitive types (PRED, S8, F32, ..., and TUPLE) to numpy
 // dtypes (NPY_BOOL, NPY_INT8, NPY_FLOAT32, ..., and NPY_OBJECT), and
 // vice versa.
@@ -54,7 +64,13 @@ bool NumpyTypeIsValid(int np_type);
 // providing the array dimensions.
 //
 // The return value is a new reference.
-PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
+Safe_PyObjectPtr PyShapeInfoFromXlaShape(const Shape& shape);
+
+// Returns a pair of (arg_shapes, result_shape), where arg_shapes is a tuple
+// of argument shapes and result_shape is the result shape. Each shape is as
+// described in in PyShapeInfoFromXlaShape's comment.
+Safe_PyObjectPtr PyProgramShapeInfoFromXlaProgramShape(
+    const ProgramShape& shape);
 
 // Converts a Python object with a method interface mathing that of
 // xla_client.Shape into an XLA Shape object.
@@ -74,7 +90,7 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o);
 // array data.
 //
 // The return value is a new reference.
-PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal);
+StatusOr<Safe_PyObjectPtr> PyObjectFromXlaLiteral(const LiteralSlice& literal);
 
 // Converts a Numpy ndarray or a nested Python tuple thereof to a
 // corresponding XLA literal.
@@ -90,8 +106,8 @@ StatusOr<Literal> XlaLiteralFromPyObject(PyObject* o);
 Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
                                Literal* literal);
 
-void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
-                             PyArrayObject* py_array);
+Status CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
+                               PyArrayObject* py_array);
 
 template <typename NativeT>
 void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
@@ -120,6 +136,18 @@ PyObject* PyNumberToPyInt(PyObject* o);
 
 }  // namespace numpy
 
+// Miscellaneous swig helpers that don't have a better home.
+
+bool GetIntAttr(PyObject* o, const char* field, int64* result);
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o, const char* attr_name,
+                           std::function<void(string s)> f);
+
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field);
+
 }  // namespace swig
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
index bce6c1acf8a1cc0005ca93e0466c5a0e29d880de..ef77ed3d95850fdfc7145e6fe1df4833d20bb7df 100644
--- a/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
+++ b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
@@ -1 +1,2 @@
 _PyInit__pywrap_xla
+_init_pywrap_xla
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 37cae0e3b6b8635ca53e282994f0d078974df5a9..9019a979a61c6ebb62adaa5503560c604e2b30f8 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An in-process, local XLA client in Python, supporting AOT compilation."""
+"""An XLA client in Python, supporting AOT compilation."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
 import enum  # pylint: disable=g-bad-import-order
 import inspect
@@ -33,13 +34,32 @@ from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
 from tensorflow.compiler.xla.service import hlo_pb2
 
+# Import the XRT backend, if available.
+try:
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.compiler.xla.python import pywrap_xrt as xrt_api
+except ImportError:
+  xrt_api = None
+
 
 # Most functions are snake_case for consistency with other modules, whereas
-# method names of ComputationBuilder and LocalComputation are CamelCase for
+# method names of ComputationBuilder and Computation are CamelCase for
 # consistency with XLA.
 # pylint: disable=invalid-name
 
 
+# Version of the XLA Python client.
+#
+# JAX packages the XLA python plugin as a binary pip module (jaxlib) that is
+# packaged separately from the Python code that consumes it (jax).
+#
+# We occasionally need to make backwards-incompatible changes to jaxlib, in
+# which case we need to be able to detect when incompatible versions are
+# installed.
+def version():
+  return (0, 1, 8)
+
+
 _OP_METADATA_FIELDS = [
     'op_type',
     'op_name',
@@ -49,13 +69,163 @@ _OP_METADATA_FIELDS = [
 OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
 
 
+@six.add_metaclass(abc.ABCMeta)
+class Backend(object):
+  """Abstract base class for XLA backends."""
+
+  @abc.abstractmethod
+  def device_count(self):
+    """Returns the number of devices known to the backend."""
+
+  @abc.abstractmethod
+  def buffer_from_pyval(self, pyval, device=0):
+    """Allocates a fresh buffer and populates it with `pyval`."""
+
+  @abc.abstractmethod
+  def delete_buffer(self, c_buffer):
+    """Deletes buffer `c_buffer`."""
+
+  @abc.abstractmethod
+  def destructure_tuple(self, c_buffer):
+    """Destructures a tuple buffer into a sequence of buffers."""
+
+  @abc.abstractmethod
+  def compile(self, computation, argument_shapes, result_shape,
+              compile_options):
+    """Compiles a computation. Returns an executable."""
+
+  @abc.abstractmethod
+  def delete_executable(self, executable):
+    """Deletes an executable."""
+
+  @abc.abstractmethod
+  def execute(self, executable, args):
+    """Runs an executable without replication."""
+
+  @abc.abstractmethod
+  def execute_replicated(self, executable, per_replica_args):
+    """Runs an executable in a replicated manner."""
+
+
+def _maybe_encode_string(s):
+  if six.PY3:
+    return s.encode('utf-8')
+  else:
+    return s
+
+
+class XlaLocalBackend(Backend):
+  """XLA backend implemented using the in-process xla::LocalClient API."""
+
+  def __init__(self, platform=None):
+    platform = platform or _get_default_platform_name()
+    self.client = c_api.LocalClient.Get(_maybe_encode_string(platform))
+    self._delete_buffer = c_api.DeleteLocalShapedBuffer
+    self._delete_executable = c_api.DeleteLocalExecutable
+
+  def device_count(self):
+    return self.client.DeviceCount()
+
+  def buffer_from_pyval(self, pyval, device=0):
+    return c_api.LocalShapedBuffer.FromLiteral(pyval, None, self.client, device)
+
+  def delete_buffer(self, c_buffer):
+    self._delete_buffer(c_buffer)
+
+  def destructure_tuple(self, c_buffer):
+    result = c_buffer.DestructureTuple()
+    return [result.Release(i) for i in xrange(result.size())]
+
+  def compile(self, c_computation, argument_shapes, result_shape,
+              compile_options):
+    return c_computation.Compile(argument_shapes, compile_options, self.client)
+
+  def delete_executable(self, executable):
+    self._delete_executable(executable)
+
+  def execute(self, executable, args):
+    return executable.Execute(args)
+
+  def execute_replicated(self, executable, per_replica_args):
+    output_buffer_tup = executable.ExecutePerReplica(per_replica_args)
+    size = output_buffer_tup.size()
+    return [output_buffer_tup.Release(i) for i in xrange(size)]
+
+
+class XrtBackend(Backend):
+  """XLA backend implemented using XRT."""
+
+  def __init__(self, target):
+    self.target = target
+    self._delete_buffer = xrt_api.DeleteXrtAllocation
+    self._delete_executable = xrt_api.DeleteXrtExecutable
+
+  def device_count(self):
+    return 1  # Multidevice execution not implemented.
+
+  def buffer_from_pyval(self, pyval, device=0):
+    if device != 0:
+      raise NotImplementedError(
+          'Multi-replica execution is not yet supported via the XRT backend.')
+    return xrt_api.XrtAllocation.FromLiteral(pyval,
+                                             _maybe_encode_string(self.target))
+
+  def delete_buffer(self, c_buffer):
+    self._delete_buffer(c_buffer)
+
+  def destructure_tuple(self, c_buffer):
+    result = xrt_api.DestructureXrtAllocationTuple(
+        c_buffer, _maybe_encode_string(self.target))
+    return [result.Release(i) for i in xrange(result.size())]
+
+  def compile(self, c_computation, argument_shapes, result_shape,
+              compile_options):
+    return xrt_api.XrtExecutable.CompileForXrt(
+        c_computation.GetSerializedProto(), argument_shapes, result_shape,
+        _maybe_encode_string(self.target))
+
+  def delete_executable(self, executable):
+    self._delete_executable(executable)
+
+  def execute(self, executable, args):
+    return executable.Execute(args)
+
+  def execute_replicated(self, executable, per_replica_args):
+    if len(per_replica_args) != 1:
+      raise NotImplementedError(
+          'Multi-replica execution is not yet supported via the XRT backend.')
+    return [executable.Execute(per_replica_args[0])]
+
+
+_default_platform_name = 'Host'
+_default_backend = None
+
+
+def _get_default_platform_name():
+  return _default_platform_name
+
+
+def _get_default_local_backend():
+  global _default_backend
+  global _default_platform_name
+  if _default_backend is None:
+    _default_backend = XlaLocalBackend(_default_platform_name)
+  return _default_backend
+
+
 class BackendType(enum.Enum):
   XLA_LOCAL = 1
   XRT = 2
 
 
-BackendSpec = collections.namedtuple('Backend', ('backend_type', 'target'))
-XLA_LOCAL_BACKEND = BackendSpec(BackendType.XLA_LOCAL, 'local')
+def BackendSpec(backend, target):
+  """Compatibility wrapper to support older clients. Do not use in new code."""
+  if backend == BackendType.XLA_LOCAL:
+    return _get_default_local_backend()
+  elif backend == BackendType.XRT:
+    return XrtBackend(target)
+  else:
+    raise ValueError('Unknown backend {}'.format(backend))
 
 
 def OpMetadataToProto(pyobj):
@@ -78,13 +248,6 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
       source_line=lineno)
 
 
-def _maybe_encode_string(s):
-  if six.PY3:
-    return s.encode('utf-8')
-  else:
-    return s
-
-
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -122,6 +285,7 @@ def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
 
 _UNARY_OPS = [
     'Not',
+    'Clz',
     'Abs',
     'Exp',
     'Expm1',
@@ -223,33 +387,18 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_buffer, backend, replica):
+  def __init__(self, c_buffer, backend, device):
     self.c_buffer = c_buffer
     self._backend = backend
-    self._replica = replica
-    if backend.backend_type == BackendType.XRT:
-      self._delete = c_api.DeleteXrtAllocation
-    else:
-      self._delete = c_api.DeleteLocalShapedBuffer
+    self._device = device
 
   @staticmethod
-  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
+  def from_pyval(pyval, device=0, backend=None):
     """Allocate and copy to XLA the given python value."""
+    backend = backend or _get_default_local_backend()
     pyval = require_numpy_array_layout(pyval)
-    num_replicas = get_replica_count()
-    if not 0 <= replica < num_replicas:
-      raise ValueError(
-          'Attempt to place buffer on replica {} when the replica count is {}'
-          .format(replica, num_replicas))
-    if backend.backend_type == BackendType.XRT:
-      if replica != 0:
-        raise NotImplementedError(
-            'Multi-replica execution is not yet supported via the XRT backend.')
-      cbuf = c_api.XrtAllocation.FromLiteral(
-          pyval, _maybe_encode_string(backend.target))
-    else:
-      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
-    return LocalBuffer(cbuf, backend, replica)
+    cbuf = backend.buffer_from_pyval(pyval, device)
+    return LocalBuffer(cbuf, backend, device)
 
   def to_py(self):
     return self.c_buffer.ToLiteral()
@@ -257,29 +406,22 @@ class LocalBuffer(object):
   def shape(self):
     return _wrap_shape(self.c_buffer.shape())
 
-  def replica(self):
-    return self._replica
+  def device(self):
+    return self._device
 
   def delete(self):
     if self.c_buffer is not None:
-      self._delete(self.c_buffer)
+      self._backend.delete_buffer(self.c_buffer)
       self.c_buffer = None
 
   def destructure(self):
     """Assuming a tuple buffer, unpack it into constituent tuple elements."""
     assert self.c_buffer is not None
-    if self._backend.backend_type == BackendType.XRT:
-      result = c_api.DestructureXrtAllocationTuple(
-          self.c_buffer, _maybe_encode_string(self._backend.target))
-    else:
-      result = c_api.DestructureLocalShapedBufferTuple(self.c_buffer)
+    result = self._backend.destructure_tuple(self.c_buffer)
     self.delete()
-    size = result.size()
-    destructured = tuple(
-        LocalBuffer(
-            result.Release(i), replica=self._replica, backend=self._backend)
-        for i in xrange(size))
-    return destructured
+    return tuple(
+        LocalBuffer(sub_buffer, device=self._device, backend=self._backend)
+        for sub_buffer in result)
 
   def is_deleted(self):
     return self.c_buffer is None
@@ -428,6 +570,34 @@ class Shape(object):
     updated._check_minor_to_major()  # pylint: disable=protected-access
     return updated
 
+  def with_major_to_minor_layout_if_absent(self):
+    """Returns a copy of a shape with missing layouts set to major-to-minor."""
+
+    def f(a):
+      if a.minor_to_major():
+        return None
+      return a.update_minor_to_major(tuple(xrange(a.rank() - 1, -1, -1)))
+
+    return self.map_leaves(f)
+
+  def serialize(self, proto):
+    """Serializes 'shape' into proto."""
+    if self.is_tuple():
+      proto.element_type = xla_data_pb2.TUPLE
+      for shape in self.tuple_shapes():
+        shape.serialize(proto.tuple_shapes.add())
+    else:
+      proto.element_type = dtype_to_etype(self.element_type())
+      proto.dimensions.extend(self.dimensions())
+      proto.is_dynamic_dimension.extend([False for _ in self.dimensions()])
+      if self.minor_to_major():
+        proto.layout.format = xla_data_pb2.DENSE
+        proto.layout.minor_to_major.extend(self.minor_to_major())
+
+
+ProgramShape = collections.namedtuple('ProgramShape',
+                                      ('parameter_shapes', 'result_shape'))
+
 
 def _wrap_shape(shape_info):
   dtype, dims = shape_info
@@ -439,6 +609,12 @@ def _wrap_shape(shape_info):
     return Shape.array_shape(dtype, dims)
 
 
+def _wrap_program_shape(shape_info):
+  arg_shapes, result_shape = shape_info
+  return ProgramShape([_wrap_shape(arg) for arg in arg_shapes],
+                      _wrap_shape(result_shape))
+
+
 def require_numpy_array_layout(value):
   if isinstance(value, tuple):
     return tuple(require_numpy_array_layout(x) for x in value)
@@ -462,7 +638,7 @@ class CompileOptions(object):
     self.num_replicas = get_replica_count()
 
 
-def transfer_to_infeed(value, replica_number=None):
+def transfer_to_infeed(value, device_ordinal=0):
   """Transfers the given value into the XLA infeed queue.
 
   XLA's infeed queue is a single queue that feeds the "XLA virtual machine" with
@@ -472,64 +648,50 @@ def transfer_to_infeed(value, replica_number=None):
   Args:
     value: the value that the caller would like to enqueue into the XLA infeed
       queue
-    replica_number: the replica number to infeed the value to -- if not
-      provided, then the default replica (trivially replica 0) is used.
+    device_ordinal: the device to infeed the value to. Each device has a
+      distinct infeed queue.
   """
-  if replica_number is None:
-    c_api.TransferToInfeedLocal(require_numpy_array_layout(value))
-  else:
-    c_api.TransferToInfeedLocalReplica(
-        require_numpy_array_layout(value), replica_number)
+  # TODO(phawkins): support non-default backends.
+  backend = _get_default_local_backend()
+  backend.client.TransferToInfeed(
+      require_numpy_array_layout(value), device_ordinal)
 
 
-def transfer_from_outfeed(shape, replica_number=None):
-  """Transfers a literal of the given shape from replica_number's outfeed.
+def transfer_from_outfeed(shape, device_ordinal=0):
+  """Transfers a literal of the given shape from `device_ordinal`'s outfeed.
 
   Args:
     shape: The shape of the value to transfer from outfeed.
-    replica_number: The replica number ordinal to transfer the outfeed value
-      from. (Each replica has a distinct outfeed queue.)
+    device_ordinal: The device ordinal to transfer the outfeed value from. Each
+      device has a distinct outfeed queue..
 
   Returns:
     The literal value that is produced from the outfeed queue.
   """
-  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
+  # TODO(phawkins): support non-default backends.
+  backend = _get_default_local_backend()
+  return backend.client.TransferFromOutfeed(shape, device_ordinal)
 
 
-class LocalComputation(object):
-  """Python wrapper for a local XLA Computation.
+class Computation(object):
+  """Python wrapper for an XLA Computation.
 
-  A LocalComputation can be executed if it is compiled. Otherwise, it
-  can still be used as a Computation where required by the
-  ComputationBuilder methods.
+  A Computation can be compiled to form an Executable, or used as a
+  subcomputation in ComputationBuilder methods.
   """
 
-  def __init__(self, c_computation, is_compiled, backend=XLA_LOCAL_BACKEND):
+  def __init__(self, c_computation, backend=None):
     self._c_computation = c_computation
+    # The backend argument is deprecated. Pass a backend to Compile() instead.
     self._backend = backend
-    self._is_compiled = is_compiled
-
-    # Ensure a reference to C-based destructor for use in __del__.
-    if is_compiled:
-      if backend.backend_type == BackendType.XRT:
-        assert isinstance(c_computation, c_api.CompiledXrtComputation)
-        self._delete = c_api.DeleteCompiledXrtComputation
-      else:
-        assert isinstance(c_computation, c_api.CompiledLocalComputation)
-        self._delete = c_api.DeleteCompiledLocalComputation
-    else:
-      assert isinstance(c_computation, c_api.LocalComputation)
-      self._delete = c_api.DeleteLocalComputation
+    self._delete_computation = c_api.DeleteComputation
 
   @property
   def computation(self):
-    if self._is_compiled:
-      raise ValueError(
-          'Attempt to read the XLA computation of a compiled LocalComputation.')
     return self._c_computation
 
   def GetProto(self):
-    """Get the HloModuleProto proto object in this local computation.
+    """Get the HloModuleProto proto object in this computation.
 
     Returns:
        An HloModuleProto proto object that has the whole-graph information.
@@ -538,30 +700,41 @@ class LocalComputation(object):
     proto = hlo_pb2.HloModuleProto.FromString(serialized)
     return proto
 
-  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
-    """Compiles an un-compiled local computation.
+  def GetHloText(self):
+    """Get the textual HLO representation of this computation.
+
+    Returns:
+       A string containing the textual HLO.
+    """
+    return self.computation.GetHloText()
+
+  def GetHloDotGraph(self):
+    """Get a Graphviz Dot representation of this computation.
+
+    Returns:
+       A string containing the graphviz dot graph.
+    """
+    return self.computation.GetHloDotGraph()
 
-    Local computations are the result of a "LocalComputationBuild'ing" process
-    -- they start in uncompiled form, and via a call to Compile() turn into a
-    compiled local computation.
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None,
+              backend=None):
+    """Compiles a computation.
 
-    Raises:
-      ValueError: if this is already a compiled local computation.
+    Computations are the result of a "ComputationBuild'ing" process.
 
     Arguments:
       argument_shapes: parameter shapes -- they are first laid out by layout_fn
         if layout_fn is provided. Otherwise, the default layout for those shapes
         will be used.
-      compile_options: options to use for compilation, includes an optional
-        laid out result shape for the computation.
+      compile_options: options to use for compilation, includes an optional laid
+        out result shape for the computation.
       layout_fn: lambda that is used to lay out the argument/result shapes.
+      backend: a `Backend` for which an executable should be generated.
 
     Returns:
-      A newly *compiled* local computation instance.
+      A Executable instance.
     """
-    if self._is_compiled:
-      raise ValueError('Attempt to compile a compiled local XLA computation.')
-
+    backend = backend or self._backend or _get_default_local_backend()
     result_shape = _wrap_shape(self.computation.GetReturnValueShape())
 
     if layout_fn:
@@ -574,32 +747,52 @@ class LocalComputation(object):
 
     compile_options = compile_options or CompileOptions()
     compile_options.result_shape = result_shape
-    if self._backend.backend_type == BackendType.XRT:
-      c = self.computation.CompileForXrt(
-          argument_shapes, _maybe_encode_string(self._backend.target))
-    else:
-      c = self.computation.Compile(argument_shapes, compile_options)
-    return LocalComputation(c, is_compiled=True, backend=self._backend)
+    c = backend.compile(self.computation, argument_shapes, result_shape,
+                        compile_options)
+    return Executable(c, backend=backend)
 
   def CompileWithExampleArguments(self,
                                   arguments=(),
                                   compile_options=None,
-                                  layout_fn=None):
+                                  layout_fn=None,
+                                  backend=None):
     return self.Compile(
         argument_shapes=[Shape.from_pyval(arg) for arg in arguments],
         compile_options=compile_options,
-        layout_fn=layout_fn)
+        layout_fn=layout_fn,
+        backend=backend)
+
+  def GetProgramShape(self):
+    return _wrap_program_shape(self._c_computation.GetProgramShape())
 
   def GetReturnValueShape(self):
     return _wrap_shape(self._c_computation.GetReturnValueShape())
 
+  def __del__(self):
+    if self._c_computation:
+      self._delete_computation(self._c_computation)
+
+
+class Executable(object):
+  """Python wrapper for an XLA Executable."""
+
+  def __init__(self, c_executable, backend=None):
+    self._c_executable = c_executable
+    self._device_ordinals = c_executable.DeviceOrdinals()
+    self._backend = backend
+
+  def DeviceOrdinals(self):
+    """Returns a list containing the device ordinals for each replica."""
+    return self._device_ordinals
+
   def Execute(self, arguments=(), check_for_deleted_args=True):
     """Execute on one replica with LocalBuffer arguments and return value."""
     if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
       raise ValueError('Executing with deleted local buffer argument')
     raw_args = [arg.c_buffer for arg in arguments]
-    output_buffer = self._c_computation.Execute(raw_args)
-    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+    output_buffer = self._backend.execute(self._c_executable, raw_args)
+    return LocalBuffer(
+        output_buffer, backend=self._backend, device=self._device_ordinals[0])
 
   def ExecutePerReplica(self, arguments=None):
     """Execute on many replicas with LocalBuffer arguments and return value.
@@ -609,14 +802,12 @@ class LocalComputation(object):
         sequence comprises the arguments for execution on the i'th replica.
 
     Returns:
-      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      A list of the computation's outputs for each replica, as a LocalBuffer. If
       a shallow sequence of arguments was passed in for `arguments`, then the
       sole, zero'th replica's output is returned instead, as a LocalBuffer.
     """
-    if not self._is_compiled:
-      raise ValueError('Cannot execute an uncompiled local XLA computation.')
     if arguments is None:
-      arguments = ((),) * get_replica_count()
+      arguments = ((),) * len(self._device_ordinals)
     else:
       arguments = [list(replica_args) for replica_args in arguments]
 
@@ -625,37 +816,35 @@ class LocalComputation(object):
       for arg in replica_args:
         if arg.is_deleted():
           raise ValueError('Executing with deleted local buffer argument')
-        if arg.replica() != replica:
+        if arg.device() != self._device_ordinals[replica]:
           raise ValueError(
-              'Executing on replica {} with argument from replica {}'.format(
-                  replica, arg.replica()))
+              'Executing on device {} with argument from device {}'.format(
+                  self._device_ordinals[replica], arg.device()))
 
     # Pull out argument buffer handles
+    # pylint: disable=g-complex-comprehension
     stripped_args = [
         [arg.c_buffer for arg in replica_args] for replica_args in arguments
     ]
 
     # Execute
-    if self._backend.backend_type == BackendType.XRT:
-      if len(stripped_args) > 1:
-        raise NotImplementedError(
-            'Multi-replica execution is not yet supported via the XRT backend.')
-      output_buffers = [self._c_computation.Execute(stripped_args[0])]
-    else:
-      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
-      size = output_buffer_tup.size()
-      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
+    output_buffers = self._backend.execute_replicated(self._c_executable,
+                                                      stripped_args)
 
     # Wrap output handles in LocalBuffer instances
     return tuple(
-        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        LocalBuffer(
+            output_buffer,
+            backend=self._backend,
+            device=self._device_ordinals[replica])
         for replica, output_buffer in enumerate(output_buffers))
 
   def ExecuteWithPythonValues(self, arguments=()):
     """Execute on one replica with Python values as arguments and output."""
 
     def put(arg):
-      return LocalBuffer.from_pyval(arg, backend=self._backend)
+      return LocalBuffer.from_pyval(
+          arg, device=self._device_ordinals[0], backend=self._backend)
 
     arguments = [put(arg) for arg in arguments]
     return self.Execute(arguments).to_py()
@@ -663,16 +852,19 @@ class LocalComputation(object):
   def ExecuteWithPythonValuesPerReplica(self, arguments):
     """Execute on many replicas with Python values as arguments and output."""
 
-    def put(arg, replica):
-      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+    def put(arg, device):
+      return LocalBuffer.from_pyval(arg, device, backend=self._backend)
 
-    arguments = [[put(arg, replica)
-                  for arg in replica_args]
-                 for replica, replica_args in enumerate(arguments)]
+    # pylint: disable=g-complex-comprehension
+    arguments = [[
+        put(arg, self._device_ordinals[replica]) for arg in replica_args
+    ] for replica, replica_args in enumerate(arguments)]
     return [out.to_py() for out in self.ExecutePerReplica(arguments)]
 
   def __del__(self):
-    self._delete(self._c_computation)
+    # Python may have freed c_api first.
+    if c_api and self._c_executable:
+      self._backend.delete_executable(self._c_executable)
 
 
 def _make_replica_group_proto(replica_group):
@@ -685,8 +877,8 @@ class ComputationBuilder(object):
   """XLA computation builder.
 
   Enqueues XLA ops in sequence and in order to build a
-  LocalComputation, which in turn can be compiled into a
-  CompiledLocalComputation, which in turn can be locally executed.
+  Computation, which in turn can be compiled into a
+  LocalExecutable, which in turn can be locally executed.
   """
 
   # The methods of this class map 1-to-1 onto the XLA C++
@@ -697,16 +889,23 @@ class ComputationBuilder(object):
   # pylint: disable=g-doc-args
 
   def __init__(self, name):
-    self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
+    self._client = c_api.ComputationBuilder(name.encode('utf8'))
     self._parameter_numbering = itertools.count()
 
-  def Build(self, root=None, backend=XLA_LOCAL_BACKEND):
+  def Build(self, root=None, backend=None):
+    """Builds a `Computation` from the contents of the builder.
+
+    Args:
+      root: if not None, the operator containing the return value of the
+        computation.
+      backend: deprecated. Pass a `backend` to `Computation.Compile` instead.
+    Returns:
+      A `Computation`.
+    """
     if root is not None:
-      return LocalComputation(
-          self._client.BuildWithRoot(root), is_compiled=False, backend=backend)
+      return Computation(self._client.BuildWithRoot(root), backend=backend)
     else:
-      return LocalComputation(
-          self._client.Build(), is_compiled=False, backend=backend)
+      return Computation(self._client.Build(), backend=backend)
 
   def SetOpMetadata(self, op_metadata):
     """Set metadata for operations that are about to be enqueued."""
@@ -1358,7 +1557,7 @@ class ComputationBuilder(object):
 
     Args:
       operand: a LocalOp to test.
-    Returns: a LocalComputation that is rooted on the given `operand` which is a
+    Returns: a Computation that is rooted on the given `operand` which is a
       compile-time constant.
     """
     return self._client.BuildConstantSubGraph(operand)
@@ -1523,11 +1722,23 @@ class ComputationBuilder(object):
     """Enqueues a QR decomposition onto the computation."""
     return self._client.QR(a, full_matrices)
 
-  def TriangularSolve(self, a, b, left_side=False, lower=False,
-                      transpose_a=False, conjugate_a=False):
+  def TriangularSolve(self,
+                      a,
+                      b,
+                      left_side=False,
+                      lower=False,
+                      transpose_a=False,
+                      conjugate_a=False,
+                      unit_diagonal=False):
     """Enqueues a triangular-solve operation onto the computation."""
-    return self._client.TriangularSolve(
-        a, b, left_side, lower, transpose_a, conjugate_a)
+    if not transpose_a:
+      transpose = 1
+      if conjugate_a:
+        a = self.Conj(a)
+    else:
+      transpose = 3 if conjugate_a else 2
+    return self._client.TriangularSolve(a, b, left_side, lower, unit_diagonal,
+                                        transpose)
 
   def Gather(self, a, start_indices, dimension_numbers, slice_sizes):
     """Enqueues a Gather operation onto the computation."""
@@ -1547,7 +1758,7 @@ def _forward_methods_to_local_builder():
 
   Set up methods, corresponding to unary and binary XLA operations,
   whose calls are forwarded in a boilerplate manner to the underlying
-  LocalComputationBuilder C-extension API.
+  ComputationBuilder C-extension API.
   """
 
   def forward_to_local_builder_with_handles(target_method, is_binop=False):
@@ -1567,13 +1778,13 @@ def _forward_methods_to_local_builder():
 
   for method_name in _UNARY_OPS:
     forward = forward_to_local_builder_with_handles(
-        getattr(c_api.LocalComputationBuilder, method_name))
+        getattr(c_api.ComputationBuilder, method_name))
     forward.__name__ = method_name
     setattr(ComputationBuilder, method_name, forward)
 
   for method_name in _BINARY_OPS:
     forward = forward_to_local_builder_with_handles(
-        getattr(c_api.LocalComputationBuilder, method_name), is_binop=True)
+        getattr(c_api.ComputationBuilder, method_name), is_binop=True)
     forward.__name__ = method_name
     setattr(ComputationBuilder, method_name, forward)
 
@@ -1581,8 +1792,14 @@ def _forward_methods_to_local_builder():
 _forward_methods_to_local_builder()
 
 
+_default_replica_count = 1
+
+
 def initialize_replica_count(replica_count):
-  """Initializes the desired replica count to use on XLA service init.
+  """Initializes the default replica count to use.
+
+  Deprecated; pass `num_replicas` as an option to `Computation.Compile()`
+  instead.
 
   Args:
     replica_count: number of replicas that are desired for set up during XLA
@@ -1591,29 +1808,30 @@ def initialize_replica_count(replica_count):
   Raises:
     A runtime exception if the XLA service has already been initialized.
   """
-  c_api.InitializeReplicaCount(replica_count)
+  global _default_replica_count
+  _default_replica_count = replica_count
 
 
-def initialize_platform_name(platform_name):
-  """Initializes the desired platform name to use on XLA service init.
-
-  Args:
-    platform_name: string name of platform.
+def get_replica_count():
+  """Returns the default replica count.
 
-  Raises:
-    A runtime exception if the XLA service has already been initialized.
+  Deprecated; pass `num_replicas` as an option to `Computation.Compile()`
+  instead.
   """
-  platform_name = _maybe_encode_string(platform_name)
-  c_api.InitializePlatformName(platform_name)
+  return _default_replica_count
 
 
-def get_replica_count():
-  """Returns the current replica count used for the XLA service.
+def initialize_platform_name(platform_name):
+  """Initializes the default platform name to use for XLA.
 
-  Note: this will return a value whether the XLA service has been initialized
-  yet or not.
+  Args:
+    platform_name: string name of platform.
   """
-  return c_api.GetReplicaCount()
+  global _default_platform_name
+  _default_platform_name = platform_name
+
+  # Make sure the platform is valid by trying to instantiate it.
+  _get_default_local_backend()
 
 
 def register_cpu_custom_call_target(name, fn):
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c80e792464560f4722b657694d8eb6f5e03956a9..51ef7d7f3a17f341e955f48615b05a886813430b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -29,7 +29,7 @@ from tensorflow.compiler.xla.python import xla_client
 import unittest
 
 
-class LocalComputationTest(unittest.TestCase):
+class ComputationTest(unittest.TestCase):
   """Base class for running an XLA Computation through the local client."""
 
   def _NewComputation(self, name=None):
@@ -85,9 +85,35 @@ def NumpyArrayBool(*args, **kwargs):
   return np.array(*args, dtype=np.bool, **kwargs)
 
 
-class ComputationsWithConstantsTest(LocalComputationTest):
+class ComputationPrinting(unittest.TestCase):
+
+  def ExampleComputation(self):
+    builder = xla_client.ComputationBuilder("acomputation")
+    p0 = builder.ParameterFromNumpy(np.float32(0))
+    p1 = builder.ParameterFromNumpy(np.zeros((4,), np.float32))
+    builder.Mul(p0, p1)
+    return builder.Build()
+
+  def testComputationToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.GetHloText()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testComputationToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = computation.GetHloDotGraph()
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+
+class ComputationsWithConstantsTest(ComputationTest):
   """Tests focusing on Constant ops."""
 
+  def testConstantScalarSumS8(self):
+    c = self._NewComputation()
+    root = c.Add(c.Constant(np.int8(1)), c.Constant(np.int8(2)))
+    self.assertEqual(c.GetShape(root), c.GetReturnValueShape())
+    self._ExecuteAndCompareExact(c, expected=np.int8(3))
+
   def testConstantScalarSumF32(self):
     c = self._NewComputation()
     root = c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
@@ -298,7 +324,7 @@ class ComputationsWithConstantsTest(LocalComputationTest):
     self._ExecuteAndCompareClose(c, expected=0.75)
 
 
-class ParametersTest(LocalComputationTest):
+class ParametersTest(ComputationTest):
   """Tests focusing on Parameter ops and argument-passing."""
 
   def setUp(self):
@@ -378,7 +404,7 @@ class ParametersTest(LocalComputationTest):
         expected=[-4.3, 1.3, -6.3, 3.3])
 
 
-class LocalBufferTest(LocalComputationTest):
+class LocalBufferTest(ComputationTest):
   """Tests focusing on execution with LocalBuffers."""
 
   def _Execute(self, c, arguments):
@@ -476,7 +502,7 @@ class LocalBufferTest(LocalComputationTest):
     self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
 
 
-class SingleOpTest(LocalComputationTest):
+class SingleOpTest(ComputationTest):
   """Tests for single ops.
 
   The goal here is smoke testing - to exercise the most basic functionality of
@@ -751,6 +777,12 @@ class SingleOpTest(LocalComputationTest):
     c.Not(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=~arr)
 
+  def testCountLeadingZeros(self):
+    c = self._NewComputation()
+    arr = NumpyArrayS32([0x7FFF, 0x12345678])
+    c.Clz(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=[17, 3])
+
   def testExp(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -1169,7 +1201,7 @@ class SingleOpTest(LocalComputationTest):
     np.testing.assert_allclose(g, expected, rtol=1e-4)
 
 
-class EmbeddedComputationsTest(LocalComputationTest):
+class EmbeddedComputationsTest(ComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
 
   def _CreateConstantS32Computation(self):
@@ -1633,7 +1665,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
     self._ExecuteAndCompareClose(c, expected=expected)
 
 
-class ErrorTest(LocalComputationTest):
+class ErrorTest(ComputationTest):
 
   def setUp(self):
     self.f32_scalar_2 = NumpyArrayF32(2.0)
@@ -1650,7 +1682,7 @@ class ErrorTest(LocalComputationTest):
         lambda: c.Build().CompileWithExampleArguments([self.f32_scalar_2]))
 
 
-class ComputationRootTest(LocalComputationTest):
+class ComputationRootTest(ComputationTest):
   """Tests related to setting the root of the computation."""
 
   def testComputationRootDifferentFromLastOp(self):
diff --git a/tensorflow/compiler/xla/python/xla_data.i b/tensorflow/compiler/xla/python/xla_data.i
new file mode 100644
index 0000000000000000000000000000000000000000..974f314af24f61c0015a8d51c16dff1bfc84c7cc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_data.i
@@ -0,0 +1,654 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SWIG typemaps and declarations for building, compiling, and
+// executing XLA computations, wrapping most of what is declared in
+// xla_data.h.
+//
+// The typemaps below implement/assert the following correspondences
+// (with elaborations below):
+//
+//    C++                                  Python
+// -------------------------------------+---------------------------------------
+//  Span<int64>                        <-  sequence of int
+//  vector<int>                        ->  sequence of int
+//  Span<LocalOp>                      <-  sequence of LocalOp
+//  Literal                            <-> (nested tuple of) numpy ndarray
+//  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  ProgramShape                       ->  pair of ([arg_shapes], ret_shape)
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
+//  PrimitiveType                      <-  int
+//  Span<pair<int64, in64>>            <-  sequence of int pairs
+//  PaddingConfig proto                <-  corresponding Python proto
+//  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
+//  DotDimensionNumbers proto          <-  corresponding Python proto
+//  GatherDimensionNumbers proto       <-  corresponding Python proto
+//  ScatterDimensionNumbers proto      <-  corresponding Python proto
+//  Span<ReplicaGroup proto>           <-  sequence of ReplicaGroup Python proto
+//
+// Arrows indicate whether a conversion only ever occurs in one
+// direction, or whether it is maintained bidirectionally.
+//
+// The Python objects corresponding to C++ Literals have the type:
+//
+//   T = ndarray | (T, ...)
+//
+// where a terminal numpy ndarray translates to a Literal with a
+// non-tuple Shape, an XLA primitive element type corresponding to the
+// ndarray's dtype. Meanwhile, a non-terminal "tuple of T" translates
+// to a tuple-shaped Literal whose tuple components are translated
+// recursively. For example, if x is a numpy ndarray in Python, with
+// shape (2, 3) and dtype of dtype('float32'), then x translates to a
+// Literal with rank 2, dimension 2 and 3, and XLA primitive type
+// F32. Meanwhile,
+//
+//   (x, (x, x), (x,)),
+//
+// translates to a tuple-shaped XLA Literal, whose component subshapes
+// are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
+//
+// Shapes output by C++ become Python objects with the type:
+//
+//   T            = (dtype, S)
+//   S            = DIMENSIONS | TUPLE_SHAPES
+//   DIMENSIONS   = (int, ...)
+//   TUPLE_SHAPES = (T, ...)
+//
+// In the pair described by the T rule, the terminal dtype determines
+// whether S expands as DIMENSIONS or TUPLE_SHAPES. Namely if it is
+// dtype('O'), numpy's object dtype, the structure represents a tuple
+// shape and the expansion of the non-terminal S is
+// TUPLE_SHAPES. Otherwise, dtype describes a primitive element type
+// and S expands into DIMENSIONS giving dimension sizes. For example:
+//
+//   (dtype('float32'), (3, 5, 7))
+//
+// describes a 3x5x7 array of F32s, and
+//
+//   (dtype('O'), ((dtype('float32'), (2, 3)),
+//                 (dtype('float64'), (4, 5))))
+//
+// describes a tuple shape with two subshapes: the first a 2x3 F32,
+// and the other a 4x5 F64.
+//
+// The Python int corresponding to a PrimitiveType enum must be valid
+// per xla_data.proto (e.g. xla_data.PRED, xla_data.F32).
+//
+// The SWIG object wrappers generated by this file are not intended
+// for end use, but rather for internal use in the Python XLA client,
+// xla_client.py.
+//
+// One central reason for the Python-side indirection is that the
+// Python-side objects produced by the typemaps in this file are
+// further packaged up by xla_client before being passed on. For
+// instance, the Python pair produced for a C++ Shape is further
+// wrapped in a Python class (xla_client.Shape) so as not to expose
+// the raw pair externally.
+//
+// Other SWIG object wrappers (e.g. of Computation) are further
+// wrapped by xla_client in order to set up a custom destructor that
+// triggers memory deallocation on the C++ side.
+
+%module(threads="1") xla_data
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+// Must be included first
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+%}
+
+// Basic types
+
+
+%typemap(out) std::vector<int> {
+  PyObject* out = PyList_New($1.size());
+  for (int i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM(out, i, PyInt_FromLong($1[i]));
+  }
+  $result = out;
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<string> {
+  if ($1.ok()) {
+    $result = PyString_FromString($1.ConsumeValueOrDie().c_str());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) Status {
+  if (!$1.ok()) {
+    PyErr_SetString(
+        PyExc_RuntimeError, $1.ToString().c_str());
+    SWIG_fail;
+  }
+  Py_INCREF(Py_None);
+  $result = Py_None;
+}
+
+%typemap(in) absl::Span<const int64>
+    (std::vector<int64> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
+    if (temps[i] == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Literal
+
+%typemap(in) const Literal& (StatusOr<Literal> literal_status) {
+  literal_status = numpy::XlaLiteralFromPyObject($input);
+  if (!literal_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $1 = &literal_status.ValueOrDie();
+}
+
+%typemap(out) Literal (StatusOr<numpy::Safe_PyObjectPtr> obj_status) {
+  obj_status = numpy::PyObjectFromXlaLiteral(*$1);
+  if (!obj_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, obj_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $result = obj_status.ValueOrDie().release();
+}
+
+%typemap(out) StatusOr<Literal> (StatusOr<numpy::Safe_PyObjectPtr> obj_status) {
+  if (!$1.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+  obj_status = numpy::PyObjectFromXlaLiteral($1.ValueOrDie());
+  if (!obj_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, obj_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $result = obj_status.ValueOrDie().release();
+}
+
+%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Literal> literal_status = numpy::XlaLiteralFromPyObject(o);
+    if (!literal_status.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps.push_back(literal_status.ConsumeValueOrDie());
+    Py_DECREF(o);
+  }
+  $1 = &temps;
+}
+
+// OpMetadata
+
+%typemap(in) const OpMetadata& (OpMetadata temp) {
+  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    SWIG_fail;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+// Shape
+
+%typemap(out) const Shape& {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1).release();
+}
+
+%typemap(out) StatusOr<Shape> {
+  if ($1.ok()) {
+    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie()).release();
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(out) StatusOr<ProgramShape> {
+  if ($1.ok()) {
+    $result = numpy::PyProgramShapeInfoFromXlaProgramShape(
+        $1.ConsumeValueOrDie()).release();
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(in) const Shape& (Shape temp) {
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    SWIG_fail;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+%typemap(in) const absl::optional<Shape>& (
+    absl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = absl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      SWIG_fail;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
+%typemap(out) std::unique_ptr<Shape> {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1).release();
+}
+
+%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      SWIG_fail;
+    }
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<absl::optional<Shape> >& (
+    std::vector<absl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(absl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        SWIG_fail;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
+  }
+  $1 = &temps;
+}
+
+// PrimitiveType
+
+%typemap(in) PrimitiveType {
+  PyObject* py_int = numpy::PyNumberToPyInt($input);
+  if (!py_int) {
+    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
+    SWIG_fail;
+  }
+  const long value = numpy::PyIntOrPyLongToLong(py_int);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(py_int);
+    SWIG_fail;
+  }
+  if (!PrimitiveType_IsValid(value)) {
+    PyErr_SetString(
+        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
+    Py_DECREF(py_int);
+    SWIG_fail;
+  }
+  $1 = static_cast<PrimitiveType>(value);
+}
+
+// Span<pair<int64, in64>>
+
+%typemap(in) absl::Span<const std::pair<int64, int64> >
+    (std::vector<std::pair<int64, int64> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (!o) {
+      SWIG_fail;
+    }
+    PyObject* first = PyTuple_GetItem(o, 0);
+    if (!first) {
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
+    if (!first_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "First pair item cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    PyObject* second = PyTuple_GetItem(o, 1);
+    if (!second) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      SWIG_fail;
+    }
+    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
+    if (!second_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Second pair item cannot be converted to int");
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      SWIG_fail;
+    }
+    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
+    if (first_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      SWIG_fail;
+    }
+    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
+    if (second_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      SWIG_fail;
+    }
+    temps.push_back(std::make_pair(first_value, second_value));
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_contracting_dimensions",
+        dimension_numbers.mutable_lhs_contracting_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_contracting_dimensions",
+        dimension_numbers.mutable_rhs_contracting_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_batch_dimensions",
+        dimension_numbers.mutable_lhs_batch_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_batch_dimensions",
+        dimension_numbers.mutable_rhs_batch_dimensions())) {
+    SWIG_fail;
+  }
+
+  $1 = &dimension_numbers;
+}
+
+// PaddingConfig
+
+%typemap(in) const PaddingConfig&
+    (PaddingConfig padding_config) {
+  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
+  if (!dimensions) {
+    SWIG_fail;
+  }
+
+  int length = PySequence_Size(dimensions);
+  if (length == -1) {
+    Py_DECREF(dimensions);
+    SWIG_fail;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(dimensions, i);
+    if (!item) {
+      Py_DECREF(dimensions);
+      SWIG_fail;
+    }
+    int64 edge_padding_low, edge_padding_high, interior_padding;
+    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
+        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
+        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
+      Py_DECREF(item);
+      Py_DECREF(dimensions);
+      SWIG_fail;
+    }
+    Py_DECREF(item);
+
+    PaddingConfig::PaddingConfigDimension* dimension =
+        padding_config.add_dimensions();
+    dimension->set_edge_padding_low(edge_padding_low);
+    dimension->set_edge_padding_high(edge_padding_high);
+    dimension->set_interior_padding(interior_padding);
+  }
+  Py_DECREF(dimensions);
+
+  $1 = &padding_config;
+}
+
+// ConvolutionDimensionNumbers
+
+%typemap(in) const ConvolutionDimensionNumbers&
+    (ConvolutionDimensionNumbers dimension_numbers) {
+  int64 value;
+
+  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_input_batch_dimension(value);
+
+  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_input_feature_dimension(value);
+
+  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_output_batch_dimension(value);
+
+  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_kernel_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_kernel_input_feature_dimension(value);
+
+  if (!HandleRepeatedInt64Attribute(
+        $input, "input_spatial_dimensions",
+        dimension_numbers.mutable_input_spatial_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "kernel_spatial_dimensions",
+        dimension_numbers.mutable_kernel_spatial_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "output_spatial_dimensions",
+        dimension_numbers.mutable_output_spatial_dimensions())) {
+    SWIG_fail;
+  }
+
+  $1 = &dimension_numbers;
+}
+
+// GatherDimensionNumbers
+
+%typemap(in) const GatherDimensionNumbers&
+    (GatherDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "offset_dims",
+        dimension_numbers.mutable_offset_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "collapsed_slice_dims",
+        dimension_numbers.mutable_collapsed_slice_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "start_index_map",
+        dimension_numbers.mutable_start_index_map())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// ScatterDimensionNumbers
+
+%typemap(in) const ScatterDimensionNumbers&
+    (ScatterDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "update_window_dims",
+        dimension_numbers.mutable_update_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "inserted_window_dims",
+        dimension_numbers.mutable_inserted_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "scatter_dims_to_operand_dims",
+        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// Span<const ReplicaGroup>
+
+%typemap(in) absl::Span<const ReplicaGroup >
+    (std::vector<ReplicaGroup > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    ReplicaGroup rgrp;
+    if (!HandleRepeatedInt64Attribute(
+            o, "replica_ids",
+            rgrp.mutable_replica_ids())) {
+        SWIG_fail;
+    }
+    temps.push_back(rgrp);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c55abc17f87c369e3d5b2140a84014e07921a9a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -0,0 +1,297 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/xrt.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace swig {
+
+XrtAllocation::XrtAllocation(int64 handle, Shape shape,
+                             const string& session_target)
+    : handle_(handle), shape_(shape), session_target_(session_target) {}
+
+XrtAllocation::~XrtAllocation() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
+
+/* static */
+StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
+    const Literal& argument, const string& session_target) {
+  xrt::XLAAllocation alloc;
+  *alloc.mutable_value() = argument.ToProto();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto literal_string =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({literal_string, alloc.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtAllocation(handle, argument.shape(), session_target);
+}
+
+const int64 XrtAllocation::handle() const { return handle_; }
+
+const Shape& XrtAllocation::shape() const { return shape_; }
+
+StatusOr<Literal> XrtAllocation::ToLiteral() const {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+
+  xla::LiteralProto response;
+  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
+  return Literal::CreateFromProto(response);
+}
+
+XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
+    : elements_(std::move(elements)) {
+  for (auto* element : elements_) {
+    CHECK(element != nullptr);
+  }
+}
+
+XrtAllocationTuple::~XrtAllocationTuple() {
+  for (XrtAllocation* element : elements_) {
+    if (element != nullptr) {
+      delete element;
+    }
+  }
+}
+
+StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
+  XrtAllocation* element = elements_[i];
+  if (element == nullptr) {
+    return InvalidArgument("Attempted to release already-released element %d.",
+                           i);
+  }
+  elements_[i] = nullptr;
+  return element;
+}
+
+int64 XrtAllocationTuple::size() const { return elements_.size(); }
+
+StatusOr<XrtExecutable*> XrtExecutable::CompileForXrt(
+    const string& hlo_module_proto, const std::vector<Shape>& argument_shapes,
+    const Shape& result_shape, const string& session_target) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto compile = tensorflow::ops::XRTCompile(root, program);
+  TF_RETURN_IF_ERROR(root.status());
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  ProgramShape program_shape;
+  for (auto& shape : argument_shapes) {
+    *program_shape.add_parameters() = shape;
+  }
+  *program_shape.mutable_result() = result_shape;
+
+  LayoutUtil::SetToDefaultLayout(&program_shape);
+  *config->mutable_program_shape() = program_shape.ToProto();
+  c.mutable_hlo_snapshot()
+      ->mutable_hlo()
+      ->mutable_hlo_module()
+      ->ParsePartialFromString(hlo_module_proto);
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({program, c.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtExecutable(program_shape, handle, session_target);
+}
+
+XrtExecutable::XrtExecutable(const ProgramShape& program_shape, int64 handle,
+                             const string& session_target)
+    : program_shape_(program_shape),
+      handle_(handle),
+      session_target_(session_target) {}
+
+XrtExecutable::~XrtExecutable() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({computation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
+
+StatusOr<XrtAllocation*> XrtExecutable::Execute(
+    absl::Span<XrtAllocation* const> argument_handles) {
+  const int num_expected_arguments = program_shape().parameters().size();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  std::vector<tensorflow::Output> arguments;
+  arguments.reserve(num_expected_arguments);
+  for (int i = 0; i < num_expected_arguments; ++i) {
+    arguments.push_back(
+        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
+  }
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto execution_config =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
+                                             execution_config, arguments);
+  TF_RETURN_IF_ERROR(root.status());
+
+  TF_RET_CHECK(argument_handles.size() == arguments.size());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(false);
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  for (int i = 0; i < arguments.size(); ++i) {
+    inputs.insert({arguments[i], argument_handles[i]->handle()});
+  }
+  inputs.insert({computation_handle, handle()});
+  inputs.insert({execution_config, e.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
+
+  int64 output = outputs[0].scalar<int64>()();
+  return new XrtAllocation(output, program_shape().result(), session_target_);
+}
+
+const ProgramShape& XrtExecutable::program_shape() const {
+  return program_shape_;
+}
+
+int64 XrtExecutable::handle() const { return handle_; }
+
+void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
+
+void DeleteXrtExecutable(XrtExecutable* computation) { delete computation; }
+
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target) {
+  const Shape& tuple_shape = allocation->shape();
+
+  if (!tuple_shape.IsTuple()) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
+  }
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
+  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  std::vector<XrtAllocation*> results;
+  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    inputs.clear();
+    inputs.insert({base_handle, allocation->handle()});
+    inputs.insert({shape_index, {i}});
+    std::vector<tensorflow::Tensor> outputs;
+    auto status = session.Run(inputs, {subtuple}, &outputs);
+    if (!status.ok()) {
+      // Clean up before returning non-ok status.
+      for (int j = 0; j < results.size(); ++j) {
+        delete results[j];
+      }
+      return status;
+    }
+    const int64 subtuple_handle = outputs[0].scalar<int64>()();
+    const Shape& subtuple_shape =
+        ShapeUtil::GetTupleElementShape(tuple_shape, i);
+    results.push_back(
+        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
+  }
+  return new XrtAllocationTuple(std::move(results));
+}
+
+}  // namespace swig
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xrt.h b/tensorflow/compiler/xla/python/xrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5bba6d5c9641dadc323f70745e870c14543321
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.h
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape.h"
+
+namespace xla {
+namespace swig {
+
+// Represents a reference to literals that live in a device-allocated buffer via
+// XRT. Specifically, wraps an int64 handle produced by running the allocation
+// graph, and an XLA shape to track the referent's shape.
+class XrtAllocation {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which allocation and deallocation
+  // graphs are run.
+  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
+                                              const string& session_target);
+
+  XrtAllocation(int64 handle, Shape shape, const string& session_target);
+  ~XrtAllocation();
+  StatusOr<Literal> ToLiteral() const;
+  const Shape& shape() const;
+  const int64 handle() const;
+
+ private:
+  const int64 handle_;
+  const Shape shape_;
+  const string session_target_;
+};
+
+// Result of a tuple destructuring operation on an XrtAllocation.
+class XrtAllocationTuple {
+ public:
+  // Note: any XrtAllocation elements that are not Release()'d will be
+  // deallocated in the destructor.
+  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
+
+  ~XrtAllocationTuple();
+
+  // Releases the ith element to the caller. Further attempts to release the ith
+  // element will return an invalid argument error.
+  StatusOr<XrtAllocation*> Release(int i);
+
+  // Returns the number of elements in the destructured tuple.
+  int64 size() const;
+
+ private:
+  std::vector<XrtAllocation*> elements_;
+};
+
+// Destructures a tuple-valued XrtAllocation into its constitutent elements
+// in XrtAllocationTuple form.
+//
+// Accepts a `session_target` argument, used in constructing the
+// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
+// and passed along in constructing each constituent XrtAllocation.
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target);
+
+// Represents a compiled computation that can be executed given handles to
+// device-allocated literals. Specifically, wraps an XRT computation handle.
+class XrtExecutable {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the compilation graph is run.
+  static StatusOr<XrtExecutable*> CompileForXrt(
+      const string& hlo_module_proto, const std::vector<Shape>& argument_shapes,
+      const Shape& result_shape, const string& session_target);
+
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the execution graph is run.
+  XrtExecutable(const ProgramShape& program_shape, int64 handle,
+                const string& session_target);
+  ~XrtExecutable();
+
+  std::vector<int> DeviceOrdinals() const { return {0}; }
+
+  StatusOr<XrtAllocation*> Execute(
+      absl::Span<XrtAllocation* const> argument_handles);
+
+  const ProgramShape& program_shape() const;
+  int64 handle() const;
+
+ private:
+  const ProgramShape program_shape_;
+  const int64 handle_;
+  const string session_target_;
+};
+
+// Functions for freeing resources from the Python side.
+void DeleteXrtAllocation(XrtAllocation* allocation);
+void DeleteXrtExecutable(XrtExecutable* computation);
+
+}  // namespace swig
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
diff --git a/tensorflow/compiler/xla/python/xrt.i b/tensorflow/compiler/xla/python/xrt.i
new file mode 100644
index 0000000000000000000000000000000000000000..456dd7be86e479b46815fc16b51a10431fe2060d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.i
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Wrappers for XRT ops.
+
+%module(threads="1") xrt
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+%include "tensorflow/compiler/xla/python/xla_data.i"
+
+%{
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/xrt.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+%}
+
+// Computation and buffer/allocation types
+
+%typemap(out) StatusOr<xla::swig::XrtExecutable*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtExecutable*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocation*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocationTuple*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
+    (std::vector<XrtAllocation*> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    XrtAllocation* xrta;
+    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
+      SWIG_fail;
+    }
+    temps.push_back(xrta);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+
+%ignoreall
+%unignore xla;
+%unignore xla::swig;
+%unignore xla::swig::XrtAllocation;
+%unignore xla::swig::XrtAllocation::FromLiteral;
+%unignore xla::swig::XrtAllocation::ToLiteral;
+%unignore xla::swig::XrtAllocation::shape;
+%unignore xla::swig::XrtAllocationTuple;
+%unignore xla::swig::XrtAllocationTuple::Release;
+%unignore xla::swig::XrtAllocationTuple::size;
+%unignore xla::swig::XrtExecutable;
+%unignore xla::swig::XrtExecutable::CompileForXrt;
+%unignore xla::swig::XrtExecutable::DeviceOrdinals;
+%unignore xla::swig::XrtExecutable::Execute;
+%unignore xla::swig::DestructureXrtAllocationTuple;
+%unignore xla::swig::DeleteXrtAllocation;
+%unignore xla::swig::DeleteXrtExecutable;
+
+%thread;
+%include "tensorflow/compiler/xla/python/xrt.h"
+%nothread;
+
+%unignoreall
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4f6509c1cb9dddac3f90cb8bea9b8ee989e4da4b..8d8394cb43ee013b9396a54e3a4d037445fcc0e1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -114,6 +114,7 @@ tf_cc_test(
         ":bfloat16_normalization",
         ":bfloat16_support",
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_verifier",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -679,7 +680,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
@@ -1203,7 +1203,6 @@ cc_library(
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -1461,11 +1460,15 @@ cc_library(
     hdrs = ["hlo_creation_utils.h"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:comparators",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1510,6 +1513,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "op_expander_pass",
+    srcs = ["op_expander_pass.cc"],
+    hdrs = ["op_expander_pass.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 cc_library(
     name = "gather_expander",
     srcs = ["gather_expander.cc"],
@@ -1518,6 +1535,7 @@ cc_library(
         ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1541,6 +1559,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "triangular_solve_expander",
+    srcs = ["triangular_solve_expander.cc"],
+    hdrs = ["triangular_solve_expander.h"],
+    deps = [
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 tf_cc_test(
     name = "batchnorm_expander_test",
     size = "small",
@@ -1602,7 +1642,7 @@ tf_cc_test(
         ":algebraic_simplifier",
         ":hlo",
         ":hlo_casting_utils",
-        ":hlo_matchers",
+        ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
         ":pattern_matcher",
@@ -2163,6 +2203,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2306,6 +2348,7 @@ tf_cc_test(
     srcs = ["hlo_dataflow_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_dataflow_analysis",
         ":hlo_graph_dumper",
         ":hlo_matchers",
@@ -2476,6 +2519,7 @@ tf_cc_test(
     srcs = ["tuple_points_to_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_matchers",
         ":instruction_fusion",
         ":tuple_points_to_analysis",
@@ -2851,7 +2895,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -3029,8 +3072,6 @@ cc_library(
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -3192,32 +3233,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_tfgraph_builder",
-    srcs = ["hlo_tfgraph_builder.cc"],
-    hdrs = ["hlo_tfgraph_builder.h"],
-    deps = [
-        ":hlo",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_tfgraph_builder_test",
-    srcs = ["hlo_tfgraph_builder_test.cc"],
-    deps = [
-        ":hlo_tfgraph_builder",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "hlo_graph_dumper",
     srcs = [
@@ -3229,7 +3244,6 @@ cc_library(
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_execution_profile",
-        ":hlo_tfgraph_builder",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -3273,7 +3287,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -3489,6 +3502,37 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "stable_sort_expander",
+    srcs = ["stable_sort_expander.cc"],
+    hdrs = ["stable_sort_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "stable_sort_expander_test",
+    srcs = ["stable_sort_expander_test.cc"],
+    deps = [
+        ":algebraic_simplifier",
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":stable_sort_expander",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_util",
     srcs = ["tuple_util.cc"],
@@ -3585,7 +3629,6 @@ cc_library(
         ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
     ],
@@ -3641,7 +3684,6 @@ cc_library(
         ":hlo_evaluator",
         ":hlo_pass",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3804,7 +3846,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index cd06cfcdd38d56a43def8a531fb7f018b22ed888..bd17e96106abd9de0dd3bbf418439b0fb3edb746 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -280,15 +280,51 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
         hlo));
   }
 
-  // Helper method to perform and add reduction in a single dimension.
-  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+  // Converts to primitive type if the input hlo is not that type, otherwise
+  // returns the original hlo.
+  HloInstruction* AsType(HloInstruction* hlo,
+                         const PrimitiveType element_type) {
+    if (hlo->shape().element_type() == element_type) {
+      return hlo;
+    }
+    return computation_->AddInstruction(HloInstruction::CreateConvert(
+        ShapeUtil::ChangeElementType(hlo->shape(), element_type), hlo));
+  }
+
+  // Transposes a dot operand such that the batch dimensions are the msot major,
+  // and the contracting dimensions are most minor.
+  StatusOr<HloInstruction*> NormalizeDotOperandToBatchMajorAndContractingMinor(
+      HloInstruction* dot_operand, absl::Span<const int64> batch_dimensions,
+      absl::Span<const int64> contracting_dimensions) {
+    std::vector<int64> transpose_dimensions(batch_dimensions.begin(),
+                                            batch_dimensions.end());
+    for (int64 i = 0; i < dot_operand->shape().rank(); ++i) {
+      if (!(absl::c_linear_search(batch_dimensions, i) ||
+            absl::c_linear_search(contracting_dimensions, i))) {
+        transpose_dimensions.push_back(i);
+      }
+    }
+    transpose_dimensions.insert(transpose_dimensions.end(),
+                                contracting_dimensions.begin(),
+                                contracting_dimensions.end());
+    return MakeTransposeHlo(dot_operand, transpose_dimensions);
+  }
+
+  // Helper method to perform and add reduction on a list of dimensions.
+  HloInstruction* AddReduce(HloInstruction* hlo, absl::Span<const int64> dims) {
     HloInstruction* zero =
         computation_->AddInstruction(HloInstruction::CreateConstant(
             LiteralUtil::Zero(hlo->shape().element_type()).Clone()));
     HloComputation* AddReduce_computation = GetOrCreateScalarAddComputation();
-    Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
+    Shape shape = ShapeUtil::FilterDimensions(
+        [&](int64 dim) { return !absl::c_linear_search(dims, dim); },
+        hlo->shape());
     return computation_->AddInstruction(HloInstruction::CreateReduce(
-        shape, hlo, zero, {dim}, AddReduce_computation));
+        shape, hlo, zero, dims, AddReduce_computation));
+  }
+
+  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+    return AddReduce(hlo, std::vector<int64>{dim});
   }
 
   // Convenience method for replacing an instruction with a bitcast. If operand
@@ -892,7 +928,6 @@ std::unique_ptr<HloInstruction> TryDivideToShift(HloInstruction* divide,
 }  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
-  Shape* shape;
   HloInstruction *a, *b, *c, *d;
   CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
   // A/1 => A
@@ -955,6 +990,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
       break;
   }
 
+  Shape* shape;
   // exp(A)/exp(B) => exp(A-B)
   if (Match(divide, m::Divide(m::Exp(m::Op(&a)), m::Exp(m::Op(&b)))
                         .WithShape(m::Shape(&shape)))) {
@@ -1005,8 +1041,9 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   // (Backends can do this transformation, but generally only if the constant is
   // a scalar.)
   if (Match(divide, m::Divide(m::NonConstant(&a), m::Constant(&b)))) {
-    Literal new_literal(b->shape());
-    switch (b->shape().element_type()) {
+    Shape result_shape = b->literal().shape();
+    Literal new_literal(result_shape);
+    switch (result_shape.element_type()) {
       case F16:
         TF_RETURN_IF_ERROR(InvertConstant<half>(*b, &new_literal));
         break;
@@ -1089,7 +1126,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   const int64 rhs_rank = rhs->shape().rank();
   const int64 lhs_rank = lhs->shape().rank();
   const auto& dnums = dot->dot_dimension_numbers();
-  if (dnums.rhs_contracting_dimensions_size() > 1) {
+  if (dnums.rhs_contracting_dimensions_size() != 1) {
     return false;
   }
   if (dot_rank > 2 && (lhs_rank != rhs_rank || lhs_rank != dot_rank)) {
@@ -1119,16 +1156,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     std::swap(rhs_collapsing_dim, rhs_kept_dim);
   }
 
-  auto as_type = [&](HloInstruction* hlo, const PrimitiveType element_type) {
-    if (hlo->shape().element_type() == element_type) {
-      return hlo;
-    }
-    return computation_->AddInstruction(HloInstruction::CreateConvert(
-        ShapeUtil::ChangeElementType(hlo->shape(), element_type), hlo));
-  };
-
   auto reshape_if_necessary = [&](HloInstruction* hlo) {
-    hlo = as_type(hlo, dot->shape().element_type());
+    hlo = AsType(hlo, dot->shape().element_type());
     if (!ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
       hlo = computation_->AddInstruction(
           HloInstruction::CreateReshape(dot->shape(), hlo));
@@ -1137,7 +1166,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   };
 
   auto add_reduce_in_f32 = [&](HloInstruction* hlo, const int64 dim) {
-    return AddReduce(as_type(hlo, F32), dim);
+    return AddReduce(AsType(hlo, F32), dim);
   };
 
   auto broadcast = [&](HloInstruction* hlo, const Shape& shape,
@@ -1246,8 +1275,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     return dims;
   };
 
-  // If the contracting dimension is 1, remove the degnerate dimnesions from the
-  // lhs and rhs, broadcast each to the result shape and multiply.
+  // If the contracting dimension is 1, remove the degnerate dimnensions from
+  // the lhs and rhs, broadcast each to the result shape and multiply.
   if (lhs->shape().dimensions(lhs_collapsing_dim) == 1 &&
       (rhs_kept_dim == rhs_rank - 1 ||
        (rhs_collapsing_dim == rhs_rank - 1 && rhs_kept_dim == rhs_rank - 2))) {
@@ -1584,7 +1613,9 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
-
+  if (options_.is_layout_sensitive()) {
+    return Status::OK();
+  }
   // Replace a zero element dot with a broadcast of the constant 0.
   if (ShapeUtil::IsZeroElementArray(dot->shape()) ||
       ShapeUtil::IsZeroElementArray(lhs->shape()) ||
@@ -1601,6 +1632,117 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
       dot->shape().element_type() != BF16) {
     return Status::OK();
   }
+
+  // If there are no contracting dimensions, a dot can be rewritten as
+  // mul(broadcast(transpose(x)),broadcast(transpose(y)))
+  if (dot->dot_dimension_numbers().lhs_contracting_dimensions_size() == 0) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_lhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            lhs,
+            AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    if (dot->shape().rank() != lhs->shape().rank()) {
+      std::vector<int64> lhs_broadcast_dims(lhs->shape().rank());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_lhs, lhs_broadcast_dims));
+    }
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_rhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            rhs,
+            AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+    if (dot->shape().rank() != rhs->shape().rank()) {
+      std::vector<int64> rhs_broadcast_dims(
+          dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+      absl::c_iota(rhs_broadcast_dims, 0);
+      for (int64 i = lhs->shape().rank(); i < dot->shape().rank(); ++i) {
+        rhs_broadcast_dims.push_back(i);
+      }
+      new_rhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_rhs, rhs_broadcast_dims));
+    }
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
+                                          new_lhs, new_rhs));
+  }
+
+  // If the lhs or rhs have only batch and contracting dimensions, a dot can be
+  // rewritten as reduce(mul(broadcast(transpose(x)),broadcast(transpose(y))))
+  if ((dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+           dot->dot_dimension_numbers().lhs_contracting_dimensions_size() ==
+       lhs->shape().rank()) ||
+      (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() +
+           dot->dot_dimension_numbers().rhs_batch_dimensions_size() ==
+       rhs->shape().rank())) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_lhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            lhs,
+            AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_rhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            rhs,
+            AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+
+    int64 lhs_outer_dims =
+        lhs->shape().rank() -
+        (dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+         dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
+    int64 rhs_outer_dims =
+        rhs->shape().rank() -
+        (dot->dot_dimension_numbers().rhs_batch_dimensions_size() +
+         dot->dot_dimension_numbers().rhs_contracting_dimensions_size());
+    CHECK(lhs_outer_dims == 0 || rhs_outer_dims == 0);
+    if (rhs_outer_dims > 0) {
+      std::vector<int64> lhs_broadcast_dims(
+          dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      lhs_broadcast_dims.resize(lhs->shape().rank());
+      std::iota(lhs_broadcast_dims.begin() +
+                    dot->dot_dimension_numbers().lhs_batch_dimensions_size(),
+                lhs_broadcast_dims.end(),
+                dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+                    rhs_outer_dims);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          new_rhs->shape(), new_lhs, lhs_broadcast_dims));
+    } else if (lhs_outer_dims > 0) {
+      std::vector<int64> rhs_broadcast_dims(
+          dot->dot_dimension_numbers().rhs_batch_dimensions_size());
+      absl::c_iota(rhs_broadcast_dims, 0);
+      rhs_broadcast_dims.resize(rhs->shape().rank());
+      std::iota(rhs_broadcast_dims.begin() +
+                    dot->dot_dimension_numbers().rhs_batch_dimensions_size(),
+                rhs_broadcast_dims.end(),
+                dot->dot_dimension_numbers().rhs_batch_dimensions_size() +
+                    lhs_outer_dims);
+      new_rhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          new_lhs->shape(), new_rhs, rhs_broadcast_dims));
+    }
+
+    TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
+                        MakeBinaryHlo(HloOpcode::kMultiply, new_lhs, new_rhs));
+    std::vector<int64> reduce_dims(
+        dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
+    new_dot = AsType(new_dot, F32);
+    const int64 outer_dims = std::max(rhs_outer_dims, lhs_outer_dims);
+    absl::c_iota(
+        reduce_dims,
+        outer_dims + dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+    new_dot = AddReduce(new_dot, reduce_dims);
+    new_dot = AsType(new_dot, dot->shape().element_type());
+    return ReplaceInstruction(dot, new_dot);
+  }
+
   if (lhs->shape().rank() > 2 || rhs->shape().rank() > 2 ||
       dot->shape().rank() > 2) {
     if (options_.enable_dot_strength_reduction() &&
@@ -1639,7 +1781,11 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   }
 
   // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
-  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+  if (dot->dot_dimension_numbers().lhs_batch_dimensions_size() == 0 &&
+      dot->dot_dimension_numbers().lhs_contracting_dimensions_size() == 1 &&
+      dot->dot_dimension_numbers().lhs_contracting_dimensions(0) == 1 &&
+      dot->dot_dimension_numbers().rhs_contracting_dimensions(0) == 0 &&
+      lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
     DotDimensionNumbers dot_dimension_numbers;
     dot_dimension_numbers.add_lhs_contracting_dimensions(1);
     dot_dimension_numbers.add_rhs_contracting_dimensions(0);
@@ -2529,11 +2675,11 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
         int64 start = slice->slice_starts(i);
         int64 low = padding_config.dimensions(i).edge_padding_low();
         int64 data = pad->operand(0)->shape().dimensions(i);
-        if (start >= low && start < low + data) {
-          return false;
+        if (start < low || start >= low + data) {
+          return true;
         }
       }
-      return true;
+      return false;
     }();
 
     if (in_padding) {
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index f55a1886b8f86af4893c8a7fc18ed935d223eca0..af03fcb100813e8942efcaefc296b971c01a6aaa 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
@@ -46,7 +46,6 @@ namespace {
 
 using ::testing::ElementsAre;
 namespace m = match;
-namespace op = xla::testing::opcode_matchers;
 
 class AlgebraicSimplifierTest : public HloTestBase {
  protected:
@@ -2749,12 +2748,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto builder = HloComputation::Builder(TestName());
+  auto module = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {1});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
-  auto module = CreateNewVerifiedModule();
+  TF_ASSERT_OK(MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false, &builder,
+                           module.get())
+                   .status());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2763,6 +2764,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
 
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
+  auto module = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {5, 0});
   Shape values_shape = ShapeUtil::MakeShape(S32, {5, 0});
@@ -2772,10 +2774,11 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
       HloInstruction::CreateParameter(1, values_shape, "values0"));
   auto values1 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, values_shape, "values1"));
-  builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape, values_shape}), 0,
-      keys, {values0, values1}));
-  auto module = CreateNewVerifiedModule();
+  TF_ASSERT_OK(MakeSortHlo(ShapeUtil::MakeTupleShape(
+                               {keys_shape, values_shape, values_shape}),
+                           {keys, values0, values1}, 0, /*is_stable=*/false,
+                           &builder, module.get())
+                   .status());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -3711,8 +3714,8 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   HloInstruction* y =
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(r1f32, x, y, dot_dnums,
                                                    DefaultPrecisionConfig(2)));
   std::unique_ptr<HloComputation> dot_computation(builder.Build());
@@ -3957,7 +3960,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
       param = f32[3,4] parameter(0)
       constant = f32[] constant(0.0)
       pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
-      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[4:5]}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -3968,6 +3971,27 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   const char* hlo_string = R"(
     HloModule module
@@ -3989,6 +4013,29 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   EXPECT_THAT(root, GmockMatch(m::Parameter()));
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadSomeDimsInPadding) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY entry () -> f32[1]{0} {
+      constant.val = f32[] constant(4)
+      constant.pad = f32[] constant(-7)
+      reshape.1 = f32[1,1,1]{2,1,0} reshape(f32[] constant.val)
+      pad = f32[3,3,3]{2,1,0} pad(f32[1,1,1]{2,1,0} reshape.1, f32[] constant.pad), padding=0_2x0_2x2_0
+      slice = f32[1,1,1]{2,1,0} slice(f32[3,3,3]{2,1,0} pad), slice={[0:1], [0:1], [0:1]}
+      ROOT reshape.2 = f32[1]{0} reshape(f32[1,1,1]{2,1,0} slice)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::ConstantScalar(-7.0))));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   const char* hlo_string = R"(
     HloModule module
@@ -4219,10 +4266,24 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   int m, k, n;
   PrimitiveType element_type;
   std::tie(m, k, n, element_type) = GetParam();
-
-  Shape dot_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, m, n});
-  Shape lhs_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, m, k});
-  Shape rhs_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, k, n});
+  std::vector<int64> lhs_dims = {1, 3, 5};
+  std::vector<int64> rhs_dims = lhs_dims;
+  std::vector<int64> output_dims = lhs_dims;
+  if (m > 0) {
+    lhs_dims.push_back(m);
+    output_dims.push_back(m);
+  }
+  if (k > 0) {
+    lhs_dims.push_back(k);
+    rhs_dims.push_back(k);
+  }
+  if (n > 0) {
+    rhs_dims.push_back(n);
+    output_dims.push_back(n);
+  }
+  Shape dot_shape = ShapeUtil::MakeShape(element_type, output_dims);
+  Shape lhs_shape = ShapeUtil::MakeShape(element_type, lhs_dims);
+  Shape rhs_shape = ShapeUtil::MakeShape(element_type, rhs_dims);
   HloComputation::Builder builder(TestName());
 
   auto lhs = builder.AddInstruction(
@@ -4236,16 +4297,18 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   dot_dnums.add_rhs_batch_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(2);
-  dot_dnums.add_lhs_contracting_dimensions(4);
-  dot_dnums.add_rhs_contracting_dimensions(3);
+  if (k > 0) {
+    dot_dnums.add_lhs_contracting_dimensions(m > 0 ? 4 : 3);
+    dot_dnums.add_rhs_contracting_dimensions(3);
+  }
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
   auto computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
-  const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
-  const bool computation_should_be_modified = dot_should_be_transformed;
-  EXPECT_EQ(changed, computation_should_be_modified);
+  const bool dot_should_be_transformed =
+      m == 1 || k == 1 || n == 1 || m == -1 || k == -1 || n == -1;
+  EXPECT_EQ(changed, dot_should_be_transformed);
   bool has_no_dot = true;
   for (const auto& hlo : computation->instructions()) {
     if (hlo->opcode() == HloOpcode::kDot) {
@@ -4256,10 +4319,12 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   EXPECT_EQ(has_no_dot, dot_should_be_transformed);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    BatchDotStrengthReductionTestInstantiation, BatchDotStrengthReductionTest,
-    ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
-                       ::testing::Values(1, 2), ::testing::Values(F32, BF16)));
+INSTANTIATE_TEST_SUITE_P(BatchDotStrengthReductionTestInstantiation,
+                         BatchDotStrengthReductionTest,
+                         ::testing::Combine(::testing::Values(-1, 1, 2),
+                                            ::testing::Values(-1, 1, 2),
+                                            ::testing::Values(-1, 1, 2),
+                                            ::testing::Values(F32, BF16)));
 
 class DotStrengthReductionTest
     : public AlgebraicSimplifierTest,
@@ -4789,7 +4854,29 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReshapeWithoutLayout) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+}
+
+TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+  shape.clear_layout();
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+
+  HloInstruction* const_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(20.0f)));
+  builder.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
+                                                      param, const_value));
+
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Multiply()));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index f8dff6a700cc9d5843053e3d451a7b005539ca26..52d6982c70f7962ea9f54db0a4b1f2089a122c1c 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -29,19 +29,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
-namespace {
-
 namespace m = match;
 
 // Checks if the argument instruction is an AllReduce, followed by a certain
 // sequence of instructions and then a CRS. It must be possible to move
 // the AR past each instruction in the sequence. Returns the CRS, which is the
 // last instruction in the sequence.
-absl::optional<HloInstruction*> MatchesArCrsPattern(
+absl::optional<ArCrsCombiner::ArCrsPair> ArCrsCombiner::MatchesArCrsPattern(
     HloInstruction* instruction) {
   auto can_ar_move_past_instruction = [](HloInstruction* instruction) -> bool {
     if (instruction->user_count() != 1) {
@@ -78,23 +75,23 @@ absl::optional<HloInstruction*> MatchesArCrsPattern(
     return absl::nullopt;
   }
   auto next = instruction->users()[0];
+  int64 distance = 1;
   while (!next->IsCrossReplicaAllReduce()) {
     if (can_ar_move_past_instruction(next)) {
       next = next->users()[0];
     } else {
       return absl::nullopt;
     }
+    ++distance;
   }
   if (!Cast<HloAllReduceInstruction>(next)->IsNoop() &&
       computation_is_addition(next->called_computations()[0])) {
-    return absl::optional<HloInstruction*>(next);
+    return absl::optional<ArCrsPair>(ArCrsPair(instruction, next, distance));
   } else {
     return absl::nullopt;
   }
 }
 
-}  // namespace
-
 absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
     HloInstruction* instruction) {
   CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
@@ -236,15 +233,55 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
 }
 
 void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  // Say that two or more ARs lead to the same CRS: (AR1, CRS), (AR2, CRS),
+  // ... , (ARn, CRS).
+  // If as we traverse the HLO graph we start tracking the pair (AR2, CRS),
+  // and later find that AR1's distance from the CRS is longer, we discard
+  // AR2 and start tracking AR1. We put the discarded ids in this set, in order
+  // to skip processing of short paths when we encounter the other ARs that
+  // have the same id as AR2.
+  absl::flat_hash_set<int64> discarded_ar_ids;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      auto maybe_crs = MatchesArCrsPattern(instruction);
-      if (maybe_crs) {
-        auto crs = *maybe_crs;
+      auto maybe_pair = MatchesArCrsPattern(instruction);
+      if (maybe_pair) {
+        auto pair = *maybe_pair;
         int64 ar_id = *(instruction->all_reduce_id());
-        if (crs_reserved_map_.find(crs) == crs_reserved_map_.end()) {
-          all_reduce_map_[ar_id].push_back(instruction);
-          crs_reserved_map_[crs] = ar_id;
+        if (discarded_ar_ids.find(ar_id) != discarded_ar_ids.end()) {
+          continue;
+        }
+        auto it = crs_reserved_map_.find(pair.crs);
+        if (it != crs_reserved_map_.end()) {
+          auto prev_ar_id = it->second;
+          // Since there is another AR paired with CRS,
+          // all_reduce_map_[prev_ar_id] should exist, but
+          // all_reduce_map_[ar_id] shouldn't.
+          CHECK(all_reduce_map_.find(ar_id) == all_reduce_map_.end());
+          CHECK_NE(prev_ar_id, ar_id);
+          auto prev_pair = all_reduce_map_[prev_ar_id].back();
+          int64 prev_distance = prev_pair.distance;
+          if (prev_distance < pair.distance) {
+            // The current AR's distance to CRS is longer than the previously
+            // tracked AR, so we discard the previous AR.
+            all_reduce_map_.erase(prev_ar_id);
+            discarded_ar_ids.insert(prev_ar_id);
+            all_reduce_map_[ar_id].push_back(pair);
+            crs_reserved_map_[pair.crs] = ar_id;
+          } else {
+            // Discard the current AR id because we are keeping the previously
+            // tracked AR.
+            discarded_ar_ids.insert(ar_id);
+          }
+        } else {
+          if (all_reduce_map_.find(ar_id) != all_reduce_map_.end()) {
+            int64 prev_distance = all_reduce_map_[ar_id].back().distance;
+            CHECK_EQ(prev_distance, pair.distance)
+                << "All ARs with the same AR ID must have the same distance "
+                   "from the corresponding CRSs. Found: "
+                << prev_distance << " and " << pair.distance;
+          }
+          all_reduce_map_[ar_id].push_back(pair);
+          crs_reserved_map_[pair.crs] = ar_id;
         }
       }
     }
@@ -254,11 +291,11 @@ void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
 void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
   for (auto it : all_reduce_map_) {
     auto all_reduce_id = it.first;
-    auto instruction_vec = it.second;
-    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
-    auto instr_0 = instruction_vec[0];
-    for (int i = 1; i < instruction_vec.size(); ++i) {
-      auto instr_i = instruction_vec[i];
+    auto pairs_vec = it.second;
+    CHECK_EQ(pairs_vec.size(), num_spatial_partitions_);
+    auto instr_0 = pairs_vec[0].ar;
+    for (int i = 1; i < pairs_vec.size(); ++i) {
+      auto instr_i = pairs_vec[i].ar;
       auto next_0 = instr_0->users()[0];
       auto next_i = instr_i->users()[0];
       absl::flat_hash_map<int64, int64> visited_pairs;
@@ -282,8 +319,9 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
     return false;
   }
   for (auto it : all_reduce_map_) {
-    auto instruction_vec = it.second;
-    for (auto all_reduce : instruction_vec) {
+    auto pairs_vec = it.second;
+    for (auto pair : pairs_vec) {
+      auto all_reduce = pair.ar;
       auto parent_computation = all_reduce->parent();
       auto all_reduce_id = all_reduce->all_reduce_id();
       auto prev = all_reduce->mutable_operand(0);
@@ -304,16 +342,23 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
                                      ? next->operands()[1]
                                      : next->operands()[0];
             // To move the AR past the addition/subtraction, we need to divide
-            // other_operand by the number of spatial partitions.
-            auto shape = other_operand->shape();
-            Literal lit(shape);
-            lit.PopulateWithValue<float>(num_spatial_partitions_);
-            auto divisor = parent_computation->AddInstruction(
-                HloInstruction::CreateConstant(lit.Clone()));
-            auto division =
-                parent_computation->AddInstruction(HloInstruction::CreateBinary(
-                    shape, HloOpcode::kDivide, other_operand, divisor));
-            TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+            // other_operand by the number of spatial partitions, except if
+            // other_operand is a cross-module AR, which can be eliminated.
+            if (other_operand->IsCrossModuleAllReduce() &&
+                other_operand->user_count() == 1) {
+              TF_CHECK_OK(other_operand->ReplaceAllUsesWith(
+                  other_operand->mutable_operand(0)));
+            } else {
+              auto shape = other_operand->shape();
+              Literal lit(shape);
+              lit.PopulateWithValue<float>(num_spatial_partitions_);
+              auto divisor = parent_computation->AddInstruction(
+                  HloInstruction::CreateConstant(lit.Clone()));
+              auto division = parent_computation->AddInstruction(
+                  HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
+                                               other_operand, divisor));
+              TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+            }
             break;
           }
           default:
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index e61ef5d4f9072979a6c356a9456c91e19405b01e..f503e1d5f2b519687e40818a61f0c0be9dfd3ab0 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -26,11 +26,47 @@ limitations under the License.
 namespace xla {
 
 // When the HLO graph contains a cross-module AllReduce, followed by some simple
-// linear operations, followed by a cross-replica AllReduce, we can combine the
-// CMAR and the CRAR, to use an efficient AllReduce implementation that fully
-// utilizes the interconnect bandwidth.
+// linear operations, followed by a cross-replica AllReduce (also known as
+// cross-replica sum, or CRS), we can combine the CMAR and the CRAR, to use an
+// efficient AllReduce implementation that fully utilizes the interconnect
+// bandwidth.
 // Such sequences appear in spatially partitioned models.
-// This pass must run right after spatial partitioning.
+// This pass must run right after spatial partitioning, when the code is still
+// in a single HLO module.
+//
+// The steps are:
+// 1) Find CMARs followed by simple ops followed by CRARs.
+// 2) Group CMARs by all_reduce_id. They must all be rewritten.
+// 3) Prove that the CMAR patterns in each core produce the same result.
+// 4) Eliminate the CMAR, and if it feeds an addition/subtraction, divide the
+//    other operand by the number of spatial partitions.
+// 5) Turn the CRAR into an all-core AllReduce.
+//
+// The pass also handles the case where multiple CMARs lead to the same CRAR,
+// and eliminates all CMARs. This graph:
+//
+//        Y
+//        |
+//  X   CMAR_2   Z
+//  |      \    /
+// CMAR_1     +
+//    \     /
+//       +
+//       |
+//     CRAR
+//
+// gets rewritten to:
+//
+//           Z   num_partitions
+//            \  /
+//       Y    div
+//        \   /
+//    X     +
+//     \   /
+//       +
+//       |
+//  all-core AR
+//
 class ArCrsCombiner : public HloModulePass {
  public:
   ArCrsCombiner(int num_spatial_partitions)
@@ -43,6 +79,28 @@ class ArCrsCombiner : public HloModulePass {
                                                HloInstruction* i2);
 
  private:
+  // We used this struct because multiple ARs could be paired with the same CRS.
+  // In this case, we want to select the AR that is furthest from the CRS,
+  // because it makes it easier to eliminate all ARs during RewriteGraph.
+  struct ArCrsPair {
+    HloInstruction* ar;
+    HloInstruction* crs;
+    // The length of the path from AR to CRS in the HLO graph.
+    int64 distance;
+
+    ArCrsPair(HloInstruction* all_reduce, HloInstruction* cross_replica_sum,
+              int64 dist)
+        : ar(all_reduce), crs(cross_replica_sum), distance(dist) {}
+
+    string ToString() {
+      return absl::StrCat("(AR: ", ar->name(), ", CRS: ", crs->name(),
+                          ", distance: ", distance, ")");
+    }
+  };
+
+  absl::optional<ArCrsCombiner::ArCrsPair> MatchesArCrsPattern(
+      HloInstruction* instruction);
+
   // If the passed instruction is a while parameter, and the while body is only
   // called by a single while instruction, return the while instruction.
   absl::optional<HloInstruction*> WhileFromBodyParameter(
@@ -80,8 +138,8 @@ class ArCrsCombiner : public HloModulePass {
 
   int num_spatial_partitions_;
 
-  // Map from all-reduce ids to the all reduce instructions.
-  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+  // Map from all-reduce ids to the AR/CRS pairs.
+  absl::flat_hash_map<int64, std::vector<ArCrsPair>> all_reduce_map_;
 
   // Map from a CRS instruction to the all-reduce ID of the AR paired with the
   // CRS. Sometimes, several ARs in the code could be paired with the same CRS.
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 5152f0dc884a153f9b0ade06acd479832d87ff25..9c9db74fd2fdab836f91d2f749d08ad93f8879b0 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -1005,11 +1005,11 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
               op::Tuple(op::AllReduce(op::Add(
                             op::Add(op::Parameter(),
                                     op::Divide(op::Constant(), op::Constant())),
-                            op::Divide(op::AllReduce(), op::Constant()))),
+                            op::Parameter())),
                         op::AllReduce(op::Add(
                             op::Add(op::Parameter(),
                                     op::Divide(op::Constant(), op::Constant())),
-                            op::Divide(op::AllReduce(), op::Constant())))));
+                            op::Parameter()))));
   auto crs_after =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_after = crs_after->replica_groups();
@@ -1093,15 +1093,17 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   ArCrsCombiner combiner(2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::AllReduce(op::Add(
-                            op::Parameter(),
-                            op::Divide(op::Add(op::AllReduce(), op::Constant()),
-                                       op::Constant()))),
-                        op::AllReduce(op::Add(
-                            op::Parameter(),
-                            op::Divide(op::Add(op::AllReduce(), op::Constant()),
-                                       op::Constant())))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::AllReduce(op::Add(
+                    op::Parameter(),
+                    op::Add(op::Parameter(),
+                            op::Divide(op::Constant(), op::Constant())))),
+                op::AllReduce(op::Add(
+                    op::Parameter(),
+                    op::Add(op::Parameter(),
+                            op::Divide(op::Constant(), op::Constant()))))));
+
   auto crs_after =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_after = crs_after->replica_groups();
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 215e8ced4bb3f98a26ac4eb9912a7fd4d917852f..d016d3e03d5e994841b81cda6214b6ff7cb550be 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
@@ -67,18 +66,38 @@ const absl::optional<std::set<int>>& BackendOptions::allowed_devices() const {
   return allowed_devices_;
 }
 
+namespace {
+
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
+ public:
+  explicit EigenThreadPoolWrapper(tensorflow::thread::ThreadPool* pool)
+      : pool_(pool) {}
+  ~EigenThreadPoolWrapper() override {}
+
+  void Schedule(std::function<void()> fn) override {
+    pool_->Schedule(std::move(fn));
+  }
+  int NumThreads() const override { return pool_->NumThreads(); }
+  int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+ private:
+  tensorflow::thread::ThreadPool* pool_ = nullptr;
+};
+
+}  // namespace
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
-struct Backend::EigenThreadPoolWrapper {
-  explicit EigenThreadPoolWrapper(const int num_threads)
+struct Backend::IntraOpThreadPool {
+  explicit IntraOpThreadPool(const int num_threads)
       : pool(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(),
                                                 "XLAEigen", num_threads)),
-        wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
+        wrapper(new EigenThreadPoolWrapper(pool.get())),
         device(new Eigen::ThreadPoolDevice(wrapper.get(),
                                            wrapper->NumThreads())) {}
 
   std::unique_ptr<tensorflow::thread::ThreadPool> pool;
-  std::unique_ptr<tensorflow::EigenThreadPoolWrapper> wrapper;
+  std::unique_ptr<EigenThreadPoolWrapper> wrapper;
   std::unique_ptr<Eigen::ThreadPoolDevice> device;
 };
 
@@ -146,8 +165,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
     const int num_threads = intra_op_parallelism_threads > 0
                                 ? intra_op_parallelism_threads
                                 : tensorflow::port::NumSchedulableCPUs();
-    intra_op_thread_pool_wrapper_.reset(
-        new EigenThreadPoolWrapper(num_threads));
+    intra_op_thread_pool_.reset(new IntraOpThreadPool(num_threads));
   }
 }
 
@@ -159,17 +177,17 @@ int Backend::default_device_ordinal() const {
 
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) {
+  if (intra_op_thread_pool_ == nullptr) {
     return nullptr;
   }
-  return intra_op_thread_pool_wrapper_->device.get();
+  return intra_op_thread_pool_->device.get();
 }
 
 tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) {
+  if (intra_op_thread_pool_ == nullptr) {
     return nullptr;
   }
-  return intra_op_thread_pool_wrapper_->pool.get();
+  return intra_op_thread_pool_->pool.get();
 }
 
 StatusOr<se::StreamExecutor*> Backend::stream_executor(
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index c35f033dc0180409ae3888c2050021da83f5c72a..e7f29a044b95015aa7e547373c24971646833280 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -156,7 +156,6 @@ class Backend {
   Status ResetDevices();
 
  private:
-  struct EigenThreadPoolWrapper;
   Backend(se::Platform* platform, Compiler* compiler,
           absl::Span<se::StreamExecutor* const> stream_executors,
           TransferManager* transfer_manager,
@@ -183,7 +182,8 @@ class Backend {
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
 
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
-  std::unique_ptr<EigenThreadPoolWrapper> intra_op_thread_pool_wrapper_;
+  struct IntraOpThreadPool;
+  std::unique_ptr<IntraOpThreadPool> intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
index eda026ac5685dc469a6230094eb28b3618e36400..dbabd82dd55465dd4c85a56aea849a3e3702d6bf 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -28,6 +28,13 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
                  *rhs = batch_dot->mutable_operand(1);
   const Shape& lhs_shape = lhs->shape();
 
+  // A dot with no contracting dims will be rewritten into a multiply by
+  // AlgebraicSimplifier. Dots with multiple contracting dims are currently
+  // unsupported.
+  if (dim_numbers.lhs_contracting_dimensions_size() != 1) {
+    return false;
+  }
+
   std::vector<int64> degenerate_dims;
   for (int64 batch_dim : dim_numbers.lhs_batch_dimensions()) {
     if (lhs_shape.dimensions(batch_dim) == 1) {
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 52ec1a794c5e9f4452a4bf2b648f453d8acfe976..a81f394a38f091b89b7f1e4d26653ff549f35b75 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -169,5 +169,47 @@ main {
                   /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/2)));
 }
 
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDimsNonContracting) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,101] parameter(0)
+  b = f32[1,101] parameter(1)
+  ROOT dot = f32[1,101,101] dot(a,b), lhs_batch_dims={0},
+                                      lhs_contracting_dims={},
+                                      rhs_batch_dims={0},
+                                      rhs_contracting_dims={}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  BatchDotSimplification pass;
+  ASSERT_FALSE(pass.Run(m.get()).ValueOrDie());
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDimsMultipleContracting) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+main {
+  lhs = f32[1,5,17,10,13] parameter(0)
+  rhs = f32[1,9,10,13,6,5] parameter(1)
+  ROOT dot = f32[10,1,17,9,6] dot(lhs,rhs), lhs_batch_dims={3,0},
+                                            rhs_batch_dims={2,0},
+                                            lhs_contracting_dims={1,4},
+                                            rhs_contracting_dims={5,3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  BatchDotSimplification pass;
+  ASSERT_FALSE(pass.Run(m.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 551ac4be73a7630d213a53ca3606aa7f890cd794..2caa979745b3b40817acb1b6951e1de5ffa294a4 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -282,8 +283,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
   HloInstruction* value = builder.AddInstruction(
       HloInstruction::CreateParameter(1, s32_shape, "value"));
 
-  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}), 0, key, {value}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}),
+                  {key, value}, 0, /*is_stable=*/false, &builder,
+                  module.get()));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, sort, 0));
 
@@ -308,8 +312,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
   HloInstruction* value = builder.AddInstruction(
       HloInstruction::CreateParameter(1, bf16_shape, "value"));
 
-  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, bf16_shape}), 0, key, {value}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({bf16_shape, f32_shape}),
+                  {key, value}, 0, /*is_stable=*/false, &builder,
+                  module.get()));
 
   auto computation = module->AddEntryComputation(builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index e1b91b500191c7756f3d1a4b160a0dd1e09cfe7d..cbebbdc8a2d7d0b65f12accbe424bea383ff5355 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -191,6 +191,7 @@ Status GatherComputationsByAllocationType(
           case HloOpcode::kReduceWindow:
           case HloOpcode::kScatter:
           case HloOpcode::kSelectAndScatter:
+          case HloOpcode::kSort:
           case HloOpcode::kFusion:
             // Map/reduce etc computations are always thread-local.
             worklist.push_back(std::make_pair(subcomputation,
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 94af788c54f6c722997311bec50da3ed93aa3cee..98304757cae91d22466ed25f8c6e36ce90a848db 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -64,6 +64,7 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFusion:
       return CallContext::kParallel;
     default:
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index c02ffda575278905f6549b362e5e7d94f5713b36..57a636fd740995d6cce933fe19d5592a64bde5cf 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -30,7 +30,7 @@ namespace xla {
 
 // The context in which a computation is called by another computation.
 enum class CallContext {
-  // In a parallel contex the computation is applied to each element of the
+  // In a parallel context the computation is applied to each element of the
   // array argument(s). kMap and kReduce instructions call computations in
   // parallel context.
   kParallel,
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d4535b204d7f3ad8d4e24beea5d0dd79e7a15ab0..42672bc3875af2d732d80691df6bf85b9d8080cd 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:transpose_folding",
+        "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
diff --git a/tensorflow/compiler/xla/service/cpu/build_defs.bzl b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
index e78330b21689fdd818cd97128bbcaaa9e0118602..ffa1cd4ec8e26e7dbe92e7b99cf65e99db5400b9 100644
--- a/tensorflow/compiler/xla/service/cpu/build_defs.bzl
+++ b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
@@ -1,12 +1,11 @@
 """build_defs for service/cpu."""
 
-
 def runtime_copts():
-  """Returns copts used for CPU runtime libraries."""
-  return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
-      "//tensorflow:android_arm": ["-mfpu=neon"],
-      "//conditions:default": []
-  }) + select({
-      "//tensorflow:android": ["-O2"],
-      "//conditions:default": []
-  }))
+    """Returns copts used for CPU runtime libraries."""
+    return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
+        "//tensorflow:android_arm": ["-mfpu=neon"],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:android": ["-O2"],
+        "//conditions:default": [],
+    }))
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index eafda68510d93ee54f2aead60a84f3e97b3fe1f4..19ab3bddb567afeeddb7c01b9a847b51bea5d957 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -95,6 +95,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
@@ -105,6 +106,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 
 namespace xla {
 namespace cpu {
@@ -255,6 +257,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
   pipeline.AddPass<MapInliner>();
 
+  pipeline.AddPass<TriangularSolveExpander>();
+
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
@@ -633,7 +637,13 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
-                       &target_machine_features);
+                       &target_machine_features,
+#ifdef MEMORY_SANITIZER
+                       /*emit_code_for_msan=*/true
+#else
+                       /*emit_code_for_msan=*/false
+#endif
+  );
 
   TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
@@ -670,9 +680,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   if (embed_ir_in_executable) {
     ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
   }
-  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   // JIT compile the LLVM IR module to in-memory machine code.
   jit->AddModule(std::move(llvm_module));
@@ -831,7 +841,9 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
-                         &target_machine_features);
+                         &target_machine_features,
+                         // TODO(b/66051036): Run full msan for AOT.
+                         /*emit_code_for_msan=*/false);
 
     TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
index 7fbe0fa157c57eb0c274662a1de95cf5328ccfa8..4ac61f44d9f38425da2d1fc6b9495cb4deba5047 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 3361a5973f5e8c91802b26d68477347b196d3cac..fae9670051a654f38f09856368ffb700b0c7a085 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 0fecbaf391bc3122646af30b508fc1a88b6641e9..2bf22ec6e43ea9944935a4d0d5dcd22c5d190c17 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -963,8 +963,8 @@ Status EmitBatchDotOperation(
   KernelSupportLibrary ksl(b);
 
   return ksl.ForWithStatus(
-      "bdot", /*start=*/0, /*end=*/batch_count, /*step=*/1,
-      [&](llvm::Value* indvar) {
+      llvm_ir::IrName(&dot, "bdot"), /*start=*/0, /*end=*/batch_count,
+      /*step=*/1, [&](llvm::Value* indvar) {
         DotDimensionNumbers adjusted_dim_numbers = dot.dot_dimension_numbers();
         adjusted_dim_numbers.clear_lhs_batch_dimensions();
         adjusted_dim_numbers.clear_rhs_batch_dimensions();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index efdda8599a1a66a0b2e43d17cfb35e3514e905b0..2418d96440f9994842a54769cf6d561610ccfa18 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -77,7 +77,6 @@ namespace {
 using llvm_ir::AsStringRef;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
-namespace gtl = tensorflow::gtl;
 }  // namespace
 
 namespace cpu {
@@ -87,7 +86,8 @@ IrEmitter::IrEmitter(
     llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
-    const TargetMachineFeatures* target_machine_features)
+    const TargetMachineFeatures* target_machine_features,
+    bool emit_code_for_msan)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
@@ -97,7 +97,8 @@ IrEmitter::IrEmitter(
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
-      target_machine_features_(*target_machine_features) {
+      target_machine_features_(*target_machine_features),
+      emit_code_for_msan_(emit_code_for_msan) {
   b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
           .xla_cpu_enable_fast_math()));
@@ -517,6 +518,7 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
     case U8:
     case S16:
     case U16:
+    case BF16:
     case F16:
     case S32:
     case U32:
@@ -577,72 +579,14 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
     lower_dimensions *= normalized_keys_shape.dimensions(i);
   }
 
-  llvm::FunctionType* less_than_type = llvm::FunctionType::get(
-      b_.getInt1Ty(), {b_.getInt8PtrTy(), b_.getInt8PtrTy()},
-      /*isVarArg=*/false);
-  auto less_than_function = llvm_ir::CreateFunction(
-      less_than_type, llvm::GlobalValue::InternalLinkage,
-      /*enable_fast_math=*/false,
-      /*optimize_for_size=*/true, absl::StrCat(IrName(sort), "_comparator"),
-      module_);
-  // Emit the code for the less_than function.
-  {
-    llvm::IRBuilder<>::InsertPointGuard guard(b_);
-
-    auto* entry_bb =
-        llvm::BasicBlock::Create(b_.getContext(), "entry", less_than_function);
-
-    b_.SetInsertPoint(entry_bb);
-    auto keys_ir_type = llvm_ir::PrimitiveTypeToIrType(keys_type, module_);
-    CHECK_EQ(less_than_function->arg_size(), 2);
-    llvm::Value* keys_lhs_ptr = less_than_function->arg_begin();
-    keys_lhs_ptr = PointerCast(keys_lhs_ptr, keys_ir_type->getPointerTo());
-    llvm::Value* keys_rhs_ptr = less_than_function->arg_begin() + 1;
-    keys_rhs_ptr = PointerCast(keys_rhs_ptr, keys_ir_type->getPointerTo());
-
-    // TODO(b/122298745): Replace the custom compare logic with a call to the
-    // computation specified for the Sort op.
-    llvm::Value* keys_lhs = Load(keys_ir_type, keys_lhs_ptr);
-    llvm::Value* keys_rhs = Load(keys_ir_type, keys_rhs_ptr);
-    bool is_signed_comparison = true;
-    if (primitive_util::IsFloatingPointType(keys_type)) {
-      // We would like a total order of floating point numbers so that the
-      // sort has a predictable behavior in the presence of NaNs. Rather
-      // than using floating point comparison, we use the following trick:
-      // If f is a float, and
-      // x = bit_cast<int32>(f);
-      // y = x < 0 ? 0x7FFFFFFF - x : x;
-      // then y is ordered as an int32 such that finite values have the
-      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-      // the beginning and end of the ordering.
-      auto k = b_.getInt(llvm::APInt::getSignedMaxValue(
-          keys_lhs->getType()->getPrimitiveSizeInBits()));
-      auto comparison_type = k->getType();
-      auto zero = llvm::ConstantInt::get(comparison_type, 0);
-      auto maybe_flip = [&](llvm::Value* v) {
-        return b_.CreateSelect(b_.CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                               b_.CreateSub(k, v), v);
-      };
-      keys_lhs = b_.CreateBitCast(keys_lhs, comparison_type);
-      keys_rhs = b_.CreateBitCast(keys_rhs, comparison_type);
-      keys_lhs = maybe_flip(keys_lhs);
-      keys_rhs = maybe_flip(keys_rhs);
-    } else if (!primitive_util::IsSignedIntegralType(keys_type)) {
-      is_signed_comparison = false;
-    }
-    llvm::Value* result =
-        b_.CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                           : llvm::ICmpInst::ICMP_ULT,
-                      keys_lhs, keys_rhs);
-    llvm::ReturnInst::Create(b_.getContext(),
-                             /*retVal=*/result, entry_bb);
-  }
-
+  auto less_than_function = FindOrDie(emitted_functions_, sort->to_apply());
+  CHECK(absl::c_binary_search(thread_local_computations_, sort->to_apply()));
   llvm::FunctionType* key_value_sort_type = llvm::FunctionType::get(
       b_.getVoidTy(),
       {b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
        b_.getInt8PtrTy()->getPointerTo(), b_.getInt32Ty(),
-       b_.getInt32Ty()->getPointerTo(), less_than_function->getType()},
+       b_.getInt32Ty()->getPointerTo(), b_.getInt1Ty(), b_.getInt8PtrTy(),
+       b_.getInt64Ty()->getPointerTo(), less_than_function->getType()},
       /*isVarArg=*/false);
   auto* key_value_sort_func = llvm::dyn_cast<llvm::Function>(
       module_
@@ -673,7 +617,9 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   Call(key_value_sort_func,
        {b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
         b_.getInt64(lower_dimensions), values,
-        b_.getInt32(sort->operand_count()), sizes, less_than_function});
+        b_.getInt32(sort->operand_count()), sizes,
+        b_.getInt1(sort->is_stable()), GetExecutableRunOptionsArgument(),
+        GetProfileCountersArgument(), less_than_function});
 
   if (sort->values_count() > 0) {
     llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_,
@@ -1914,7 +1860,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support variadic reduce.
+  // TODO(b/118333695): Support variadic reduce.
   if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on CPU");
   }
@@ -2281,6 +2227,25 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
         InBoundsGEP(operands_alloca, {b_.getInt64(i)});
     Store(operand_as_i8ptr, slot_in_operands_alloca);
   }
+  if (emit_code_for_msan_) {
+    // Mark the alloca as initialized for msan. The buffer gets read by the
+    // custom callee, which might be msan-instrumented.
+    // TODO(b/66051036): Run the msan instrumentation pass instead.
+    const llvm::DataLayout& dl = module_->getDataLayout();
+    llvm::Type* intptr_type = b_.getIntPtrTy(dl);
+    auto* msan_unpoison_ir_function = llvm::cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                "__msan_unpoison",
+                llvm::FunctionType::get(
+                    /*Result=*/b_.getVoidTy(),
+                    /*Params=*/{i8_ptr_type, intptr_type}, /*isVarArg=*/false))
+            .getCallee());
+    Call(msan_unpoison_ir_function,
+         {PointerCast(operands_alloca, i8_ptr_type),
+          llvm::ConstantInt::get(
+              intptr_type, *operands_alloca->getAllocationSizeInBits(dl) / 8)});
+  }
   auto* custom_call_ir_function = llvm::dyn_cast<llvm::Function>(
       module_
           ->getOrInsertFunction(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 974dd7cd3f2254bfbc86fffae02c06c481af8902..0e372335f3aae919f9a9c559f86d4d61ab799b70 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -72,13 +72,15 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
+  // emit_code_for_msan: whether emitted code should be compatible with msan.
   IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
             llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
                 computation_to_profile_idx,
-            const TargetMachineFeatures* target_machine);
+            const TargetMachineFeatures* target_machine,
+            bool emit_code_for_msan);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -574,6 +576,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<const HloComputation*> thread_local_computations_;
   std::vector<const HloComputation*> global_computations_;
 
+  bool emit_code_for_msan_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index a0667d0d9d1cde246f4b74626859955beeec08b0..70a6d0af02c0c2db7208db561cf29e35a74707b2 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -32,8 +32,9 @@ using tensorflow::int64;
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
     int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes,
-    bool (*less_than)(char*, char*)) {
+    int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, int64* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, tensorflow::int64*)) {
   // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
   // code, so msan can't tell they are initialized.
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
@@ -54,6 +55,7 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
   int64 sort_dimension_offset = c;
 
   std::unique_ptr<int64[]> indices(new int64[sort_dimension_elements]);
+  std::unique_ptr<char*[]> comparison_values(new char*[2 * values_count]);
   std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
@@ -67,16 +69,27 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
     int64 base_offset =
         index % sort_dimension_offset +
         (index - index % sort_dimension_offset) * sort_dimension_elements;
-    std::stable_sort(
-        indices.get(), indices.get() + sort_dimension_elements,
-        [&](int64 a, int64 b) {
-          int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
-                                   values_primitive_type_size_in_bytes[0];
-          int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
-                                   values_primitive_type_size_in_bytes[0];
-          return less_than(values[0] + memory_index_lhs,
-                           values[0] + memory_index_rhs);
-        });
+    auto compare_function = [&](int64 a, int64 b) -> bool {
+      int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
+                               values_primitive_type_size_in_bytes[0];
+      int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
+                               values_primitive_type_size_in_bytes[0];
+      for (int32 i = 0; i < values_count; ++i) {
+        comparison_values[i * 2] = values[i] + memory_index_lhs;
+        comparison_values[i * 2 + 1] = values[i] + memory_index_rhs;
+      }
+      char result = 0;  // Overwritten by less_than.
+      less_than(&result, run_options, comparison_values.get(), nullptr,
+                prof_counters);
+      return result != 0u;
+    };
+    if (is_stable) {
+      std::stable_sort(indices.get(), indices.get() + sort_dimension_elements,
+                       compare_function);
+    } else {
+      std::sort(indices.get(), indices.get() + sort_dimension_elements,
+                compare_function);
+    }
 
     // Reorder the values according to the order defined by 'indices'.
     for (int32 idx = 0; idx < values_count; ++idx) {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
index 5460af3485b94aaef1a5822a79e4fa325bcb67ea..50c2911c3bd392b6df12717c34d250ce86ad26e0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
@@ -22,18 +22,25 @@ limitations under the License.
 extern "C" {
 
 // Each entry in 'values' represents a 3-dimensional shape with dimensions
-// [a, b, c]. The 'b' dimension of the first shape is sorted into ascending
-// order according to the results of comparisons using the provided 'less_than'
+// [a, b, c]. The 'b' dimension of each shape is sorted into ascending order
+// according to the results of comparisons using the provided 'less_than'
 // function. 'values_count' must be > 0 and specifies the number of entries in
 // 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
 // type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
-// bytes. The elements in each 'values' shape are reordered in the same way
-// according to the comparisons using the first shape.
+// bytes. 'is_stable' specifies whether the sorting should be stable.
+// 'run_options' and 'prof_counters' are passed through to the less-than
+// function, which expects the following arguments:
+// - pointer to the return value buffer (char*)
+// - xla::ExecutableRunOptions = 'run_options' (char*)
+// - pointers to the parameter buffers (char**)
+// - pointers to the buffer tables = nullptr for thread local functions (char**)
+// - profile counters = 'prof_counters' (int64*)
 extern void __xla_cpu_runtime_KeyValueSort(
     tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
     char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes,
-    bool (*less_than)(char*, char*));
+    tensorflow::int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, tensorflow::int64* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, tensorflow::int64*));
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 9c2685674fbc133de1220caef81ac3b60a1c0f7c..f7b64738b7b314b56f4ae60336d9c85c90287219 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -337,6 +337,11 @@ bool RegisterKnownJITSymbols() {
                      reinterpret_cast<void*>(memset_pattern16));
 #endif
 
+#ifdef MEMORY_SANITIZER
+  registry->Register("__msan_unpoison",
+                     reinterpret_cast<void*>(__msan_unpoison));
+#endif
+
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index 3934c03a04c978009282b3cd0d39bacf9b12a356..762ee67db9a1b2a753c6ec5538dee1d13282942e 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -26,10 +26,16 @@ TEST_F(CpuKeyValueSortTest, SortR1) {
   const string hlo_text = R"(
 HloModule KeyValueSort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY main {
   a = f32[10] parameter(0)
 
-  ROOT result = f32[10] sort(f32[10] a), dimensions={0}
+  ROOT result = f32[10] sort(f32[10] a), dimensions={0}, to_apply=compare
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
index eb6c44b70ab34d0a294880b5de4fe0b3ba5e19e5..9fc472ff767441e60cf618ac9022e5c50ea20023 100644
--- a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
@@ -938,6 +938,53 @@ void TiledSmallGemmEmitter::EmitTiledGemm(
   });
 }
 
+llvm::Type* GetPointerToElementType(llvm::Type* pointer_type) {
+  llvm::Type* type =
+      llvm::cast<llvm::PointerType>(pointer_type)->getElementType();
+  while (auto* array_type = llvm::dyn_cast<llvm::ArrayType>(type)) {
+    type = array_type->getElementType();
+  }
+
+  return type->getPointerTo();
+}
+
+struct GemvBuffersWithCanonicalType {
+  llvm::Value* lhs_canonicalized;
+  llvm::Value* rhs_canonicalized;
+  llvm::Value* addend_canonicalized;
+  llvm::Value* result_canonicalized;
+};
+
+GemvBuffersWithCanonicalType GetGemvBuffersWithCanonicalType(
+    llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+    llvm::Value* result, llvm::IRBuilder<>* b) {
+  // We characterize a GEMV operation via M and K, since N is implicitly 1.
+  // This means the GEMV that multiplies (say) [5,6] with [6,1] is implemented
+  // by the same GEMV that multiplies [5,6] with [1,6].  However, the
+  // `llvm::Types` for the inputs to the two GEMVs don't match (in a trivial
+  // sense -- the in memory representations are the same) since they're computed
+  // from the `xla::Shape`s.  Since we want to be able to call the same
+  // `llvm::Function` for the two GEMVs we canonicalize the types of the GEMV
+  // inputs here into the same type.
+  GemvBuffersWithCanonicalType buffers_with_canonical_type;
+  llvm::Type* lhs_type = lhs->getType();
+  llvm::Type* rhs_type = rhs->getType();
+  llvm::Type* addend_type = addend ? addend->getType() : nullptr;
+  llvm::Type* result_type = result->getType();
+
+  buffers_with_canonical_type.lhs_canonicalized =
+      b->CreateBitCast(lhs, GetPointerToElementType(lhs_type));
+  buffers_with_canonical_type.rhs_canonicalized =
+      b->CreateBitCast(rhs, GetPointerToElementType(rhs_type));
+  buffers_with_canonical_type.addend_canonicalized =
+      addend ? b->CreateBitCast(addend, GetPointerToElementType(addend_type))
+             : nullptr;
+  buffers_with_canonical_type.result_canonicalized =
+      b->CreateBitCast(result, GetPointerToElementType(result_type));
+
+  return buffers_with_canonical_type;
+}
+
 }  // namespace
 
 void EmitRowMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
@@ -950,12 +997,18 @@ void EmitRowMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
       /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
       /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
 
+  GemvBuffersWithCanonicalType canonical_inputs =
+      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
+
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
-      rhs, addend, result,
-      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
-          llvm::Value* result) {
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(),
+      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
+      canonical_inputs.addend_canonicalized,
+      canonical_inputs.result_canonicalized,
+      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
+                                      llvm::Value* addend,
+                                      llvm::Value* result) {
         RowMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
                                                    result, b);
         emitter.Emit();
@@ -972,12 +1025,18 @@ void EmitColumnMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
       /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
       /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
 
+  GemvBuffersWithCanonicalType canonical_inputs =
+      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
+
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
-      rhs, addend, result,
-      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
-          llvm::Value* result) {
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(),
+      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
+      canonical_inputs.addend_canonicalized,
+      canonical_inputs.result_canonicalized,
+      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
+                                      llvm::Value* addend,
+                                      llvm::Value* result) {
         ColumnMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
                                                       result, b);
         emitter.Emit();
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index c54e954c222ff7ca9c0739ec8a55b9d79b74a437..af039776b7f157a407f8f4cf3d7cabdbee45b014 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -105,6 +105,7 @@ class DfsHloVisitorBase {
   }
   virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
+  virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 33b2cc3fb098ec0d92f68756526fcc4a761d7149..341bb37b8355e9987a0331d0a66bb8fe87f019cf 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -91,6 +91,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleFft(HloInstructionPtr fft) override {
     return DefaultAction(fft);
   }
+  Status HandleTriangularSolve(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index e8bc6d05716a2ef02e0280e86c7df4ac22fe78c4..559b9c1f2c9f341293ca89adc61e3312fd9f313c 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -297,7 +297,7 @@ StatusOr<bool> DotDecomposer::Run(HloModule* module) {
       const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
       // A dot it not canonical if there are more than one contracting
       // dimension.
-      if (dnums.lhs_contracting_dimensions_size() > 1) {
+      if (dnums.lhs_contracting_dimensions_size() != 1) {
         non_canonical_dots.push_back(instruction);
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 727e0bfa52d45b6f8c67d7d04613e4865f18a53c..808929be75ec6fd0cfb15418a231431b8d51e089 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -440,14 +440,16 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                                           {operand_value},
                                           {operand_value->getType()}, b_);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = FCmpOEQ(operand_value, zero);
-      auto olt = FCmpOLT(operand_value, zero);
-      return Select(oeq, zero,
-                    Select(olt, llvm::ConstantFP::get(type, -1.0),
-                           llvm::ConstantFP::get(type, 1.0)));
+      auto ne0_i1 = FCmpONE(operand_value, zero);
+      auto ne0_float = UIToFP(ne0_i1, type);
+      llvm::Value* result = llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::copysign, {ne0_float, operand_value},
+          {operand_value->getType()}, b_);
+      auto is_nan = FCmpUNO(operand_value, operand_value);
+      result = Select(is_nan, operand_value, result);
+      return result;
     }
     case HloOpcode::kIsFinite: {
       // abs(x) o!= inf, this works because the comparison returns false if
@@ -855,6 +857,9 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
   return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
 }
 
+// TODO(b/123355973): We have an implementation of erfinv in math.cc.  We
+// shouldn't have two implementations, especially since this one isn't testable
+// (it's only observable via a normally-distributed RNG).
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) {
   if (prim_type != F16 && prim_type != F32 && prim_type != F64) {
@@ -1362,26 +1367,69 @@ StatusOr<llvm::Value*> ElementalIrEmitter::ConvertValueForDistribution(
       llvm_ir::PrimitiveTypeToIrType(elem_prim_ty, module_);
   llvm::Type* raw_value_ty = raw_value->getType();
 
-  // Convert raw integer to float in range [0, 1) if the element is a float.
+  // If we're generating a floating-point value, convert the raw integer R (i.e.
+  // `raw_value`) to a float in the range [0, 1).
+  //
+  // The basic approach is to choose a significand and exponent such that the
+  // significand is uniformly distributed and the exponent is distributed, well,
+  // exponentially (it's more likely to be close to 0 than far from 0).
+  //
+  // An easy way to do this is to say that the significand is the first S bits
+  // of R, and the exponent is determined by the number of trailing zeroes in R,
+  // exp = 2^-(cttz(R) + 1).  (+1 because the largest exponent should be -1;
+  // this way the largest value we can return is 1.999... * 2^-1 = 1-ε.)
+  //
+  // This results in a small bias.  Namely, if R has enough trailing zeroes, the
+  // significand and exponent will "overlap".  As a concrete example, consider
+  //
+  //         20 X's                 12 zeroes
+  //   R = 0bXXXXXXXXXXXXXXXXXXXX000000000000
+  //
+  // Here the exponent is 2^-13 because R has 12 trailing zeroes.  The
+  // significand is made up of the first 23 most-significant bits of R, which we
+  // observe contain 3 zeroes.  This is biased because any random value with
+  // exponent 2^-12 will have a significand which ends in `000`.
+  //
+  // For f32s, this problem occurs only when there are more than 32-23 = 9
+  // trailing zeros, which happens with probability 0.5^10 = ~0.1%. Moreover the
+  // probability of a large bias (i.e. many trailing 0s in the significand) is
+  // exponentially low.  So we deem this acceptable.
   llvm::Value* elem_value = raw_value;
   if (elem_ir_ty->isFloatingPointTy()) {
-    unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits();
-    CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64);
-    // Perform the division using the float type with the same number of bits
-    // as the raw value to avoid overflow.
-    if (raw_value_size_in_bits == 32) {
-      elem_value = UIToFP(elem_value, b_->getFloatTy());
-      elem_value = FDiv(elem_value,
-                        llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32)));
-    } else {
-      elem_value = UIToFP(elem_value, b_->getDoubleTy());
-      elem_value = FDiv(
-          elem_value, llvm::ConstantFP::get(b_->getDoubleTy(), std::exp2(64)));
-    }
-
-    if (elem_ir_ty != elem_value->getType()) {
-      elem_value = FPTrunc(elem_value, elem_ir_ty);
-    }
+    const auto& dest_flt_semantics = elem_ir_ty->getFltSemantics();
+    const int bits = raw_value_ty->getPrimitiveSizeInBits();
+    CHECK_GE(bits, llvm::APFloat::semanticsSizeInBits(dest_flt_semantics));
+
+    // Subtract 1 because semanticsPrecision includes the "hidden bit", i.e. the
+    // implicit "1." at the beginning of the significand.
+    const int significand_bits =
+        llvm::APFloat::semanticsPrecision(dest_flt_semantics) - 1;
+
+    llvm::Value* cttz = llvm_ir::EmitCallToIntrinsic(
+        llvm::Intrinsic::cttz, {raw_value, /*is_zero_undef=*/b_->getFalse()},
+        {raw_value->getType()}, b_);
+    llvm::Value* significand = LShr(raw_value, bits - significand_bits);
+
+    // Exponent bias is -127 for f32, meaning that if the exponent is E and the
+    // significand is S, then the value of the number is 2^(E - 127) * (1.S).
+    //
+    // We want cttz == 0 to correspond to 2^-1, so our exponent is computed as
+    // E = 126 - cttz.
+    //
+    // For f64, this is all the same, except the bias is -1023.
+    //
+    // In IEEE floating point, the absolute value of the exponent bias equals
+    // the value of the largest possible exponent.
+    const int bias = -llvm::APFloat::semanticsMaxExponent(dest_flt_semantics);
+    llvm::Value* exponent =
+        Sub(llvm::ConstantInt::get(cttz->getType(), -bias - 1), cttz);
+
+    // Now just slot everything into place!  The `Trunc` is here because
+    // raw_value may be larger than our float destination.
+    elem_value =
+        BitCast(Trunc(Or(Shl(exponent, significand_bits), significand),
+                      b_->getIntNTy(elem_ir_ty->getPrimitiveSizeInBits())),
+                elem_ir_ty);
   }
 
   // Convert the value for the requested distribution.
@@ -1767,18 +1815,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    // TODO(b/118437727): Remove the R1 path.
-    llvm::Value* start_index_value;
-    if (hlo->operand(1)->shape().rank() == 1) {
-      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-      TF_ASSIGN_OR_RETURN(start_index_value,
-                          operand_to_generator.at(hlo->operand(1))(dim_index));
-    } else {
-      llvm_ir::IrArray::Index zero_index(index_type);
-      TF_ASSIGN_OR_RETURN(
-          start_index_value,
-          operand_to_generator.at(hlo->operand(1 + i))(zero_index));
-    }
+    llvm_ir::IrArray::Index zero_index(index_type);
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value * start_index_value,
+        operand_to_generator.at(hlo->operand(1 + i))(zero_index));
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
@@ -1924,18 +1964,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
       return llvm::ConstantInt::get(index_type, c);
     };
 
-    llvm::Value* start_index_value;
-    // TODO(b/118437727): Remove the R1 path.
-    if (hlo->operand(2)->shape().rank() == 1) {
-      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-      TF_ASSIGN_OR_RETURN(start_index_value,
-                          operand_to_generator.at(hlo->operand(2))(dim_index));
-    } else {
-      llvm_ir::IrArray::Index zero_index(index_type);
-      TF_ASSIGN_OR_RETURN(
-          start_index_value,
-          operand_to_generator.at(hlo->operand(2 + i))(zero_index));
-    }
+    llvm_ir::IrArray::Index zero_index(index_type);
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value * start_index_value,
+        operand_to_generator.at(hlo->operand(2 + i))(zero_index));
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index d3e2acaabd4f602171def70ccd3d4fd5adce0d0d..7d360fe38cfeda17878c363253c41883ec9fd64f 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -216,8 +216,11 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   llvm_ir::ElementGenerator MakePhiloxRngElementGenerator(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator);
+
   // Converts the raw value generated by a random number generation algorithm
   // to the distribution requested by the RNG HloInstruction.
+  //
+  // Precondition: raw_value has at least as many bits as hlo's element type.
   StatusOr<llvm::Value*> ConvertValueForDistribution(
       const HloInstruction* hlo,
       const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 10b8c01ff1383658fcfb2271c177ba54347f985a..1518d83083b3b0ce876da9344c483a23cd5b073c 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 
-
 namespace xla {
 
 StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
@@ -173,11 +172,13 @@ Status Executable::DumpHloSnapshot() {
   }
   filename = SanitizeFileName(std::move(filename));
   string file_path = tensorflow::io::JoinPath(directory_path, filename);
-  string result;
-  TF_RET_CHECK(
-      tensorflow::SerializeToStringDeterministic(hlo_session, &result));
-  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
-                                       result);
+  const size_t size = hlo_session.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(tensorflow::SerializeToBufferDeterministic(
+      hlo_session, serialized.get(), size));
+  return tensorflow::WriteStringToFile(
+      tensorflow::Env::Default(), file_path,
+      absl::string_view(serialized.get(), size));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 590942cddcdd138981ee829f090ae17b0d038e1a..a58ac39dffad56315308f784b08e6b6087b8e30a 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -296,7 +296,7 @@ static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
 // [3,1] out of operand into an accumulator of shape [4,3,1].  We then
 // reshape this result to [2,2,3] and finally transpose it to [2,3,2].
 
-StatusOr<HloInstruction*> GatherExpander::ExpandGather(
+StatusOr<HloInstruction*> GatherExpander::ExpandInstruction(
     HloInstruction* gather_instr) {
   CHECK(!ShapeUtil::IsZeroElementArray(gather_instr->shape()));
 
@@ -361,25 +361,11 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
                                    output_rank);
 }
 
-StatusOr<bool> GatherExpander::Run(HloModule* module) {
-  auto is_nontrivial_gather = [](HloInstruction* inst) {
-    return inst->opcode() == HloOpcode::kGather &&
-           // Avoid expanding gather ops that produce zero sized tensors,
-           // instead punt these to ZeroSizedHloElimination.
-           !ShapeUtil::IsZeroElementArray(inst->shape());
-  };
-
-  std::vector<HloInstruction*> gather_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    absl::c_copy_if(computation->instructions(),
-                    std::back_inserter(gather_instrs), is_nontrivial_gather);
-  }
-
-  for (HloInstruction* inst : gather_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandGather(inst));
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
-  }
-
-  return !gather_instrs.empty();
+bool GatherExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kGather &&
+         // Avoid expanding gather ops that produce zero sized tensors,
+         // instead punt these to ZeroSizedHloElimination.
+         !ShapeUtil::IsZeroElementArray(inst->shape());
 }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index 8af9c6b71fbc391bf7c0e9809e979b65135a6df3..5625a37cb46ca5b70f69d86bc424f6512bfb293f 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -16,20 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
 
 // This pass rewrites gather operations into (roughly) while loops of dynamic
 // slices.  This lets backends that don't support gather directly to
 // nevertheless have a minimum level of support.
-class GatherExpander : public HloModulePass {
+class GatherExpander : public OpExpanderPass {
  public:
   absl::string_view name() const override { return "gather_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  StatusOr<HloInstruction*> ExpandGather(HloInstruction* gather_instr);
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* gather_inst) override;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 7d450f4b53cdea209f2ef10ba785be6ec3b8bf8d..cb43c27be961262bf29d4a3958de62cfada19aed 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 85fb2dd47abdad7073bf15a2f8b974a3ae0f01e4..25c4f70d89b4ebc483a61f1e28c7a55eb31f4bdf 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -305,6 +305,7 @@ cc_library(
         "sequential_thunk.cc",
         "thunk.cc",
         "thunk_schedule.cc",
+        "triangular_solve_thunk.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
     ],
@@ -324,6 +325,7 @@ cc_library(
         "sequential_thunk.h",
         "thunk.h",
         "thunk_schedule.h",
+        "triangular_solve_thunk.h",
         "tuple_thunk.h",
         "while_thunk.h",
     ],
@@ -364,6 +366,8 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor:device_memory",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -396,18 +400,21 @@ cc_library(
     srcs = ["cudnn_conv_algorithm_picker.cc"],
     hdrs = ["cudnn_conv_algorithm_picker.h"],
     deps = [
+        ":autotuning_proto",
         ":backend_configs",
         ":buffer_comparator",
         ":cudnn_conv_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
+        "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -758,6 +765,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:sort_simplifier",
+        "//tensorflow/compiler/xla/service:stable_sort_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -1081,3 +1089,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
+
+xla_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/autotuning.proto b/tensorflow/compiler/xla/service/gpu/autotuning.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b4a08963b4f2ebc55c89ed57325093536f343bd1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/autotuning.proto
@@ -0,0 +1,81 @@
+// This file defines protos that store the results of autotuning XLA:GPU
+// operations.
+//
+// They are in proto format because we want to log them structured. They offer
+// tremendous statistical, testing, and debugging value.
+syntax = "proto3";
+
+package xla.gpu;
+
+import "google/protobuf/duration.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/compiler/xla/service/hlo.proto";
+
+message CudnnVersion {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+}
+
+message ComputeCapability {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+message AutotuneResult {
+  message SuccessResult {
+    int64 scratch_bytes = 1;
+    google.protobuf.Duration run_time = 2;
+  }
+
+  message ConvKey {
+    int64 algorithm = 1;
+    bool tensor_ops_enabled = 2;
+  }
+
+  // If the conv runs successfully, success will be populated with the
+  // autotuning result. Otherwise, the error message is propagated.
+  oneof result {
+    SuccessResult success = 3;
+    string error_string = 4;
+  }
+
+  oneof key {
+    ConvKey conv = 5;
+  }
+
+  // Sometimes we run a correctness checker during autotuning. It compares the
+  // result buffer content between two algorithms, say, "reference" and "test"
+  // algorithms. The "test" algorithm is the one associated with this
+  // AutotuneResult.
+  //
+  // This field records the reference algorithm used. Notice that naming it
+  // "reference" doesn't mean it's always correct. However, empirically it's
+  // more correct, as it's "algo 0", less fancy than the compared one.
+  //
+  // Notice that the checker_failure may exist even in the success case.
+  // This is because the error string in `result` comes from the underlying
+  // implementation like cuDNN, which isn't aware that it produced an incorrect
+  // result. And even if the checker detects an incorrect result, we can still
+  // retrieve scratch_bytes and runtime_ms.
+  oneof checker_failure {
+    ConvKey reference_conv = 6;
+  }
+}
+
+message AutotuneLog {
+  message Instruction {
+    xla.HloInstructionProto instruction = 1;
+    repeated xla.ShapeProto operand_shapes = 2;
+  }
+
+  oneof instr_oneof {
+    Instruction instr = 1;
+  }
+
+  // Records all auto-tuning results per algorithm.
+  repeated AutotuneResult results = 3;
+
+  CudnnVersion cudnn_version = 4;
+  ComputeCapability compute_capability = 5;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 309b0aca64954e64509d731dce28ce9d8da4ee43..603af5a654589e0b02c762b57d70a8b7628b1d0f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -16,14 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
@@ -32,7 +35,6 @@ namespace {
 
 using absl::optional;
 using se::DeviceMemoryBase;
-using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
 
 class ScratchAllocator : public se::ScratchAllocator {
@@ -132,6 +134,31 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
   return tensorflow::mutex_lock{it->second};
 }
 
+xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  xla::gpu::CudnnVersion cudnn_version;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      cudnn_version.set_major(version.major_version());
+      cudnn_version.set_minor(version.minor_version());
+      cudnn_version.set_patch(version.patch());
+    }
+  }
+  return cudnn_version;
+}
+
+xla::gpu::ComputeCapability GetComputeCapability(
+    se::StreamExecutor* stream_executor) {
+  xla::gpu::ComputeCapability cc;
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  cc.set_major(cc_major);
+  cc.set_minor(cc_minor);
+  return cc;
+}
+
 }  // anonymous namespace
 
 // We could have caching here so that we don't redo this work for two identical
@@ -145,8 +172,7 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // cache misses and doing extra work.  Overall, caching doesn't seem worth the
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
-StatusOr<CudnnConvAlgorithmPicker::AutotuneResult>
-CudnnConvAlgorithmPicker::PickBestAlgorithm(
+StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
     const HloCustomCallInstruction* instr) {
   // TODO(timshen): for now only check fp16. It can be expanded to other types,
   // with some work on the HLO routines.
@@ -233,8 +259,6 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(
           &stream, ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
   initialize_buffer(result_buffer);
 
-  se::dnn::ProfileResult best_result;
-  int64 best_result_bytes_used = 0;
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
 
@@ -244,6 +268,7 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(
   // this algorithm considered correct, though.
   optional<AlgorithmDesc> first_algorithm;
   TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
+  std::vector<AutotuneResult> profile_results;
   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
     ScratchAllocator scratch_allocator(device_ordinal, allocator);
     se::dnn::ProfileResult profile_result;
@@ -254,73 +279,108 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(
     RunConvOptions options;
     options.profile_result = &profile_result;
     options.algo_override = alg;
-    bool launch_ok =
+    Status launch_status =
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                     &scratch_allocator, &stream, options)
-            .ok();
-
-    if (launch_ok && profile_result.is_valid()) {
-      const bool crash_on_checking_failure =
-          instr->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_crash_on_verification_failures();
-      if (comparator.has_value()) {
-        StatusOr<bool> result = comparator->CompareEqual(
-            se::DeviceMemory<Eigen::half>(result_buffer));
-        if (!result.ok()) {
-          LOG(ERROR) << "Unable to compare "
-                     << AlgorithmToString(*first_algorithm) << " against "
-                     << AlgorithmToString(alg) << " for " << instr->ToString()
-                     << ": " << result.status();
-          CHECK(!crash_on_checking_failure);
-        } else if (!result.ValueOrDie()) {
-          LOG(ERROR) << "Results mismatch between different convolution "
-                        "algorithms. This is likely a bug in convolution, or "
-                        "an excessive loss of precision in convolution. "
-                     << instr->ToString() << " for "
-                     << AlgorithmToString(*first_algorithm) << " vs "
-                     << AlgorithmToString(alg);
-          CHECK(!crash_on_checking_failure);
-        }
-      } else if (cross_check_enabled) {
-        auto comp = F16BufferComparator::Create(
-            se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
-            &stream);
-        if (comp.ok()) {
-          comparator.emplace(comp.ConsumeValueOrDie());
-          first_algorithm.emplace(alg);
-        } else {
-          LOG(ERROR) << "Fail to initialize buffer comparator: "
-                     << comp.status() << ", instruction: " << instr->ToString();
-          CHECK(!crash_on_checking_failure);
-        }
+                     &scratch_allocator, &stream, options);
+
+    profile_results.emplace_back();
+    AutotuneResult& result = profile_results.back();
+    result.mutable_conv()->set_algorithm(alg.algo_id());
+    result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
+
+    if (!launch_status.ok()) {
+      result.set_error_string(launch_status.error_message());
+      continue;
+    }
+
+    if (!profile_result.is_valid()) {
+      result.set_error_string("Invalid profile result");
+      continue;
+    }
+
+    int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+    result.mutable_success()->set_scratch_bytes(scratch_bytes_used);
+    *result.mutable_success()->mutable_run_time() =
+        protobuf_util::ToDurationProto(
+            absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
+    const bool crash_on_checking_failure =
+        instr->GetModule()
+            ->config()
+            .debug_options()
+            .xla_gpu_crash_on_verification_failures();
+
+    if (comparator.has_value()) {
+      StatusOr<bool> compare_result = comparator->CompareEqual(
+          se::DeviceMemory<Eigen::half>(result_buffer));
+      if (!compare_result.ok()) {
+        LOG(ERROR) << "Unable to compare "
+                   << AlgorithmToString(*first_algorithm) << " against "
+                   << AlgorithmToString(alg) << " for " << instr->ToString()
+                   << ": " << compare_result.status();
+        CHECK(!crash_on_checking_failure);
+      } else if (!compare_result.ValueOrDie()) {
+        LOG(ERROR) << "Results mismatch between different convolution "
+                      "algorithms. This is likely a bug in convolution, or "
+                      "an excessive loss of precision in convolution. "
+                   << instr->ToString() << " for "
+                   << AlgorithmToString(*first_algorithm) << " vs "
+                   << AlgorithmToString(alg);
+        CHECK(!crash_on_checking_failure);
+        auto* failure = result.mutable_reference_conv();
+        failure->set_algorithm(first_algorithm->algo_id());
+        failure->set_tensor_ops_enabled(first_algorithm->tensor_ops_enabled());
       }
-      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
-              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
-              << "ms and using " << NumBytesToString(scratch_bytes_used)
-              << " of scratch (Best result: "
-              << best_result.elapsed_time_in_ms() << "ms, "
-              << NumBytesToString(best_result_bytes_used) << " of scratch)";
-      if (profile_result.elapsed_time_in_ms() <
-          best_result.elapsed_time_in_ms()) {
-        best_result = profile_result;
-        best_result_bytes_used = scratch_bytes_used;
+    } else if (cross_check_enabled) {
+      auto comp = F16BufferComparator::Create(
+          se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
+          &stream);
+      if (comp.ok()) {
+        comparator.emplace(comp.ConsumeValueOrDie());
+        first_algorithm.emplace(alg);
+      } else {
+        LOG(ERROR) << "Fail to initialize buffer comparator: " << comp.status()
+                   << ", instruction: " << instr->ToString();
+        CHECK(!crash_on_checking_failure);
       }
-    } else {
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
     }
   }
-  if (best_result.is_valid()) {
-    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
-            << AlgorithmToString(best_result.algorithm()) << ", takes "
-            << best_result.elapsed_time_in_ms() << "ms, and uses "
-            << best_result_bytes_used << "B of scratch memory.";
-    return AutotuneResult{best_result.algorithm().algo_id(),
-                          best_result.algorithm().tensor_ops_enabled(),
-                          best_result_bytes_used,
-                          absl::Milliseconds(best_result.elapsed_time_in_ms())};
+
+  // Log the autotuning result.
+  {
+    AutotuneLog log;
+    *log.mutable_instr()->mutable_instruction() = instr->ToProto();
+    for (const auto* op : instr->operands()) {
+      *log.mutable_instr()->add_operand_shapes() = op->shape().ToProto();
+    }
+    for (const auto& profile : profile_results) {
+      *log.add_results() = profile;
+    }
+    *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
+    *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
+    VLOG(2) << "Autotuning result:\n" << log.DebugString();
+    tensorflow::Logger::Singleton()->LogProto(log);
+  }
+
+  auto* profile_results_end = profile_results.data() + profile_results.size();
+
+  const AutotuneResult* best_result = std::min_element(
+      profile_results.data(), profile_results_end,
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        // The successful one should have a smaller key, since we are doing
+        // min_element. If they are both unsuccessful, keep the earlier one in
+        // the vector by comparing pointers.
+        return std::make_tuple(
+                   !lhs.has_success(),
+                   protobuf_util::FromDurationProto(lhs.success().run_time()),
+                   &lhs) < std::make_tuple(!rhs.has_success(),
+                                           protobuf_util::FromDurationProto(
+                                               rhs.success().run_time()),
+                                           &rhs);
+      });
+
+  if (best_result != profile_results_end && best_result->has_success()) {
+    return *best_result;
   }
 
   return InternalError(
@@ -341,22 +401,23 @@ StatusOr<bool> CudnnConvAlgorithmPicker::RunOnInstruction(
   }
 
   auto best_algo = std::move(best_algo_or).ValueOrDie();
-  VLOG(1) << "Setting cudnn conv to use algorithm " << best_algo.algorithm
-          << " and " << NumBytesToString(best_algo.scratch_bytes)
+  VLOG(1) << "Setting cudnn conv to use algorithm "
+          << best_algo.conv().algorithm() << " and "
+          << NumBytesToString(best_algo.success().scratch_bytes())
           << " of scratch memory: " << instr->ToString()
-          << " tensor_ops_enabled: " << best_algo.tensor_ops_enabled;
+          << " tensor_ops_enabled: " << best_algo.conv().tensor_ops_enabled();
 
   // Replace instr with a new CustomCall which has the correct algorithm, and
   // whose output shape has the appropriate amount of scratch memory.
   HloComputation* computation = instr->parent();
   Shape new_call_shape = ShapeUtil::MakeTupleShape(
       {instr->shape().tuple_shapes(0),
-       ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes})});
+       ShapeUtil::MakeShape(U8, {best_algo.success().scratch_bytes()})});
 
   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
-  backend_config.set_algorithm(best_algo.algorithm);
-  backend_config.set_tensor_ops_enabled(best_algo.tensor_ops_enabled);
+  backend_config.set_algorithm(best_algo.conv().algorithm());
+  backend_config.set_tensor_ops_enabled(best_algo.conv().tensor_ops_enabled());
 
   HloInstruction* new_call = computation->AddInstruction(
       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 4991db0948589e479a202f4082d96df275f6e088..2e34ba9672314a62290b8a557960a605a98996c7 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -47,13 +48,6 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  struct AutotuneResult {
-    int64 algorithm;
-    bool tensor_ops_enabled;
-    int64 scratch_bytes;
-    absl::Duration runtime;
-  };
-
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
   StatusOr<AutotuneResult> PickBestAlgorithm(
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index ffd4214958275dc79bbcb060328893f8b68c737a..80ddb3e5cde708f535f71b75587171ed975f939c 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -448,7 +448,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         return Load(accum_ptr);
       };
     case HloOpcode::kReduce:
-      // TODO(b/112040122): This should be supported.
+      // TODO(b/118332391): This should be supported.
       CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
       return [=, &operand_to_generator](
                  const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 86c9bc6a345047fb5329af0be45c8981cc427f50..a7053e6a013be3ccf5725cbe003558be77104af1 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -428,7 +428,8 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
         scratch_data = scratch_mem->device_memory();
       }
       const MatrixDescriptor scratch_descriptor(
-          scratch_data, false, output_num_cols, output_num_rows, batch_size);
+          scratch_data, false, output_matrix.num_rows, output_matrix.num_cols,
+          batch_size);
 
       StatusOr<se::blas::AlgorithmType> best_algorithm = GetGemmAutotuneFn(
           element_type)(lhs_matrix, rhs_matrix, scratch_descriptor, alpha_,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index e9d7ba1c4cfa865532a0d06c2ed883a2fea4e2cd..9f0de3f794decb7b878b67c96030f8e11b0555fe 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -48,7 +48,7 @@ bool IsInputFusibleReduction(const HloInstruction& instr);
 
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
-// This function works for both, sibling and producer-conumser multi-output
+// This function works for both, sibling and producer-consumer multi-output
 // fusion.
 // So far, multi-output fusion is supported for loop fusions and reduce
 // input fusions only. It is up to the caller to ensure the instructions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
index 4268fb2c7a813b3b53e4cd48746028a7b369f28e..4765f67c4b17e97419182e341573f75ad3d6ac30 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 58bdd4209a2315cdb7d29e920faded4d1a6a5876..a6d80f0b6dddb3d8d0fd00c639e11c71da6a9f09 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -240,6 +240,32 @@ Status GpuLayoutAssignment::AddBackendConstraints(
         TF_RETURN_IF_ERROR(
             constraints->SetBufferLayout(keys_layout, *output_buffer));
       }
+    } else if (instruction->opcode() == HloOpcode::kTriangularSolve) {
+      // TODO(phawkins): Ideally we would relax this constraint. What we
+      // actually want is that:
+      // a) the batch dimensions are major, in no particular order.
+      // b) the two minor dimensions are in fortran (column-major) order,
+      // although for the 'a' argument we could potentially accept row-major
+      // order and fold the transpose into the operator.
+      auto set_fortran_layout = [](Shape* shape) {
+        LayoutUtil::SetToDefaultLayout(shape);
+        int n = shape->mutable_layout()->minor_to_major_size();
+        CHECK_GE(n, 2);
+        std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
+                  shape->mutable_layout()->mutable_minor_to_major()->at(1));
+      };
+      Shape op0_shape = instruction->operand(0)->shape();
+      Shape op1_shape = instruction->operand(1)->shape();
+      Shape output_shape = instruction->shape();
+      set_fortran_layout(&op0_shape);
+      set_fortran_layout(&op1_shape);
+      set_fortran_layout(&output_shape);
+      TF_RETURN_IF_ERROR(
+          constraints->SetOperandLayout(op0_shape, instruction, 0));
+      TF_RETURN_IF_ERROR(
+          constraints->SetOperandLayout(op1_shape, instruction, 1));
+      TF_RETURN_IF_ERROR(
+          constraints->SetInstructionLayout(output_shape, instruction));
     }
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 29756d27260b0f41b2dd4b649ea9b1610ff90268..391029e574622925b2a7e801a7d41d95e49a1cfb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -368,12 +368,21 @@ TEST_F(LayoutAssignmentTest, DotLayout) {
 TEST_F(LayoutAssignmentTest, SortLayout) {
   const char* hlo_text = R"(
   HloModule SortLayout
+
+  compare {
+    p.0.lhs = f32[] parameter(0)
+    p.0.rhs = f32[] parameter(1)
+    p.1.lhs = f32[] parameter(2)
+    p.1.rhs = f32[] parameter(3)
+    ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+  }
+
   ENTRY sort {
     keys = f32[3,2]{0,1} constant({{0,1},{0,1},{0,1}})
     values = f32[2,3]{1,0} parameter(0)
     transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
     ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}) sort(keys, transpose),
-      dimensions={1}
+      dimensions={1}, to_apply=compare
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 8c6a6914792a96ab517fa5f20ff2215e4785490e..e593f535642e15f28a4a1c1f321881ba3c694548 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 82bdd677d96d3d0826bb4127b32d074eb632b1a3..3ed6553f9205803cfa17772b890c449cfb457c89 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 0007a9a8a3369d8ac010640127e1561615a6d813..8f010ab27a6c99b97e7808218de908ce558b0fe7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -492,8 +492,11 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
       result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType());
       result = InsertValue(result, value.first, {0});
       result = InsertValue(result, value.second, {1});
-    } else {
+    } else if (ShapeUtil::ElementIsFloating(lhs_shape)) {
       result = FMul(lhs_value, rhs_value);
+    } else {
+      TF_RET_CHECK(ShapeUtil::ElementIsIntegral(lhs_shape));
+      result = Mul(lhs_value, rhs_value);
     }
     target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_);
     return Status::OK();
@@ -583,9 +586,13 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
     llvm::Value* accum_imag = Imag(accum, &b_);
     llvm::Value* imag_sum = FAdd(accum_imag, value.second);
     updated_accum = InsertValue(updated_accum, imag_sum, {1});
-  } else {
+  } else if (ShapeUtil::ElementIsFloating(lhs_shape)) {
     llvm::Value* product = FMul(lhs_element, rhs_element);
     updated_accum = FAdd(accum, product);
+  } else {
+    TF_RET_CHECK(ShapeUtil::ElementIsIntegral(lhs_shape));
+    llvm::Value* product = Mul(lhs_element, rhs_element);
+    updated_accum = Add(accum, product);
   }
   Store(updated_accum, accum_address);
 
@@ -647,7 +654,7 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support variadic reduce.
+  // TODO(b/118332391): Support variadic reduce.
   if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on GPU");
   }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 294a454931b5cfa368bf094c428a1e942f4556b8..0cc65ebb52737aa9bb8866eb07278a2319aa797b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
@@ -60,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -487,6 +487,41 @@ Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
   return Status::OK();
 }
 
+Status IrEmitterUnnested::HandleTriangularSolve(HloInstruction* hlo) {
+  auto has_fortran_layout = [](const Layout& layout) {
+    int n = layout.minor_to_major_size();
+    return layout.minor_to_major(0) == n - 2 &&
+           layout.minor_to_major(1) == n - 1;
+  };
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(0)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(1)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->shape().layout()));
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+
+  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
+  // aren't the same buffer.
+  auto operand_buffer = GetAllocationSlice(*hlo->operand(1));
+  auto destination_buffer = GetAllocationSlice(*hlo);
+  if (operand_buffer != destination_buffer) {
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/operand_buffer,
+        /*destination_buffer=*/destination_buffer,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()), hlo));
+  }
+
+  thunks.push_back(BuildTriangularSolveThunk(hlo));
+
+  // Elide the sequential thunk if there's no copy.
+  if (thunks.size() == 1) {
+    AddThunkToThunkSequence(std::move(thunks[0]));
+  } else {
+    AddThunkToThunkSequence(
+        absl::make_unique<SequentialThunk>(std::move(thunks), hlo));
+  }
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
   if (HloInstruction::FusionKind::kInput == fusion->fusion_kind()) {
@@ -546,7 +581,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         // a 1D array. The specialized version requires a initializer thunk that
         // initializes the output array to the initial value of the reduce.
         if (root->opcode() == HloOpcode::kReduce && root->shape().IsTuple()) {
-          // TODO(b/112040122): Support variadic reduce.
+          // TODO(b/118332391): Support variadic reduce.
           return Unimplemented("Variadic reduce is not supported on GPU");
         }
         return EmitReductionToVector(fusion);
@@ -635,7 +670,7 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
 }
 
 Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support multi-output reduce.
+  // TODO(b/118332391): Support multi-output reduce.
   if (!reduce->shape().IsArray()) {
     return Unimplemented("Multi-output reduce is not supported on GPU");
   }
@@ -959,18 +994,18 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
       BuildKernelThunk(scatter,
                        /*implements_whole_instruction=*/thunks.empty()));
 
-  TF_RETURN_IF_ERROR(
-      EmitScatter(thunks.back().get(), scatter,
-                  /*scatter_indices_gen=*/
-                  [=](const IrArray::Index& index) {
-                    return GetIrArray(*scatter_indices, *scatter)
-                        .EmitReadArrayElement(index, &b_, "scatter_index");
-                  },
-                  /*updates_gen=*/
-                  [=](const IrArray::Index& index) {
-                    return GetIrArray(*updates, *scatter)
-                        .EmitReadArrayElement(index, &b_, "update");
-                  }));
+  TF_RETURN_IF_ERROR(EmitScatter(
+      thunks.back().get(), scatter,
+      /*scatter_indices_gen=*/
+      [=](const IrArray::Index& index) {
+        return GetIrArray(*scatter_indices, *scatter)
+            .EmitReadArrayElement(index, &b_, "scatter_index");
+      },
+      /*updates_gen=*/
+      [=](const IrArray::Index& index) {
+        return GetIrArray(*updates, *scatter)
+            .EmitReadArrayElement(index, &b_, "update");
+      }));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
@@ -1118,17 +1153,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   Shape keys_shape = sort->operand(0)->shape();
   int64 dimension_to_sort = sort->dimensions(0);
-  // In case there is a 'values' parameter that is a iota, we take note and use
-  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
-  // sort.
-  int64 iota_values_parameter_index = -1;
   for (int64 i = 0; i < sort->operand_count(); ++i) {
-    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
-        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
-        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
-            dimension_to_sort) {
-      iota_values_parameter_index = i;
-    }
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
@@ -1241,25 +1266,23 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
                                              : standard_launch_dimensions;
     UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
                            ir_emitter_context_->llvm_module());
-    IrArray keys_array;
     std::vector<IrArray> values_arrays;
-    values_arrays.reserve(sort->operand_count() - 1);
+    values_arrays.reserve(sort->operand_count());
     for (int64 i = 0; i < sort->operand_count(); ++i) {
       ShapeIndex shape_index =
           sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-      if (i == 0) {
-        keys_array = GetIrArray(*sort, *sort, shape_index);
-      } else {
-        values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
-      }
+      values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
     }
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, keys_array, values_arrays,
-        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        dimension_to_sort, values_arrays, IrName(sort), xor_masks, &b_,
         launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
-        kTileSize);
+        kTileSize,
+        [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
+          return EmitCallToNestedComputation(*sort->to_apply(), operands,
+                                             output);
+        });
   };
   std::vector<int64> xor_masks;
   for (int64 stage = 0; stage < num_stages; ++stage) {
@@ -1758,6 +1781,29 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
       /*output_shape=*/inst->shape(), inst);
 }
 
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildTriangularSolveThunk(
+    const HloInstruction* inst) {
+  const HloInstruction* a = inst->operand(0);
+  const HloInstruction* b = inst->operand(1);
+  int64 m = b->shape().dimensions(b->shape().rank() - 2);
+  int64 n = b->shape().dimensions(b->shape().rank() - 1);
+  int64 batch_size = std::accumulate(
+      b->shape().dimensions().begin(), b->shape().dimensions().end() - 2,
+      int64{1}, [](int64 a, int64 b) { return a * b; });
+  int64 elem_size =
+      ShapeUtil::ByteSizeOfPrimitiveType(inst->shape().element_type());
+  int64 a_batch_stride = inst->triangular_solve_options().left_side()
+                             ? m * m * elem_size
+                             : n * n * elem_size;
+  int64 b_batch_stride = m * n * elem_size;
+  return absl::make_unique<TriangularSolveThunk>(
+      inst->triangular_solve_options(),
+      /*a_input_buffer=*/GetAllocationSlice(*a),
+      /*b_input_buffer=*/GetAllocationSlice(*inst),
+      inst->shape().element_type(), batch_size, m, n, a_batch_stride,
+      b_batch_stride, inst);
+}
+
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     HloInstruction* hlo, const ShapeIndex& index) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
@@ -2133,7 +2179,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
     const std::vector<llvm::Value*>& param_buffers,
@@ -2967,12 +3012,11 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
   // *anyway*.
   if (!reduction_info && unnested_hlo->IsMultiOutputFusion()) {
-    KernelSupportLibrary{&b_}.If(
-        "emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-          llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
-                             ConstructIrArrayForOutputs(*unnested_hlo), &b_,
-                             module_);
-        });
+    KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+      llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
+                         ConstructIrArrayForOutputs(*unnested_hlo), &b_,
+                         module_);
+    });
   }
 
   // For each tiled parameter, cast its input IrArray to the corresponding
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 21b842bb2cd63ac454f85556df20ae5877cecbe1..f85e18bbf0798ef3d5b87e81d287d8aed691dfc4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -176,6 +176,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
+  Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAfterAll(HloInstruction* after_all) override;
@@ -319,6 +320,9 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
 
+  // Returns a TriangularSolveThunk that calls cuBlas to implement `inst`.
+  std::unique_ptr<Thunk> BuildTriangularSolveThunk(const HloInstruction* inst);
+
   // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
   // to make sure `inst` outlives the lifetime of the returned Thunk object.
   std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 1f4f1766618c71c9ef8705f3038676a0518b3ddd..6e00e4b4ff8c493f00fae3355215fb13fb5f4f10 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -82,6 +82,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -118,6 +119,9 @@ std::vector<string> GetCudaRootCandidates(
     const HloModuleConfig& hlo_module_config) {
   std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
 
+  // "." is our last resort, even though it probably won't work.
+  potential_cuda_roots.push_back(".");
+
   // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
   // highest priority.
   string xla_gpu_cuda_data_dir =
@@ -129,9 +133,23 @@ std::vector<string> GetCudaRootCandidates(
   return potential_cuda_roots;
 }
 
+void PrintCantFindCudaMessage(absl::string_view msg,
+                              const HloModuleConfig& hlo_module_config) {
+  LOG(WARNING) << msg;
+  LOG(WARNING) << "Searched in the following directories:";
+  for (const auto& dir : GetCudaRootCandidates(hlo_module_config)) {
+    LOG(WARNING) << "  " << dir;
+  }
+  LOG(WARNING)
+      << "You can choose the search directory by setting xla_gpu_cuda_data_dir "
+         "in HloModule's DebugOptions.  For most apps, setting the environment "
+         "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.";
+}
+
 // Returns the directory containing nvvm libdevice files.
 string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
-  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
+  const auto& candidate_dirs = GetCudaRootCandidates(hlo_module_config);
+  for (const string& cuda_root : candidate_dirs) {
     string libdevice_dir =
         tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
     VLOG(2) << "Looking for libdevice at " << libdevice_dir;
@@ -140,8 +158,14 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
       return libdevice_dir;
     }
   }
-  LOG(WARNING) << "Unable to find libdevice dir. Using '.'";
-  // Last resort: maybe in the current folder.
+  PrintCantFindCudaMessage(
+      "Can't find directory containing CUDA libevice.  This may result in "
+      "compilation or runtime failures, if the program we try to run uses "
+      "routines from libdevice.",
+      hlo_module_config);
+
+  // GetCudaRotCandidates always inclues ".", but but if everything fails, we
+  // return it anyway.  Better than returning the empty string.
   return ".";
 }
 
@@ -172,6 +196,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     pipeline.AddPass<ConvolutionGroupConverter>(
         cost_model,
         /*convert_batch_groups_only=*/true);
+    // Expand the sort op to support stable sorting if required.
+    pipeline.AddPass<StableSortExpander>();
     // Convert BF16 operations to F32 operations so that the GPU backend can
     // support BF16 operations without directly implementing a BF16 lowering for
     // most ops.
@@ -772,14 +798,19 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
-  if (module->config().hlo_profiling_enabled()) {
+  if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
     cost_analysis.set_bytes_per_second(
         stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
-    profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
-    profile_printer = CreateHloProfilePrinterData(
-        *profile_index_map, cost_analysis, entry_computation->name());
+    VLOG(1) << "HLO memory read+written: "
+            << tensorflow::strings::HumanReadableNumBytes(
+                   cost_analysis.bytes_accessed());
+    if (module->config().hlo_profiling_enabled()) {
+      profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
+      profile_printer = CreateHloProfilePrinterData(
+          *profile_index_map, cost_analysis, entry_computation->name());
+    }
   }
 
   auto* gpu_executable = new GpuExecutable(
@@ -843,10 +874,11 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
             log_warning = !warning_done.exchange(true);
           }
           if (log_warning) {
-            LOG(WARNING)
-                << "Failed to compile ptx to cubin.  Will attempt to let "
-                   "GPU driver compile the ptx. "
-                << maybe_cubin.status();
+            PrintCantFindCudaMessage(
+                "Can't find ptxas binary.  Will back to the GPU driver "
+                "for PTX -> sass compilation.  This is OK so long as you don't "
+                "see a warning below about an out-of-date driver version.",
+                hlo_module_config);
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 8154d75d23a6d49153ccb6824402aff73f365617..cb012649200c6386d3ae25d088aa3b16bd40be82 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index a1ed8499040359fe7265a7317b0577a990a2234c..d33e9cf714ee3810b1fb2fa8c05c3ed399d27bfb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
index f91a22d482bc8bc046977870a7a4d18ca1acde68..06b06a5b1ee1fb9996be3ebe326893c4160a7e29 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index c78605cebbc671272b8df9faf0e0cc54be2f5b1c..a677617727c04811584cbaa295d164ed27273bb2 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -48,6 +48,8 @@ std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
       return os << "kOutfeed";
     case Thunk::kSequential:
       return os << "kSequential";
+    case Thunk::kTriangularSolve:
+      return os << "kTriangularSolve";
     case Thunk::kTuple:
       return os << "kTuple";
     case Thunk::kWhile:
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index e68bee035a029178844282995429eaa960cc4817..bc69af897a01775d2d33d46067464b10e049f3e1 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -56,6 +56,7 @@ class Thunk {
     kMemzero,
     kOutfeed,
     kSequential,
+    kTriangularSolve,
     kTuple,
     kWhile,
   };
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5200a2af412979c7e38d95c5a9bd5bc2ab64f086
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
@@ -0,0 +1,149 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+namespace xla {
+namespace gpu {
+
+TriangularSolveThunk::TriangularSolveThunk(
+    const TriangularSolveOptions& options,
+    const BufferAllocation::Slice& a_buffer,
+    const BufferAllocation::Slice& b_buffer, PrimitiveType type,
+    int64 batch_size, int64 m, int64 n, int64 a_batch_stride,
+    int64 b_batch_stride, const HloInstruction* hlo)
+    : Thunk(Kind::kTriangularSolve, hlo),
+      uplo_(options.lower() ? se::blas::UpperLower::kLower
+                            : se::blas::UpperLower::kUpper),
+      side_(options.left_side() ? se::blas::Side::kLeft
+                                : se::blas::Side::kRight),
+      unit_diagonal_(options.unit_diagonal() ? se::blas::Diagonal::kUnit
+                                             : se::blas::Diagonal::kNonUnit),
+      a_buffer_(a_buffer),
+      b_buffer_(b_buffer),
+      type_(type),
+      batch_size_(batch_size),
+      m_(m),
+      n_(n),
+      a_batch_stride_(a_batch_stride),
+      b_batch_stride_(b_batch_stride) {
+  transpose_a_ = [&] {
+    switch (options.transpose_a()) {
+      case TriangularSolveOptions::NO_TRANSPOSE:
+        return se::blas::Transpose::kNoTranspose;
+      case TriangularSolveOptions::TRANSPOSE:
+        return se::blas::Transpose::kTranspose;
+      case TriangularSolveOptions::ADJOINT:
+        return se::blas::Transpose::kConjugateTranspose;
+      default:
+        LOG(ERROR) << "Invalid triangular solve transpose value "
+                   << options.transpose_a();
+        return se::blas::Transpose::kNoTranspose;
+    }
+  }();
+}
+
+Status TriangularSolveThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  VLOG(3) << "uplo=" << se::blas::UpperLowerString(uplo_)
+          << " side=" << se::blas::SideString(side_)
+          << " diagonal=" << se::blas::DiagonalString(unit_diagonal_)
+          << " batch_size=" << batch_size_ << " m=" << m_ << " n=" << n_
+          << " a_batch_stride=" << a_batch_stride_
+          << " b_batch_stride=" << b_batch_stride_;
+
+  const int lda = side_ == se::blas::Side::kLeft ? m_ : n_;
+  const int ldb = m_;
+
+  char* a_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(a_buffer_).opaque());
+  char* b_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(b_buffer_).opaque());
+  for (int64 i = 0; i < batch_size_; ++i) {
+    bool launch_ok;
+    se::DeviceMemoryBase a_data =
+        se::DeviceMemoryBase(a_base + i * a_batch_stride_, a_batch_stride_);
+    se::DeviceMemoryBase b_data =
+        se::DeviceMemoryBase(b_base + i * b_batch_stride_, b_batch_stride_);
+    switch (type_) {
+      case F32: {
+        se::DeviceMemory<float> b_data_typed(b_data);
+        launch_ok = stream
+                        ->ThenBlasTrsm(side_, uplo_, transpose_a_,
+                                       unit_diagonal_, m_, n_, /*alpha=*/1.0f,
+                                       se::DeviceMemory<float>(a_data), lda,
+                                       &b_data_typed, ldb)
+                        .ok();
+        break;
+      }
+      case F64: {
+        se::DeviceMemory<double> b_data_typed(b_data);
+        launch_ok = stream
+                        ->ThenBlasTrsm(side_, uplo_, transpose_a_,
+                                       unit_diagonal_, m_, n_, /*alpha=*/1.0,
+                                       se::DeviceMemory<double>(a_data), lda,
+                                       &b_data_typed, ldb)
+                        .ok();
+        break;
+      }
+      case C64: {
+        se::DeviceMemory<std::complex<float>> b_data_typed(b_data);
+        launch_ok =
+            stream
+                ->ThenBlasTrsm(side_, uplo_, transpose_a_, unit_diagonal_, m_,
+                               n_, /*alpha=*/1.0f,
+                               se::DeviceMemory<std::complex<float>>(a_data),
+                               lda, &b_data_typed, ldb)
+                .ok();
+        break;
+      }
+      case C128: {
+        se::DeviceMemory<std::complex<double>> b_data_typed(b_data);
+        launch_ok =
+            stream
+                ->ThenBlasTrsm(side_, uplo_, transpose_a_, unit_diagonal_, m_,
+                               n_, /*alpha=*/1.0,
+                               se::DeviceMemory<std::complex<double>>(a_data),
+                               lda, &b_data_typed, ldb)
+                .ok();
+        break;
+      }
+      default:
+        return InvalidArgument("Invalid type for triangular solve %d", type_);
+    }
+    if (!launch_ok) {
+      return InternalError("Unable to launch triangular solve for thunk %p",
+                           this);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..c947162ea32f197f808d099859eadbbc55a65ab1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a triangular
+// solve (BlasTrsm). It is generated by IrEmitter.
+//
+// Thread-compatible.
+class TriangularSolveThunk : public Thunk {
+ public:
+  TriangularSolveThunk(const TriangularSolveOptions& options,
+                       const BufferAllocation::Slice& a_buffer,
+                       const BufferAllocation::Slice& b_buffer,
+                       PrimitiveType type, int64 batch_size, int64 m, int64 n,
+                       int64 a_batch_stride, int64 b_batch_stride,
+                       const HloInstruction* hlo);
+
+  TriangularSolveThunk(const TriangularSolveThunk&) = delete;
+  TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  const se::blas::UpperLower uplo_;
+  const se::blas::Side side_;
+  const se::blas::Diagonal unit_diagonal_;
+  se::blas::Transpose transpose_a_;
+
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice b_buffer_;
+
+  const PrimitiveType type_;
+  const int64 batch_size_;
+  const int64 m_;
+  const int64 n_;
+  const int64 a_batch_stride_;
+  const int64 b_batch_stride_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
index c552c2925497f1c4808d74a615d35cdbeeba1858..bbbcc2dbb0f71d08462a1aad6d97e7fd07b2a1fb 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 263b42a29dbb0dbc0fb6eca7968674ff242f45ed..ae9e3169fd9b7a4655ab91ffb1589b845402ba8d 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,7 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 59
+// Next ID: 62
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -175,6 +175,9 @@ message HloInstructionProto {
   // partners.
   bool is_host_transfer = 47;
 
+  // Whether this Sort instruction should be stable.
+  bool is_stable = 60;
+
   xla.ScatterDimensionNumbers scatter_dimension_numbers = 48;
 
   // Precision configuration for the instruction. Has backend-specific meaning.
@@ -193,6 +196,12 @@ message HloInstructionProto {
   // operand.
   bool constrain_layout = 56;
   repeated xla.ShapeProto operand_shapes_with_layout = 57;
+
+  // Options for TriangularSolve
+  xla.TriangularSolveOptions triangular_solve_options = 59;
+
+  // Describes how parameters behave with regards to replicas.
+  xla.ParameterReplication parameter_replication = 61;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 40fe91398be33f5681e1389e1b6fadcbd87487bb..817e15f9ff10a9b7e1a502265c85f70fdd681dd9 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -296,7 +296,7 @@ void ComputeComputationPostOrder(HloComputation* computation,
 }  // namespace
 
 void HloComputation::ComputeInstructionPostOrder(
-    const HloComputation::ChannelDependencyMap& channel_dependency_map,
+    const HloComputation::ChannelDependencyGroup& channel_dependency_group,
     std::vector<HloInstruction*>* post_order, HloInstruction* root,
     absl::flat_hash_map<HloInstruction*, VisitState>* visited) const {
   std::vector<HloInstruction*> dfs_stack;
@@ -320,66 +320,75 @@ void HloComputation::ComputeInstructionPostOrder(
 
     visited->insert({current, kVisiting});
 
-    // Add the operands to the stack in reverse order so the first operand is
-    // processed first. This will produce a more natural ordering and a nicer
-    // result for things like HLO stringification.
-    const auto& operands = current->operands();
-    for (int64 i = operands.size() - 1; i >= 0; --i) {
-      dfs_stack.emplace_back(operands[i]);
-    }
-
-    for (HloInstruction* op : current->control_predecessors()) {
-      dfs_stack.emplace_back(op);
-    }
-
-    // Add inputs for send->recv_done dependencies and all-reduce
-    // dependencies.
-    switch (current->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(current->channel_id());
-        if (it != channel_dependency_map.end()) {
-          for (HloInstruction* op : it->second) {
-            dfs_stack.emplace_back(op);
-          }
-        }
-        break;
+    const auto get_channel_id =
+        [](HloInstruction* inst) -> absl::optional<int64> {
+      switch (inst->opcode()) {
+        case HloOpcode::kRecvDone:
+          return inst->channel_id();
+        case HloOpcode::kAllReduce:
+          return inst->all_reduce_id();
+        default:
+          return absl::nullopt;
       }
-      case HloOpcode::kAllReduce: {
-        auto all_reduce_id = current->all_reduce_id();
-        if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            for (HloInstruction* op : it->second) {
-              dfs_stack.emplace_back(op);
-            }
-          }
+    };
+
+    // When adding a predecessor to the dfs_stack, we need to also add its
+    // associated channel dependencies.
+    const auto add_dfs_stack = [&](HloInstruction* inst) {
+      auto channel_id = get_channel_id(inst);
+      if (channel_id && channel_dependency_group.count(*channel_id)) {
+        auto it = channel_dependency_group.find(*channel_id);
+        for (HloInstruction* cinst : it->second) {
+          dfs_stack.emplace_back(cinst);
         }
-        break;
+      } else {
+        dfs_stack.emplace_back(inst);
       }
-      default:
-        break;
+    };
+
+    const auto add_predecessors = [&](HloInstruction* inst) {
+      // Add the operands to the stack in reverse order so the first operand is
+      // processed first. This will produce a more natural ordering and a nicer
+      // result for things like HLO stringification.
+      const auto& operands = inst->operands();
+      for (int64 i = operands.size() - 1; i >= 0; --i) {
+        add_dfs_stack(operands[i]);
+      }
+
+      for (HloInstruction* op : inst->control_predecessors()) {
+        add_dfs_stack(op);
+      }
+    };
+
+    // If the current instruction is a channel instruction, add the dependencies
+    // from all associated instructions of the channel.
+    auto channel_id = get_channel_id(current);
+    if (channel_id && channel_dependency_group.count(*channel_id)) {
+      auto it = channel_dependency_group.find(*channel_id);
+      for (HloInstruction* cinst : it->second) {
+        add_predecessors(cinst);
+      }
+    } else {
+      add_predecessors(current);
     }
   }
 }
 
-HloComputation::ChannelDependencyMap
+HloComputation::ChannelDependencyGroup
 HloComputation::ComputeChannelDependencies() const {
-  ChannelDependencyMap channel_dependency_map;
+  ChannelDependencyGroup channel_dependency_group;
   for (const auto& instruction : instructions_) {
     switch (instruction->opcode()) {
-      case HloOpcode::kSend: {
-        channel_dependency_map[instruction->channel_id()].push_back(
+      case HloOpcode::kSend:
+      case HloOpcode::kRecvDone:
+        channel_dependency_group[instruction->channel_id()].push_back(
             instruction.get());
         break;
-      }
       case HloOpcode::kAllReduce: {
         auto all_reduce_id = instruction->all_reduce_id();
         if (all_reduce_id) {
-          auto& dependencies = channel_dependency_map[all_reduce_id.value()];
-          absl::c_copy(instruction->operands(),
-                       std::back_inserter(dependencies));
-          absl::c_copy(instruction->control_predecessors(),
-                       std::back_inserter(dependencies));
+          channel_dependency_group[all_reduce_id.value()].push_back(
+              instruction.get());
         }
         break;
       }
@@ -387,11 +396,11 @@ HloComputation::ComputeChannelDependencies() const {
         break;
     }
   }
-  return channel_dependency_map;
+  return channel_dependency_group;
 }
 
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
-  auto channel_dependency_map = ComputeChannelDependencies();
+  auto channel_dependency_group = ComputeChannelDependencies();
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
   std::vector<HloInstruction*> trace_instructions;
@@ -404,7 +413,7 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
       // users).
       trace_instructions.push_back(instruction.get());
     } else if (instruction->users().empty()) {
-      ComputeInstructionPostOrder(channel_dependency_map, &post_order,
+      ComputeInstructionPostOrder(channel_dependency_group, &post_order,
                                   instruction.get(), &visited);
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 0cb9caddd089011f3e9a4473995847dc966dd402..212dfa15a13185f1050103739fad8b560270d401 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -369,13 +369,13 @@ class HloComputation {
   // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
-  // Returns a map from channel-id to directed dependencies of the channel
-  // instructions. For send&recv pairs it means the send instruction and for
-  // all-reduce the union of the dependencies for all participating
-  // instructions.
-  using ChannelDependencyMap =
+  // Returns a map from channel-id to the group of instructions associated with
+  // the channel. These instructions will be considered as a single node for
+  // dependency purposes. Send and RecvDone are in the group, and AllReduces
+  // with the same channel id are in the group.
+  using ChannelDependencyGroup =
       absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyMap ComputeChannelDependencies() const;
+  ChannelDependencyGroup ComputeChannelDependencies() const;
 
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
@@ -391,6 +391,10 @@ class HloComputation {
     fusion_instruction_ = fusion_instruction;
   }
 
+  // Clear the unique ID of the computation so that it can be re-assigned, such
+  // as for the purpose of compacting the unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
   // The id of this computation should be unique within the module.
   void SetUniqueId(int64 id) {
     CHECK_EQ(unique_id_, -1);
@@ -434,7 +438,7 @@ class HloComputation {
 
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
-      const HloComputation::ChannelDependencyMap& channel_dependency_map,
+      const HloComputation::ChannelDependencyGroup& channel_dependency_map,
       std::vector<HloInstruction*>* post_order, HloInstruction* root,
       absl::flat_hash_map<HloInstruction*, VisitState>* visited) const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 3b88e9745c27d6e1f2a46e5c83ac2e8bd8d05150..fe37ca6b3963430c765f27aede4f506366fc5d97 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -37,6 +39,7 @@ namespace xla {
 namespace {
 
 namespace m = match;
+namespace op = xla::testing::opcode_matchers;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -668,5 +671,34 @@ TEST_F(HloComputationTest, DeepEquality) {
   EXPECT_FALSE(*computation_c == *computation_b);
 }
 
+// Tests that cross-module AllReduce instructions are ordered before all their
+// predecessors and after all their successors.
+TEST_F(HloComputationTest, InstructionPostOrderWithAllReduce) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param = f32[128] parameter(0), sharding={maximal device=0}
+  crs0 = f32[128] all-reduce(param),
+    replica_groups={{0}}, all_reduce_id=1, barrier="", to_apply=add,
+    sharding={maximal device=0}
+  crs1 = f32[128] all-reduce(param),
+    replica_groups={{0}}, all_reduce_id=1, barrier="", to_apply=add,
+    sharding={maximal device=1}
+  add = f32[128] add(crs0, crs0), sharding={maximal device=0}
+  ROOT t = (f32[128], f32[128]) tuple(add, crs1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+  EXPECT_THAT(module->entry_computation()->MakeInstructionPostOrder(),
+              ElementsAre(op::Parameter(), op::AllReduce(), op::AllReduce(),
+                          op::Add(), op::Tuple()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 1ee958114ebfa976cea72e901432575b7dc58321..29ac263c5f39b5ec1f02a232704adcd3e3f21f60 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -245,7 +245,7 @@ Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   for (auto dim : dnums.lhs_contracting_dimensions()) {
     reduction_width *= lhs_shape.dimensions(dim);
   }
-  // Each output elment requires reduction_widht FMA operations.
+  // Each output elment requires reduction_width FMA operations.
   current_properties_[kFlopsKey] =
       kFmaFlops * ShapeUtil::ElementsIn(dot_shape) * reduction_width;
   return Status::OK();
@@ -546,6 +546,21 @@ Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
+  float bytes_accessed = GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
+  bytes_accessed += GetShapeSize(hlo->operand(1)->shape());
+  current_properties_[kBytesAccessedKey] = bytes_accessed;
+
+  const Shape& a_shape = hlo->operand(0)->shape();
+  const Shape& b_shape = hlo->operand(1)->shape();
+  // Estimate as batch * mn^2 / 2 flops.
+  int64 elems = a_shape.dimensions(a_shape.dimensions_size() - 1);
+  elems *= ShapeUtil::ElementsIn(b_shape);
+  // Each output elment requires reduction_widht FMA operations.
+  current_properties_[kFlopsKey] = kFmaFlops * elems;
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 421786e20a3d9528ea76a44b3087ab2aed81d2b5..96357dec68e390251c43c2c3fc6f5a5612063fbd 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -71,6 +71,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleDot(const HloInstruction* dot) override;
   Status HandleConvolution(const HloInstruction* convolution) override;
   Status HandleFft(const HloInstruction* fft) override;
+  Status HandleTriangularSolve(const HloInstruction* hlo) override;
   Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index d56f673455f9129b72e9d85eaf8cbf03cfee4302..b5d9e8e7f1a703d5d914a12d5226d53821071be6 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -17,10 +17,15 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -268,6 +273,29 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
       select_shape, HloOpcode::kSelect, pred, on_true, on_false));
 }
 
+StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64 dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module) {
+  CHECK(!operands.empty()) << "Sort Hlo requires at least one operand.";
+  HloComputation* compare_computation;
+  XlaBuilder b("Sort.Compare");
+  std::vector<PrimitiveType> operand_types(operands.size());
+  for (int64 i = 0; i < operands.size(); ++i) {
+    operand_types[i] = operands[i]->shape().element_type();
+  }
+  XlaComputation comparator = CreateScalarLtComputation(operand_types, &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module);
+  compare_computation =
+      module->DeepCloneComputation(new_module->entry_computation(), &context);
+  return builder->AddInstruction(HloInstruction::CreateSort(
+      sort_shape, dimension_to_sort, operands, compare_computation, is_stable));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 1c3174e9c89c16cb11589e7c0235bdf13eae6b85..17b7a2da6a9da994ea2d496b549eec79278b56b5 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -123,6 +123,15 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
                                         HloInstruction* on_true,
                                         HloInstruction* on_false);
 
+// Creates a Sort HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation. Also creates a
+// default compare sub-computation which sorts the first operand into ascending
+// order. 'is_stable' specifies whether the sorting should be stable.
+StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64 dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module);
+
 // Creates an R1 Constant HLO instruction of the given PrimitiveType with the
 // given values and adds it to the given computation.
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index e602107cbe64320a8e8e740168cb294ec6be9667..849cac278ee379122ba1ff9fade3bf003969b8a7 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 4a7c4963b7b399e625da907b3810c42df7ee2bd3..768e3afb3b80698061b62c4aadef09c20e2f286c 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -2356,14 +2357,17 @@ TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto sort =
-      builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort, MakeSortHlo(keys_shape, {keys}, -1, /*is_stable=*/false,
+                              &builder, module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   EXPECT_TRUE(
       dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
@@ -2371,6 +2375,7 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   Shape values_shape = ShapeUtil::MakeShape(F32, {8});
@@ -2378,11 +2383,14 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
-  auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
-      {values}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({keys_shape, values_shape}),
+                  {keys, values}, 0, /*is_stable=*/false, &builder,
+                  module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   // The buffer for the keys can be shared with the first tuple entry.
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index 9b0f2b2a0f4dd5d1d1191e9ab0637cc3034b50da..7d6b86056af3fc2128fe1642bbfa0ca6f9ef1da0 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -127,6 +127,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       // These are ops where it does not make sense to convert them.
       if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
           opcode == HloOpcode::kTuple || opcode == HloOpcode::kConvert ||
+          opcode == HloOpcode::kBitcastConvert ||
           opcode == HloOpcode::kGetTupleElement ||
           opcode == HloOpcode::kInfeed || opcode == HloOpcode::kOutfeed) {
         continue;
@@ -145,7 +146,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
           opcode == HloOpcode::kMap || opcode == HloOpcode::kReduce ||
           opcode == HloOpcode::kReduceWindow || opcode == HloOpcode::kScatter ||
           opcode == HloOpcode::kSelectAndScatter ||
-          opcode == HloOpcode::kConditional) {
+          opcode == HloOpcode::kSort || opcode == HloOpcode::kConditional) {
         continue;
       }
       TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index 5b633784e2f306290ca6c096f67c657be1f188c8..4171f738620dbf545e5883b8c26169fae4b93643 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -176,5 +176,19 @@ ENTRY main {
   EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0));
 }
 
+TEST_F(HloElementTypeConverterTest, BitcastConvertIsUnmodified) {
+  const string& hlo_string = R"(
+  HloModule test
+
+  ENTRY test {
+    p = bf16[] parameter(0)
+    ROOT c = u16[] bitcast-convert(p)
+  })";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  HloElementTypeConverter converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, RunHloPass(&converter, module.get()));
+  EXPECT_FALSE(converted);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 56a1b6f43945adae18313546432b959f66a32dcf..4d6487700b24cfd3b89aece58e5ad6d7bb43a800 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -1461,14 +1462,6 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
   }
   Shape key_shape = sort->operand(0)->shape();
   auto rank = key_shape.rank();
-  PrimitiveType keys_type = key_shape.element_type();
-  if (keys_type != F64 && keys_type != U64 && keys_type != S64 &&
-      keys_type != F32 && keys_type != U32 && keys_type != S32 &&
-      keys_type != BF16 && keys_type != F16 && keys_type != U16 &&
-      keys_type != S16 && keys_type != U8 && keys_type != S8) {
-    return InvalidArgument("Unsupported type for Sort: %s",
-                           PrimitiveType_Name(keys_type));
-  }
   std::vector<Literal> result_literals;
   result_literals.reserve(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {
@@ -1479,6 +1472,7 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
   int64 sort_dim = sort->dimensions(0);
   int64 sort_dim_elements = key_shape.dimensions(sort_dim);
   increment[sort_dim] = sort_dim_elements;
+  HloEvaluator embedded_evaluator(max_loop_iterations_);
   // Iterate through each dimension except 'sort_dim'.
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
       key_shape, zero_base, AsInt64Slice(key_shape.dimensions()), increment,
@@ -1499,78 +1493,51 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
         }
         std::vector<int64> indices_to_sort(sort_dim_elements);
         std::iota(indices_to_sort.begin(), indices_to_sort.end(), 0);
-        std::stable_sort(
-            indices_to_sort.begin(), indices_to_sort.end(),
-            [keys_type, &literals_to_sort](int64 a, int64 b) {
-              switch (keys_type) {
-                case F64: {
-                  auto key_lhs = literals_to_sort[0].Get<double>({a});
-                  auto key_rhs = literals_to_sort[0].Get<double>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case U64: {
-                  auto key_lhs = literals_to_sort[0].Get<uint64>({a});
-                  auto key_rhs = literals_to_sort[0].Get<uint64>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case S64: {
-                  auto key_lhs = literals_to_sort[0].Get<int64>({a});
-                  auto key_rhs = literals_to_sort[0].Get<int64>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case F32: {
-                  auto key_lhs = literals_to_sort[0].Get<float>({a});
-                  auto key_rhs = literals_to_sort[0].Get<float>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case U32: {
-                  auto key_lhs = literals_to_sort[0].Get<uint32>({a});
-                  auto key_rhs = literals_to_sort[0].Get<uint32>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case S32: {
-                  auto key_lhs = literals_to_sort[0].Get<int32>({a});
-                  auto key_rhs = literals_to_sort[0].Get<int32>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case BF16: {
-                  auto key_lhs = literals_to_sort[0].Get<bfloat16>({a});
-                  auto key_rhs = literals_to_sort[0].Get<bfloat16>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case F16: {
-                  auto key_lhs = literals_to_sort[0].Get<Eigen::half>({a});
-                  auto key_rhs = literals_to_sort[0].Get<Eigen::half>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case U16: {
-                  auto key_lhs = literals_to_sort[0].Get<uint16>({a});
-                  auto key_rhs = literals_to_sort[0].Get<uint16>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case S16: {
-                  auto key_lhs = literals_to_sort[0].Get<int16>({a});
-                  auto key_rhs = literals_to_sort[0].Get<int16>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case U8: {
-                  auto key_lhs = literals_to_sort[0].Get<uint8>({a});
-                  auto key_rhs = literals_to_sort[0].Get<uint8>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                case S8: {
-                  auto key_lhs = literals_to_sort[0].Get<int8>({a});
-                  auto key_rhs = literals_to_sort[0].Get<int8>({b});
-                  return SafeLess(key_lhs, key_rhs);
-                }
-                default:
-                  // We should never reach here, because we checked earlier
-                  // that 'key_type' is one of the cases above.
-                  LOG(FATAL) << "Invalid key type in Sort: %s",
-                      PrimitiveType_Name(keys_type);
-                  return false;
-              }
-            });
+        Status compare_status = Status::OK();
+        auto comparator = [sort, &compare_status, &embedded_evaluator,
+                           &literals_to_sort](int64 a, int64 b) {
+          std::vector<Literal> literals;
+          literals.reserve(2 * sort->operand_count());
+          for (int64 i = 0; i < sort->operand_count(); ++i) {
+            auto lhs = ExtractFromIndexPositions(literals_to_sort[i], {a},
+                                                 /*extract_as_scalar=*/true);
+            if (!lhs.ok()) {
+              compare_status = lhs.status();
+              return false;
+            }
+            literals.push_back(std::move(lhs.ValueOrDie()));
+            auto rhs = ExtractFromIndexPositions(literals_to_sort[i], {b},
+                                                 /*extract_as_scalar=*/true);
+            if (!rhs.ok()) {
+              compare_status = rhs.status();
+              return false;
+            }
+            literals.push_back(std::move(rhs.ValueOrDie()));
+          }
+          std::vector<const Literal*> literal_ptrs;
+          absl::c_transform(literals, std::back_inserter(literal_ptrs),
+                            [](const Literal& literal) { return &literal; });
+
+          auto computed_result =
+              embedded_evaluator.Evaluate(*sort->to_apply(), literal_ptrs);
+          // Clear visit states so that we can use the evaluator again
+          // on the same computation.
+          embedded_evaluator.ResetVisitStates();
+          if (!computed_result.ok()) {
+            compare_status = computed_result.status();
+            return false;
+          }
+          return computed_result.ValueOrDie().Get<bool>({});
+        };
+        if (Cast<HloSortInstruction>(sort)->is_stable()) {
+          std::stable_sort(indices_to_sort.begin(), indices_to_sort.end(),
+                           comparator);
+        } else {
+          std::sort(indices_to_sort.begin(), indices_to_sort.end(), comparator);
+        }
+        if (!compare_status.ok()) {
+          return compare_status;
+        }
         std::vector<int64> slice_dimensions(rank, 1);
         slice_dimensions[sort_dim] = sort_dim_elements;
         std::vector<int64> start_indices(rank, 0);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 72ea40bcd797def3bc0765986881792b8752d9e1..357975a131d0c7e63c06e96852468b43d97a37f2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -331,16 +331,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   std::vector<const Literal*> arg_literals_;
 
   // Max loop iterations to execute with no maximum if negative.
-  int64 max_loop_iterations_;
+  int64 max_loop_iterations_ = 0;
 
   // Module-level seed handle.
-  uint64 seed_;
+  uint64 seed_ = 0;
   // RNG engine.
   std::minstd_rand0 engine_;
 
   // DynamicDimensionInference is used to evaluate GetDimensionSize, which
   // returns the dynamic dimension size of its operand.
-  DynamicDimensionInference* dynamic_dimension_inference_;
+  DynamicDimensionInference* dynamic_dimension_inference_ = nullptr;
 
   // Optional handler for custom_call ops.
   std::function<StatusOr<Literal>(HloInstruction* custom_call,
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index fb8cd299cef06d549130cd56dd2c430c4c1a0387..383921fde22242b6ede95a6554f2348ab6fd4277 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -111,6 +111,24 @@ class HloEvaluatorTest : public HloTestBase {
     EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
   }
 
+  void TestTernaryOp(HloOpcode opcode, Literal expected, Literal src0,
+                     Literal src1, Literal src2) {
+    HloComputation::Builder b(TestName());
+    auto operand0 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src0)));
+    auto operand1 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src1)));
+    auto operand2 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src2)));
+    b.AddInstruction(HloInstruction::CreateTernary(
+        expected.shape(), opcode, operand0, operand1, operand2));
+    m_->AddEntryComputation(b.Build());
+
+    Literal result = Evaluate();
+
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
  protected:
   explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {}
   HloEvaluator evaluator_;
@@ -152,6 +170,33 @@ TEST_P(HloEvaluatorBf16Test, DoesClamp) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+// Verifies that clamping of int64 does not cause loss of precision
+TEST_P(HloEvaluatorBf16Test, DoesClampInt64) {
+  auto ones = [](int bits) { return (int64{1} << bits) - 1; };
+
+  auto low =
+      LiteralUtil::CreateR2<int64>({{0, ones(54)}, {ones(54), ones(58)}});
+  auto value = LiteralUtil::CreateR2<int64>({{0, ones(56)}, {0, ones(58)}});
+  auto high = LiteralUtil::CreateR2<int64>(
+      {{ones(54), ones(55)}, {ones(56), ones(58)}});
+
+  Shape shape = low.shape();
+  HloComputation::Builder b(TestName());
+  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
+  b.AddInstruction(
+      HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
+  m_->AddEntryComputation(b.Build());
+
+  Literal result = Evaluate();
+
+  auto expected =
+      LiteralUtil::CreateR2<int64>({{0, ones(55)}, {ones(54), ones(58)}});
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_P(HloEvaluatorBf16Test, DISABLED_DoesClampSpecialBroadcast) {
   auto low = LiteralUtil::CreateR0<float>(0.f);
   auto value = LiteralUtil::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
@@ -254,6 +299,20 @@ TEST_F(HloEvaluatorTest, DoesDivideInt64) {
   TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
                std::move(rhs));
 }
+
+TEST_F(HloEvaluatorTest, DoesClampS64) {
+  auto low = LiteralUtil::CreateR1<int64>(
+      {-8616761059752331528LL, 6780561065411491190LL, -8616761059752331528LL});
+  auto value = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491190LL, 6780561065411491180LL, 4241131823772864090LL});
+  auto high = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491180LL, 8616761059752331528LL, 3832151243857508051LL});
+  auto expected = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491190LL, 6780561065411491190LL, 3832151243857508051LL});
+  TestTernaryOp(HloOpcode::kClamp, std::move(expected), std::move(low),
+                std::move(value), std::move(high));
+}
+
 TEST_P(HloEvaluatorBf16Test, DoesDivideDouble) {
   auto lhs = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 742a389ed04eb7303197467587223486c780a31e..d516a6258c80bda168ef4c6fd976e60946eb8b5b 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
 
 #include <cmath>
+#include <type_traits>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
@@ -43,46 +44,6 @@ template <typename T>
 using is_complex_t =
     absl::disjunction<std::is_same<T, complex64>, std::is_same<T, complex128>>;
 
-// It's UB to use std::sort with std::less<float>, because of NaNs. Define
-// "safe" less functions which are actually strict weak orders. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0.
-template <
-    typename NativeT,
-    typename std::enable_if<std::is_integral<NativeT>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  return a < b;
-}
-
-template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                NativeT>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  bool lhs_is_negative = std::signbit(a);
-  bool rhs_is_negative = std::signbit(b);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
-  }
-  bool lhs_nan = std::isnan(a);
-  bool rhs_nan = std::isnan(b);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
-  }
-  return a < b;
-}
-
-template <typename NativeT,
-          typename std::enable_if<
-              std::is_same<NativeT, bfloat16>::value ||
-              std::is_same<NativeT, Eigen::half>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  return SafeLess(static_cast<float>(a), static_cast<float>(b));
-}
-
 // ToArithmeticSafeType(T t):
 //  - converts `t` to the bitwise-equivalent `unsigned T` if T is a signed
 //    integer, and
@@ -462,9 +423,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleNegate<ReturnT>(negate);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
                         ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
@@ -474,6 +435,23 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, bfloat16>::value ||
+                std::is_same<NativeT, Eigen::half>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return std::isnan(elem_operand)
+                                     ? elem_operand
+                                     : std::copysign(
+                                           elem_operand != ElementwiseT(0),
+                                           elem_operand);
+                        }));
+    return Status::OK();
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
@@ -916,9 +894,29 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleShiftRightLogical<ElementwiseT>(shrl);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  // Special case for integral type due to MSVC's std::isnan being unable to
+  // handle integral type.
+  template <typename NativeT,
+            typename std::enable_if<!is_complex_t<NativeT>::value &&
+                                    std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleClamp(HloInstruction* clamp) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
+          return static_cast<ElementwiseT>(
+              std::min(high, std::max(value, low)));
+        };
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!is_complex_t<NativeT>::value &&
+                                    !std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
         clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
@@ -926,7 +924,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             return static_cast<ElementwiseT>(NAN);
           }
           return static_cast<ElementwiseT>(
-              std::fmin(high, std::fmax(value, low)));
+              std::min<NativeT>(high, std::max<NativeT>(value, low)));
         };
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[clamp],
@@ -2693,12 +2691,25 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         const Literal& high =
             parent_->GetEvaluatedLiteralFor(random->operand(1));
 
-        std::uniform_real_distribution<NativeT> generator(
-            low.Get<NativeT>({}), high.Get<NativeT>({}));
-
+        // std::uniform_real_distribution(a, b) can sometimes return a value
+        // equal to b.  Unclear if this is a spec bug or an implementation bug
+        // or WAI [0] [1] [2].  Anyway for our purposes we want a half-open
+        // interval, so we have to re-sample if we get `b` out.
+        //
+        // [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63176
+        // [1] https://bugs.llvm.org/show_bug.cgi?id=18767
+        // [2] http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524
+        auto low_val = low.Get<NativeT>({});
+        auto high_val = high.Get<NativeT>({});
+        std::uniform_real_distribution<NativeT> generator(low_val, high_val);
         TF_RETURN_IF_ERROR(
             result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
-              return generator(parent_->engine_);
+              while (true) {
+                NativeT v = generator(parent_->engine_);
+                if (v != high_val) {
+                  return v;
+                }
+              }
             }));
         break;
       }
@@ -2832,21 +2843,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       absl::Span<HloInstruction* const> start_indices,
       const Shape& result_shape) {
     std::vector<int64> start;
-    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
-    // between the cases, this currently assumes there is at least 1 index. That
-    // is wrong in the general case, because for scalar indices, if the operand
-    // is scalar, then there are no indices. This problem with resolve itself.
-    const HloInstruction* first_index = start_indices[0];
-    if (first_index->shape().rank() == 1) {
-      auto start_indices_typed =
-          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
-      start = std::vector<int64>(start_indices_typed.begin(),
-                                 start_indices_typed.end());
-    } else {
-      for (HloInstruction* index : start_indices) {
-        start.push_back(
-            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
-      }
+
+    for (HloInstruction* index : start_indices) {
+      start.push_back(
+          parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
     }
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
@@ -2879,22 +2879,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto result = operand_literal.Clone();
     const auto rank = result.shape().rank();
     std::vector<int64> start;
-    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
-    // between the cases, this currently assumes there is at least 1 index. That
-    // is wrong in the general case, because for scalar indices, if the operand
-    // is scalar, then there are no indices. This problem with resolve itself.
-    const HloInstruction* first_index = start_indices[0];
-    if (first_index->shape().rank() == 1) {
-      auto start_indices_typed =
-          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
-      start = std::vector<int64>(start_indices_typed.begin(),
-                                 start_indices_typed.end());
-    } else {
-      for (HloInstruction* index : start_indices) {
-        start.push_back(
-            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
-      }
+    for (HloInstruction* index : start_indices) {
+      start.push_back(
+          parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
     }
+
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
     for (int64 i = 0; i < rank; ++i) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 46ee99923ee9b6d852e6190cc8de6afe0b99457e..49300b3ffe2f755d103af7877ab3fee5298eeb3e 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -536,7 +535,12 @@ stylesheet=<
     }
   }
 
-  return StrFormat(fmt, graph_label, StrJoin(edge_css_rules, "\n"));
+  // Browsers require that we URI-encode the contents of our data URI.  (It
+  // seems this was a relatively recent change?) In practice, this means that we
+  // need to escape '#'.
+  return StrFormat(
+      fmt, graph_label,
+      absl::StrReplaceAll(StrJoin(edge_css_rules, "\n"), {{"#", "%23"}}));
 }
 
 string HloDotDumper::Footer() { return StrCat(StrJoin(edges_, "\n"), "\n}"); }
@@ -1011,6 +1015,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kConvolution:
     case HloOpcode::kDot:
     case HloOpcode::kFft:
+    case HloOpcode::kTriangularSolve:
       return kDarkBlue;
     case HloOpcode::kReducePrecision:
       return kRed;
@@ -1281,8 +1286,9 @@ namespace {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
-                                      int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(
+    const HloInstruction* root, int64 radius,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
   absl::flat_hash_map<const HloInstruction*, NodeFilterResult> nodes;
@@ -1298,6 +1304,9 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     if (depth == radius) {
       continue;
     }
+    if (boundary.contains(instr)) {
+      continue;
+    }
 
     // Traverse into instr's operands.
     //
@@ -1446,9 +1455,6 @@ string SaveGraph(const string& graph,
     case GraphRendererInterface::DOT_GRAPH:
       file_extension = ".dot";
       break;
-    case GraphRendererInterface::TF_GRAPHDEF:
-      file_extension = ".pbtxt";
-      break;
   }
   string path = JoinPath(dest_path, StrCat("hlo_graph_", output_num++, "."));
   auto status = Status::OK();
@@ -1486,25 +1492,27 @@ string ExportGraph(const string& graph,
 
 }  // namespace
 
+string HloComputationToDotGraph(const HloComputation& computation,
+                                const DotGraphOptions& options) {
+  DebugOptions default_debug_options;
+  return HloDotDumper(&computation, options.label,
+                      options.debug_options ? *options.debug_options
+                                            : default_debug_options,
+                      options.show_backend_config, options.profile,
+                      NodeFilter())
+      .Dump();
+}
+
 string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile,
                  bool show_backend_config) {
   GraphRendererInterface::GraphKind graph_kind;
-  string graph;
-  if (debug_options.xla_hlo_dump_as_graphdef()) {
-    HloTfGraphBuilder builder(debug_options);
-    TF_CHECK_OK(builder.AddComputation(computation));
-    CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
-                                                          &graph));
-    graph_kind = GraphRendererInterface::TF_GRAPHDEF;
-  } else {
-    graph =
-        HloDotDumper(&computation, label, debug_options, show_backend_config,
-                     hlo_execution_profile, NodeFilter())
-            .Dump();
-    graph_kind = GraphRendererInterface::DOT_GRAPH;
-  }
+  string graph =
+      HloDotDumper(&computation, label, debug_options, show_backend_config,
+                   hlo_execution_profile, NodeFilter())
+          .Dump();
+  graph_kind = GraphRendererInterface::DOT_GRAPH;
 
   string graph_url = ExportGraph(graph, graph_kind, debug_options);
   LOG(INFO) << "computation " << computation.name() << " [" << label
@@ -1512,12 +1520,13 @@ string DumpGraph(const HloComputation& computation, const string& label,
   return graph_url;
 }
 
-string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_backend_config) {
+string DumpNeighborhoodAround(
+    const HloInstruction& node, int radius, bool show_backend_config,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
+  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius, boundary);
   string graph =
       HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 8e51454ef1cf992386cc7325e32705c08bf7712f..563cea42371d370b4c9ea739418692fd74dca799 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -26,13 +26,23 @@ limitations under the License.
 namespace xla {
 namespace hlo_graph_dumper {
 
+// Converts a HLO module to a DOT (graphviz) graph. Returns the dot graph as
+// a string.
+struct DotGraphOptions {
+  absl::string_view label;
+  const DebugOptions* debug_options = nullptr;
+  const HloExecutionProfile* profile = nullptr;
+  bool show_backend_config = false;
+};
+string HloComputationToDotGraph(const HloComputation& computation,
+                                const DotGraphOptions& options);
+
 // Abstract interface for classes that render HLO graphs (e.g. DOT graph,
-// tensorflow GraphDef).
+// tensorflow GraphDef) to files or services.
 class GraphRendererInterface {
  public:
   enum GraphKind {
     DOT_GRAPH,
-    TF_GRAPHDEF,
   };
 
   virtual ~GraphRendererInterface() = default;
@@ -63,8 +73,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 // The number of nodes dumped is controlled by the radius parameter, which
 // (roughly) corresponds to the max distance a node may be from the primary node
 // before it's omitted from the graph.
-string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_backend_config = false);
+//
+// The optional boundary specifies a set of boundary nodes, beyond which nodes
+// will be omitted even if they are within the radius.
+string DumpNeighborhoodAround(
+    const HloInstruction& node, int radius, bool show_backend_config = false,
+    const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Dumps nodes on any of the paths from `from` to `to`.  If there are more than
 // max_nodes on all paths, restricts to the max_nodes nodes on the shortest
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
index a46a107723de30176241aae01b268a8c10d991d3..265bfdf7f989b0821a98c1f774cb408b78f348fe 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 1b677bc25c1d3dfd0205c3e0dfbf1fd30c646fd4..5a6915005838c5c6f0ca0dd6563b2f17f6274a64 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -132,6 +132,14 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                               absl::Span<const int64>(fft_length));
       break;
     }
+    case HloOpcode::kTriangularSolve: {
+      TF_RET_CHECK(proto.operand_ids_size() == 2)
+          << "Triangular solve instruction should have 2 operands but sees "
+          << proto.operand_ids_size();
+      instruction = CreateTriangularSolve(shape, operands(0), operands(1),
+                                          proto.triangular_solve_options());
+      break;
+    }
     case HloOpcode::kSend:
       TF_RET_CHECK(proto.operand_ids_size() == 2)
           << "Send instruction should have 2 operand but sees "
@@ -201,11 +209,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.dimensions().size() == 1)
           << "Sort instruction should have 1 dimension";
+      TF_RET_CHECK(proto.called_computation_ids_size() == 1)
+          << "Sort instruction should one called computation but sees "
+          << proto.called_computation_ids_size();
       auto sort_operands = all_operands();
-      HloInstruction* keys = sort_operands[0];
-      instruction = CreateSort(
-          shape, proto.dimensions(0), keys,
-          absl::Span<HloInstruction* const>(sort_operands).subspan(1));
+      instruction = CreateSort(shape, proto.dimensions(0), all_operands(),
+                               computations(0), proto.is_stable());
       break;
     }
     case HloOpcode::kTranspose:
@@ -295,6 +304,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kParameter:
       instruction =
           CreateParameter(proto.parameter_number(), shape, proto.name());
+      if (!proto.parameter_replication().replicated_at_leaf_buffers().empty()) {
+        instruction->set_parameter_replicated_at_leaf_buffers(
+            proto.parameter_replication().replicated_at_leaf_buffers());
+      }
       break;
     case HloOpcode::kGetTupleElement:
       TF_RET_CHECK(proto.operand_ids_size() == 1)
@@ -790,6 +803,13 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
                                               fft_length);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateTriangularSolve(const Shape& shape, HloInstruction* a,
+                                      HloInstruction* b,
+                                      const TriangularSolveOptions& options) {
+  return absl::make_unique<HloTriangularSolveInstruction>(shape, a, b, options);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
     const DotDimensionNumbers& dimension_numbers,
@@ -1153,9 +1173,11 @@ HloInstruction::CreateBroadcastSequence(
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSort(
-    const Shape& shape, int64 dimension, HloInstruction* keys,
-    absl::Span<HloInstruction* const> values) {
-  return absl::make_unique<HloSortInstruction>(shape, dimension, keys, values);
+    const Shape& shape, int64 dimension,
+    absl::Span<HloInstruction* const> operands, HloComputation* compare,
+    bool is_stable) {
+  return absl::make_unique<HloSortInstruction>(shape, dimension, operands,
+                                               compare, is_stable);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
@@ -1347,6 +1369,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
     case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kTriangularSolve:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1802,6 +1825,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
     case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kTriangularSolve:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -1856,7 +1880,11 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
       << "this shape: " << ShapeUtil::HumanString(shape())
       << ", replacement shape: "
       << ShapeUtil::HumanString(new_producer->shape());
+  return ReplaceUseWithDifferentShape(user, new_producer);
+}
 
+Status HloInstruction::ReplaceUseWithDifferentShape(
+    HloInstruction* user, HloInstruction* new_producer) {
   VLOG(3) << "Replacing uses of " << name() << " in " << user->name()
           << " with " << new_producer->name();
 
@@ -1952,6 +1980,7 @@ HloComputation* HloInstruction::to_apply() const {
     case HloOpcode::kReduce:
     case HloOpcode::kAllReduce:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
       CHECK_EQ(called_computations_.size(), 1);
       return called_computations_[0];
     default:
@@ -1971,6 +2000,7 @@ void HloInstruction::set_to_apply(HloComputation* computation) {
     case HloOpcode::kReduce:
     case HloOpcode::kAllReduce:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
       CHECK_EQ(called_computations_.size(), 1);
       called_computations_[0] = computation;
       break;
@@ -2243,7 +2273,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
                opcode() == HloOpcode::kReduceWindow ||
                opcode() == HloOpcode::kReduce ||
                opcode() == HloOpcode::kAllReduce ||
-               opcode() == HloOpcode::kScatter) {
+               opcode() == HloOpcode::kScatter ||
+               opcode() == HloOpcode::kSort) {
       extra.push_back(
           StrCat("to_apply=", PrintName(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
@@ -2280,6 +2311,7 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
       case HloOpcode::kReduce:
       case HloOpcode::kAllReduce:
       case HloOpcode::kScatter:
+      case HloOpcode::kSort:
         extra.push_back(
             StrCat("to_apply=\n", to_apply()->ToString(new_options)));
         break;
@@ -2585,6 +2617,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleIota(this);
     case HloOpcode::kGetDimensionSize:
       return visitor->HandleGetDimensionSize(this);
+    case HloOpcode::kTriangularSolve:
+      return visitor->HandleTriangularSolve(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -3296,6 +3330,19 @@ int64 HloInstruction::parameter_number() const {
   return Cast<HloParameterInstruction>(this)->parameter_number();
 }
 
+void HloInstruction::set_parameter_replicated_at_leaf_buffers(
+    absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+  return Cast<HloParameterInstruction>(this)
+      ->set_parameter_replicated_at_leaf_buffers(
+          parameter_replicated_at_leaf_buffers);
+}
+
+const absl::optional<std::vector<bool>>&
+HloInstruction::parameter_replicated_at_leaf_buffers() const {
+  return Cast<HloParameterInstruction>(this)
+      ->parameter_replicated_at_leaf_buffers();
+}
+
 int64 HloInstruction::tuple_index() const {
   return Cast<HloGetTupleElementInstruction>(this)->tuple_index();
 }
@@ -3452,4 +3499,8 @@ const DomainMetadata& HloInstruction::operand_side_metadata() const {
 const DomainMetadata& HloInstruction::user_side_metadata() const {
   return Cast<HloDomainInstruction>(this)->user_side_metadata();
 }
+
+const TriangularSolveOptions& HloInstruction::triangular_solve_options() const {
+  return Cast<HloTriangularSolveInstruction>(this)->triangular_solve_options();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c11d29d33e918a363a7df5c4ec4e53dbf407e71e..33cbb9a41bab838e02813e75e2ca6327f785b007 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -384,6 +385,14 @@ class HloInstruction {
 
   // Creates a random number generation instruction that fills a shape with
   // random numbers from a given distribution.
+  //
+  // The parameters to the instruction are interpreted as follows:
+  //
+  //  - If `distribution` is RNG_UNIFORM, generates a number in range
+  //    [param0, param1).
+  //
+  //  - If `distribution` is RNG_NORMAL, generates a normally-distributed value
+  //    with mean `param0` and standard deviation `param1`.
   static std::unique_ptr<HloInstruction> CreateRng(
       const Shape& shape, RandomDistribution distribution,
       absl::Span<HloInstruction* const> parameters);
@@ -435,6 +444,10 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, FftType fft_type,
       absl::Span<const int64> fft_length);
 
+  static std::unique_ptr<HloInstruction> CreateTriangularSolve(
+      const Shape& shape, HloInstruction* a, HloInstruction* b,
+      const TriangularSolveOptions& options);
+
   // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
   // dimensions specified in 'dimension_numbers'.
   static std::unique_ptr<HloInstruction> CreateDot(
@@ -489,7 +502,7 @@ class HloInstruction {
   // Data is sent/received according to the (source_replica_id,
   // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a
   // target_replica_id in any pair, the output on that replica is a tensor
-  // conssits of 0(s) in `shape`.
+  // consists of 0(s) in `shape`.
   static std::unique_ptr<HloInstruction> CreateCollectivePermute(
       const Shape& shape, HloInstruction* operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
@@ -598,7 +611,6 @@ class HloInstruction {
   // f_2 = f(f_1.tuple_element(0), ..., f_1.tuple_element(N), input0.value1,
   // ..., inputN.value1)
   // ...
-  // TODO(b/112040122): Add support to this in HLO passes and in backends.
   static std::unique_ptr<HloInstruction> CreateReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::Span<HloInstruction* const> init_values,
@@ -671,10 +683,15 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       absl::Span<const int64> dimensions);
 
-  // Creates a sort op, with a keys operand, and optional values operands.
+  // Creates a n-ary sort op with a 'compare' computation which is used for
+  // comparisons in the sorting algorithm. 'compare' gets 2 * n parameters,
+  // where parameters 2 * i and 2 * i + 1 are the values of the i-th operand at
+  // specific index positions which should be compared, and should return a
+  // PRED. 'is_stable' specifies whether stable sorting is required.
   static std::unique_ptr<HloInstruction> CreateSort(
-      const Shape& shape, int64 dimension, HloInstruction* keys,
-      absl::Span<HloInstruction* const> values = {});
+      const Shape& shape, int64 dimension,
+      absl::Span<HloInstruction* const> operands, HloComputation* compare,
+      bool is_stable);
 
   // Creates a while instruction, given a condition computation, a body
   // computation, and the initial value for the input of the computations. For
@@ -932,6 +949,10 @@ class HloInstruction {
   // operands of it which could be created due to this replacement.
   Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer);
 
+  // Same as ReplaceUseWith(), but new_producer can have a different shape.
+  Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                      HloInstruction* new_producer);
+
   // Replaces the specified operand with new_operand. The old and new operands
   // must have compatible shapes ignoring floating-point precision.
   //
@@ -1242,6 +1263,10 @@ class HloInstruction {
   // on the instruction's existing name.
   void UniquifyName(NameUniquer* name_uniquer);
 
+  // Clear the unique ID of the instruction so that it can be re-assigned, such
+  // as for the purpose of compacting the instruction unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
   // Set the unique id for this instruction to "id"
   void SetUniqueId(int id) {
     CHECK_EQ(unique_id_, -1);  // Should not be assigned already
@@ -1275,6 +1300,9 @@ class HloInstruction {
     backend_config_ = std::move(config_str);
   }
 
+  bool is_default_config() const { return is_default_config_; }
+  void set_default_config() { is_default_config_ = true; }
+
   // Returns a string representation of a proto in the format used by
   // raw_backend_config_string.
   //
@@ -1445,6 +1473,15 @@ class HloInstruction {
   // Delegates to HloParameterInstruction::parameter_number.
   int64 parameter_number() const;
 
+  // Delegates to
+  // HloParameterInstruction::set_parameter_replicated_at_leaf_buffers.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers);
+
+  // Delegates to HloParameterInstruction::parameter_replicated_at_leaf_buffers.
+  const absl::optional<std::vector<bool>>&
+  parameter_replicated_at_leaf_buffers() const;
+
   // Delegates to HloGetTupleElementInstruction::tuple_index.
   int64 tuple_index() const;
 
@@ -1554,6 +1591,9 @@ class HloInstruction {
   // Delegates to HloDomainInstruction::user_side_metadata().
   const DomainMetadata& user_side_metadata() const;
 
+  // Delegates to HloTriangularSolveInstruction::triangular_solve_options().
+  const TriangularSolveOptions& triangular_solve_options() const;
+
   // Old methods kept for smooth subclassing transition END.
 
  protected:
@@ -1720,6 +1760,10 @@ class HloInstruction {
   // HLO. See the documentation on backend_config().
   string backend_config_;
 
+  // This field is assigned to true when backend_config_ is assigned to
+  // a default configuration.
+  bool is_default_config_ = false;
+
   // String identifier for instruction.
   string name_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 785206bf7753abaef5788365fe10217b8b74ccc6..905a6fe08b4430ad862edf0886a57c9f7e9f7977 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -201,6 +201,57 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
                                               fft_length_);
 }
 
+HloTriangularSolveInstruction::HloTriangularSolveInstruction(
+    const Shape& shape, HloInstruction* a, HloInstruction* b,
+    const TriangularSolveOptions& options)
+    : HloInstruction(HloOpcode::kTriangularSolve, shape),
+      triangular_solve_options_(options) {
+  AppendOperand(a);
+  AppendOperand(b);
+}
+
+HloInstructionProto HloTriangularSolveInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_triangular_solve_options() = triangular_solve_options_;
+  return proto;
+}
+
+std::vector<string> HloTriangularSolveInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {
+      StrCat("left_side=",
+             triangular_solve_options_.left_side() ? "true" : "false"),
+      StrCat("lower=", triangular_solve_options_.lower() ? "true" : "false"),
+      StrCat("unit_diagonal=",
+             triangular_solve_options_.unit_diagonal() ? "true" : "false"),
+      StrCat("transpose_a=", TriangularSolveOptions_Transpose_Name(
+                                 triangular_solve_options_.transpose_a()))};
+}
+
+bool HloTriangularSolveInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other =
+      static_cast<const HloTriangularSolveInstruction&>(other);
+  const auto& options = triangular_solve_options();
+  const auto& other_options = casted_other.triangular_solve_options();
+
+  return options.left_side() == other_options.left_side() &&
+         options.lower() == other_options.lower() &&
+         options.unit_diagonal() == other_options.unit_diagonal() &&
+         options.transpose_a() == other_options.transpose_a();
+}
+
+std::unique_ptr<HloInstruction>
+HloTriangularSolveInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 2);
+  return absl::make_unique<HloTriangularSolveInstruction>(
+      shape, new_operands[0], new_operands[1], triangular_solve_options());
+}
+
 HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
                                                const Shape& shape,
                                                int64 channel_id,
@@ -609,14 +660,17 @@ std::unique_ptr<HloInstruction> HloReduceInstruction::CloneWithNewOperandsImpl(
                                                  dimensions(), to_apply());
 }
 
-HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension,
-                                       HloInstruction* keys,
-                                       absl::Span<HloInstruction* const> values)
-    : HloInstruction(HloOpcode::kSort, shape), dimensions_({dimension}) {
-  AppendOperand(keys);
-  for (auto* value : values) {
+HloSortInstruction::HloSortInstruction(
+    const Shape& shape, int64 dimension,
+    absl::Span<HloInstruction* const> operands, HloComputation* compare,
+    bool is_stable)
+    : HloInstruction(HloOpcode::kSort, shape),
+      dimensions_({dimension}),
+      is_stable_(is_stable) {
+  for (auto* value : operands) {
     AppendOperand(value);
   }
+  AppendComputation(compare);
 }
 
 HloInstructionProto HloSortInstruction::ToProto() const {
@@ -624,12 +678,18 @@ HloInstructionProto HloSortInstruction::ToProto() const {
   for (int64 dimension : dimensions_) {
     proto.add_dimensions(dimension);
   }
+  proto.set_is_stable(is_stable());
   return proto;
 }
 
 std::vector<string> HloSortInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
+  std::vector<string> attrs;
+  attrs.push_back(StrCat("dimensions={", StrJoin(dimensions(), ","), "}"));
+  if (is_stable()) {
+    attrs.push_back("is_stable=true");
+  }
+  return attrs;
 }
 
 bool HloSortInstruction::IdenticalSlowPath(
@@ -637,15 +697,20 @@ bool HloSortInstruction::IdenticalSlowPath(
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloSortInstruction&>(other);
-  return dimensions() == casted_other.dimensions();
+  if (dimensions() != casted_other.dimensions()) {
+    return false;
+  }
+  if (is_stable() != casted_other.is_stable()) {
+    return false;
+  }
+  return eq_computations(to_apply(), other.to_apply());
 }
 
 std::unique_ptr<HloInstruction> HloSortInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  HloInstruction* keys = new_operands[0];
-  return absl::make_unique<HloSortInstruction>(shape, dimensions(0), keys,
-                                               new_operands.subspan(1));
+  return absl::make_unique<HloSortInstruction>(
+      shape, dimensions(0), new_operands, to_apply(), is_stable());
 }
 
 HloTransposeInstruction::HloTransposeInstruction(
@@ -1473,9 +1538,30 @@ HloParameterInstruction::HloParameterInstruction(int64 parameter_number,
 HloInstructionProto HloParameterInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_parameter_number(parameter_number_);
+  if (parameter_replicated_at_leaf_buffers_) {
+    for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
+      proto.mutable_parameter_replication()->add_replicated_at_leaf_buffers(
+          replicated);
+    }
+  }
   return proto;
 }
 
+std::vector<string> HloParameterInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  std::vector<string> result;
+  if (!parameter_replicated_at_leaf_buffers_) {
+    return result;
+  }
+  std::vector<string> buffers_replicated_strs;
+  for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
+    buffers_replicated_strs.push_back(replicated ? "true" : "false");
+  }
+  result.push_back(StrCat("parameter_replication={",
+                          StrJoin(buffers_replicated_strs, ","), "}"));
+  return result;
+}
+
 string HloParameterInstruction::OperandsToStringWithCanonicalNameMap(
     const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
@@ -1963,6 +2049,17 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
   if (batch_group_count_ != casted_other.batch_group_count_) {
     return false;
   }
+  if (layout_constrained() != casted_other.layout_constrained()) {
+    return false;
+  }
+  if (layout_constrained()) {
+    for (int64 i = 0; i < operand_shapes_with_layout_.size(); ++i) {
+      if (!ShapeUtil::Equal(operand_shapes_with_layout_[i],
+                            casted_other.operand_shapes_with_layout_[i])) {
+        return false;
+      }
+    }
+  }
   return custom_call_target_ == casted_other.custom_call_target_ &&
          opaque_ == casted_other.opaque_;
 }
@@ -1973,6 +2070,10 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* context) const {
   auto cloned = absl::make_unique<HloCustomCallInstruction>(
       shape, new_operands, custom_call_target(), opaque());
+  if (layout_constrained()) {
+    cloned->layout_constrained_ = true;
+    cloned->operand_shapes_with_layout_ = operand_shapes_with_layout();
+  }
   if (window_ != nullptr) {
     cloned->set_window(*window_);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 1b4a94753cda8aba8d50836b9d51b7c3fd5807f6..4d23cb671f24623f56faa9b69015cef21752a799 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -131,6 +131,34 @@ class HloFftInstruction : public HloInstruction {
   std::vector<int64> fft_length_;
 };
 
+class HloTriangularSolveInstruction : public HloInstruction {
+ public:
+  explicit HloTriangularSolveInstruction(const Shape& shape, HloInstruction* a,
+                                         HloInstruction* b,
+                                         const TriangularSolveOptions& options);
+  const TriangularSolveOptions& triangular_solve_options() const {
+    return triangular_solve_options_;
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  TriangularSolveOptions triangular_solve_options_;
+};
+
 class HloSendRecvInstruction : public HloInstruction {
  public:
   // Returns the channel id associated with the instruction. The id is
@@ -418,8 +446,8 @@ class HloReduceInstruction : public HloInstruction {
 class HloSortInstruction : public HloInstruction {
  public:
   explicit HloSortInstruction(const Shape& shape, int64 dimension,
-                              HloInstruction* keys,
-                              absl::Span<HloInstruction* const> values = {});
+                              absl::Span<HloInstruction* const> operands,
+                              HloComputation* compare, bool is_stable);
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
@@ -432,6 +460,7 @@ class HloSortInstruction : public HloInstruction {
   HloInstruction* mutable_keys() { return mutable_operand(0); }
   // Returns the number of value operands.
   int64 values_count() const { return operand_count() - 1; }
+  bool is_stable() const { return is_stable_; }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -446,6 +475,7 @@ class HloSortInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   std::vector<int64> dimensions_;
+  bool is_stable_;
 };
 
 class HloTransposeInstruction : public HloInstruction {
@@ -787,10 +817,28 @@ class HloParameterInstruction : public HloInstruction {
   explicit HloParameterInstruction(int64 parameter_number, const Shape& shape,
                                    const string& name);
   int64 parameter_number() const { return parameter_number_; }
+
+  // Sets and gets the whether all replicas will receive the same parameter data
+  // for each leaf buffer in data parallelism.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+    CHECK_EQ(ShapeUtil::GetLeafCount(shape()),
+             parameter_replicated_at_leaf_buffers.size());
+    parameter_replicated_at_leaf_buffers_.emplace(
+        parameter_replicated_at_leaf_buffers.begin(),
+        parameter_replicated_at_leaf_buffers.end());
+  }
+  const absl::optional<std::vector<bool>>&
+  parameter_replicated_at_leaf_buffers() const {
+    return parameter_replicated_at_leaf_buffers_;
+  }
+
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
  private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
@@ -804,6 +852,10 @@ class HloParameterInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   int64 parameter_number_ = 0;
+
+  // Specifies whether each buffer has the same parameter value on all replicas
+  // in data parallelism.
+  absl::optional<std::vector<bool>> parameter_replicated_at_leaf_buffers_;
 };
 
 class HloGetTupleElementInstruction : public HloInstruction {
@@ -903,9 +955,7 @@ class HloOutfeedInstruction : public HloInstruction {
                                  HloInstruction* token_operand,
                                  absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
-  const Shape& outfeed_shape() const {
-    return outfeed_shape_;
-  }
+  const Shape& outfeed_shape() const { return outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
   // Returns a serialized representation of this instruction.
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index c1a642dfea7e464aaf93ffde1e26e07c1a4b73cd..2255383322873a39c7076e0f4f0dd541bc79014d 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -153,6 +153,8 @@ TokKind HloLexer::LexToken() {
         return LexPercent();
       case ':':
         return TokKind::kColon;
+      case '*':
+        return TokKind::kAsterisk;
       case '[':
         return TokKind::kLsquare;
       case ']':
@@ -464,6 +466,8 @@ string TokKindToString(TokKind kind) {
       return "kComma";
     case TokKind::kColon:
       return "kColon";
+    case TokKind::kAsterisk:
+      return "kAsterisk";
     case TokKind::kLsquare:
       return "kLsquare";
     case TokKind::kRsquare:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 16eed21617bc7254b67090d2b5acf9ccbd82f4ea..383fb4e862b8e32771879d055e663dc821a5c839 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -38,9 +38,10 @@ enum class TokKind {
   kError,
 
   // Tokens with no info.
-  kEqual,  // =
-  kComma,  // ,
-  kColon,  // :
+  kEqual,     // =
+  kComma,     // ,
+  kColon,     // :
+  kAsterisk,  // *
   kLsquare,
   kRsquare,  // [  ]
   kLbrace,
@@ -108,7 +109,7 @@ class HloLexer {
         LOG(FATAL) << "This token does not have string value";
     }
   }
-  tensorflow::int64 GetInt64Val() const {
+  int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return token_state_.int64_val;
   }
@@ -171,7 +172,7 @@ class HloLexer {
     const char* token_start = nullptr;
     TokKind current_kind;
     string str_val;
-    tensorflow::int64 int64_val;
+    int64 int64_val;
     double decimal_val;
     PrimitiveType primitive_type_val;
   };
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 759ce6541ef144ad3f84bcb87ddabf507a034305..8322870cfd6a89fc6f863da8fd4a3576e8845cd7 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -246,6 +246,34 @@ HloModuleProto HloModule::ToProto() const {
   return proto;
 }
 
+Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
+  absl::flat_hash_set<string> computation_names;
+  absl::flat_hash_set<int> computation_ids;
+  absl::flat_hash_set<string> instruction_names;
+  absl::flat_hash_set<int> instruction_ids;
+
+  for (const HloComputation* computation : computations()) {
+    TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
+        << "Computation name is not unique: " << computation->name();
+    computation_names.insert(computation->name());
+
+    TF_RET_CHECK(!ContainsKey(computation_ids, computation->unique_id()))
+        << "Computation id is not unique: " << computation->unique_id();
+    computation_ids.insert(computation->unique_id());
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      TF_RET_CHECK(!ContainsKey(instruction_names, instruction->name()))
+          << "Instruction name is not unique: " << instruction->name();
+      instruction_names.insert(instruction->name());
+
+      TF_RET_CHECK(!ContainsKey(instruction_ids, instruction->unique_id()))
+          << "Instruction id is not unique: " << instruction->unique_id();
+      instruction_ids.insert(instruction->unique_id());
+    }
+  }
+  return Status::OK();
+}
+
 /* static */
 StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config) {
@@ -329,28 +357,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
                       DynamicParameterBinding::CreateFromProto(
                           proto.dynamic_parameter_binding()));
 
-  absl::flat_hash_set<string> computation_names;
-  absl::flat_hash_set<string> instruction_names;
-  absl::flat_hash_set<int> computation_ids;
-  absl::flat_hash_set<int> instruction_ids;
-  for (HloComputation* computation : module->computations()) {
-    TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
-        << "Computation name is not unique: " << computation->name();
-    computation_names.insert(computation->name());
-
-    TF_RET_CHECK(!ContainsKey(computation_ids, computation->unique_id()))
-        << "Computation id is not unique: " << computation->unique_id();
-    computation_ids.insert(computation->unique_id());
-    for (HloInstruction* instruction : computation->instructions()) {
-      TF_RET_CHECK(!ContainsKey(instruction_names, instruction->name()))
-          << "Instruction name is not unique: " << instruction->name();
-      instruction_names.insert(instruction->name());
-
-      TF_RET_CHECK(!ContainsKey(instruction_ids, instruction->unique_id()))
-          << "Instruction id is not unique: " << instruction->unique_id();
-      instruction_ids.insert(instruction->unique_id());
-    }
-  }
+  TF_RETURN_IF_ERROR(
+      module->CheckUniqueNamesAndIdsForComputationsAndInstructions());
 
   if (proto.has_schedule()) {
     TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index f1310e4b270898a21dbb4f86123edde4ba8993d0..b6fe6a5cdbd0934014f1152acd48c7a5973bead3 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -187,6 +187,7 @@ class HloModule {
   std::vector<HloComputation*> MakeNonfusionComputations() const;
 
   const HloModuleConfig& config() const { return config_; }
+  void set_config(HloModuleConfig& config) { config_ = config; }
 
   // Return a string representation of the module.
   //
@@ -264,6 +265,18 @@ class HloModule {
   const HloSchedule& schedule() const { return *schedule_; }
   HloSchedule& schedule() { return *schedule_; }
 
+  HloComputation* AddComputationAndUnifyNamesAndIds(
+      std::unique_ptr<HloComputation> computation, bool is_entry) {
+    computation->ClearUniqueIdInternal();
+    for (auto* instruction : computation->instructions()) {
+      instruction->ClearUniqueIdInternal();
+    }
+    return AddComputationInternal(std::move(computation), is_entry,
+                                  /*uniquify_identifiers=*/true);
+  }
+
+  Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 47734bc55cc00d605f4e318400be88639450343c..b877081be5775bf6c75a69ffeba28d0f2cc17f90 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -389,9 +389,10 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
                instruction1->opcode() == HloOpcode::kCall);
   VLOG(2) << "adding as companions:" << instruction1->ToString() << " and "
           << instruction2->ToString();
-
-  if (!ContainsKey(companion_set_index_, instruction1) &&
-      !ContainsKey(companion_set_index_, instruction2)) {
+  if (instruction1 == instruction2) {
+    return Status::OK();
+  } else if (!ContainsKey(companion_set_index_, instruction1) &&
+             !ContainsKey(companion_set_index_, instruction2)) {
     companion_sets_.push_back(
         absl::make_unique<std::vector<HloInstruction*>>());
     auto companion_set = companion_sets_.back().get();
@@ -419,7 +420,10 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
     for (HloInstruction* hlo : Companions(instruction2)) {
       companion_set_index_[hlo] = companion_set_index_[instruction1];
     }
-    companion_sets_.erase(companion_sets_.begin() + index_to_remove);
+    // We can't remove the set from the vector because companion_set_index_
+    // references sets by their index in this vector, so we reset to nullptr
+    // instead.
+    companion_sets_[index_to_remove].reset(nullptr);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 3ed95c10504141139d83eb8679a0b8144b15ad0d..84f7f2f31339ae9e98ea2301b6e6d94fcf4dedbb 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -173,7 +173,8 @@ class HloModuleGroupMetadata {
   // Returns the number of modules for devices (excluding the host module).
   int64 GetDeviceModulesCount() const;
 
-  // Returns the companion instructions for the given instruction.
+  // Returns the companion set for the given instruction, including the
+  // instruction itself.
   //
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::vector<HloInstruction*>& Companions(
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index bf9b3c811704870d9e0a36de5c38a013fba6dfe4..35626ba37541fc7a984ad05d12ebc22e9a08a550 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -137,6 +137,7 @@ namespace xla {
   V(kTanh, "tanh")                                           \
   V(kTrace, "trace")                                         \
   V(kTranspose, "transpose")                                 \
+  V(kTriangularSolve, "triangular-solve")                    \
   V(kTuple, "tuple", kHloOpcodeIsVariadic)                   \
   V(kTupleSelect, "tuple-select")                            \
   V(kWhile, "while")                                         \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index dd8e8ff3a52a1cdf99a2b07b83e2891f90cf85bb..f448571082e52e4b81db1c68d1e1470935386139 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -63,8 +63,7 @@ HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
 
 // Some functions accept either a linear index or a multi-dimensional index
 // (used for indexing into sparse literals).
-using LinearOrMultiIndex =
-    absl::variant<tensorflow::int64, absl::Span<const int64>>;
+using LinearOrMultiIndex = absl::variant<int64, absl::Span<const int64>>;
 
 // Parser for the HloModule::ToString() format text.
 class HloParser {
@@ -83,6 +82,7 @@ class HloParser {
   // Stand alone parsing utils for various aggregate data types.
   StatusOr<Shape> ParseShapeOnly();
   StatusOr<HloSharding> ParseShardingOnly();
+  StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
   StatusOr<PaddingConfig> ParsePaddingConfigOnly();
@@ -129,8 +129,8 @@ class HloParser {
   // given value. If the literal is dense, it myst have the default layout.
   //
   // `loc` should be the source location of the value.
-  bool SetValueInLiteral(LocTy loc, tensorflow::int64 value,
-                         LinearOrMultiIndex index, Literal* literal);
+  bool SetValueInLiteral(LocTy loc, int64 value, LinearOrMultiIndex index,
+                         Literal* literal);
   bool SetValueInLiteral(LocTy loc, double value, LinearOrMultiIndex index,
                          Literal* literal);
   bool SetValueInLiteral(LocTy loc, bool value, LinearOrMultiIndex index,
@@ -158,9 +158,9 @@ class HloParser {
   // Describes the start, limit, and stride on every dimension of the operand
   // being sliced.
   struct SliceRanges {
-    std::vector<tensorflow::int64> starts;
-    std::vector<tensorflow::int64> limits;
-    std::vector<tensorflow::int64> strides;
+    std::vector<int64> starts;
+    std::vector<int64> limits;
+    std::vector<int64> strides;
   };
 
   // The data parsed for the kDomain instruction.
@@ -180,9 +180,11 @@ class HloParser {
     kBracedInt64ListList,
     kHloComputation,
     kFftType,
+    kTriangularSolveTranspose,
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
+    kParameterReplication,
     kInstructionList,
     kSliceRanges,
     kPaddingConfig,
@@ -247,21 +249,21 @@ class HloParser {
   bool ParseMetadata(OpMetadata* metadata);
   bool ParseSharding(OpSharding* sharding);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
+  bool ParseParameterReplication(ParameterReplication* parameter_replication);
 
   // Parses the metadata behind a kDOmain instruction.
   bool ParseDomain(DomainData* domain);
 
   // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
-  bool ParseDxD(const string& name, std::vector<tensorflow::int64>* result);
+  bool ParseDxD(const string& name, std::vector<int64>* result);
   // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
-  bool ParseWindowPad(std::vector<std::vector<tensorflow::int64>>* pad);
+  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParsePrecisionList(std::vector<PrecisionConfig::Precision>* result);
   bool ParseShapeList(std::vector<Shape>* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
-                      const TokKind delim,
-                      std::vector<tensorflow::int64>* result);
+                      const TokKind delim, std::vector<int64>* result);
   // 'parse_and_add_item' is an lambda to parse an element in the list and add
   // the parsed element to the result. It's supposed to capture the result.
   bool ParseList(const TokKind start, const TokKind end, const TokKind delim,
@@ -276,12 +278,14 @@ class HloParser {
                            std::vector<bool>* dynamic_dimensions);
   bool ParseShape(Shape* result);
   bool ParseLayout(Layout* layout);
+  bool ParseTiles(std::vector<Tile>* tiles);
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
+  bool ParseTriangularSolveTranspose(TriangularSolveOptions::Transpose* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
   bool ParsePrecision(PrecisionConfig::Precision* result);
-  bool ParseInt64(tensorflow::int64* result);
+  bool ParseInt64(int64* result);
   bool ParseDouble(double* result);
   bool ParseComplex(std::complex<double>* result);
   bool ParseBool(bool* result);
@@ -643,6 +647,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   std::unordered_map<string, AttrConfig> attrs;
   optional<OpSharding> sharding;
   attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding};
+  optional<ParameterReplication> parameter_replication;
+  attrs["parameter_replication"] = {/*required=*/false,
+                                    AttrTy::kParameterReplication,
+                                    &parameter_replication};
   optional<std::vector<HloInstruction*>> predecessors;
   attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList,
                                    &predecessors};
@@ -656,7 +664,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
-      tensorflow::int64 parameter_number;
+      int64 parameter_number;
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
           !ParseInt64(&parameter_number)) {
@@ -688,7 +696,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kIota: {
-      optional<tensorflow::int64> iota_dimension;
+      optional<int64> iota_dimension;
       attrs["iota_dimension"] = {/*required=*/true, AttrTy::kInt64,
                                  &iota_dimension};
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
@@ -853,13 +861,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReplicaId: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
           !ParseAttributes(attrs)) {
         return false;
       }
-      if (!operands.empty()) {
-        return false;
-      }
       instruction = builder->AddInstruction(HloInstruction::CreateReplicaId());
       break;
     }
@@ -894,17 +899,21 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSort: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
+      optional<bool> is_stable = false;
+      attrs["is_stable"] = {/*required=*/false, AttrTy::kBool, &is_stable};
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
           dimensions->size() != 1) {
         return false;
       }
-      instruction = builder->AddInstruction(HloInstruction::CreateSort(
-          shape, dimensions->at(0),
-          /*keys=*/operands[0],
-          /*values=*/absl::Span<HloInstruction* const>(operands).subspan(1)));
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateSort(shape, dimensions->at(0), operands,
+                                     to_apply.value(), is_stable.value()));
       break;
     }
     case HloOpcode::kTuple: {
@@ -930,7 +939,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecv: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -946,7 +955,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecvDone: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -964,7 +973,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSend: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -979,7 +988,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSendDone: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -997,7 +1006,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      optional<tensorflow::int64> index;
+      optional<int64> index;
       attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -1080,7 +1089,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     }
     case HloOpcode::kFft: {
       optional<FftType> fft_type;
-      optional<std::vector<tensorflow::int64>> fft_length;
+      optional<std::vector<int64>> fft_length;
       attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type};
       attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &fft_length};
@@ -1092,8 +1101,40 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
           shape, operands[0], *fft_type, *fft_length));
       break;
     }
+    case HloOpcode::kTriangularSolve: {
+      optional<bool> left_side;
+      optional<bool> lower;
+      optional<bool> unit_diagonal;
+      optional<TriangularSolveOptions::Transpose> transpose_a;
+      attrs["left_side"] = {/*required=*/false, AttrTy::kBool, &left_side};
+      attrs["lower"] = {/*required=*/false, AttrTy::kBool, &lower};
+      attrs["unit_diagonal"] = {/*required=*/false, AttrTy::kBool,
+                                &unit_diagonal};
+      attrs["transpose_a"] = {/*required=*/false,
+                              AttrTy::kTriangularSolveTranspose, &transpose_a};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      TriangularSolveOptions options;
+      if (left_side) {
+        options.set_left_side(*left_side);
+      }
+      if (lower) {
+        options.set_lower(*lower);
+      }
+      if (unit_diagonal) {
+        options.set_unit_diagonal(*unit_diagonal);
+      }
+      options.set_transpose_a(
+          transpose_a ? *transpose_a : TriangularSolveOptions::NO_TRANSPOSE);
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateTriangularSolve(
+              shape, operands[0], operands[1], options));
+      break;
+    }
     case HloOpcode::kBroadcast: {
-      optional<std::vector<tensorflow::int64>> broadcast_dimensions;
+      optional<std::vector<int64>> broadcast_dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &broadcast_dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1105,7 +1146,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kConcatenate: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
@@ -1120,7 +1161,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> to_apply;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &to_apply};
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/false, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1136,7 +1177,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> reduce_computation;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &reduce_computation};
-      optional<std::vector<tensorflow::int64>> dimensions_to_reduce;
+      optional<std::vector<int64>> dimensions_to_reduce;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions_to_reduce};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1157,7 +1198,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReverse: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1201,7 +1242,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDynamicSlice: {
-      optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
+      optional<std::vector<int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
       LocTy loc = lexer_.GetLoc();
@@ -1240,7 +1281,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kTranspose: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1254,7 +1295,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormTraining: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
@@ -1270,7 +1311,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormInference: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -1287,7 +1328,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormGrad: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -1369,8 +1410,8 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReducePrecision: {
-      optional<tensorflow::int64> exponent_bits;
-      optional<tensorflow::int64> mantissa_bits;
+      optional<int64> exponent_bits;
+      optional<int64> mantissa_bits;
       attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
                                 &exponent_bits};
       attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
@@ -1481,16 +1522,16 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDot: {
-      optional<std::vector<tensorflow::int64>> lhs_contracting_dims;
+      optional<std::vector<int64>> lhs_contracting_dims;
       attrs["lhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims};
-      optional<std::vector<tensorflow::int64>> rhs_contracting_dims;
+      optional<std::vector<int64>> rhs_contracting_dims;
       attrs["rhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims};
-      optional<std::vector<tensorflow::int64>> lhs_batch_dims;
+      optional<std::vector<int64>> lhs_batch_dims;
       attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &lhs_batch_dims};
-      optional<std::vector<tensorflow::int64>> rhs_batch_dims;
+      optional<std::vector<int64>> rhs_batch_dims;
       attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &rhs_batch_dims};
       optional<std::vector<PrecisionConfig::Precision>> operand_precision;
@@ -1534,19 +1575,19 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<tensorflow::int64>> offset_dims;
+      optional<std::vector<int64>> offset_dims;
       attrs["offset_dims"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &offset_dims};
-      optional<std::vector<tensorflow::int64>> collapsed_slice_dims;
+      optional<std::vector<int64>> collapsed_slice_dims;
       attrs["collapsed_slice_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &collapsed_slice_dims};
-      optional<std::vector<tensorflow::int64>> start_index_map;
+      optional<std::vector<int64>> start_index_map;
       attrs["start_index_map"] = {/*required=*/true, AttrTy::kBracedInt64List,
                                   &start_index_map};
-      optional<tensorflow::int64> index_vector_dim;
+      optional<int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<tensorflow::int64>> slice_sizes;
+      optional<std::vector<int64>> slice_sizes;
       attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &slice_sizes};
 
@@ -1568,17 +1609,17 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kScatter: {
-      optional<std::vector<tensorflow::int64>> update_window_dims;
+      optional<std::vector<int64>> update_window_dims;
       attrs["update_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &update_window_dims};
-      optional<std::vector<tensorflow::int64>> inserted_window_dims;
+      optional<std::vector<int64>> inserted_window_dims;
       attrs["inserted_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &inserted_window_dims};
-      optional<std::vector<tensorflow::int64>> scatter_dims_to_operand_dims;
+      optional<std::vector<int64>> scatter_dims_to_operand_dims;
       attrs["scatter_dims_to_operand_dims"] = {/*required=*/true,
                                                AttrTy::kBracedInt64List,
                                                &scatter_dims_to_operand_dims};
-      optional<tensorflow::int64> index_vector_dim;
+      optional<int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
 
@@ -1619,7 +1660,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
     case HloOpcode::kGetDimensionSize:
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1644,6 +1685,18 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     instruction->set_sharding(
         HloSharding::FromProto(sharding.value()).ValueOrDie());
   }
+  if (parameter_replication) {
+    int leaf_count = ShapeUtil::GetLeafCount(instruction->shape());
+    const auto& replicated =
+        parameter_replication->replicated_at_leaf_buffers();
+    if (leaf_count != replicated.size()) {
+      return Error(lexer_.GetLoc(),
+                   StrCat("parameter has ", leaf_count,
+                          " leaf buffers, but parameter_replication has ",
+                          replicated.size(), " elements."));
+    }
+    instruction->set_parameter_replicated_at_leaf_buffers(replicated);
+  }
   if (predecessors) {
     for (auto* pre : *predecessors) {
       Status status = pre->AddControlDependencyTo(instruction);
@@ -1708,8 +1761,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
-  std::vector<tensorflow::int64> devices;
-  std::vector<tensorflow::int64> tile_assignment_dimensions;
+  std::vector<int64> devices;
+  std::vector<int64> tile_assignment_dimensions;
   while (lexer_.GetKind() != TokKind::kRbrace) {
     switch (lexer_.GetKind()) {
       case TokKind::kw_maximal:
@@ -1735,7 +1788,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           }
 
           do {
-            tensorflow::int64 dim;
+            int64 dim;
             if (!ParseInt64(&dim)) {
               return false;
             }
@@ -1747,7 +1800,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
             return false;
           }
           do {
-            tensorflow::int64 device;
+            int64 device;
             if (!ParseInt64(&device)) {
               return false;
             }
@@ -1791,10 +1844,10 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           "dimensions");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    for (tensorflow::int64 dim : tile_assignment_dimensions) {
+    for (int64 dim : tile_assignment_dimensions) {
       sharding->add_tile_assignment_dimensions(dim);
     }
-    for (tensorflow::int64 device : devices) {
+    for (int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
   }
@@ -1803,6 +1856,32 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   return true;
 }
 
+// parameter_replication ::=
+//   '{' ('true' | 'false')* (',' ('true' | 'false'))*  '}'
+bool HloParser::ParseParameterReplication(
+    ParameterReplication* parameter_replication) {
+  if (!ParseToken(TokKind::kLbrace,
+                  "expected '{' to start parameter_replication attribute")) {
+    return false;
+  }
+
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    do {
+      if (lexer_.GetKind() == TokKind::kw_true) {
+        parameter_replication->add_replicated_at_leaf_buffers(true);
+      } else if (lexer_.GetKind() == TokKind::kw_false) {
+        parameter_replication->add_replicated_at_leaf_buffers(false);
+      } else {
+        return false;
+      }
+      lexer_.Lex();
+    } while (EatIfPresent(TokKind::kComma));
+  }
+
+  return ParseToken(TokKind::kRbrace,
+                    "expected '}' to end parameter_replication attribute");
+}
+
 // domain ::= '{' 'kind=' domain_kind ',' 'entry=' entry_sharding ','
 //            'exit=' exit_sharding '}'
 bool HloParser::ParseDomain(DomainData* domain) {
@@ -1855,22 +1934,18 @@ bool HloParser::ParseInstructionNames(
                     "expects '}' at the end of instruction name list");
 }
 
-bool HloParser::SetValueInLiteral(LocTy loc, tensorflow::int64 value,
+bool HloParser::SetValueInLiteral(LocTy loc, int64 value,
                                   LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case S8:
-      return SetValueInLiteralHelper<tensorflow::int8>(loc, value, index,
-                                                       literal);
+      return SetValueInLiteralHelper<int8>(loc, value, index, literal);
     case S16:
-      return SetValueInLiteralHelper<tensorflow::int16>(loc, value, index,
-                                                        literal);
+      return SetValueInLiteralHelper<int16>(loc, value, index, literal);
     case S32:
-      return SetValueInLiteralHelper<tensorflow::int32>(loc, value, index,
-                                                        literal);
+      return SetValueInLiteralHelper<int32>(loc, value, index, literal);
     case S64:
-      return SetValueInLiteralHelper<tensorflow::int64>(loc, value, index,
-                                                        literal);
+      return SetValueInLiteralHelper<int64>(loc, value, index, literal);
     case U8:
       return SetValueInLiteralHelper<tensorflow::uint8>(loc, value, index,
                                                         literal);
@@ -1958,7 +2033,7 @@ bool HloParser::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
   }
 
   // Check that the index is in range and assign into the literal
-  if (auto* linear_index = absl::get_if<tensorflow::int64>(&index)) {
+  if (auto* linear_index = absl::get_if<int64>(&index)) {
     if (*linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
       return Error(loc, StrCat("trys to set value ", StringifyValue(value),
                                " to a literal in shape ",
@@ -2063,8 +2138,8 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
   // Create a literal with the given shape in default layout.
   *literal = LiteralUtil::CreateFromDimensions(
       shape.element_type(), AsInt64Slice(shape.dimensions()));
-  tensorflow::int64 nest_level = 0;
-  tensorflow::int64 linear_index = 0;
+  int64 nest_level = 0;
+  int64 linear_index = 0;
   // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for
   // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}},
   // when we are parsing the 2nd '{' (right before '1'), we are seeing a
@@ -2072,13 +2147,13 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
   // the first '}' (right after '3'), it means the sub-array ends, and the
   // sub-array is supposed to contain exactly 3 elements, so check if
   // elems_seen_per_dim[1] is 3.
-  std::vector<tensorflow::int64> elems_seen_per_dim(rank);
+  std::vector<int64> elems_seen_per_dim(rank);
   auto get_index_str = [&elems_seen_per_dim](int dim) -> string {
-    std::vector<tensorflow::int64> elems_seen_until_dim(
-        elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim);
+    std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
+                                            elems_seen_per_dim.begin() + dim);
     return StrCat("[",
                   StrJoin(elems_seen_until_dim, ",",
-                          [](string* out, const tensorflow::int64& num_elems) {
+                          [](string* out, const int64& num_elems) {
                             StrAppend(out, num_elems - 1);
                           }),
                   "]");
@@ -2184,7 +2259,7 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         } else if (primitive_util::IsIntegralType(shape.element_type()) ||
                    shape.element_type() == PRED) {
           LocTy loc = lexer_.GetLoc();
-          tensorflow::int64 value;
+          int64 value;
           if (!ParseInt64(&value)) {
             return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
@@ -2229,9 +2304,9 @@ bool HloParser::ParseSparseLiteral(Literal* literal, const Shape& shape) {
       break;
     }
 
-    std::vector<tensorflow::int64> index;
+    std::vector<int64> index;
     if (lexer_.GetKind() == TokKind::kInt) {
-      tensorflow::int64 single_index = lexer_.GetInt64Val();
+      int64 single_index = lexer_.GetInt64Val();
       lexer_.Lex();
       index.push_back(single_index);
     } else {
@@ -2255,7 +2330,7 @@ bool HloParser::ParseSparseLiteral(Literal* literal, const Shape& shape) {
       }
       lexer_.Lex();
     } else if (primitive_util::IsIntegralType(shape.element_type())) {
-      tensorflow::int64 value;
+      int64 value;
       if (!ParseInt64(&value)) {
         return Error(value_loc,
                      StrCat("expects integer for primitive type: ",
@@ -2341,7 +2416,7 @@ bool HloParser::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
         -std::numeric_limits<ParsedElemT>::infinity() == value))) {
     // Skip range checking for non-finite value.
   } else if (std::is_unsigned<LiteralNativeT>::value) {
-    CHECK((std::is_same<ParsedElemT, tensorflow::int64>::value ||
+    CHECK((std::is_same<ParsedElemT, int64>::value ||
            std::is_same<ParsedElemT, bool>::value))
         << "Unimplemented checking for ParsedElemT";
 
@@ -2567,24 +2642,23 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kInt64: {
-        tensorflow::int64 result;
+        int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        static_cast<optional<tensorflow::int64>*>(attr_out_ptr)
-            ->emplace(result);
+        static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
         return true;
       }
       case AttrTy::kInt32: {
-        tensorflow::int64 result;
+        int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        if (result != static_cast<tensorflow::int32>(result)) {
+        if (result != static_cast<int32>(result)) {
           return Error(attr_loc, "value out of range for int32");
         }
-        static_cast<optional<tensorflow::int32>*>(attr_out_ptr)
-            ->emplace(static_cast<tensorflow::int32>(result));
+        static_cast<optional<int32>*>(attr_out_ptr)
+            ->emplace(static_cast<int32>(result));
         return true;
       }
       case AttrTy::kFloat: {
@@ -2624,6 +2698,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<FftType>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kTriangularSolveTranspose: {
+        TriangularSolveOptions::Transpose result;
+        if (!ParseTriangularSolveTranspose(&result)) {
+          return false;
+        }
+        static_cast<optional<TriangularSolveOptions::Transpose>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
       case AttrTy::kWindow: {
         Window result;
         if (!ParseWindow(&result, /*expect_outer_curlies=*/true)) {
@@ -2649,6 +2732,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<OpSharding>*>(attr_out_ptr)->emplace(sharding);
         return true;
       }
+      case AttrTy::kParameterReplication: {
+        ParameterReplication parameter_replication;
+        if (!ParseParameterReplication(&parameter_replication)) {
+          return false;
+        }
+        static_cast<optional<ParameterReplication>*>(attr_out_ptr)
+            ->emplace(parameter_replication);
+        return true;
+      }
       case AttrTy::kInstructionList: {
         std::vector<HloInstruction*> result;
         if (!ParseInstructionNames(&result)) {
@@ -2668,19 +2760,19 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kBracedInt64List: {
-        std::vector<tensorflow::int64> result;
+        std::vector<int64> result;
         if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
                             &result)) {
           return false;
         }
-        static_cast<optional<std::vector<tensorflow::int64>>*>(attr_out_ptr)
+        static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
       case AttrTy::kBracedInt64ListList: {
-        std::vector<std::vector<tensorflow::int64>> result;
+        std::vector<std::vector<int64>> result;
         auto parse_and_add_item = [&]() {
-          std::vector<tensorflow::int64> item;
+          std::vector<int64> item;
           if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace,
                               TokKind::kComma, &item)) {
             return false;
@@ -2692,8 +2784,7 @@ bool HloParser::ParseAttributeHelper(
                        parse_and_add_item)) {
           return false;
         }
-        static_cast<optional<std::vector<std::vector<tensorflow::int64>>>*>(
-            attr_out_ptr)
+        static_cast<optional<std::vector<std::vector<int64>>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
@@ -2894,7 +2985,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   absl::string_view rhs = split2[0];
   absl::string_view out = split2[1];
 
-  const tensorflow::int64 rank = lhs.length();
+  const int64 rank = lhs.length();
   if (rank != rhs.length() || rank != out.length()) {
     return TokenError(
         "convolution lhs, rhs, and output must have the same rank");
@@ -3005,7 +3096,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
     return false;
   }
-  std::vector<std::vector<tensorflow::int64>> ranges;
+  std::vector<std::vector<int64>> ranges;
   if (lexer_.GetKind() == TokKind::kRbrace) {
     // empty
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
@@ -3075,9 +3166,9 @@ bool HloParser::ParseShapeList(std::vector<Shape>* result) {
 //   ::= int64_val (delim int64_val)*
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
-                               std::vector<tensorflow::int64>* result) {
+                               std::vector<int64>* result) {
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     if (!ParseInt64(&i)) {
       return false;
     }
@@ -3153,7 +3244,7 @@ bool HloParser::ParseParamList() {
 bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes,
                                     std::vector<bool>* dynamic_dimensions) {
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     bool is_dynamic = false;
     if (lexer_.GetKind() == TokKind::kLeq) {
       is_dynamic = true;
@@ -3170,22 +3261,108 @@ bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes,
                    parse_and_add_item);
 }
 
-// layout ::= '{' int64_list '}'
+// tiles
+//   ::= /*empty*/
+//   ::= 'T' '(' dim_list ')'
+// dim_list
+//   ::= /*empty*/
+//   ::= (int64 | '*') (',' (int64 | '*'))*
+bool HloParser::ParseTiles(std::vector<Tile>* tiles) {
+  auto parse_and_add_tile_dimension = [&]() {
+    tensorflow::int64 i;
+    if (ParseInt64(&i)) {
+      tiles->back().add_dimensions(i);
+      return true;
+    }
+    if (lexer_.GetKind() == TokKind::kAsterisk) {
+      tiles->back().add_dimensions(Tile::kCombineDimension);
+      lexer_.Lex();
+      return true;
+    }
+    return false;
+  };
+
+  do {
+    tiles->push_back(Tile());
+    if (!ParseList(TokKind::kLparen, TokKind::kRparen, TokKind::kComma,
+                   parse_and_add_tile_dimension)) {
+      return false;
+    }
+  } while (lexer_.GetKind() == TokKind::kLparen);
+  return true;
+}
+
+// layout ::= '{' int64_list (':' tiles element_size_in_bits)? '}'
+// element_size_in_bits
+//   ::= /*empty*/
+//   ::= 'E' '(' int64 ')'
 bool HloParser::ParseLayout(Layout* layout) {
   std::vector<int64> minor_to_major;
+  std::vector<Tile> tiles;
+  tensorflow::int64 element_size_in_bits = 0;
+
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     if (!ParseInt64(&i)) {
       return false;
     }
     minor_to_major.push_back(i);
     return true;
   };
-  if (!ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
-                 parse_and_add_item)) {
+
+  if (!ParseToken(TokKind::kLbrace,
+                  StrCat("expects layout to start with ",
+                         TokKindToString(TokKind::kLbrace)))) {
     return false;
   }
-  *layout = LayoutUtil::MakeLayout(minor_to_major);
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    if (lexer_.GetKind() == TokKind::kInt) {
+      // Parse minor to major.
+      do {
+        if (!parse_and_add_item()) {
+          return false;
+        }
+      } while (EatIfPresent(TokKind::kComma));
+    }
+
+    if (lexer_.GetKind() == TokKind::kColon) {
+      lexer_.Lex();
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "T") {
+        lexer_.Lex();
+        ParseTiles(&tiles);
+      }
+
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "E") {
+        // Parse element size in bits.
+        lexer_.Lex();
+        if (!ParseToken(TokKind::kLparen,
+                        StrCat("expects element size in bits to start with ",
+                               TokKindToString(TokKind::kLparen)))) {
+          return false;
+        }
+        if (!ParseInt64(&element_size_in_bits)) {
+          return false;
+        }
+        if (!ParseToken(TokKind::kRparen,
+                        StrCat("expects element size in bits to end with ",
+                               TokKindToString(TokKind::kRparen)))) {
+          return false;
+        }
+      }
+    }
+  }
+  if (!ParseToken(TokKind::kRbrace,
+                  StrCat("expects layout to end with ",
+                         TokKindToString(TokKind::kRbrace)))) {
+    return false;
+  }
+
+  std::vector<Tile> vec_tiles(tiles.size());
+  for (int i = 0; i < tiles.size(); i++) {
+    vec_tiles[i] = Tile(tiles[i]);
+  }
+  *layout =
+      LayoutUtil::MakeLayout(minor_to_major, vec_tiles, element_size_in_bits);
   return true;
 }
 
@@ -3237,7 +3414,7 @@ bool HloParser::ParseShape(Shape* result) {
     lexer_.Lex();
     const string message =
         "expects a brace-bracketed integer for sparse layout";
-    tensorflow::int64 max_sparse_elements;
+    int64 max_sparse_elements;
     if (!ParseToken(TokKind::kLbrace, message) ||
         !ParseInt64(&max_sparse_elements) ||
         !ParseToken(TokKind::kRbrace, message)) {
@@ -3257,13 +3434,20 @@ bool HloParser::ParseShape(Shape* result) {
   //
   // The open brace could either be the start of a computation or the start of a
   // layout for the f32[123] shape. We consider it the start of a layout if the
-  // next token after the open brace is a integer
+  // next token after the open brace is an integer or a colon.
   if (lexer_.GetKind() == TokKind::kLbrace &&
-      lexer_.LookAhead() == TokKind::kInt) {
+      (lexer_.LookAhead() == TokKind::kInt ||
+       lexer_.LookAhead() == TokKind::kColon)) {
     Layout layout;
     if (!ParseLayout(&layout)) {
       return false;
     }
+    if (layout.minor_to_major_size() != result->rank()) {
+      return Error(
+          lexer_.GetLoc(),
+          StrFormat("Dimensions size is %ld, but minor to major size is %ld.",
+                    result->rank(), layout.minor_to_major_size()));
+    }
     *result->mutable_layout() = layout;
   }
   return true;
@@ -3306,15 +3490,14 @@ bool HloParser::ParseString(string* result) {
   return true;
 }
 
-bool HloParser::ParseDxD(const string& name,
-                         std::vector<tensorflow::int64>* result) {
+bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
     return Error(loc, StrFormat("sub-attribute '%s=' already exists", name));
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
-    tensorflow::int64 number;
+    int64 number;
     if (!ParseInt64(&number)) {
       return Error(loc, StrFormat("expects sub-attribute '%s=i'", name));
     }
@@ -3333,8 +3516,7 @@ bool HloParser::ParseDxD(const string& name,
   return TokenError("expects token type kInt or kDxD");
 }
 
-bool HloParser::ParseWindowPad(
-    std::vector<std::vector<tensorflow::int64>>* pad) {
+bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
   LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
     return Error(loc, "sub-attribute 'pad=' already exists");
@@ -3344,7 +3526,7 @@ bool HloParser::ParseWindowPad(
   }
   string str = lexer_.GetStrVal();
   for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
-    std::vector<tensorflow::int64> low_high;
+    std::vector<int64> low_high;
     if (!SplitToInt64s(padding_dim_str, '_', &low_high) ||
         low_high.size() != 2) {
       return Error(loc,
@@ -3367,7 +3549,7 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   LocTy loc = lexer_.GetLoc();
   string str = lexer_.GetStrVal();
   for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
-    std::vector<tensorflow::int64> padding_dim;
+    std::vector<int64> padding_dim;
     if (!SplitToInt64s(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
       return Error(loc,
@@ -3389,7 +3571,7 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
   optional<string> op_type;
   optional<string> op_name;
   optional<string> source_file;
-  optional<tensorflow::int32> source_line;
+  optional<int32> source_line;
   attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
   attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
   attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
@@ -3441,6 +3623,22 @@ bool HloParser::ParseFftType(FftType* result) {
   return true;
 }
 
+bool HloParser::ParseTriangularSolveTranspose(
+    TriangularSolveOptions::Transpose* result) {
+  VLOG(1) << "ParseTriangularSolveTranspose";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects triangular solve transpose type");
+  }
+  string val = lexer_.GetStrVal();
+  if (!TriangularSolveOptions_Transpose_Parse(val, result) ||
+      !TriangularSolveOptions_Transpose_IsValid(*result)) {
+    return TokenError(
+        StrFormat("expects triangular solve transpose type but sees: %s", val));
+  }
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(1) << "ParseFusionKind";
   if (lexer_.GetKind() != TokKind::kIdent) {
@@ -3492,7 +3690,7 @@ bool HloParser::ParsePrecision(PrecisionConfig::Precision* result) {
   return true;
 }
 
-bool HloParser::ParseInt64(tensorflow::int64* result) {
+bool HloParser::ParseInt64(int64* result) {
   VLOG(1) << "ParseInt64";
   if (lexer_.GetKind() != TokKind::kInt) {
     return TokenError("expects integer");
@@ -3644,6 +3842,21 @@ StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   return HloSharding::FromProto(op_sharding);
 }
 
+StatusOr<std::vector<bool>> HloParser::ParseParameterReplicationOnly() {
+  lexer_.Lex();
+  ParameterReplication parameter_replication;
+  if (!ParseParameterReplication(&parameter_replication)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument(
+        "Syntax error:\nExtra content after parameter replication");
+  }
+  return std::vector<bool>(
+      parameter_replication.replicated_at_leaf_buffers().begin(),
+      parameter_replication.replicated_at_leaf_buffers().end());
+}
+
 StatusOr<Window> HloParser::ParseWindowOnly() {
   lexer_.Lex();
   Window window;
@@ -3759,6 +3972,11 @@ StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   return parser.ParseShardingOnly();
 }
 
+StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str) {
+  HloParser parser(str);
+  return parser.ParseParameterReplicationOnly();
+}
+
 StatusOr<Window> ParseWindow(absl::string_view str) {
   HloParser parser(str);
   return parser.ParseWindowOnly();
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 450a54c54c156c2ae27475d145a8e83dc841b431..a96260b4d75e515a4cb23d315444142cae1b9587 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -44,11 +44,16 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
-// ParseHloString sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string,
-// e.g., "{replicated}".
+// Parses sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string, e.g.,
+// "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
+// Parses parameter replication from str. str is supposed to contain the body of
+// the parameter replication, i.e. just the rhs of the
+// "parameter_replication={...}" attribute string, e.g., "{true, false}".
+StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str);
+
 // Parses the result of window_util::ToString(const Window&).
 StatusOr<Window> ParseWindow(absl::string_view str);
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index b525f66f9bc837f720531b8828436ebb9c1f6b31..8e3f1e44b9562334130aa565ed447a78899fad53 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -63,6 +63,19 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
+)"
+},
+// parameter replication
+{
+"ParamReplication",
+R"(HloModule param_replication_module
+
+ENTRY %param_replication (a: f32[], b: (f32[2,4], (f32[2,4]))) -> (f32[], (f32[2,4], (f32[2,4]))) {
+  %a = f32[] parameter(0), parameter_replication={true}
+  %b = (f32[2,4]{1,0}, (f32[2,4]{1,0})) parameter(1), parameter_replication={false,true}
+  ROOT %tuple = (f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0}))) tuple(f32[] %a, (f32[2,4]{1,0}, (f32[2,4]{1,0})) %b)
+}
+
 )"
 },
 // pred constant
@@ -1047,9 +1060,15 @@ ENTRY ReducePrecision {
 "SortKey",
 R"(HloModule sort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   x = f32[1024]{0} parameter(0)
-  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1059,10 +1078,18 @@ ENTRY Sort {
 "SortKeyValue",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024]{0} parameter(0)
   values = s32[1024]{0} parameter(1)
-  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1072,9 +1099,15 @@ ENTRY Sort {
 "SortKeyR2",
 R"(HloModule sort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   x = f32[1024,16]{0,1} parameter(0)
-  ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0}
+  ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1084,10 +1117,18 @@ ENTRY Sort {
 "SortKeyValueR2",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024,16]{0,1} parameter(0)
   values = s32[1024,16]{0,1} parameter(1)
-  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1097,12 +1138,42 @@ ENTRY Sort {
 "SortManyValues",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.2.lhs = u32[] parameter(4)
+  p.2.rhs = u32[] parameter(5)
+  p.3.lhs = f32[] parameter(6)
+  p.3.rhs = f32[] parameter(7)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024,16]{0,1} parameter(0)
   values.0 = s32[1024,16]{0,1} parameter(1)
   values.1 = u32[1024,16]{0,1} parameter(2)
   values.2 = f32[1024,16]{0,1} parameter(3)
-  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}, to_apply=compare
+}
+
+)"
+},
+// Sort (Key) is_stable=true
+{
+"SortKeyStable",
+R"(HloModule sort
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
+ENTRY Sort {
+  x = f32[1024]{0} parameter(0)
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, is_stable=true, to_apply=compare
 }
 
 )"
@@ -1280,6 +1351,17 @@ ENTRY CollectivePermute {
   ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
+)"
+},
+// replica-id
+{
+"ReplicaId",
+R"(HloModule replica-id
+
+ENTRY Replica-id {
+  ROOT replica-id = u32[] replica-id()
+}
+
 )"
 },
 // Iota
@@ -1309,10 +1391,18 @@ ENTRY Computation {
 "ScheduledModule",
 R"(HloModule scheduled_module, is_scheduled=true
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lhs = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024]{0} parameter(0)
   values = s32[1024]{0} parameter(1)
-  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1396,7 +1486,7 @@ class HloParameterizedParserTest
  protected:
   // Expects "ToString(ParseHloString(string)) == string", that is, parses the
   // string, asserts that it succeeded, stringifies the parsed module, and
-  // checks that the it equals the original string.
+  // checks that it equals the original string.
   void ExpectEqual() {
     const string& original = GetParam().module_string;
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -2515,6 +2605,60 @@ TEST_F(HloParserTest, ParseShapeStringWithLayout) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
+  // One tile.
+  string shape_string = "f32[123,456]{0,1:T(2,128)}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected =
+      ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1}, {Tile({2, 128})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Tile with negative dimension size for combining dimensions.
+  shape_string = "f32[123,456,789]{0,1,2:T(2, * , 128)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected =
+      ShapeUtil::MakeShapeWithLayout(F32, {123, 456, 789}, {0, 1, 2},
+                                     {Tile({2, Tile::kCombineDimension, 128})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Two tiles.
+  shape_string = "bf16[123,456,789]{2,1,0:T(2,*,128)(2,1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(
+      BF16, {123, 456, 789}, {2, 1, 0},
+      {Tile({2, Tile::kCombineDimension, 128}), Tile({2, 1})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Tile with element size in bits.
+  shape_string = "pred[123,456]{1,0:T(2,128)E(1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(PRED, {123, 456}, {1, 0},
+                                            {Tile({2, 128})}, 1);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Element size in bits without tile.
+  shape_string = "pred[123,456]{1,0:E(1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(PRED, {123, 456}, {1, 0}, {}, 1);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Wrong minor_to_major.
+  shape_string = "f32[123,456,789]{1:T(2, * , 128)}";
+  auto result = ParseShape(shape_string);
+  ExpectHasSubstr(result.status().error_message(),
+                  "Dimensions size is 3, but minor to major size is 1.");
+}
+
 TEST_F(HloParserTest, ParseShapeStringWithSparseLayout) {
   string shape_string = "f32[123,456]sparse{10}";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
@@ -2579,5 +2723,16 @@ TEST_F(HloParserTest, NegativeParameterNumber) {
               ::testing::HasSubstr("parameter number must be >= 0"));
 }
 
+TEST_F(HloParserTest, WrongNumberOfParameterLeafBuffersInReplication) {
+  const string hlo_string =
+      "par0 = (f32[3,5], f32[]) parameter(0), "
+      "parameter_replication={true,false,true}";
+  auto result = ParseHloString(hlo_string);
+  ASSERT_FALSE(result.status().ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("parameter has 2 leaf buffers, but "
+                                   "parameter_replication has 3 elements"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 0fced7f15bdaf1dbe349e3b0fc6ada68393c6512..b7f507b1184dbe021effc1102a68040286480ed2 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -77,28 +77,51 @@ std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
     const HloComputation* computation) {
   const auto& all = computation->MakeInstructionPostOrder();
   auto result = absl::make_unique<HloReachabilityMap>(all);
-  auto channel_dependency_map = computation->ComputeChannelDependencies();
+  auto channel_group = computation->ComputeChannelDependencies();
 
-  std::vector<HloInstruction*> inputs;
   for (const HloInstruction* hlo : all) {
-    inputs.assign(hlo->operands().begin(), hlo->operands().end());
-    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
-                  hlo->control_predecessors().end());
+    std::vector<HloInstruction*> inputs;
+    const auto add_input = [&channel_group, &inputs](HloInstruction* input) {
+      inputs.push_back(input);
+      if (input->opcode() == HloOpcode::kAllReduce && input->all_reduce_id()) {
+        auto it = channel_group.find(*input->all_reduce_id());
+        if (it != channel_group.end()) {
+          inputs.insert(inputs.end(), it->second.begin(), it->second.end());
+        }
+      }
+    };
+
+    const auto add_dependencies = [&add_input](const HloInstruction* hlo) {
+      for (HloInstruction* operand : hlo->operands()) {
+        add_input(operand);
+      }
+      for (HloInstruction* predecessor : hlo->control_predecessors()) {
+        add_input(predecessor);
+      }
+    };
+
+    add_dependencies(hlo);
 
     switch (hlo->opcode()) {
       case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(hlo->channel_id());
-        if (it != channel_dependency_map.end()) {
-          absl::c_copy(it->second, std::back_inserter(inputs));
+        auto it = channel_group.find(hlo->channel_id());
+        if (it != channel_group.end()) {
+          for (HloInstruction* channel : it->second) {
+            if (channel->opcode() == HloOpcode::kSend) {
+              add_input(channel);
+            }
+          }
         }
         break;
       }
       case HloOpcode::kAllReduce: {
         auto all_reduce_id = hlo->all_reduce_id();
         if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            absl::c_copy(it->second, std::back_inserter(inputs));
+          auto it = channel_group.find(all_reduce_id.value());
+          if (it != channel_group.end()) {
+            for (HloInstruction* all_reduce : it->second) {
+              add_dependencies(all_reduce);
+            }
           }
         }
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index d7d66ae1c4592723ca991d5ee971fa72cc1af90a..5a5401e351384867016a3a9addfd43d57091848c 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -168,6 +168,35 @@ StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
       /*profile=*/profile);
 }
 
+StatusOr<Literal> HloRunner::Execute(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const Literal* const> arguments,
+    ExecutionProfile* profile) {
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
+                      TransferLiteralsToDevice(arguments));
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      ExecuteWithDeviceBuffers(
+                          /*executable=*/executable.get(),
+                          /*arguments=*/argument_buffers,
+                          /*profile=*/profile));
+  return TransferLiteralFromDevice(result);
+}
+
+StatusOr<Literal> HloRunner::Execute(std::unique_ptr<Executable> executable,
+                                     const absl::Span<const Literal> arguments,
+                                     ExecutionProfile* profile) {
+  // Construct a vector of plain pointers for the arguments.
+  std::vector<const Literal*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return Execute(
+      /*module=*/std::move(executable),
+      /*arguments=*/argument_pointers,
+      /*profile=*/profile);
+}
+
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
     std::unique_ptr<HloModule> module,
     const absl::Span<const ShapedBuffer* const> arguments, bool run_hlo_passes,
@@ -206,7 +235,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    std::unique_ptr<Executable> executable,
+    Executable* executable,
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutionProfile* profile) {
   // Get service run options.
@@ -225,7 +254,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    std::unique_ptr<Executable> executable,
+    Executable* executable,
     const absl::Span<const ScopedShapedBuffer> arguments,
     ExecutionProfile* profile) {
   std::vector<const ShapedBuffer*> argument_pointers;
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index bb792cf8c9825ff67ca33bbcf2c3c32b1a0ecb85..098989cd4c78fb5ad57cd6700fbf99c50064f225 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -60,7 +60,7 @@ class HloRunner {
     // The number of times the infeed literal should be fed to the HLO module.
     // For a clean exit, this should match the iterations-per-loop parameter
     // used when generating the HLO module proto (that is usually the main
-    // while bounary counter). A value higher then iterations-per-loop would
+    // while boundary counter). A value higher then iterations-per-loop would
     // lead to infeed threads feeding to a gone computation, while a lower
     // value would trigger a stuck ExecuteReplicated() call (the computation
     // will be trying to infeed data which will never come).
@@ -124,6 +124,14 @@ class HloRunner {
                             bool run_hlo_passes = true,
                             ExecutionProfile* profile = nullptr);
 
+  StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
+                            const absl::Span<const Literal* const> arguments,
+                            ExecutionProfile* profile = nullptr);
+
+  StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
+                            const absl::Span<const Literal> arguments,
+                            ExecutionProfile* profile = nullptr);
+
   // As Execute(), but accepts and returns device buffers instead of host
   // buffers.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
@@ -136,13 +144,16 @@ class HloRunner {
       const absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  // In the following two calls, "executable" is not a unique_ptr to allow
+  // reuse of the Executable.  This call may update the profile information in
+  // *executable.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      std::unique_ptr<Executable> executable,
+      Executable* executable,
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutionProfile* profile = nullptr);
 
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      std::unique_ptr<Executable> executable,
+      Executable* executable,
       const absl::Span<const ScopedShapedBuffer> arguments,
       ExecutionProfile* profile = nullptr);
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 37cc146bd7a6f2aef9373bd4afd8572ffac6473c..f1d7e60f2b5a68408f6d428a0ec47fba3c9c4f12 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -96,13 +96,13 @@ string HloSharding::ToString() const {
 
   if (replicated_) {
     return "{replicated}";
-  } else if (maximal_) {
+  }
+  if (maximal_) {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
-  } else {
-    return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","),
-                  "]", StrJoin(tile_assignment_, ","), "}");
   }
+  return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
+                StrJoin(tile_assignment_, ","), "}");
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
@@ -328,8 +328,8 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
             status = tensorflow::errors::InvalidArgument(
                 StrCat("core ", core, " is not unique in tile assignment"));
           }
+          seen_cores.insert(core);
         }
-        seen_cores.insert(core);
       });
   if (!status.ok()) {
     return status;
@@ -347,7 +347,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
         ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
-  // The correct constructor have to be used to create tile maximal shardings.
+  // The correct constructor has to be used to create tile maximal shardings.
   if (tile_assignment_.num_elements() == 1) {
     return tensorflow::errors::InvalidArgument(
         "Tile assignment only contains a single device. If a replicated "
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 5789ae09988d2a85247c5b8c037a172b3699f3b7..dd57ea83f1cb33aa052facb607bc040d2e708633 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -118,7 +118,7 @@ class HloSharding {
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
-  // Retrieves an histogram of the devices used by the sharding. The returned
+  // Retrieves a histogram of the devices used by the sharding. The returned
   // map has the device number as key, and the occurrence count as value.
   // If a sharding does not have a device, it will not be incuded in the
   // histogram. The count argument, if not nullptr, will receive the total
@@ -260,6 +260,19 @@ class HloSharding {
   bool replicated_;
   bool maximal_;
   bool tuple_;
+  // This field is only used if replicated_ is false. If maximal_ is true, then
+  // the field contains a rank 1 array with a single element, which is the
+  // device the HLO is assigned to. If maximal_ is false, the field contains an
+  // array with the same rank as the corresponding HLO. The dimension sizes of
+  // the array describe the number of ways the HLO is partitioned along each
+  // dimension. The values of the array specify which device each tile of
+  // the HLO is assigned to. The index of each value determines which tile it
+  // takes.
+  // For example, {{{2, 3}}, {{5, 7}}} (whose ToString representation is
+  // "{devices=[2,1,2]2,3,5,7}"), means that dimension 1 is split two way and
+  // dimension 3 is split 2 way. Core 5, whose index is [2,1,1] will take the
+  // tile that contains the 2nd half of dimension 1 and the 1st half of
+  // dimension 3.
   Array<int64> tile_assignment_;
   // Only non-empty when tuple_ is true. If a tuple is empty then one entry is
   // present for the root. This is a flattened list of all the leaf shardings in
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 80634677e78e4a35dcb9bf7de018a88122c3c030..9e234e025586ff14f99da73afc5610c627303a36 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -84,7 +84,7 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should fail because of more devices used then `num_device`.
+    // Test should fail because of more devices used than `num_device`.
     HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3}));
     EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}),
                                        /*num_devices=*/2));
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
deleted file mode 100644
index c1f69db74eafb7743e85f499f2f4828ed0375501..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-LIcensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-using absl::StrAppend;
-using absl::StrCat;
-using tensorflow::GraphDef;
-using tensorflow::NodeDef;
-using tensorflow::TensorShapeProto;
-
-string GetOpDefName(const HloInstruction* instruction) {
-  string name = StrCat("hlo-", HloOpcodeString(instruction->opcode()));
-  tensorflow::str_util::TitlecaseString(&name, "-");  // non-absl ok
-  name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
-
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    string fusion_name = ToString(instruction->fusion_kind());
-    StrAppend(&name, absl::string_view(fusion_name).substr(1));
-  }
-  return name;
-}
-
-TensorShapeProto GetTensorShape(const HloInstruction* instruction) {
-  TensorShapeProto tensor_shape;
-  const Shape& shape = instruction->shape();
-  for (auto dim : shape.dimensions()) {
-    tensor_shape.add_dim()->set_size(dim);
-  }
-  return tensor_shape;
-}
-
-string GetDeviceName(int device) { return StrCat("/device/XLA:", device); }
-
-void CleanNodeName(string* name) {
-  name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
-  const string chars_to_replace = "<>[]";
-  auto pred = [&](char c) {
-    return absl::c_linear_search(chars_to_replace, c);
-  };
-  std::replace_if(name->begin(), name->end(), pred, '_');
-}
-
-}  // namespace
-
-HloTfGraphBuilder::HloTfGraphBuilder(const DebugOptions& debug_options)
-    : debug_options_(debug_options) {}
-
-Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
-  VLOG(2) << "Adding computation " << computation.name();
-  for (auto embedded : computation.MakeEmbeddedComputationsList()) {
-    for (auto* instruction : embedded->instructions()) {
-      TF_RETURN_IF_ERROR(AddInstruction(instruction));
-    }
-  }
-  for (auto* instruction : computation.instructions()) {
-    TF_RETURN_IF_ERROR(AddInstruction(instruction));
-  }
-  return Status::OK();
-}
-
-const GraphDef& HloTfGraphBuilder::GetGraphDef() const { return graph_def_; }
-
-const string& HloTfGraphBuilder::GetNodeNameForInstruction(
-    const HloInstruction* instruction) {
-  if (ContainsKey(instruction_to_node_name_, instruction)) {
-    return instruction_to_node_name_[instruction];
-  }
-  auto append = [](string* str, const string& other) {
-    if (str->empty()) {
-      *str = other;
-    } else if (!other.empty()) {
-      StrAppend(str, "/", other);
-    }
-  };
-  string node_name;
-  if (debug_options_.xla_hlo_tfgraph_device_scopes()) {
-    auto device = instruction->sharding_unique_device();
-    if (device) {
-      node_name = StrCat("dev", *device);
-    }
-  }
-  // If an instruction is fused, put it in the subgraph of the fusion;
-  // otherwise, put it in the computation subgraph.
-  const HloComputation* computation = instruction->parent();
-  if (computation->IsFusionComputation()) {
-    append(&node_name,
-           GetNodeNameForInstruction(computation->FusionInstruction()));
-  } else {
-    append(&node_name, computation->name());
-    if (!instruction->metadata().op_name().empty()) {
-      // Always make computations contain TF ops but not the other way around.
-      append(&node_name, instruction->metadata().op_name());
-    }
-  }
-  string instruction_name = instruction->name();
-  if (instruction->opcode() == HloOpcode::kParameter) {
-    StrAppend(&instruction_name, ".", instruction->parameter_number());
-  }
-  append(&node_name, instruction_name);
-  CleanNodeName(&node_name);
-  auto ret =
-      instruction_to_node_name_.insert(std::make_pair(instruction, node_name));
-  CHECK(ret.second);
-  return ret.first->second;
-}
-
-void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
-                                     NodeDef* node_def) const {
-  auto& attrs = *node_def->mutable_attr();
-
-  // Set the number of arguments for instructions that have variadic operands.
-  if (HloOpcodeIsVariadic(instruction->opcode())) {
-    tensorflow::AttrValue attr_value;
-    attr_value.set_i(instruction->operands().size());
-    attrs["arg_num"] = attr_value;
-  }
-
-  // Set the node type.
-  attrs["type"].set_s(
-      xla::PrimitiveType_Name(instruction->shape().element_type()));
-
-  // Set the framework op (e.g. Tensorflow op) that generated this XLA op.
-  attrs["tf_op_type"].set_s(instruction->metadata().op_type());
-  attrs["tf_op_name"].set_s(instruction->metadata().op_name());
-
-  // Set the shape of the output tensor. "_output_shapes" is a special attribute
-  // name used by Tensorboard for shapes of output tensors.
-  tensorflow::AttrValue shapes;
-  *shapes.mutable_list()->add_shape() = GetTensorShape(instruction);
-  attrs["_output_shapes"] = shapes;
-
-  // Set the layout.
-  if (LayoutUtil::HasLayout(instruction->shape())) {
-    string layout_string;
-    if (instruction->shape().IsTuple()) {
-      // For tuples, emit the full shape because the layout of a tuple is not
-      // represented in a single Layout field.
-      layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
-    } else {
-      layout_string = StrCat(
-          "{",
-          absl::StrJoin(LayoutUtil::MinorToMajor(instruction->shape()), ","),
-          "}");
-    }
-    attrs["layout"].set_s(layout_string);
-  }
-
-  // Set op-specific attributes.
-  switch (instruction->opcode()) {
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kBroadcast:
-    case HloOpcode::kReduce:
-    case HloOpcode::kReverse:
-    case HloOpcode::kTranspose:
-      for (auto dim : instruction->dimensions()) {
-        attrs["dims"].mutable_list()->add_i(dim);
-      }
-      break;
-    case HloOpcode::kGetTupleElement:
-      attrs["index"].set_i(instruction->tuple_index());
-      break;
-    case HloOpcode::kRng:
-      attrs["dist"].set_s(
-          RandomDistribution_Name(instruction->random_distribution()));
-      break;
-    case HloOpcode::kConstant:
-      if (ShapeUtil::IsScalar(instruction->shape())) {
-        attrs["value"].set_s(instruction->literal().GetAsString({}));
-      }
-      break;
-    case HloOpcode::kCustomCall:
-      attrs["custom_call_target"].set_s(instruction->custom_call_target());
-      break;
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
-      attrs["channel_id"].set_i(instruction->channel_id());
-      break;
-    default:
-      break;
-  }
-}
-
-Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) {
-  if (!visited_instructions_.insert(instruction).second) {
-    // Skip instructions that have already been added.
-    return Status::OK();
-  }
-
-  NodeDef* node_def = graph_def_.add_node();
-  node_def->set_name(GetNodeNameForInstruction(instruction));
-  node_def->set_op(GetOpDefName(instruction));
-
-  auto device = instruction->sharding_unique_device();
-  if (device) {
-    node_def->set_device(GetDeviceName(*device));
-  }
-  SetNodeAttrs(instruction, node_def);
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    for (auto* fused_instruction : instruction->fused_instructions()) {
-      TF_RETURN_IF_ERROR(AddInstruction(fused_instruction));
-    }
-  }
-  // Add all edges including control edges.
-  for (unsigned i = 0; i < instruction->operands().size(); ++i) {
-    *node_def->add_input() = GetNodeNameForInstruction(instruction->operand(i));
-  }
-  // Called computations are control dependencies.
-  for (const auto* called_computation : instruction->called_computations()) {
-    *node_def->add_input() = StrCat(
-        "^", GetNodeNameForInstruction(called_computation->root_instruction()));
-  }
-  return Status::OK();
-}
-
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
deleted file mode 100644
index c4876b852e32d34693202f4023aa20ad2b301ffd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-
-// This constructs a tensorflow graph for HLO computations.
-class HloTfGraphBuilder {
- public:
-  HloTfGraphBuilder(const DebugOptions& debug_options = DebugOptions());
-
-  // Adds a computation to the graph.
-  Status AddComputation(const HloComputation& computation);
-
-  const tensorflow::GraphDef& GetGraphDef() const;
-
- private:
-  // Gets the node name of an instruction. The node name is hierarchical. For
-  // example, if an instruction is fused, it will be put in a subgraph of the
-  // fusion instruction.
-  const string& GetNodeNameForInstruction(const HloInstruction* instruction);
-
-  void SetNodeAttrs(const HloInstruction* instruction,
-                    tensorflow::NodeDef* node_def) const;
-
-  Status AddInstruction(const HloInstruction* instruction);
-
-  DebugOptions debug_options_;
-  tensorflow::GraphDef graph_def_;
-  // This records instructions that have been visited.
-  std::unordered_set<const HloInstruction*> visited_instructions_;
-  // A cache that maps instruction to the node name.
-  std::unordered_map<const HloInstruction*, string> instruction_to_node_name_;
-};
-
-}  // namespace hlo_graph_dumper
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
deleted file mode 100644
index 1e2b31a1f2bb4865faafc3d14e2b194e3aa171a1..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-using ::tensorflow::GraphDef;
-
-class HloTfGraphBuilderTest : public HloTestBase {
- protected:
-  HloTfGraphBuilderTest() {}
-  HloTfGraphBuilder generator_;
-
-  // Create a computation which takes a scalar and returns its negation.
-  std::unique_ptr<HloComputation> CreateNegateComputation() {
-    auto builder = HloComputation::Builder("Negate");
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, r0f32_, "param0"));
-    builder.AddInstruction(
-        HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-    return builder.Build();
-  }
-
-  // Creates a computation which calls map with the given computation.
-  std::unique_ptr<HloComputation> CreateMapComputation(
-      HloComputation *map_computation) {
-    auto builder = HloComputation::Builder("Map");
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, r0f32_, "param0"));
-    builder.AddInstruction(
-        HloInstruction::CreateMap(r0f32_, {param}, map_computation));
-    return builder.Build();
-  }
-  Shape r0f32_ = ShapeUtil::MakeShape(PrimitiveType::F32, {});
-};
-
-static const tensorflow::AttrValue &GetNodeAttr(const tensorflow::NodeDef &node,
-                                                const string &attr_name) {
-  auto attr = node.attr().find(attr_name);
-  CHECK(attr != node.attr().end());
-  return attr->second;
-}
-
-TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) {
-  auto builder = HloComputation::Builder("Concatenate");
-  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2});
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, shape, "param1"));
-  builder.AddInstruction(HloInstruction::CreateConcatenate(
-      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {param_1, param_2}, 1));
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  const auto &node = graph_def.node(2);
-  EXPECT_EQ(node.name(), "Concatenate/concatenate");
-
-  // Check dimensions.
-  auto dims_value = GetNodeAttr(node, "dims");
-  EXPECT_EQ(dims_value.list().i_size(), 1);
-  EXPECT_EQ(dims_value.list().i(0), 1);
-
-  // Check shapes.
-  auto shape_value = GetNodeAttr(node, "_output_shapes");
-  EXPECT_EQ(shape_value.list().shape_size(), 1);
-  EXPECT_EQ(shape_value.list().shape(0).dim_size(), 2);
-  EXPECT_EQ(shape_value.list().shape(0).dim(0).size(), 2);
-  EXPECT_EQ(shape_value.list().shape(0).dim(1).size(), 4);
-}
-
-TEST_F(HloTfGraphBuilderTest, CheckScalarValue) {
-  auto builder = HloComputation::Builder("Const");
-  HloInstruction *instruction = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(123)));
-  OpMetadata metadata;
-  metadata.set_op_name("x");
-  metadata.set_op_type("y");
-  instruction->set_metadata(metadata);
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 1);
-  const auto &node = graph_def.node(0);
-  EXPECT_EQ(GetNodeAttr(node, "value").s(), "123");
-  EXPECT_EQ(GetNodeAttr(node, "type").s(), "S32");
-  EXPECT_EQ(GetNodeAttr(node, "tf_op_name").s(), "x");
-  EXPECT_EQ(GetNodeAttr(node, "tf_op_type").s(), "y");
-}
-
-TEST_F(HloTfGraphBuilderTest, SimpleNegateComputation) {
-  auto negate_computation = CreateNegateComputation();
-  TF_CHECK_OK(generator_.AddComputation(*negate_computation));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 2);
-  EXPECT_EQ(graph_def.node(0).name(), "Negate/param0.0");
-  EXPECT_EQ(graph_def.node(0).op(), "HloParameter");
-  EXPECT_EQ(graph_def.node(1).name(), "Negate/negate");
-  EXPECT_EQ(graph_def.node(1).op(), "HloNegate");
-  EXPECT_EQ(graph_def.node(1).input_size(), 1);
-  EXPECT_EQ(graph_def.node(1).input(0), "Negate/param0.0");
-}
-
-TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
-  auto builder = HloComputation::Builder("GE");
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, r0f32_, "param1"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
-  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
-  EXPECT_EQ(graph_def.node(2).input_size(), 2);
-  EXPECT_EQ(graph_def.node(2).name(), "GE/greater-than-or-equal-to");
-  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
-}
-
-TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
-  auto builder = HloComputation::Builder("GE");
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, r0f32_, "param1"));
-  auto ge = builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
-  OpMetadata metadata;
-  metadata.set_op_name("x/y");
-  metadata.set_op_type("Y");
-  ge->set_metadata(metadata);
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
-  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
-  EXPECT_EQ(graph_def.node(2).input_size(), 2);
-  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
-  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
-}
-
-TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
-  // Create computations with a diamond-shaped callgraph.
-  auto negate_computation = CreateNegateComputation();
-  auto map1_computation = CreateMapComputation(negate_computation.get());
-  auto map2_computation = CreateMapComputation(negate_computation.get());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto map1 = builder.AddInstruction(
-      HloInstruction::CreateMap(r0f32_, {param}, map1_computation.get()));
-  auto map2 = builder.AddInstruction(
-      HloInstruction::CreateMap(r0f32_, {param}, map2_computation.get()));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, map1, map2));
-  auto computation = builder.Build();
-  TF_CHECK_OK(generator_.AddComputation(*computation));
-  EXPECT_GT(generator_.GetGraphDef().node_size(), 0);
-}
-
-}  // namespace
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 4caaa5a32b1e213ff475591e32809f744bcb86ad..97e6ea9dad04238c2e6f1a49fba6d880ef3169c2 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -50,6 +50,7 @@ bool IsCallerInstruction(HloInstruction* hlo) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFusion:
       return true;
     default:
@@ -167,6 +168,15 @@ Status ShapeVerifier::HandleFft(HloInstruction* fft) {
   return CheckShape(fft, expected);
 }
 
+Status ShapeVerifier::HandleTriangularSolve(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 2));
+  TF_ASSIGN_OR_RETURN(const Shape expected,
+                      ShapeInference::InferTriangularSolveShape(
+                          hlo->operand(0)->shape(), hlo->operand(1)->shape(),
+                          hlo->triangular_solve_options()));
+  return CheckShape(hlo, expected);
+}
+
 Status ShapeVerifier::HandleAllReduce(HloInstruction* crs) {
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : crs->operands()) {
@@ -327,13 +337,48 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) {
     return InternalError("Expected at least 1 operand for %s instruction: %s",
                          HloOpcodeString(sort->opcode()), sort->ToString());
   }
+  HloComputation* compare = sort->to_apply();
+
+  // Check that the 'compare' computation returns a PRED.
+  Shape compare_shape = compare->root_instruction()->shape();
+  if (!ShapesSame(compare_shape, ShapeUtil::MakeShape(PRED, {}))) {
+    return InternalError(
+        "The Sort compare computation shape does not lead to a scalar "
+        "predicate shape: %s",
+        StringifyShape(compare_shape));
+  }
+
+  // Check that the number of parameters of the 'compare' computation is
+  // correct.
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(sort, compare, sort->operand_count() * 2));
+
+  // Verify that the operands of the compare computation have the correct scalar
+  // shapes.
+  for (int64 parameter_idx = 0; parameter_idx < compare->num_parameters();
+       ++parameter_idx) {
+    int64 operand_idx = parameter_idx / 2;
+    Shape expected_scalar_shape = ShapeUtil::MakeShape(
+        sort->operand(operand_idx)->shape().element_type(), {});
+    Shape actual_parameter_shape =
+        compare->parameter_instruction(parameter_idx)->shape();
+    if (!ShapeUtil::CompatibleIgnoringFpPrecision(expected_scalar_shape,
+                                                  actual_parameter_shape)) {
+      return InternalError(
+          "Expected the %lld-th parameter of the compare computation of sort "
+          "to have shape %s, but got %s",
+          parameter_idx, StringifyShape(expected_scalar_shape),
+          StringifyShape(actual_parameter_shape));
+    }
+  }
+
+  // Verify that all operand shapes have the same dimensions.
   for (int64 operand = 1; operand < sort->operand_count(); ++operand) {
     if (!ShapeUtil::SameDimensions(sort->operand(0)->shape(),
                                    sort->operand(operand)->shape())) {
       return InternalError(
-          "Expected sort to have to have the same dimensions for the keys "
-          "and the values. Keys shape is: %s\n, Values shape (operand index "
-          "%lld) is: %s",
+          "Expected sort to have to have the same dimensions for all operands. "
+          "First operand shape is: %s\n, shape (operand index %lld) is: %s",
           StringifyShape(sort->operand(0)->shape()), operand,
           StringifyShape(sort->operand(operand)->shape()));
     }
@@ -376,6 +421,24 @@ Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
                         get_tuple_element->tuple_index()));
 }
 
+namespace {
+Status SameElementTypesForOperandsAndToApplyParameters(
+    const HloInstruction& instruction, int64 num_operands_to_check) {
+  const ProgramShape& to_apply = instruction.to_apply()->ComputeProgramShape();
+  for (int i = 0; i < num_operands_to_check; ++i) {
+    const Shape& parameter_shape = to_apply.parameters(i);
+    const Shape& operand_shape = instruction.operands()[i]->shape();
+    if (!ShapeUtil::SameElementType(parameter_shape, operand_shape)) {
+      return InvalidArgument(
+          "Shape mismatch between to_apply computation"
+          " parameter and operand %d in %s.",
+          i, instruction.ToString().c_str());
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
 Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   if (reduce->operand_count() % 2 != 0) {
     return InternalError(
@@ -387,9 +450,15 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   for (const HloInstruction* operand : reduce->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(reduce, ShapeInference::InferReduceShape(
-                                operand_shapes, reduce->dimensions(),
-                                reduce->to_apply()->ComputeProgramShape()));
+  TF_RETURN_IF_ERROR(
+      CheckShape(reduce, ShapeInference::InferReduceShape(
+                             operand_shapes, reduce->dimensions(),
+                             reduce->to_apply()->ComputeProgramShape())));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(
+                   *reduce, reduce->operands().size() - 1);
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
@@ -545,19 +614,31 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
   // arbitrary map dimensions.
   std::vector<int64> map_dims(max_operand_rank);
   std::iota(map_dims.begin(), map_dims.end(), 0);
-  return CheckShape(map, ShapeInference::InferMapShape(
-                             operand_shapes,
-                             map->to_apply()->ComputeProgramShape(), map_dims));
+
+  TF_RETURN_IF_ERROR(CheckShape(
+      map,
+      ShapeInference::InferMapShape(
+          operand_shapes, map->to_apply()->ComputeProgramShape(), map_dims)));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(
+                   *map, map->operands().size());
 }
 
 Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
   TF_RETURN_IF_ERROR(CheckOperandCount(reduce_window, 2));
-  return CheckShape(
+  TF_RETURN_IF_ERROR(CheckShape(
       reduce_window,
       ShapeInference::InferReduceWindowShape(
           reduce_window->operand(0)->shape(),
           reduce_window->operand(1)->shape(), reduce_window->window(),
-          reduce_window->to_apply()->ComputeProgramShape()));
+          reduce_window->to_apply()->ComputeProgramShape())));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(*reduce_window,
+                                                               1);
 }
 
 Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
@@ -1244,8 +1325,8 @@ Status CheckFusionInstruction(HloInstruction* fusion) {
   return Status::OK();
 }
 
-// Checks that the non-scalar operand shapes are compatible to the output
-// shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+// Checks that the operand shapes are compatible to the output shape, i.e.,
+// that there are no implicit broadcasts.
 Status CheckElementwiseInstruction(HloInstruction* instruction) {
   const Shape& out_shape = instruction->shape();
   for (HloInstruction* operand : instruction->operands()) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index facb76a124a4166e2a29c34f01194c9ebb62498b..a9b5e9a3e6eec19e125188a192694fcaadfe2322 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -52,6 +52,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
+  Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 4f69bd155b8713041ba539098808125956e86259..523890b3c7268c06cdb6aaa67749f26a1cb62855 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -552,5 +552,67 @@ TEST_F(HloVerifierTest, IotaNonArrayResult) {
               HasSubstr("does not support non-array result"));
 }
 
+static const char* const kMapOperandComputationMismatchHlo = R"(
+  HloModule MapOperandComputationMismatch
+
+  Computation {
+    param0 = f32[] parameter(0)
+    constant = f32[] constant(1)
+    ROOT add = f32[] add(param0, constant)
+  }
+
+  ENTRY kernelEntry {
+  param = f64[] parameter(0)
+  ROOT map = f32[] map(param), dimensions={}, to_apply=Computation
+})";
+
+TEST_F(HloVerifierTest, MapOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kMapOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr(
+          "Shape mismatch between to_apply computation parameter and operand"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, MapOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kMapOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+static const char* const kReduceOperandComputationMismatchHlo = R"(
+  HloModule ReduceOperandComputationMismatch
+  computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY kernelEntry {
+    arg0 = f16[64,64,224,224]{3,2,1,0} parameter(0)
+    constant = f16[] constant(0)
+    reduce = f16[64]{0} reduce(arg0, constant), dimensions={0,2,3}, to_apply=computation
+  })";
+
+TEST_F(HloVerifierTest, ReduceOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kReduceOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected instruction to have shape equal to f32[64]"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, ReduceOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kReduceOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 76bf48870d55e82497ba5f63e9e2e2a322cb330e..c5d32a4b9ad8c708ec0870173fa72320238e8464 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
-namespace gtl = ::tensorflow::gtl;
 
 namespace {
 using Analysis = IndexedArrayAnalysis;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index d4794acb2f463c4cf8ce5e969f221d52e3742453..a5767774f2dcebf8fd309823d48dcc3269b3f594 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -158,6 +158,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
+    case HloOpcode::kTriangularSolve:
     case HloOpcode::kWhile:
     case HloOpcode::kGetDimensionSize:
       return true;
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index a305c6e8005045f7dbca3b8099a3b8ddebb092af..8cd936268994c2a25c2c0debe0a003d1d05cbd0b 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -50,6 +50,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 0827b1daf89bebb68c045784ef2b9da677792880..792773c676984aa280c1b20cb7fd0fc7c9425f6c 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/map_inliner.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -79,6 +80,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
   pipeline.AddPass<DynamicIndexSplitter>();
+  pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout);
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 9376a3c8f8963551a89dcedd77068a39ffd05301..2dd9e055503e01f5c1ace5e6cd3dc64012828b71 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2070,6 +2070,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kSort:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
+    case HloOpcode::kTriangularSolve:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kWhile:
       return false;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index c66eaec8fb0e4c03f6967fec0cf0ae9661cdf470..3acceccfa556103c15fe229c41e96e618ac59c80 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -113,20 +113,10 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
   Shape output_shape = output_array.GetShape();
   Shape update_shape = update_array.GetShape();
 
-  IndexGenerator start_indices_generator;
-  // TODO(b/118437727): Remove the R1 path, and rename the variables.
-  if (start_indices_array.GetShape().rank() == 1) {
-    start_indices_generator = [&](int64 index) {
-      return start_indices_array.EmitReadArrayElement(
-          IrArray::Index({b->getInt64(index)}), b);
-    };
-  } else {
-    start_indices_generator = [&](int64 index) {
-      return operand_arrays[2 + index].EmitReadArrayElement(
-          IrArray::Index(b->getInt64Ty()), b);
-    };
-  }
-
+  IndexGenerator start_indices_generator = [&](int64 index) {
+    return operand_arrays[2 + index].EmitReadArrayElement(
+        IrArray::Index(b->getInt64Ty()), b);
+  };
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
     return update_array.EmitReadArrayElement(index, b);
   };
@@ -178,21 +168,11 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
   TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
   ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
 
-  // TODO(b/118437727): Remove the R1 path, and rename the variables.
-  IndexGenerator start_indices_generator;
-  if (start_indices->shape().rank() == 1) {
-    start_indices_generator = [&](int64 index) {
-      return fused_emitter.GetGenerator(start_indices)(
-          IrArray::Index({b->getInt64(index)}));
-    };
-  } else {
-    start_indices_generator = [&](int64 index) {
-      ElementGenerator element_generator =
-          fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
-      return element_generator(IrArray::Index(b->getInt64Ty()));
-    };
-  }
-
+  IndexGenerator start_indices_generator = [&](int64 index) {
+    ElementGenerator element_generator =
+        fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
+    return element_generator(IrArray::Index(b->getInt64Ty()));
+  };
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index cf5083e8c13b9485035923895cec1ad05049c644..02c719502ee7b0a732ae74acec364f89d51ae0c1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -269,6 +269,11 @@ class IrBuilderMixin {
     return mixin_builder()->CreateFCmpUNE(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpUNO(Args&&... args) {
+    return mixin_builder()->CreateFCmpUNO(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FDiv(Args&&... args) {
     return mixin_builder()->CreateFDiv(std::forward<Args>(args)...);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index fe320bbe727111fbc986cc1fbc217feed74d30f1..3a35405a2da0af386e01bb48bed56ad194048543 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 0dc120e0b0df47f261435f490a8459b49d989b53..a689881e65ec3a7ddf606c36bdd64b749cfe358e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 89b6a36f96beedbcb7322e6164ac59221650d3d8..d71addec9b7317dfe16e9d7e5380c3cfda0b8c06 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -45,13 +46,14 @@ namespace llvm_ir {
 namespace {
 
 // Adds the inner comparison loop body where we compare elements.
-void EmitCompareLoopBody(
-    int64 iteration_bound, PrimitiveType key_type, int64 num_values,
-    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+Status EmitCompareLoopBody(
+    int64 iteration_bound, int64 num_values, llvm::Value* element_pair_index,
     int64 xor_mask, llvm::Type* index_type,
-    std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
+    std::function<llvm::Value*(int64 operand, llvm::Value* index)>
+        element_address,
     std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
         write_element,
+    const EmitCallToNestedComputationCallback& emit_compare_callback,
     llvm::IRBuilder<>* b, bool needs_bounds_checks = true) {
   auto index_typed_constant = [&](int64 value) {
     return llvm::ConstantInt::get(index_type, value);
@@ -108,74 +110,44 @@ void EmitCompareLoopBody(
 
   // if (is_smaller_index && index_is_inbounds)
   KernelSupportLibrary ksl(b);
-  ksl.If("smaller_comparison_index", do_comparison, [&]() {
-    auto key1 = read_element(0, current_keys_index);
-    auto key2 = read_element(0, compare_keys_index);
-    auto compare_key1 = key1;
-    auto compare_key2 = key2;
-    bool is_signed_comparison = true;
-    if (primitive_util::IsFloatingPointType(key_type)) {
-      // We would like a total order of floating point numbers so that the
-      // sort has a predictable behavior in the presence of NaNs. Rather
-      // than using floating point comparison, we use the following trick:
-      // If f is a float, and
-      // x = bit_cast<int32>(f);
-      // y = x < 0 ? 0x7FFFFFFF - x : x;
-      // then y is ordered as an int32 such that finite values have the
-      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-      // the beginning and end of the ordering.
-      auto k = b->getInt(llvm::APInt::getSignedMaxValue(
-          key1->getType()->getPrimitiveSizeInBits()));
-      auto comparison_type = k->getType();
-      auto zero = llvm::ConstantInt::get(comparison_type, 0);
-      auto maybe_flip = [&](llvm::Value* v) {
-        return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                               b->CreateSub(k, v), v);
-      };
-      compare_key1 = b->CreateBitCast(key1, comparison_type);
-      compare_key2 = b->CreateBitCast(key2, comparison_type);
-      compare_key1 = maybe_flip(compare_key1);
-      compare_key2 = maybe_flip(compare_key2);
-    } else if (!primitive_util::IsSignedIntegralType(key_type)) {
-      is_signed_comparison = false;
-    }
-    // If key2 < key1
-    auto is_smaller_than =
-        b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                           : llvm::ICmpInst::ICMP_ULT,
-                      compare_key2, compare_key1);
-    if (iota_values_parameter_index >= 0) {
-      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
-      auto key_index1 =
-          read_element(iota_values_parameter_index, current_keys_index);
-      auto key_index2 =
-          read_element(iota_values_parameter_index, compare_keys_index);
-      auto index_is_smaller_than =
-          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
-      is_smaller_than = b->CreateOr(
-          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+  return ksl.IfWithStatus("smaller_comparison_index", do_comparison, [&]() {
+    std::vector<llvm::Value*> values_to_compare;
+    for (int i = 0; i < num_values; ++i) {
+      values_to_compare.push_back(element_address(i, compare_keys_index));
+      values_to_compare.push_back(element_address(i, current_keys_index));
     }
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    llvm::Value* compare_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
+        llvm_ir::PrimitiveTypeToIrType(PRED, module), "compare_return_buffer",
+        b);
+    TF_RETURN_IF_ERROR(
+        emit_compare_callback(values_to_compare, compare_return_buffer));
+    llvm::Value* result = b->CreateLoad(compare_return_buffer);
+
+    // Check if the 'compare' function returns true.
+    llvm::Value* is_smaller_than =
+        b->CreateICmpNE(result, llvm::ConstantInt::get(result->getType(), 0),
+                        "boolean_predicate");
     ksl.If("is_smaller_than", is_smaller_than, [&]() {
-      // Swap key1 with key2.
-      write_element(0, current_keys_index, key2);
-      write_element(0, compare_keys_index, key1);
-      for (int64 i = 1; i <= num_values; ++i) {
-        // Also swap the values.
-        auto value1 = read_element(i, current_keys_index);
-        auto value2 = read_element(i, compare_keys_index);
-        write_element(i, current_keys_index, value2);
-        write_element(i, compare_keys_index, value1);
+      for (int64 i = 0; i < num_values; ++i) {
+        // Swap the values.
+        auto value1 = b->CreateLoad(values_to_compare[i * 2]);
+        auto value2 = b->CreateLoad(values_to_compare[i * 2 + 1]);
+        write_element(i, current_keys_index, value1);
+        write_element(i, compare_keys_index, value2);
       }
     });
+    return Status::OK();
   });
 }
 
-void EmitTiledCompareLoop(
+Status EmitTiledCompareLoop(
     const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
-    int64 dimension_to_sort_bound, PrimitiveType keys_type,
-    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
-    const std::vector<llvm::Value*>& param_shmem_buffers,
-    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
+    int64 dimension_to_sort_bound, absl::Span<const int64> xor_masks,
+    const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers, int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback,
+    llvm::IRBuilder<>* b) {
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
@@ -200,7 +172,7 @@ void EmitTiledCompareLoop(
             [&]() {
               auto cache_index = b->CreateShl(thread_id, value_one);
               read_or_write(cache_index, current_keys_index);
-              // Increment to go the next index position.
+              // Increment to go to the next index position.
               current_keys_index = b->CreateAdd(current_keys_index, value_one);
               // Here we check whether the next index position is within bounds.
               ksl.If("inner_smaller_keys_index",
@@ -230,10 +202,18 @@ void EmitTiledCompareLoop(
   llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
 
   // Now emit the bodies of the comparison loops.
-  auto read_element = [&](int64 operand, llvm::Value* index) {
-    return b->CreateLoad(
+  auto element_address = [&](int64 operand, llvm::Value* index) {
+    auto shared_memory_address =
         b->CreateGEP(param_shmem_buffers[operand],
-                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+                     {tiled_keys_index.GetConstantWithIndexType(0), index});
+    auto ptr_type = shared_memory_address->getType();
+    // We need a generic pointer with address space 0 instead of a pointer to
+    // shared memory (address space 3) so that we can pass it to the comparison
+    // computation.
+    return b->CreateAddrSpaceCast(
+        shared_memory_address,
+        llvm::PointerType::get(ptr_type->getPointerElementType(),
+                               /*AddressSpace=*/0));
   };
   auto write_element = [&](int64 operand, llvm::Value* index,
                            llvm::Value* value) {
@@ -252,7 +232,7 @@ void EmitTiledCompareLoop(
     if (dimension_to_sort_bound % tile_size) {
       // Otherwise we need a bounds check for the last tile. The last tile has
       // size 'dimension_to_sort_bound' % 'tile_size'.
-      ksl.If(
+      TF_RETURN_IF_ERROR(ksl.IfWithStatus(
           "is_last_tile",
           b->CreateICmpUGE(
               b->CreateMul(tiled_keys_index[dimension_to_sort],
@@ -260,24 +240,24 @@ void EmitTiledCompareLoop(
               tiled_keys_index.GetConstantWithIndexType(
                   RoundDownToNearest(dimension_to_sort_bound, tile_size))),
           [&]() {
-            EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
-                                params.size() - 1, iota_values_parameter_index,
-                                element_pair_index, xor_mask,
-                                tiled_keys_index.GetType(), read_element,
-                                write_element, b);
+            return EmitCompareLoopBody(
+                dimension_to_sort_bound % tile_size, params.size(),
+                element_pair_index, xor_mask, tiled_keys_index.GetType(),
+                element_address, write_element, emit_compare_callback, b);
           },
           [&]() {
-            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                                iota_values_parameter_index, element_pair_index,
-                                xor_mask, tiled_keys_index.GetType(),
-                                read_element, write_element, b,
-                                /*needs_bounds_checks=*/false);
-          });
+            return EmitCompareLoopBody(
+                tile_size, params.size(), element_pair_index, xor_mask,
+                tiled_keys_index.GetType(), element_address, write_element,
+                emit_compare_callback, b,
+                /*needs_bounds_checks=*/false);
+          }));
     } else {
-      EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                          iota_values_parameter_index, element_pair_index,
-                          xor_mask, tiled_keys_index.GetType(), read_element,
-                          write_element, b, /*needs_bounds_checks=*/false);
+      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+          tile_size, params.size(), element_pair_index, xor_mask,
+          tiled_keys_index.GetType(), element_address, write_element,
+          emit_compare_callback, b,
+          /*needs_bounds_checks=*/false));
     }
     // Wait until all comparisons have happened.
     llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
@@ -301,17 +281,16 @@ void EmitTiledCompareLoop(
   // same location in shared memory because we have exactly tile_size / 2 many
   // threads, and the linear index calculated by ParallelLoopEmitter uses
   // linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  return Status::OK();
 }
 }  // namespace
 
-Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const std::vector<IrArray>& values_arrays,
-                       int64 iota_values_parameter_index,
-                       absl::string_view name,
-                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions& launch_dimensions,
-                       int64 num_iterations_in_sort_dim,
-                       const int64 tile_size) {
+Status EmitSortInPlace(
+    int64 dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64> xor_masks,
+    llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64 num_iterations_in_sort_dim, const int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback) {
   // Iterate through the keys shape in physical order, but skip the dimension to
   // sort and make it the innermost loop which is the loop where the comparisons
   // happen. In the dimension to sort, if we use tiling, we iterate through it
@@ -321,7 +300,7 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
   // within those 64 elements and are therefore independent of the other
   // comparisons).
 
-  const Shape& keys_shape = keys_array.GetShape();
+  const Shape& keys_shape = values_arrays[0].GetShape();
   int64 rank = keys_shape.rank();
   int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64> dimensions_in_iteration_order(rank);
@@ -338,18 +317,16 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
 
   Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
                                                dimensions_in_iteration_order);
-  std::vector<IrArray> params(1, keys_array);
-  params.insert(params.end(), values_arrays.begin(), values_arrays.end());
 
   // Allocate shared memory for the tiled compare loop.
-  std::vector<llvm::Value*> param_shmem_buffers(params.size(), nullptr);
+  std::vector<llvm::Value*> param_shmem_buffers(values_arrays.size(), nullptr);
   if (xor_masks.size() > 1) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
-    for (int64 i = 0; i < params.size(); ++i) {
-      llvm::Type* tile_type =
-          llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                   params[i].GetShape().element_type(), module),
-                               tile_size);
+    for (int64 i = 0; i < values_arrays.size(); ++i) {
+      llvm::Type* tile_type = llvm::ArrayType::get(
+          llvm_ir::PrimitiveTypeToIrType(
+              values_arrays[i].GetShape().element_type(), module),
+          tile_size);
       param_shmem_buffers[i] = llvm_ir::AllocateSharedMemoryTile(
           module, tile_type, absl::StrCat(name, "_tile_param_", i));
     }
@@ -376,25 +353,24 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
       keys_index[iteration_order_to_logical_order[i]] = tiles_index[i];
     }
     if (xor_masks.size() > 1) {
-      EmitTiledCompareLoop(keys_index, dimension_to_sort,
-                           dimension_to_sort_bound, keys_shape.element_type(),
-                           xor_masks, params, param_shmem_buffers,
-                           iota_values_parameter_index, tile_size, b);
+      TF_RETURN_IF_ERROR(EmitTiledCompareLoop(
+          keys_index, dimension_to_sort, dimension_to_sort_bound, xor_masks,
+          values_arrays, param_shmem_buffers, tile_size, emit_compare_callback,
+          b));
     } else {
-      auto read_element = [&](int64 operand, llvm::Value* index) {
+      auto element_address = [&](int64 operand, llvm::Value* index) {
         keys_index[dimension_to_sort] = index;
-        return params[operand].EmitReadArrayElement(keys_index, b);
+        return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
       };
       auto write_element = [&](int64 operand, llvm::Value* index,
                                llvm::Value* value) {
         keys_index[dimension_to_sort] = index;
-        params[operand].EmitWriteArrayElement(keys_index, value, b);
+        values_arrays[operand].EmitWriteArrayElement(keys_index, value, b);
       };
-      EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
-                          values_arrays.size(), iota_values_parameter_index,
-                          tiles_index[rank - 1], xor_masks[0],
-                          tiles_index.GetType(), read_element, write_element,
-                          b);
+      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+          dimension_to_sort_bound, values_arrays.size(), tiles_index[rank - 1],
+          xor_masks[0], tiles_index.GetType(), element_address, write_element,
+          emit_compare_callback, b));
     }
     return Status::OK();
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 685f9383acba416f51681270e4037d56abb4b6ea..b9341a34d1f2203db6e02c3df5d607174b6d0f74 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -28,19 +28,18 @@ limitations under the License.
 
 namespace xla {
 namespace llvm_ir {
+using EmitCallToNestedComputationCallback =
+    std::function<Status(absl::Span<llvm::Value* const>, llvm::Value*)>;
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
-// dimension of 'keys_array'. All other dimensions are kept as-is. This
-// implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
-// contains only powers of 2, or values 2^k - 1 (k > 0). If
-// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
-// that is a iota and can be used to make the sorting stable.
-Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const std::vector<IrArray>& values_arrays,
-                       int64 iota_values_parameter_index,
-                       absl::string_view name,
-                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions& launch_dimensions,
-                       int64 num_iterations_in_sort_dim, int64 tile_size);
+// dimension of each array in 'values_arrays'. All other dimensions are kept
+// as-is. This implements the inner loop of BitonicSort. It is assumed that
+// 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
+Status EmitSortInPlace(
+    int64 dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64> xor_masks,
+    llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64 num_iterations_in_sort_dim, int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.cc b/tensorflow/compiler/xla/service/op_expander_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02c9d4b387b112be39c204d35fe4fa1013ed064c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/op_expander_pass.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+StatusOr<bool> OpExpanderPass::Run(HloModule* module) {
+  std::vector<HloInstruction*> matching_instructions;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    absl::c_copy_if(
+        computation->instructions(), std::back_inserter(matching_instructions),
+        [&](HloInstruction* inst) { return InstructionMatchesPattern(inst); });
+  }
+
+  for (HloInstruction* inst : matching_instructions) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root,
+                        ExpandInstruction(inst));
+    if (expanded_root == nullptr) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
+  }
+
+  return !matching_instructions.empty();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.h b/tensorflow/compiler/xla/service/op_expander_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..276e3d70b8ecd8742e0b277698765063198fe872
--- /dev/null
+++ b/tensorflow/compiler/xla/service/op_expander_pass.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass is an abstract superclass for passes that replace operations that
+// match a pattern. It is intended to be subclassed, not used directly.
+//
+// This pass is useful for legalizing HLO instructions that a particular backend
+// does not support into other HLO instructions.
+class OpExpanderPass : public HloModulePass {
+ public:
+  StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  virtual bool InstructionMatchesPattern(HloInstruction* instruction) = 0;
+
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // neeeded (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  virtual StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 3ddcaae193ba266f35fa6f9922fe4f3a4970cdc5..3f4456c1bbf0f620609459256424b9cb30a04e13 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -534,6 +534,10 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                     p.edge_padding_high() +
                     std::max<int64>(operand_shape.dimensions(i) - 1, 0LL) *
                         p.interior_padding();
+    if (dimensions[i] < 0) {
+      return InvalidArgument("Padding result in negative size for dimension %d",
+                             i);
+    }
     is_dynamic[i] = operand_shape.is_dynamic_dimension(i);
   }
 
@@ -832,7 +836,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           ShapeUtil::HumanString(larger_shape));
     }
     if (small_is_dynamic != large_is_dynamic) {
-      if ((small_dimension_size == 1 && !small_is_dynamic) ||
+      if (small_dimension_size == large_dimension_size ||
+          (small_dimension_size == 1 && !small_is_dynamic) ||
           (large_dimension_size == 1 && !large_is_dynamic)) {
         // Do nothing. It's OK when the size-1 dimension is not static.
       } else {
@@ -1858,6 +1863,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
               fft_length[i]);
         }
       }
+      if (ShapeUtil::IsZeroElementArray(in)) {
+        return in;
+      }
       Shape result = ShapeUtil::ChangeElementType(in, C64);
       result.set_dimensions(result.dimensions_size() - 1,
                             fft_length[fft_rank - 1] / 2 + 1);
@@ -1899,6 +1907,49 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 #undef RET_CHECK_RANK
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferTriangularSolveShape(
+    const Shape& a, const Shape& b, const TriangularSolveOptions& options) {
+  if (a.rank() < 2) {
+    return InvalidArgument(
+        "The 'a' argument to TriangularSolve must have rank >= 2, got shape %s",
+        a.ToString());
+  }
+  if (b.rank() != a.rank()) {
+    return InvalidArgument(
+        "Arguments to triangular solve must have equal rank; got %s and %s.",
+        b.ToString(), a.ToString());
+  }
+  if (a.dimensions(a.rank() - 2) != a.dimensions(a.rank() - 1)) {
+    return InvalidArgument(
+        "The two minor dimensions of 'a' must have equal size, got %s.",
+        a.ToString());
+  }
+  if (a.dimensions(a.rank() - 1) !=
+      b.dimensions(b.rank() - (options.left_side() ? 2 : 1))) {
+    return InvalidArgument(
+        "The shared dimension of 'a' and 'b' does not match, got shapes %s and "
+        "%s",
+        a.ToString(), b.ToString());
+  }
+  absl::Span<const int64> a_batch_dims(a.dimensions());
+  absl::Span<const int64> b_batch_dims(b.dimensions());
+  a_batch_dims.remove_suffix(2);
+  b_batch_dims.remove_suffix(2);
+  if (a_batch_dims != b_batch_dims) {
+    return InvalidArgument(
+        "The leading batch dimensions of the arguments to triangular solve "
+        "must be equal; got %s and %s.",
+        b.ToString(), a.ToString());
+  }
+  if (!TriangularSolveOptions_Transpose_IsValid(options.transpose_a()) ||
+      options.transpose_a() == TriangularSolveOptions::TRANSPOSE_INVALID) {
+    return InvalidArgument(
+        "Invalid transpose option value for triangular solve (%d).\n",
+        options.transpose_a());
+  }
+  return b;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferAllReduceShape(
     absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* operand_shape : operand_shapes) {
@@ -2345,8 +2396,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
     if (operand_shape.rank() != number_of_indices) {
       return InvalidArgument(
-          "Dynamic update slice start number of dimensions %d must match rank "
-          "%d of slice input (%s).",
+          "Dynamic update slice start number of dimensions %d must match "
+          "rank %d of slice input (%s).",
           number_of_indices, operand_shape.rank(),
           ShapeUtil::HumanString(operand_shape));
     }
@@ -2433,7 +2484,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(arg));
   }
 
-  if (index >= arg.tuple_shapes_size()) {
+  if (index < 0 || index >= arg.tuple_shapes_size()) {
     return InvalidArgument(
         "Cannot infer shape: attempt to index out of tuple bounds: %d "
         ">= %d in shape %s.",
@@ -2582,7 +2633,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         operand_shape.dimensions(i) != 1) {
       return InvalidArgument(
           "Input dimension should be either 1 or equal to the output dimension "
-          "it's broadcasting into; the %lldth operand dimension is %lld, the "
+          "it is broadcasting into; the %lldth operand dimension is %lld, the "
           "%lldth output dimension is %lld.",
           i, operand_shape.dimensions(i), broadcast_dimensions[i],
           output_shape.dimensions(broadcast_dimensions[i]));
@@ -2650,11 +2701,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& operand, absl::Span<const int64> dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand, "transpose"));
 
-  std::vector<int64> indices(operand.rank());
-  std::iota(indices.begin(), indices.end(), 0);
-  if (dimensions.size() != operand.rank() ||
-      !std::is_permutation(dimensions.begin(), dimensions.end(),
-                           indices.begin())) {
+  if (!IsPermutation(dimensions, operand.rank())) {
     return InvalidArgument(
         "Transpose dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 7d39ef38e05abf0a81683c1fb0f3999908b27d23..acb071ab18824472153fc608b812ad2d9c52651e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -116,6 +116,10 @@ class ShapeInference {
   static StatusOr<Shape> InferFftShape(const Shape& in, FftType fft_type,
                                        absl::Span<const int64> fft_length);
 
+  // Infers the shape produced by the given triangular solve operation.
+  static StatusOr<Shape> InferTriangularSolveShape(
+      const Shape& a, const Shape& b, const TriangularSolveOptions& options);
+
   // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
   static StatusOr<Shape> InferAllReduceShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 26120a06b823c9fddf378991cec434a880fb888d..f400ef51f07b006eef2ea674feff1dd72f836e77 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -252,7 +252,7 @@ TEST_F(ShapeInferenceTest, ClampBadShapes) {
 
 TEST_F(ShapeInferenceTest, Complex) {
   auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
-                           const absl::Span<const int64>& bcast) {
+                           absl::Span<const int64> bcast) {
     return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs,
                                               bcast);
   };
@@ -896,6 +896,20 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) {
   ASSERT_TRUE(ShapeUtil::Equal(s32_, inferred1_status.ValueOrDie()));
 }
 
+TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  auto inferredNegative_status =
+      ShapeInference::InferGetTupleElementShape(tuple_shape, -1);
+  auto inferred2_status =
+      ShapeInference::InferGetTupleElementShape(tuple_shape, 2);
+  ASSERT_FALSE(inferredNegative_status.ok());
+  ASSERT_FALSE(inferred2_status.ok());
+  EXPECT_THAT(inferredNegative_status.status().error_message(),
+              HasSubstr("attempt to index out of tuple bounds"));
+  EXPECT_THAT(inferred2_status.status().error_message(),
+              HasSubstr("attempt to index out of tuple bounds"));
+}
+
 TEST_F(ShapeInferenceTest, InferPowShape) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status = ShapeInference::InferBinaryOpShape(
@@ -1467,6 +1481,14 @@ TEST_F(ShapeInferenceTest, Pad) {
   Shape inferred_shape = inferred_status.ValueOrDie();
   ASSERT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), inferred_shape));
+
+  dimension1->set_edge_padding_low(-20);
+  dimension1->set_edge_padding_high(-10);
+  auto negative_dimension_size = ShapeInference::InferPadShape(
+      input_shape, padding_value_shape, padding_config);
+  ASSERT_FALSE(negative_dimension_size.ok());
+  ASSERT_THAT(negative_dimension_size.status().error_message(),
+              HasSubstr("negative size for dimension 1"));
 }
 
 TEST_F(ShapeInferenceTest, Reverse) {
@@ -1550,6 +1572,16 @@ TEST_F(ShapeInferenceTest, Transpose) {
                                     ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
+TEST_F(ShapeInferenceTest, Rank1Transpose) {
+  Shape a_shape = ShapeUtil::MakeShape(F32, {5});
+  auto inferred_shape_and_status =
+      ShapeInference::InferTransposeShape(a_shape, {0});
+  EXPECT_IS_OK(inferred_shape_and_status);
+  Shape inferred_shape = inferred_shape_and_status.ValueOrDie();
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(inferred_shape, ShapeUtil::MakeShape(F32, {5})));
+}
+
 TEST_F(ShapeInferenceTest, Conditional) {
   auto inferred_status0 = ShapeInference::InferConditionalShape(
       pred_, vector_32_, vector_64_,
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.cc b/tensorflow/compiler/xla/service/sort_simplifier.cc
index 4a00e8d7b227f14d462ca53f695189f3f48754ee..122366a0f322a66963b364e1b19629cbd2d9aabe 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier.cc
+++ b/tensorflow/compiler/xla/service/sort_simplifier.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/statusor.h"
+
+#include <memory>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
 namespace {
@@ -39,8 +42,7 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
     return false;
   }
 
-  // Index 0 is the sorting key used by the sort HLO itself.
-  absl::flat_hash_set<int64> used_indices{0};
+  absl::flat_hash_set<int64> used_indices;
   for (const HloInstruction* user : sort->users()) {
     if (user->opcode() != HloOpcode::kGetTupleElement) {
       // Can't analyse users other then get-tuple-element.
@@ -49,15 +51,25 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
     used_indices.insert(user->tuple_index());
   }
 
+  // Also note which parameters are used by the comparator computation.
+  auto comparator = sort->to_apply();
+  for (int64 i = 0; i < sort->operand_count() * 2; ++i) {
+    if (comparator->parameter_instruction(i)->user_count() > 0) {
+      // operand i corresponds to parameters 2 * i and 2 * i + 1 of the
+      // computation.
+      used_indices.insert(i / 2);
+    }
+  }
+
   if (used_indices.size() == sort->operand_count()) {
     // All operands are used.
     return false;
   }
 
-  std::vector<HloInstruction*> operands{sort->mutable_operand(0)};
-  std::vector<Shape> new_shapes{sort->operand(0)->shape()};
-  for (int64 i = 1; i < sort->operand_count(); ++i) {
-    if (used_indices.count(i)) {
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> new_shapes;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (used_indices.contains(i)) {
       operands.push_back(sort->mutable_operand(i));
       new_shapes.push_back(sort->operand(i)->shape());
     }
@@ -68,6 +80,32 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
                              : ShapeUtil::MakeTupleShape(new_shapes);
   HloInstruction* new_sort = computation->AddInstruction(
       sort->CloneWithNewOperands(new_sort_shape, operands));
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  int64 parameter_number = 0;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    auto* old_lhs_parameter = comparator->parameter_instruction(i * 2);
+    auto* old_rhs_parameter = comparator->parameter_instruction(i * 2 + 1);
+    if (used_indices.contains(i)) {
+      Shape scalar_shape =
+          ShapeUtil::MakeShape(sort->operand(i)->shape().element_type(), {});
+      replacements[old_lhs_parameter] = HloInstruction::CreateParameter(
+          parameter_number, scalar_shape,
+          absl::StrCat("p.", parameter_number / 2, ".lhs"));
+      ++parameter_number;
+      replacements[old_rhs_parameter] = HloInstruction::CreateParameter(
+          parameter_number, scalar_shape,
+          absl::StrCat("p.", parameter_number / 2, ".rhs"));
+      ++parameter_number;
+    } else {
+      replacements[old_lhs_parameter] = nullptr;
+      replacements[old_rhs_parameter] = nullptr;
+    }
+  }
+  HloModule* module = sort->GetModule();
+  HloComputation* new_compare = module->AddEmbeddedComputation(
+      comparator->CloneWithReplacements(std::move(replacements)));
+  new_sort->set_to_apply(new_compare);
 
   // Map from original get-tuple-element tuple index to new HLO instruction
   absl::flat_hash_map<int64, HloInstruction*> result_map;
@@ -83,7 +121,8 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
       }
     }
   } else {
-    result_map[0] = new_sort;
+    CHECK_EQ(used_indices.size(), 1);
+    result_map[*used_indices.begin()] = new_sort;
   }
   std::vector<HloInstruction*> users(sort->users().begin(),
                                      sort->users().end());
diff --git a/tensorflow/compiler/xla/service/sort_simplifier_test.cc b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
index cd05fcf830d32e8bac4f8b260d3dd143ab98ad7b..696ac1b465848894f8dcb1c88bc48c6a5b268ef4 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
@@ -34,13 +34,21 @@ TEST_F(SortSimplifierTest, RemoveUnusedSortOperandArrayResult) {
   const char* hlo_string = R"(
    HloModule permutation_sort
 
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} parameter(1)
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
-        dimensions={1}
-      ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
-    })";
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
@@ -58,17 +66,27 @@ TEST_F(SortSimplifierTest, RemoveUnusedSortOperandTuple) {
   const char* hlo_string = R"(
    HloModule permutation_sort
 
-    ENTRY sort_computation {
-      keys = f32[64,87] parameter(0)
-      values.0 = s32[64,87] parameter(1)
-      values.1 = u32[64,87] parameter(2)
-      sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
-          keys, values.0, values.1),
-        dimensions={1}
-      gte.0 = f32[64,87] get-tuple-element(sort), index=0
-      gte.1 = u32[64,87] get-tuple-element(sort), index=2
-      ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
-    })";
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     p.2.lhs = u32[] parameter(4)
+     p.2.rhs = u32[] parameter(5)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,87] parameter(0)
+     values.0 = s32[64,87] parameter(1)
+     values.1 = u32[64,87] parameter(2)
+     sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
+         keys, values.0, values.1),
+       dimensions={1}, to_apply=compare
+     gte.0 = f32[64,87] get-tuple-element(sort), index=0
+     gte.1 = u32[64,87] get-tuple-element(sort), index=2
+     ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
+   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
@@ -86,17 +104,57 @@ TEST_F(SortSimplifierTest, DontRemoveUnusedSortKey) {
   const char* hlo_string = R"(
    HloModule permutation_sort
 
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} parameter(1)
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
-    })";
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}, to_apply=compare
+     ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
   SortSimplifier simplifier;
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
+
+TEST_F(SortSimplifierTest, RemoveUnusedFirstOperand) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.1.lhs, p.1.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare
+     ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  uint64 num_executions = 0;
+  do {
+    num_executions++;
+  } while (simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(num_executions, 2);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Sort(m::Parameter(1))));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.cc b/tensorflow/compiler/xla/service/stable_sort_expander.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1aa7e5fe7c0d57ee3303480e4727c456727f64c8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.cc
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Looks for a iota operand that can be used as tie breaker in the computation.
+// If no matching iota operand is found, a iota operand is added to Sort. The
+// comparison computation is adjusted to break ties using the values from the
+// iota operand.
+StatusOr<HloInstruction*> StableSortExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  auto* sort = Cast<HloSortInstruction>(instruction);
+  HloComputation* computation = sort->parent();
+
+  HloInstruction* expanded_sort = nullptr;
+  absl::flat_hash_set<int64> used_indices;
+  int64 iota_index = -1;
+  for (const HloInstruction* operand : sort->operands()) {
+    // We can only use the iota operand if it has an iota dimension which is the
+    // same as the dimension to sort. Also it should have an integral type that
+    // is large enough for the number of elements in the sort dimension. For
+    // now, we only allow S32, because we expect to find a S32 iota operand for
+    // all Sort ops which are created by TopK.
+    // TODO(b/122298745): Also support other types.
+    if (operand->opcode() == HloOpcode::kIota &&
+        Cast<HloIotaInstruction>(operand)->iota_dimension() ==
+            sort->sort_dimension() &&
+        operand->shape().element_type() == S32) {
+      iota_index = sort->operand_index(operand);
+      break;
+    }
+  }
+
+  // If there is currently no iota operand which we could use for making the
+  // sort stable, we will have to add a new such operand.
+  if (iota_index == -1) {
+    Shape iota_shape = sort->operand(0)->shape();
+    // We might need to use S64 if the number of elements in the sort dimension
+    // is bigger than 2^31 - 1.
+    // TODO(b/122298745): Handle Sort ops where S32 is too small for the number
+    // of elements in the sort dimension.
+    if (iota_shape.dimensions(sort->sort_dimension()) >
+        std::numeric_limits<int32>::max()) {
+      return Unimplemented(
+          "Stable sorting of more than 2^31-1 elements is not implemented");
+    }
+    iota_shape.set_element_type(S32);
+    auto iota = computation->AddInstruction(
+        HloInstruction::CreateIota(iota_shape, sort->sort_dimension()));
+
+    // Create a new comparator.
+    auto comparator = sort->to_apply();
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+        replacements;
+    std::vector<std::unique_ptr<HloInstruction>> extra_parameters;
+    std::vector<HloInstruction*> extra_parameter_ptrs;
+    Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+    extra_parameters.push_back(HloInstruction::CreateParameter(
+        sort->operand_count() * 2, scalar_shape,
+        absl::StrCat("p.", sort->operand_count(), ".lhs")));
+    extra_parameter_ptrs.push_back(extra_parameters.back().get());
+    extra_parameters.push_back(HloInstruction::CreateParameter(
+        sort->operand_count() * 2 + 1, scalar_shape,
+        absl::StrCat("p.", sort->operand_count(), ".rhs")));
+    extra_parameter_ptrs.push_back(extra_parameters.back().get());
+    sort->set_to_apply(sort->GetModule()->AddEmbeddedComputation(
+        comparator->CloneWithReplacements(std::move(replacements),
+                                          extra_parameter_ptrs)));
+
+    // Replace the original sort op.
+    std::vector<HloInstruction*> new_operands(sort->operands().begin(),
+                                              sort->operands().end());
+    new_operands.push_back(iota);
+    std::vector<Shape> new_shapes = sort->operand_count() == 1
+                                        ? std::vector<Shape>{sort->shape()}
+                                        : sort->shape().tuple_shapes();
+    new_shapes.push_back(iota_shape);
+    Shape new_sort_shape = ShapeUtil::MakeTupleShape(new_shapes);
+    HloInstruction* new_sort = computation->AddInstruction(
+        sort->CloneWithNewOperands(new_sort_shape, new_operands));
+
+    // Add a "wrapper" around the new sort op to make sure we have the same
+    // shape as before. For the rank 1 case, we only need a GetTupleElement,
+    // otherwise we create a Tuple consisting of GetTupleElements of the new
+    // sort.
+    std::vector<HloInstruction*> tuple_elements;
+    tuple_elements.reserve(sort->operand_count());
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      tuple_elements.push_back(
+          computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+              sort->operand(i)->shape(), new_sort, i)));
+    }
+    expanded_sort = tuple_elements[0];
+    if (tuple_elements.size() > 1) {
+      expanded_sort = computation->AddInstruction(
+          HloInstruction::CreateTuple(tuple_elements));
+    }
+    sort = Cast<HloSortInstruction>(new_sort);
+    iota_index = sort->operand_count() - 1;
+  }
+
+  // Modify the computation to break ties using the iota operand.
+  auto comparator = sort->to_apply();
+  std::vector<HloInstruction*> instructions_postorder =
+      comparator->MakeInstructionPostOrder();
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> replacements;
+  // Look up instr in the replacements map, and return either the replacement,
+  // or instr, if the replacement isn't present.
+  auto replace = [&](HloInstruction* instr) {
+    auto it = replacements.find(instr);
+    if (it == replacements.end()) {
+      return instr;
+    }
+    return it->second;
+  };
+  HloInstruction* old_root = comparator->root_instruction();
+  // The comparison computation gets 2 * n parameters (n being the number of
+  // operands of Sort), where parameters 2 * i and 2 * i + 1 correspond to two
+  // different scalars of operand i of Sort which are to be compared. The
+  // comparison computation should induce a strict weak order, so if
+  // to_apply(p1.lhs, p1.rhs, ..., pn.lhs, pn.rhs) is equal to
+  // to_apply(p1.rhs, p1.lhs, ..., pn.rhs, pn.lhs), we can conclude that the
+  // values to be compared are equivalent, and perform a tie-breaker comparison.
+  //
+  // We clone each instruction with at least one operand, but use as new
+  // operands of the instruction the replacements of the original operands.
+  // Parameter 2 * i is replaced by parameter 2 * i + 1 and vice versa. This
+  // should make sure that the cloned root instruction gives the result of the
+  // comparison computation when being called with each scalar pair reversed.
+  // parameters corresponding to the iota operand.
+  for (int64 i = 0; i < comparator->num_parameters(); ++i) {
+    replacements[comparator->parameter_instruction(i)] =
+        comparator->parameter_instruction(i ^ 1);
+  }
+  HloInstruction* cloned_root = nullptr;
+  for (HloInstruction* inst : instructions_postorder) {
+    if (inst->operand_count() == 0) {
+      continue;
+    }
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(inst->operand_count());
+    for (HloInstruction* operand : inst->operands()) {
+      new_operands.push_back(replace(operand));
+    }
+    auto new_instruction =
+        inst->CloneWithNewOperands(inst->shape(), new_operands);
+    replacements[inst] = new_instruction.get();
+    if (inst == old_root) {
+      cloned_root = new_instruction.get();
+    }
+    comparator->AddInstruction(std::move(new_instruction));
+  }
+  CHECK_NE(cloned_root, nullptr);
+  Shape scalar_pred = ShapeUtil::MakeShape(PRED, {});
+  HloInstruction* same =
+      comparator->AddInstruction(HloInstruction::CreateBinary(
+          scalar_pred, HloOpcode::kEq, old_root, cloned_root));
+  HloInstruction* tie_breaker =
+      comparator->AddInstruction(HloInstruction::CreateBinary(
+          scalar_pred, HloOpcode::kLt,
+          comparator->parameter_instruction(2 * iota_index),
+          comparator->parameter_instruction(2 * iota_index + 1)));
+  HloInstruction* new_root =
+      comparator->AddInstruction(HloInstruction::CreateTernary(
+          ShapeUtil::MakeShape(PRED, {}), HloOpcode::kSelect, same, tie_breaker,
+          old_root));
+  comparator->set_root_instruction(new_root);
+
+  return expanded_sort;
+}
+
+bool StableSortExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kSort &&
+         Cast<HloSortInstruction>(instruction)->is_stable();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.h b/tensorflow/compiler/xla/service/stable_sort_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..31b6fd92d25370218017c58072f1aa5e64df00c3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which expands Sort ops that have the is_stable field set to true
+// into equivalent Sort ops which guarantee stable sorting without relying on
+// the is_stable field.
+class StableSortExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "stable-sort-expander"; }
+
+ private:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander_test.cc b/tensorflow/compiler/xla/service/stable_sort_expander_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a62d953e6e8fa2f3c1ecfd9e4a7900eee74f9dca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander_test.cc
@@ -0,0 +1,358 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+using StableSortExpanderTest = HloTestBase;
+
+// Checks whether 'a' and 'b' are roots of equivalent computations, except that
+// parameters 2 * i and 2 * i + 1 are switched.
+bool IsSameComputationExceptParams(const HloInstruction* a,
+                                   const HloInstruction* b) {
+  if (a->opcode() != b->opcode() || a->operand_count() != b->operand_count()) {
+    return false;
+  }
+  if (a->opcode() == HloOpcode::kParameter) {
+    // Check that parameters were switched.
+    return a->parameter_number() == (b->parameter_number() ^ 1);
+  }
+  // If the operation has no operands, it should actually be the same.
+  if (a->operand_count() == 0) {
+    return a == b;
+  }
+  // Otherwise recursively compare all operands.
+  for (int64 i = 0; i < a->operand_count(); ++i) {
+    if (!IsSameComputationExceptParams(a->operand(i), b->operand(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Check that the comparison computation has been modified to add a tie breaker
+// using 'iota_parameter'.
+void CheckComputationHasTieBreaker(const HloInstruction* root,
+                                   int64 iota_parameter) {
+  // With the tie breaker, the root instruction should be
+  //   Select(Eq(Comp(), CompReverse()), Lt(), Comp())
+  // with Comp() being the original comparison function, and CompReverse() being
+  // the copied comparison function where the parameters are reversed. Lt() is
+  // the tie breaker comparison using the Iota operand.
+  ASSERT_EQ(root->opcode(), HloOpcode::kSelect);
+  ASSERT_EQ(root->operand(0)->opcode(), HloOpcode::kEq);
+
+  // Check that the tie breaker instruction is correct.
+  EXPECT_THAT(root->operand(1),
+              GmockMatch(m::Lt(m::Parameter(iota_parameter * 2),
+                               m::Parameter(iota_parameter * 2 + 1))));
+  EXPECT_EQ(root->operand(2), root->operand(0)->operand(0));
+
+  // Check that Comp() and CompReverse() are equivalent except that
+  // CompReverse() has reversed parameters.
+  EXPECT_TRUE(IsSameComputationExceptParams(root->operand(0)->operand(0),
+                                            root->operand(0)->operand(1)));
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortReuseIotaOperand) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest,
+       StabilizeSortReuseIotaOperandComplicatedComparison) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     max = u32[] constant(2147483647)
+     zero = s32[] constant(0)
+     lhs.signed = s32[] bitcast-convert(p.0.lhs)
+     lhs.unsigned = u32[] bitcast-convert(p.0.lhs)
+     lhs.flipped = u32[] subtract(max, lhs.unsigned)
+     lhs.flipped.signed = s32[] bitcast-convert(lhs.flipped)
+     lhs.is_negative = pred[] less-than(lhs.flipped.signed, zero)
+     lhs.converted = s32[] select(lhs.is_negative, lhs.flipped.signed, lhs.signed)
+     rhs.signed = s32[] bitcast-convert(p.0.rhs)
+     rhs.unsigned = u32[] bitcast-convert(p.0.rhs)
+     rhs.flipped = u32[] subtract(max, rhs.unsigned)
+     rhs.flipped.signed = s32[] bitcast-convert(rhs.flipped)
+     rhs.is_negative = pred[] less-than(rhs.flipped.signed, zero)
+     rhs.converted = s32[] select(rhs.is_negative, rhs.flipped.signed, rhs.signed)
+     ROOT lt = pred[] less-than(lhs.converted, rhs.converted)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortAddIotaOperandAndChangeRoot) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     ROOT sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, GmockMatch(m::Tuple(
+                m::GetTupleElement(
+                    m::Sort(m::Parameter(0), m::Parameter(1), m::Iota()), 0),
+                m::GetTupleElement(
+                    m::Sort(m::Parameter(0), m::Parameter(1), m::Iota()), 1))));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, HonorIsStableFlag) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=false
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_FALSE(stabilizer.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(StableSortExpanderTest,
+       StabilizeSortDontReuseIotaOperandWrongDimension) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=0
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  // Simplify away the "wrapper" tuple around the new sort.
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions(
+      [](const Shape&, const Shape&) { return false; }));
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota(), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortDontReuseIotaOperandWrongType) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = f32[] parameter(2)
+     p.1.rhs = f32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = f32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  // Simplify away the "wrapper" tuple around the new sort.
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions(
+      [](const Shape&, const Shape&) { return false; }));
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota(), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortR1) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = s32[] parameter(0)
+     p.0.rhs = s32[] parameter(1)
+     mask = s32[] constant(65535)
+     lhs = s32[] and(p.0.lhs, mask)
+     rhs = s32[] and(p.0.rhs, mask)
+     ROOT lt = pred[] less-than(lhs, rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = s32[64,8732]{1,0} parameter(0)
+     ROOT sort = s32[64,8732]{1,0} sort(keys), dimensions={0}, to_apply=compare,
+       is_stable=true
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortR1NoRoot) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = s32[] parameter(0)
+     p.0.rhs = s32[] parameter(1)
+     mask = s32[] constant(65535)
+     lhs = s32[] and(p.0.lhs, mask)
+     rhs = s32[] and(p.0.rhs, mask)
+     ROOT lt = pred[] less-than(lhs, rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = s32[64,8732]{1,0} parameter(0)
+     sort = s32[64,8732]{1,0} sort(keys), dimensions={0}, to_apply=compare,
+       is_stable=true
+     ROOT neg = s32[64,8732]{1,0} negate(sort)
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Negate(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0))));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/1);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
similarity index 82%
rename from tensorflow/compiler/xla/client/lib/triangular_solve.cc
rename to tensorflow/compiler/xla/service/triangular_solve_expander.cc
index ba7fde118fde990fbb4aa9a34dd0f0e67ff5a93b..b26cdc1db59b30d82b9ac58a8a2ac762220086be 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 
 #include <memory>
 #include <vector>
@@ -33,6 +33,8 @@ limitations under the License.
 
 namespace xla {
 
+namespace {
+
 // Get the diagonal blocks of the coefficient matrix
 XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
   XlaBuilder* builder = a.builder();
@@ -345,9 +347,10 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
   });
 }
 
-XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
-                      bool transpose_a, bool conjugate_a, int64 block_size,
-                      PrecisionConfig::Precision precision) {
+XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                           bool transpose_a, bool conjugate_a,
+                           bool unit_diagonal, int64 block_size,
+                           PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -406,6 +409,20 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
       return b;
     }
 
+    // TODO(phawkins): consider pushing triangle masking into
+    // InvertDiagonalBlocks.
+    if (unit_diagonal) {
+      // Mask everything but the subdiagonal/superdiagonal elements.
+      a = lower ? Select(TriangleMask(a, -1), a, ZerosLike(a))
+                : Select(TriangleMask(a, 0), ZerosLike(a), a);
+      int64 k = ShapeUtil::GetDimension(a_shape, -1);
+      a = xla::Add(a, IdentityMatrix(builder, a_shape.element_type(), k, k),
+                   /*broadcast_dimensions=*/{ndims - 2, ndims - 1});
+    } else {
+      // Mask off the ignored elements of the triangular matrix a.
+      a = Triangle(a, lower);
+    }
+
     // We find the diagonal blocks of the coefficient matrix
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
@@ -413,11 +430,6 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
     auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
                                                 conjugate_a, precision);
 
-    // Mask off the ignored elements of the triangular matrix a.
-    // TODO(phawkins): it would probably be preferable to perform this masking
-    // block by block inside SolveWithInvertedDiagonalBlocks.
-    a = Triangle(a, lower);
-
     // We now find the solution using GEMMs
     auto x =
         SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
@@ -427,4 +439,66 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
   });
 }
 
+}  // namespace
+
+bool TriangularSolveExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kTriangularSolve;
+}
+
+StatusOr<HloInstruction*> TriangularSolveExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  const TriangularSolveOptions& options =
+      instruction->triangular_solve_options();
+  const string name = absl::StrFormat(
+      "xla.triangular_solve_%s_%s_%s_%s_%s_%s",
+      instruction->operand(0)->shape().ToString(),
+      instruction->operand(1)->shape().ToString(),
+      options.left_side() ? "left" : "right",
+      options.lower() ? "lower" : "upper",
+      TriangularSolveOptions_Transpose_Name(options.transpose_a()),
+      options.unit_diagonal() ? "unit" : "nonunit");
+
+  HloModule* module = instruction->parent()->parent();
+
+  HloComputation*& computation =
+      computation_cache_.emplace(name, nullptr).first->second;
+  if (!computation) {
+    // Builds a new expansion.
+    //
+    // We do something unusual here: we build the computation using the
+    // XlaBuilder API, which is nominally an XLA client API. We do this because
+    // the external APIs for building complicated computations (XlaBuilder)
+    // are much more ergonomic than the internal ones. As it turns out,
+    // XlaBuilder isn't really a client API—what it does is build a
+    // HloModuleProto protocol buffer, that we can then deserialize and clone
+    // into our HloModule. Ideally we would avoid the protocol buffer step;
+    // that is left as an exercise for future work.
+    XlaBuilder builder(name);
+    XlaOp a = Parameter(&builder, 0, instruction->operand(0)->shape(), "a");
+    XlaOp b = Parameter(&builder, 1, instruction->operand(1)->shape(), "b");
+    bool transpose_a =
+        options.transpose_a() != TriangularSolveOptions::NO_TRANSPOSE;
+    bool conjugate_a = options.transpose_a() == TriangularSolveOptions::ADJOINT;
+
+    BuildTriangularSolve(a, b, options.left_side(), options.lower(),
+                         transpose_a, conjugate_a, options.unit_diagonal(),
+                         /*block_size=*/128,
+                         /*precision=*/PrecisionConfig::HIGHEST);
+    TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
+
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        xla_computation.GetProgramShape());
+    HloModuleConfig config(program_shape);
+    TF_ASSIGN_OR_RETURN(auto new_module, HloModule::CreateFromProto(
+                                             xla_computation.proto(), config));
+    HloCloneContext context(module);
+    computation =
+        module->DeepCloneComputation(new_module->entry_computation(), &context);
+  }
+
+  return instruction->parent()->AddInstruction(HloInstruction::CreateCall(
+      instruction->shape(), instruction->operands(), computation));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..be2374ef8c86254d8db5ac1acac385aa0de7d3a5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+class TriangularSolveExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "triangular_solve_expander";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+ private:
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 5e505aaf02f157d0cba9dff42b1a9b89a6691504..cc82e9bb0287b5a586fb21fee35d3124a6d6f121 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -699,6 +699,8 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
 // (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
 //     0.
 // (5) The 'user' of 'operand' is Sort, and it is the only user.
+// (6) The 'user' of 'operand' is TriangularSolve, it is the second operand,
+//     and it is the only user.
 //
 // (2) and (3) can only be determined if points-to analysis is available.
 bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
@@ -779,6 +781,14 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && user_index[0] == operand_indices[0];
   }
+  if (user->opcode() == HloOpcode::kTriangularSolve) {
+    // Only valid if there are no other users.
+    if (operand->users().size() != 1) {
+      return false;
+    }
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 1;
+  }
   if (user->opcode() == HloOpcode::kCall) {
     // TODO(b/62548313): Remove when buffer assignment is module scoped and
     // does not assign buffers to calls.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index fd5759e44230db8223822d6ae0f511027f73d8f9..6f61fc44166298e86a88dfc4f0ce8526d65ffd02 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
@@ -1065,14 +1066,17 @@ TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto sort =
-      builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort, MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false,
+                              &builder, module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   EXPECT_TRUE(
       points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
@@ -1080,6 +1084,7 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   Shape values_shape = ShapeUtil::MakeShape(F32, {8});
@@ -1087,11 +1092,14 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
-  auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
-      {values}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({keys_shape, values_shape}),
+                  {keys, values}, 0, /*is_stable=*/false, &builder,
+                  module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   // The buffer for the keys can be shared with the first tuple entry.
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index a36d3547a0987422c2658b0f3046f7b1f83369c6..94854047e530babe2234381a615aeb805f0d5933 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -34,8 +34,12 @@ Shape::Shape(const ShapeProto& shape_proto) {
   // instead of a constructor.
   if (shape_proto.dimensions_size() !=
       shape_proto.is_dynamic_dimension_size()) {
-    LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
-                  "fields does not match number of dimension fields";
+    if (shape_proto.is_dynamic_dimension_size() != 0) {
+      LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
+                    "fields does not match number of dimension fields";
+    } else {
+      LOG(WARNING) << "Malformed shape proto: is_dynamic_dimension is empty";
+    }
   }
   int64 num_dynamic_dimension_fields = std::min(
       shape_proto.dimensions_size(), shape_proto.is_dynamic_dimension_size());
@@ -143,26 +147,15 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
       return false;
     }
     if (LayoutUtil::IsDenseArray(lhs)) {
-      if (!absl::c_equal(LayoutUtil::MinorToMajor(lhs),
-                         LayoutUtil::MinorToMajor(rhs))) {
-        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
-        return false;
-      }
-
-      const auto& lhs_tiles = lhs.layout().tiles();
-      const auto& rhs_tiles = rhs.layout().tiles();
-      if (lhs_tiles.size() != rhs_tiles.size()) {
-        return false;
+      Layout::Equal equal;
+      if (ignore_tiles_in_layout_) {
+        equal.IgnoreTiles();
       }
-      for (int64 i = 0; i < lhs_tiles.size(); i++) {
-        if (!absl::c_equal(lhs_tiles[i].dimensions(),
-                           rhs_tiles[i].dimensions())) {
-          return false;
-        }
+      if (ignore_element_size_in_layout_) {
+        equal.IgnoreElementSize();
       }
-
-      if (lhs.layout().element_size_in_bits() !=
-          rhs.layout().element_size_in_bits()) {
+      if (!equal(lhs.layout(), rhs.layout())) {
+        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
         return false;
       }
     }
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 1d594904e0b9e6f1779674e75b41b7a597788bac..78cea83c6d71e5965f10cd3a917ffccabd630462 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -146,10 +146,10 @@ class Shape {
   //
   // Examples:
   //
-  // - Comparing two shapes ignoring they layout difference:
+  // - Comparing two shapes ignoring their layout difference:
   //   Equal().IgnoreLayout()(shape1, shape2);
   //
-  // - Comparing two shapes ignoring they layout and element type difference:
+  // - Comparing two shapes ignoring their layout and element type difference:
   //   Equal().IgnoreLayout().IgnoreElementType()(shape1, shape2);
   class Equal {
    public:
@@ -161,6 +161,14 @@ class Shape {
       ignore_layout_ = true;
       return *this;
     }
+    Equal& IgnoreTilesInLayout() {
+      ignore_tiles_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementSizeInLayout() {
+      ignore_element_size_in_layout_ = true;
+      return *this;
+    }
     Equal& IgnoreElementType() {
       ignore_element_type_ = true;
       return *this;
@@ -174,8 +182,10 @@ class Shape {
       return *this;
     }
 
-   public:
+   private:
     bool ignore_layout_ = false;
+    bool ignore_tiles_in_layout_ = false;
+    bool ignore_element_size_in_layout_ = false;
     bool ignore_element_type_ = false;
     bool ignore_fp_precision_ = false;
     bool ignore_dynamic_dimension_ = false;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 1ada4bc0362f86bc770d4adfcd4d4b0ff7379c77..d045fc7a9e291258640eca75166e116cf7390a7b 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -89,7 +90,8 @@ namespace {
 // its Layout.
 StatusOr<Shape> MakeShapeWithLayoutInternal(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
-    absl::Span<const int64> minor_to_major) {
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
   if (dimensions.size() != minor_to_major.size()) {
     return InvalidArgument("Dimensions size is %ld, but layout size is %ld.",
                            dimensions.size(), minor_to_major.size());
@@ -100,11 +102,8 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   }
   TF_ASSIGN_OR_RETURN(Shape shape,
                       ShapeUtil::MakeValidatedShape(element_type, dimensions));
-  auto min2maj = shape.mutable_layout()->mutable_minor_to_major();
-  min2maj->clear();
-  for (int64 value : minor_to_major) {
-    min2maj->push_back(value);
-  }
+  *shape.mutable_layout() =
+      LayoutUtil::MakeLayout(minor_to_major, tiles, element_size_in_bits);
   if (!shape.has_layout()) {
     return InvalidArgument("Shape has no layout.");
   }
@@ -189,8 +188,10 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ Shape ShapeUtil::MakeShapeWithLayout(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
-    absl::Span<const int64> minor_to_major) {
-  return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major)
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
+  return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major,
+                                     tiles, element_size_in_bits)
       .ValueOrDie();
 }
 
@@ -1256,6 +1257,43 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     const Shape& input_shape, const Shape& output_shape) {
   CHECK(input_shape.IsArray());
   CHECK(output_shape.IsArray());
+  // Removing trivial dimensions from the shape simplifies the alignment
+  // algorithm since ones can go in any position.
+  if (HasDegenerateDimensions(input_shape) ||
+      HasDegenerateDimensions(output_shape)) {
+    auto simple_output_shape =
+        AlignLayouts(DropDegenerateDimensions(input_shape),
+                     DropDegenerateDimensions(output_shape));
+    if (!simple_output_shape) {
+      return absl::nullopt;
+    }
+
+    auto layout = simple_output_shape->layout().minor_to_major();
+    // For each one sized dimension in the output, increment the dimension
+    // numbers in layout that are more minor than the one.
+    absl::InlinedVector<int64, 8> dim_map;
+    dim_map.reserve(simple_output_shape->rank());
+    for (int64 i = 0; i < output_shape.rank(); ++i) {
+      if (output_shape.dimensions(i) != 1) {
+        dim_map.push_back(i);
+      }
+    }
+    for (int64& d : layout) {
+      d = dim_map[d];
+    }
+
+    // Add the ones in descending order to the layout. Descending layouts tend
+    // to reduce the number of copies inserted in layout assignment.
+    for (int64 i = output_shape.rank() - 1; i >= 0; --i) {
+      if (output_shape.dimensions(i) == 1) {
+        layout.push_back(i);
+      }
+    }
+    Shape output_shape_with_layout = output_shape;
+    *output_shape_with_layout.mutable_layout()->mutable_minor_to_major() =
+        layout;
+    return output_shape_with_layout;
+  }
 
   int64 input_rank = input_shape.rank();
   int64 output_rank = output_shape.rank();
@@ -1304,10 +1342,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   if (input_dimension_product != output_dimension_product) {
     return absl::nullopt;
   }
+
   // We also need to store an end element so that we know where the last
   // alignment part ends.
   alignment.push_back({input_rank, output_rank});
-
   // Now check if the physical layout can potentially be aligned to the output
   // shape by changing the physical layout of the output shape. We need to check
   // that all dimension numbers that belong to the same alignment part appear
@@ -1319,40 +1357,23 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   for (int64 i = 0; i < input_rank;) {
     int64 current_dimension_number = input_dimension_numbers[i];
 
-    // Skip trivial dimensions with a bound of 1.
-    if (input_shape.dimensions(current_dimension_number) == 1) {
-      ++i;
-      continue;
-    }
-
-    // Calculate the number of non-trivial dimension bounds in the input shape
-    // belonging to the current alignment part.
+    // Trivial dimensions are stripped.
+    CHECK_NE(input_shape.dimensions(current_dimension_number), 1);
     const int64 current_alignment_index =
         dimension_to_alignment_index[current_dimension_number];
     // Because of the special end element that we added, we can be sure that
     // 'current_alignment_index' is < alignment.size() - 1.
     CHECK_LT(current_alignment_index, alignment.size() - 1);
-    int64 num_non_trivial_dimensions_in_alignment_part = 0;
-    for (int64 j = alignment[current_alignment_index].first;
-         j < alignment[current_alignment_index + 1].first; ++j) {
-      if (input_shape.dimensions(j) != 1) {
-        ++num_non_trivial_dimensions_in_alignment_part;
-      }
-    }
 
     // Check that the following 'num_non_trivial_dimensions_in_alignment_part'
     // dimension numbers (ignoring dimension numbers with dimension bound 1) are
     // in descending order and belong to the current alignment part.
-    for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
+    for (int64 j = 0; j < alignment[current_alignment_index + 1].first -
+                              alignment[current_alignment_index].first;
          ++i, ++j) {
       if (i == input_rank) {
         return absl::nullopt;
       }
-      // Skip trivial dimensions with a bound of 1.
-      if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
-        --j;
-        continue;
-      }
       // If the current dimension number belongs to a different alignment part,
       // or the dimension numbers are not in descending order, we can return
       // early.
@@ -1363,22 +1384,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
       }
       current_dimension_number = input_dimension_numbers[i];
     }
-
     // The output dimension numbers that belong to the current alignment part
-    // need to appear in the same descending order as in the input. Again, we
-    // can skip dimensions with a bound of 1.
+    // need to appear in the same descending order as in the input.
     for (int64 j = alignment[current_alignment_index + 1].second - 1;
          j >= alignment[current_alignment_index].second; --j) {
-      if (output_shape.dimensions(j) != 1) {
-        output_layout.push_back(j);
-      }
-    }
-  }
-  // Now add all the dimensions with dimension bound 1 at the end of
-  // 'output_layout'.
-  for (int64 i = 0; i < output_rank; ++i) {
-    if (output_shape.dimensions(i) == 1) {
-      output_layout.push_back(i);
+      output_layout.push_back(j);
     }
   }
   CHECK_EQ(output_layout.size(), output_rank);
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index fb6da7460e2475732d6f02758e5519fbdb7c0f8d..7f610a6085d6fbe3d3143d5027cdc43d4b07bcbf 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -398,7 +398,9 @@ class ShapeUtil {
   // Returns a value shape such that shape.has_layout().
   static Shape MakeShapeWithLayout(PrimitiveType element_type,
                                    absl::Span<const int64> dimensions,
-                                   absl::Span<const int64> minor_to_major);
+                                   absl::Span<const int64> minor_to_major,
+                                   absl::Span<const Tile> tiles = {},
+                                   int64 element_size_in_bits = 0);
 
   static Shape MakeShapeWithSparseLayout(PrimitiveType element_type,
                                          absl::Span<const int64> dimensions,
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 126ae58293d12182e9b6e30f779f681829729526..020b062f6b1b032bab958772d3a6a1e35daee38b 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -761,8 +761,15 @@ TEST(AlignmentTest, AlignLayoutsWithTrivialDimensions) {
   auto aligned_shape = ShapeUtil::AlignLayouts(
       input, ShapeUtil::MakeShape(xla::F32, {1, 4, 1, 3, 2, 7, 5, 11, 1}));
   EXPECT_TRUE(aligned_shape);
-  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
-              ElementsAre(6, 5, 4, 3, 1, 7, 0, 2, 8));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+TEST(AlignmentTest, AlignLayoutsWithAllTrivialDimensions) {
+  Shape input =
+      ShapeUtil::MakeShapeWithLayout(xla::F32, {1, 1, 1, 1}, {0, 1, 3, 2});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {1, 1, 1, 1, 1}));
+  EXPECT_TRUE(aligned_shape);
   EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
 }
 
diff --git a/tensorflow/compiler/xla/status_macros.cc b/tensorflow/compiler/xla/status_macros.cc
index b88fe367d7416a26c1147fd5e10fb20772814fe5..aa7238f07d432aabb44d2cbed66786217e6a846c 100644
--- a/tensorflow/compiler/xla/status_macros.cc
+++ b/tensorflow/compiler/xla/status_macros.cc
@@ -25,6 +25,13 @@ limitations under the License.
 namespace xla {
 namespace status_macros {
 
+ABSL_CONST_INIT const char kPossibleAutoJitAlternative[] =
+    "This error might be occurring with the use of xla.compile. If it is not "
+    "necessary that every Op be compiled with XLA, an alternative is to use "
+    "auto_jit with OptimizerOptions.global_jit_level = ON_2 or the environment "
+    "variable TF_XLA_FLAGS=\"tf_xla_auto_jit=2\" which will attempt to use xla "
+    "to compile as much of the graph as the compiler is able to.";
+
 static Status MakeStatus(tensorflow::error::Code code, const string& message) {
   return Status(code, message);
 }
diff --git a/tensorflow/compiler/xla/status_macros.h b/tensorflow/compiler/xla/status_macros.h
index e51dd64e2a3dc7c359918cb08c6c94b2b4d9e91b..315136acc71670fa3ad48da4dc064e384ddadaa9 100644
--- a/tensorflow/compiler/xla/status_macros.h
+++ b/tensorflow/compiler/xla/status_macros.h
@@ -30,6 +30,10 @@ limitations under the License.
 namespace xla {
 namespace status_macros {
 
+// This is a useful error message when encountering XLA Compiler errors that
+// could be handled with the non-strict AutoJit mode.
+extern const char kPossibleAutoJitAlternative[];
+
 // Stream object used to collect error messages in MAKE_ERROR macros
 // or append error messages with APPEND_ERROR.  It accepts any
 // arguments with operator<< to build an error string, and then has an
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index e8e779fb2a3f201ae056e6385eacfe6a63503749..562854756628df64fbf92d40af859f8b218b0cc2 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -317,6 +317,11 @@ xla_test(
     name = "conv_depthwise_backprop_filter_test",
     timeout = "long",
     srcs = ["conv_depthwise_backprop_filter_test.cc"],
+    # these backends do not natively handle batch group counts.
+    blacklisted_backends = [
+        "gpu",
+        "cpu",
+    ],
     shard_count = 6,
     deps = [
         "//tensorflow/compiler/xla:execution_options_util",
@@ -676,7 +681,7 @@ xla_test(
     tags = [
         "optonly",
         # This is a big test that we skip for capacity reasons in OSS testing.
-        "nooss",
+        "no_oss",
     ],
     deps = [
         ":client_library_test_base",
@@ -1141,7 +1146,7 @@ xla_test(
 xla_test(
     name = "reduce_test",
     srcs = ["reduce_test.cc"],
-    shard_count = 40,
+    shard_count = 31,
     tags = [
         "optonly",
     ],
@@ -1389,8 +1394,8 @@ xla_test(
 )
 
 xla_test(
-    name = "fmax_test",
-    srcs = ["fmax_test.cc"],
+    name = "fmax_fmin_test",
+    srcs = ["fmax_fmin_test.cc"],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -2155,3 +2160,26 @@ xla_test(
         "//tensorflow/compiler/xla:test",
     ],
 )
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "noasan",  # sometimes times out, http://b/78650012
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 7379fbcc22745f46f2a29732c4bda46f352d07e7..acdd3c9da92efe8fae1336eaa861c01d5bb9b158 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
index 96f4aedf8b996b152b77628252841348e732756f..dfbf0478e62713635446d11557367cfac6ab0dce 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -37,6 +37,8 @@ struct BatchGroupedConvolution2DSpec {
   std::vector<int64> activation_dims;
   std::vector<int64> kernel_dims;
   std::vector<int64> output_dims;
+  std::vector<int64> activation_and_kernel_layout;
+  std::vector<int64> output_layout;
 };
 
 class BatchGroupedConvolution2DTest
@@ -47,8 +49,9 @@ class BatchGroupedConvolution2DTest
 static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
   std::vector<BatchGroupedConvolution2DSpec> config_set;
   std::vector<std::vector<int64>> config_options = {
-      {8, 5, 3, 2},   {4, 5, 5, 2},   {8, 7, 4, 128},  {16, 20, 20, 256},
-      {256, 7, 5, 4}, {256, 6, 6, 4}, {256, 8, 8, 512}};
+      {8, 5, 3, 2},      {4, 5, 5, 2},    {8, 7, 4, 128},
+      {16, 20, 20, 256}, {256, 7, 5, 4},  {256, 6, 6, 4},
+      {256, 8, 8, 512},  {64, 7, 7, 960}, {64, 14, 14, 576}};
 
   for (auto option : config_options) {
     int64 feature = option[3];
@@ -68,8 +71,14 @@ static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
     int64 output_space_size = 3 + activation_size - kernel_size;
     config.output_dims = {output_space_size, output_space_size, feature, 1};
 
+    config.activation_and_kernel_layout = {0, 3, 1, 2};
+    config.output_layout = {2, 3, 0, 1};
     config_set.push_back(config);
 
+    BatchGroupedConvolution2DSpec different_layout_config = config;
+    different_layout_config.activation_and_kernel_layout = {3, 0, 1, 2};
+    config_set.push_back(different_layout_config);
+
     // Add configurations for window dilation cases.
     if (activation_size % 2 == 0 && activation_size == kernel_size) {
       BatchGroupedConvolution2DSpec config;
@@ -79,11 +88,17 @@ static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
       config.activation_dims = {batch, activation_size, activation_size,
                                 feature};
       config.kernel_dims = {batch, kernel_size / 2, kernel_size / 2, feature};
+      config.activation_and_kernel_layout = {0, 3, 1, 2};
+      config.output_layout = {2, 3, 0, 1};
 
       int64 output_space_size = 5;
       config.output_dims = {output_space_size, output_space_size, feature, 1};
 
       config_set.push_back(config);
+
+      BatchGroupedConvolution2DSpec different_layout_config = config;
+      different_layout_config.activation_and_kernel_layout = {3, 0, 1, 2};
+      config_set.push_back(different_layout_config);
     }
   }
 
@@ -97,8 +112,11 @@ string BatchGroupedConvolution2DTestDataToString(
   const string data_type = GetFloatDataType(::testing::get<1>(data.param));
   string str = absl::StrCat(
       "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
-      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_output_dims_",
-      absl::StrJoin(spec.output_dims, "x"), data_type);
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"),
+      "_activation_layout_",
+      absl::StrJoin(spec.activation_and_kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), data_type, "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"));
 
   // Test names are not allowed to contain the '-' character.
   absl::c_replace(str, '-', 'n');
@@ -110,23 +128,28 @@ string BuildHloTextBatchGroupedConvolution2D(
   const string data_type = GetFloatDataType(use_bfloat16);
   return absl::StrFormat(
       R"(
-    HloModule TensorFlowDepthwiseConv
+    HloModule TensorFlowDepthwiseConv, is_scheduled=true
 
     ENTRY main {
-      activation = %s[%s] parameter(0)
-      kernel = %s[%s] parameter(1)
-      ROOT conv = %s[%s] convolution(%s[%s] activation, %s[%s] kernel),
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
           window={size=%dx%d pad=1_%dx1_%d rhs_dilate=%dx%d}, dim_labels=f01b_i01o->01fb,
           batch_group_count=%d
     }
     )",
-      data_type, absl::StrJoin(spec.activation_dims, ","), data_type,
-      absl::StrJoin(spec.kernel_dims, ","), data_type,
-      absl::StrJoin(spec.output_dims, ","), data_type,
-      absl::StrJoin(spec.activation_dims, ","), data_type,
-      absl::StrJoin(spec.kernel_dims, ","), spec.window, spec.window,
-      spec.window_dilation, spec.window_dilation, spec.window_dilation,
-      spec.window_dilation, spec.output_batch);
+      data_type, absl::StrJoin(spec.activation_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
+      absl::StrJoin(spec.kernel_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
+      absl::StrJoin(spec.output_dims, ","),
+      absl::StrJoin(spec.output_layout, ","), data_type,
+      absl::StrJoin(spec.activation_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
+      absl::StrJoin(spec.kernel_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), spec.window,
+      spec.window, spec.window_dilation, spec.window_dilation,
+      spec.window_dilation, spec.window_dilation, spec.output_batch);
 }
 
 XLA_TEST_P(BatchGroupedConvolution2DTest, DoIt) {
@@ -135,13 +158,13 @@ XLA_TEST_P(BatchGroupedConvolution2DTest, DoIt) {
   const string hlo_text =
       BuildHloTextBatchGroupedConvolution2D(spec, use_bfloat16);
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
-                            [](HloModule* module) -> Status {
-                              BFloat16MixedPrecisionRemoval remover;
-                              TF_RETURN_IF_ERROR(remover.Run(module).status());
-                              Despecializer despecializer;
-                              return despecializer.Run(module).status();
-                            }));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
+        BFloat16MixedPrecisionRemoval remover;
+        TF_RETURN_IF_ERROR(remover.Run(module).status());
+        Despecializer despecializer;
+        return despecializer.Run(module).status();
+      }));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index cad43d1b5547d74701760fa623e50466fc15c263..4687ed61a7de91bc1bce0efeadf1965ad7d52d55 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -172,8 +172,10 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
 
   const Shape& r2f32_dim0_major =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
-  b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto custom_call = b.AddInstruction(HloInstruction::CreateCustomCall(
       r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
+  b.AddInstruction(
+      custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
 
   module->AddEntryComputation(b.Build());
   ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
@@ -182,7 +184,7 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
   Literal result = ExecuteAndTransfer(std::move(module), {&argument});
-  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 3.f}, {4.f, 5.f}}, result);
+  LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
 XLA_TEST_F(CustomCallTest, TupleOutput) {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index f740f4815810727890583405b2244fceaec0bd3f..6ee2178a227a12b7baa933f036a44db8ec630a4c 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -1188,6 +1188,16 @@ std::vector<EinsumParamType> GetEinsumTestCases() {
       p{v{8, 55, 11, 3}, v{55, 11, 3, 29}, "mkBC,kBCn->BCnm"},
       p{v{5, 6}, v{6, 7}, "ab,cd->dcba"},
       p{v{6}, v{6, 7}, "b,bc->c"},
+      p{v{5, 6, 7}, v{5, 6, 7}, "abc,abc->ab"},
+      p{v{5, 6, 7}, v{7, 6, 5}, "abc,cba->ca"},
+      p{v{77}, v{77}, "a,a->a"},
+      p{v{77}, v{77, 55}, "a,ab->ba"},
+      p{v{2, 3, 77}, v{77, 2, 3, 55}, "ija,aijb->baij"},
+      p{v{55}, v{}, "a,->a"},
+      p{v{11, 111}, v{11}, "ab,a->ab"},
+      p{v{16, 34}, v{16, 34}, "ab,ab->ab"},
+      p{v{16, 3, 34}, v{3, 16, 34}, "abc,bac->abc"},
+      p{v{5, 19}, v{}, "ab,->ab"},
   };
   return test_cases;
 }
@@ -1257,5 +1267,82 @@ ENTRY %test {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
 }
 
+XLA_TEST_F(DotOperationTextTest, CpuTiledDotEmitterCachingBug_1) {
+  // Tests for a caching bug in the XLA CPU backend.
+  absl::string_view hlo_string =
+      R"(
+HloModule CpuTiledDotEmitterCachingBug
+
+ENTRY main {
+  lhs = f32[20,40] parameter(0)
+  rhs_0 = f32[40,1] parameter(2)
+  rhs_1 = f32[1,40] parameter(1)
+
+  dot_0 = f32[20,1] dot(lhs, rhs_0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_1 = f32[20,1] dot(lhs, rhs_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+
+  ROOT result = f32[20,1] divide(dot_0, dot_1)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, CpuTiledDotEmitterCachingBug_2) {
+  // Tests for a caching bug in the XLA CPU backend.
+  absl::string_view hlo_string =
+      R"(
+HloModule CpuTiledDotEmitterCachingBug
+
+ENTRY main {
+  lhs_0 = f32[20,40] parameter(0)
+  rhs_0 = f32[40,1] parameter(1)
+  lhs_1 = f32[1,40] parameter(2)
+  rhs_1 = f32[20,40] parameter(3)
+
+  dot_0 = f32[20,1] dot(lhs_0, rhs_0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_1 = f32[1,20] dot(lhs_1, rhs_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+
+  dot_0_reshaped = f32[20] reshape(dot_0)
+  dot_1_reshaped = f32[20] reshape(dot_1)
+
+  ROOT result = f32[20] divide(dot_0_reshaped, dot_1_reshaped)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(GpuIntegerDotCodegen)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s32[1,2,2] parameter(0)
+  arg1 = s32[1,2,1] parameter(1)
+  ROOT dot = s32[1,2,1] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(GpuTransposeOutput)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule TransposeOutput
+
+ENTRY TransposeOutput {
+  p0 = f32[32,32] parameter(0)
+  p1 = f32[32,64] parameter(1)
+  dot = f32[32,64] dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  ROOT tr = f32[64,32] transpose(dot), dimensions={1,0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fmax_fmin_test.cc b/tensorflow/compiler/xla/tests/fmax_fmin_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7423ac0bcdb0bc305ee384fb98bd17413404ecef
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fmax_fmin_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class FmaxSimpleTest : public ClientLibraryTestBase {};
+
+TEST_F(FmaxSimpleTest, FmaxTenValues) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(
+      &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
+  auto y = ConstantR1<float>(
+      &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0});
+  Max(x, y);
+
+  std::vector<float> expected = {-0.0, 1.0, 2.0, 3.0, 4.0,
+                                 5.0,  6.0, 7.0, 8.0, 9.0};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+TEST_F(FmaxSimpleTest, FmaxEdgeCases) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  XlaOp param0, param1;
+  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+      {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
+       INFINITY, -INFINITY, NAN},
+      /*parameter_number=*/0, /*name=*/"param0",
+      /*builder=*/&builder, /*data_handle=*/&param0);
+  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+      {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
+      /*parameter_number=*/1, /*name=*/"param1",
+      /*builder=*/&builder, /*data_handle=*/&param1);
+
+  Max(param0, param1);
+  std::vector<float> expected = {INFINITY, INFINITY, NAN,      NAN, INFINITY,
+                                 -5,       NAN,      INFINITY, 8,   NAN};
+  ComputeAndCompareR1<float>(&builder, expected,
+                             {param0_data.get(), param1_data.get()},
+                             ErrorSpec(0.0001));
+}
+
+TEST_F(FmaxSimpleTest, FminEdgeCases) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  XlaOp param0, param1;
+  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+      {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
+       INFINITY, -INFINITY, NAN},
+      /*parameter_number=*/0, /*name=*/"param0",
+      /*builder=*/&builder, /*data_handle=*/&param0);
+  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+      {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
+      /*parameter_number=*/1, /*name=*/"param1",
+      /*builder=*/&builder, /*data_handle=*/&param1);
+
+  Min(param0, param1);
+  std::vector<float> expected = {INFINITY,  -INFINITY, NAN, NAN,       -4,
+                                 -INFINITY, NAN,       7,   -INFINITY, NAN};
+  ComputeAndCompareR1<float>(&builder, expected,
+                             {param0_data.get(), param1_data.get()},
+                             ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
deleted file mode 100644
index c5bbbe778df15d63a2586bd6291a7a33fc82aa52..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace {
-
-class FmaxSimpleTest : public ClientLibraryTestBase {};
-
-TEST_F(FmaxSimpleTest, FmaxTenValues) {
-  XlaBuilder builder(TestName());
-  auto x = ConstantR1<float>(
-      &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
-  auto y = ConstantR1<float>(
-      &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0});
-  Max(x, y);
-
-  std::vector<float> expected = {-0.0, 1.0, 2.0, 3.0, 4.0,
-                                 5.0,  6.0, 7.0, 8.0, 9.0};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 66f72ba8d20b8ef1f436da4425b2bb6518ee9a94..0151981ef16aabe9e363bc4d7f9ba96d4a1f170f 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -205,6 +205,17 @@ Literal HloTestBase::ExecuteAndTransfer(std::unique_ptr<HloModule> module,
   return test_runner_.Execute(std::move(module), arguments).ValueOrDie();
 }
 
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+    int64 num_replicas) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  for (auto argument : arguments) {
+    options.arguments.push_back(argument);
+  }
+  return test_runner_.ExecuteReplicated(std::move(module), options);
+}
+
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
     const HloModule& test_module,
     const std::function<void(HloModule*)>& reference_preprocessor) {
@@ -313,7 +324,10 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
                        reference_preprocessor);
 }
 
-::testing::AssertionResult HloTestBase::Run(string_view hlo_string) {
+::testing::AssertionResult HloTestBase::Run(string_view hlo_string,
+                                            bool run_hlo_passes,
+                                            ExecutionProfile* profile,
+                                            string backend_config) {
   auto module_or_status =
       HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
@@ -321,19 +335,108 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
            << "Error while parsing HLO text format: "
            << module_or_status.status().ToString();
   }
+
+  std::unique_ptr<HloModule> module = std::move(module_or_status.ValueOrDie());
   const auto& fake_arguments =
-      MakeFakeArguments(module_or_status.ValueOrDie().get())
-          .ConsumeValueOrDie();
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
   std::vector<Literal*> fake_argument_ptrs;
   absl::c_transform(
       fake_arguments, std::back_inserter(fake_argument_ptrs),
       [](const Literal& literal) { return const_cast<Literal*>(&literal); });
-  return test_runner_
-                 .Execute(std::move(module_or_status.ValueOrDie()),
-                          fake_argument_ptrs, /*run_hlo_passes=*/true)
-                 .ok()
+
+  if (profile != nullptr) {
+    // We have to enable HLO profiling since otherwise currently the
+    // ExecutionProfile is not correct.
+    //
+    // TODO(b/119432044): Fix collection of the ExecutionProfile
+    // so that this is not necessary.
+    HloModuleConfig config = module->config();
+    DebugOptions debug_options = config.debug_options();
+    debug_options.set_xla_hlo_profile(true);
+    config.set_debug_options(debug_options);
+    module->set_config(config);
+  }
+
+  if (!backend_config.empty()) {
+    // Set backend configuration if it is given.
+    HloInstruction* instruction =
+        module->entry_computation()->root_instruction();
+    instruction->set_raw_backend_config_string(backend_config);
+  }
+
+  // return ::testing::AssertionSuccess();
+  auto output = test_runner_.Execute(std::move(module), fake_argument_ptrs,
+                                     /*run_hlo_passes=*/run_hlo_passes,
+                                     /*profile=*/profile);
+
+  return output.ok()
              ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure();
+             : ::testing::AssertionFailure() << output.status().error_message();
+}
+
+::testing::AssertionResult HloTestBase::RunMultipleTimes(
+    string_view hlo_string, bool run_hlo_passes,
+    std::vector<ExecutionProfile>* profiles, string backend_config) {
+  int n = profiles->size();
+  std::vector<std::vector<Literal*>> fake_argument_ptrs(n);
+  std::vector<std::vector<Literal>> fake_arguments(n);
+  std::vector<std::unique_ptr<Executable>> executables(n);
+
+  for (int i = 0; i < n; ++i) {
+    auto module_or_status =
+        HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+    if (!module_or_status.ok()) {
+      return ::testing::AssertionFailure()
+             << "Error while parsing HLO text format: "
+             << module_or_status.status().ToString();
+    }
+    std::unique_ptr<HloModule> module =
+        std::move(module_or_status.ValueOrDie());
+
+    fake_arguments[i] = MakeFakeArguments(module.get()).ConsumeValueOrDie();
+    absl::c_transform(
+        fake_arguments[i], std::back_inserter(fake_argument_ptrs[i]),
+        [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+    if (profiles != nullptr) {
+      // We have to enable HLO profiling since otherwise currently the
+      // ExecutionProfile is not correct.
+      //
+      // TODO(b/119432044): Fix collection of the ExecutionProfile
+      // so that this is not necessary.
+      HloModuleConfig config = module->config();
+      DebugOptions debug_options = config.debug_options();
+      debug_options.set_xla_hlo_profile(true);
+      config.set_debug_options(debug_options);
+      module->set_config(config);
+    }
+
+    if (!backend_config.empty()) {
+      // Set backend configuration if it is given.
+      HloInstruction* instruction =
+          module->entry_computation()->root_instruction();
+      instruction->set_raw_backend_config_string(backend_config);
+    }
+
+    auto executable =
+        test_runner_.CreateExecutable(std::move(module), run_hlo_passes);
+    if (!executable.ok()) {
+      return ::testing::AssertionFailure()
+             << executable.status().error_message();
+    }
+    executables[i] = std::move(executable.ValueOrDie());
+  }
+
+  for (int i = 0; i < n; ++i) {
+    auto output =
+        test_runner_.Execute(std::move(executables[i]), fake_argument_ptrs[i],
+                             /*profile=*/&((*profiles)[i]));
+    if (!output.ok()) {
+      return ::testing::AssertionFailure() << output.status().error_message();
+    }
+  }
+
+  return ::testing::AssertionSuccess();
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 69a4f96288c7285010e9adbdc33f1b394f58d8d2..3c2bcbb5df5ce94dd37f63d0c0e609f3ad2b60aa 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -173,6 +173,11 @@ class HloTestBase : public ::testing::Test {
   Literal ExecuteAndTransfer(std::unique_ptr<HloModule> module,
                              absl::Span<Literal* const> arguments);
 
+  // Executes the given module on multiple replicas.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64 num_replicas);
+
   // Executes the given hlo module on two backends and compares results.
   //
   // 'arguments': the input of the hlo module.
@@ -221,8 +226,14 @@ class HloTestBase : public ::testing::Test {
       const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
-  ::testing::AssertionResult Run(const absl::string_view hlo_string)
-      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult Run(const absl::string_view hlo_string,
+                                 bool run_hlo_passes = true,
+                                 ExecutionProfile* profile = nullptr,
+                                 string backend_config = "") TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunMultipleTimes(
+      const absl::string_view hlo_string, bool run_hlo_passes,
+      std::vector<ExecutionProfile>* profiles,
+      string backend_config = "") TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareFromFile(
       const string& filename, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
diff --git a/tensorflow/compiler/xla/tests/plugin.bzl b/tensorflow/compiler/xla/tests/plugin.bzl
index 8a5d91363b619c6b214a96ad96e92742e3052541..107869fe59d43d0a9a3e2b14af2c09e4906d9f15 100644
--- a/tensorflow/compiler/xla/tests/plugin.bzl
+++ b/tensorflow/compiler/xla/tests/plugin.bzl
@@ -33,4 +33,3 @@
 # }
 
 plugins = {}
-
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index f80d29b9de440b11c36e8c9bc65d4a93353a6267..e2cf4c0be289b52d5cc581ea07752ed6e98da76f 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 95c89b0ba6f29c453abab88e29bca13ee006455a..67d2258928f75c078588c9425359f9468f4463ed 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -238,6 +238,79 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
   return std::move(literal);
 }
 
+template <typename IntT>
+void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
+                                              std::minstd_rand0* engine,
+                                              IntT min, IntT max) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<IntT>());
+  std::uniform_int_distribution<IntT> generator(min, max);
+  for (IntT& value : literal->data<IntT>()) {
+    value = generator(*engine);
+  }
+}
+
+// Same as MakeFakeLiteralInternal but generates random numbers in the given
+// range [min, max]. Currently this works only for INT types.
+StatusOr<Literal> MakeFakeLiteralInternalWithBounds(const Shape& shape,
+                                                    std::minstd_rand0* engine,
+                                                    int64 min, int64 max) {
+  if (shape.IsTuple()) {
+    std::vector<Literal> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(
+          Literal element,
+          MakeFakeLiteralInternalWithBounds(element_shape, engine, min, max));
+      elements.push_back(std::move(element));
+    }
+    return LiteralUtil::MakeTupleOwned(std::move(elements));
+  }
+  if (engine == nullptr) {
+    return Literal::CreateFromShape(shape);
+  }
+  Literal literal(shape);
+  switch (shape.element_type()) {
+    case S8:
+      PopulateWithRandomIntegralDataWithBounds<int8>(
+          &literal, engine, static_cast<int8>(min), static_cast<int8>(max));
+      break;
+    case U8:
+      PopulateWithRandomIntegralDataWithBounds<uint8>(
+          &literal, engine, static_cast<uint8>(min), static_cast<uint8>(max));
+      break;
+    case S16:
+      PopulateWithRandomIntegralDataWithBounds<int16>(
+          &literal, engine, static_cast<int16>(min), static_cast<int16>(max));
+      break;
+    case U16:
+      PopulateWithRandomIntegralDataWithBounds<uint16>(
+          &literal, engine, static_cast<uint16>(min), static_cast<uint16>(max));
+      break;
+    case S32:
+      PopulateWithRandomIntegralDataWithBounds<int32>(
+          &literal, engine, static_cast<int32>(min), static_cast<int32>(max));
+      break;
+    case U32:
+      PopulateWithRandomIntegralDataWithBounds<uint32>(
+          &literal, engine, static_cast<uint32>(min), static_cast<uint32>(max));
+      break;
+    case S64:
+      PopulateWithRandomIntegralDataWithBounds<int64>(
+          &literal, engine, static_cast<int64>(min), static_cast<int64>(max));
+      break;
+    case U64:
+      PopulateWithRandomIntegralDataWithBounds<uint64>(
+          &literal, engine, static_cast<uint64>(min), static_cast<uint64>(max));
+      break;
+    default:
+      return Unimplemented(
+          "Unsupported type for fake random literal generation with bounds: %s",
+          ShapeUtil::HumanString(shape));
+  }
+  return std::move(literal);
+}
+
 enum class ConstantType { kUnknown, kZero, kOne };
 
 // Return the constant type required by this computation, if known.
@@ -297,6 +370,10 @@ std::vector<HloInstruction*> FindConstrainedUses(
       if ((opcode == HloOpcode::kDynamicSlice && op_num >= 1) ||
           (opcode == HloOpcode::kDynamicUpdateSlice && op_num >= 2)) {
         constrained_uses.push_back(instruction);
+      } else if ((opcode == HloOpcode::kGather ||
+                  opcode == HloOpcode::kScatter) &&
+                 op_num == 1) {
+        constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kFusion) {
         const HloInstruction* const to_analyze =
             instruction->fused_parameter(op_num);
@@ -356,6 +433,22 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
         }
         break;
       }
+      case HloOpcode::kGather:
+      case HloOpcode::kScatter: {
+        const Shape& operand_shape = use->operand(0)->shape();
+        if (use->operand(1) == &param) {
+          auto index_map =
+              use->opcode() == HloOpcode::kGather
+                  ? use->gather_dimension_numbers().start_index_map()
+                  : use->scatter_dimension_numbers()
+                        .scatter_dims_to_operand_dims();
+          for (const auto dim_in_operand : index_map) {
+            index_bound =
+                std::min(index_bound, operand_shape.dimensions(dim_in_operand));
+          }
+        }
+        break;
+      }
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
         needs_constant = true;
@@ -385,8 +478,8 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
     return Unimplemented("Conflicting operand generation constraints.");
   }
   if (index_bound != INT64_MAX) {
-    return MakeRandomIndex(index_bound, engine)
-        .Reshape(param.shape().dimensions());
+    return MakeFakeLiteralInternalWithBounds(param.shape(), engine, -1,
+                                             index_bound);
   } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 591d6c19228a313f530cdae18f4be37e7b517601..f68ee04565f3898bd3db455e3e102bc2edb6255a 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -92,12 +92,13 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
                           MakeFakeArguments(module.get()));
   ASSERT_EQ(args.size(), 5);
 
-  EXPECT_EQ(args[0].Get<int32>({}), 0);
+  EXPECT_GE(args[0].Get<int32>({}), -1);
+  EXPECT_LE(args[0].Get<int32>({}), 1);
 
-  EXPECT_GE(args[1].Get<int32>({}), 0);
-  EXPECT_LE(args[0].Get<int32>({}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), -1);
+  EXPECT_LE(args[1].Get<int32>({}), 2);
 
-  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_GE(args[2].Get<int32>({}), -1);
   EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
@@ -122,12 +123,13 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
                           MakeFakeArguments(module.get()));
   ASSERT_EQ(args.size(), 7);
 
-  EXPECT_EQ(args[0].Get<int32>({}), 0);
+  EXPECT_GE(args[0].Get<int32>({}), -1);
+  EXPECT_LE(args[0].Get<int32>({}), 1);
 
-  EXPECT_GE(args[1].Get<int32>({}), 0);
-  EXPECT_LE(args[0].Get<int32>({}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), -1);
+  EXPECT_LE(args[1].Get<int32>({}), 2);
 
-  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_GE(args[2].Get<int32>({}), -1);
   EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
@@ -136,10 +138,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
   auto module = ParseHloString(R"(
 HloModule sort.148.1589
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (f32[1048576], s32[1048576]) {
   %parameter.0 = f32[1048576]{0} parameter(0)
   %parameter.1 = s32[1048576]{0} parameter(1)
-  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -159,10 +169,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesInt32) {
   auto module = ParseHloString(R"(
 HloModule sort.148.1589
 
+compare {
+  p.0.lhs = s32[] parameter(0)
+  p.0.rhs = s32[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (s32[1048576], s32[1048576]) {
   %parameter.0 = s32[1048576]{0} parameter(0)
   %parameter.1 = s32[1048576]{0} parameter(1)
-  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -182,10 +200,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesBfloat16) {
   auto module = ParseHloString(R"(
 HloModule sort, is_scheduled=true
 
+compare {
+  p.0.lhs = bf16[] parameter(0)
+  p.0.rhs = bf16[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,1452], s32[2,1452]) {
   %parameter.0 = bf16[2,1452]{1,0} parameter(0)
   %parameter.1 = s32[2,1452]{1,0} parameter(1)
-  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}
+  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -228,5 +254,77 @@ ENTRY %module (parameter.0: s32[], parameter.1: f32[20,20]) -> f32[] {
       << ShapeUtil::HumanString(args[1].shape());
 }
 
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForGather) {
+  auto module = ParseHloString(R"(
+  HloModule Test
+
+ENTRY %module(paramater.0: f32[200,100,300], parameter.1: s32[10,2]) ->
+                                                          f32[10,300] {
+  %parameter.0 = f32[200,100,300] parameter(0)
+  %parameter.1 = s32[10,2] parameter(1)
+  ROOT gather = f32[10,300] gather(f32[200,100,300] %parameter.0,
+                                   s32[10,2] %parameter.1),
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
+      index_vector_dim=1,
+      slice_sizes={1,1,300}
+}
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+
+  const Shape& indices_shape = args[1].shape();
+  EXPECT_TRUE(
+      ShapeUtil::Equal(indices_shape, ShapeUtil::MakeShape(S32, {10, 2})))
+      << ShapeUtil::HumanString(indices_shape);
+  auto indices = args[1].data<int32>();
+  for (const auto index : indices) {
+    EXPECT_GE(index, -1);
+    EXPECT_LE(index, 100);
+  }
+}
+
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForScatter) {
+  auto module = ParseHloString(R"(
+  HloModule Test
+
+scatter_update (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  ROOT rhs = f32[] parameter(1)
+}
+
+ENTRY main {
+  operand = f32[200,100,300] parameter(0)
+  indices = s32[10,2] parameter(1)
+  updates = f32[10,300] parameter(2)
+  ROOT scatter = f32[200,100,300] scatter(operand, indices, updates),
+    to_apply=scatter_update,
+    update_window_dims={1},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=1
+  }
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 3);
+
+  const Shape& indices_shape = args[1].shape();
+  EXPECT_TRUE(
+      ShapeUtil::Equal(indices_shape, ShapeUtil::MakeShape(S32, {10, 2})))
+      << ShapeUtil::HumanString(indices_shape);
+  auto indices = args[1].data<int32>();
+  for (const auto index : indices) {
+    EXPECT_GE(index, -1);
+    EXPECT_LE(index, 100);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/tests/triangular_solve_test.cc
similarity index 77%
rename from tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
rename to tensorflow/compiler/xla/tests/triangular_solve_test.cc
index 284a2e9d183a6a7923fb59ac134ce3b3a3a96e35..24ab12136ff396bd9ac37bb058311b0d2d6f2515 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/xla/tests/triangular_solve_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
-
 #include <memory>
 #include <numeric>
 #include <vector>
@@ -54,6 +52,20 @@ Array2D<float> AValsUpper() {
           {kNan, kNan, kNan, 11}};
 }
 
+Array2D<float> AValsLowerUnitDiagonal() {
+  return {{kNan, kNan, kNan, kNan},
+          {3, kNan, kNan, kNan},
+          {4, 7, kNan, kNan},
+          {5, 8, 10, kNan}};
+}
+
+Array2D<float> AValsUpperUnitDiagonal() {
+  return {{kNan, 3, 4, 5},
+          {kNan, kNan, 7, 8},
+          {kNan, kNan, kNan, 10},
+          {kNan, kNan, kNan, kNan}};
+}
+
 Array2D<float> BValsRight() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
 }
@@ -96,8 +108,8 @@ XLA_TEST_F(TriangularSolveTest, EmptyArrays) {
       CreateR2Parameter<float>(Array2D<float>(0, 10), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 10),
                              {a_data.get(), b_data.get()});
@@ -111,8 +123,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 0.08333334, 0.04629629, 0.03367003},
@@ -132,8 +144,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
@@ -153,8 +165,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
@@ -174,8 +186,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/false,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 0.08333334, 0.04629629, 0.03367003},
@@ -195,8 +207,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {-0.89646465, -0.69444444, -0.49242424},
@@ -217,8 +229,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 1.0, 1.5},
@@ -231,6 +243,25 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
                              ErrorSpec(1e-2, 1e-2));
 }
 
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNoTransposeUnitDiagonal) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(AValsLowerUnitDiagonal(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/true,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected(
+      {{1., 2., 3.}, {1., -1., -3.}, {-4., 7., 18.}, {37., -61., -159.}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
   XlaBuilder builder(TestName());
 
@@ -239,8 +270,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/3);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 1.0, 1.5},
@@ -261,8 +292,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 1.0, 1.5},
@@ -283,8 +314,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {-0.89646465, -0.69444444, -0.49242424},
@@ -297,6 +328,27 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
                              ErrorSpec(1e-2, 1e-2));
 }
 
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotransposeUnitDiagonal) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(AValsUpperUnitDiagonal(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/false,
+                  /*unit_diagonal=*/true,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({{-1402., -1538., -1674.},
+                           {575., 631., 687.},
+                           {-93., -102., -111.},
+                           {10., 11., 12.}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
   XlaBuilder builder(TestName());
 
@@ -307,8 +359,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
       CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/true,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::ADJOINT);
 
   Array2D<complex64> expected({
       {0.5, complex64(0.08333333, 0.08333333),
@@ -333,8 +385,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
       CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<complex64> expected({
       {0.5, 1., 1.5},
@@ -368,11 +420,12 @@ XLA_TEST_F(TriangularSolveTest, BatchedLeftUpper) {
   XlaOp a, b;
   auto a_data = CreateR3Parameter<float>(avals, 0, "a", &builder, &a);
   auto b_data = CreateR3Parameter<float>(bvals, 1, "b", &builder, &b);
-  BatchDot(ConstantR3FromArray3D(&builder, avals),
-           TriangularSolve(a, b,
-                           /*left_side=*/true, /*lower=*/false,
-                           /*transpose_a=*/false, /*conjugate_a=*/false,
-                           /*block_size=*/2));
+  BatchDot(
+      ConstantR3FromArray3D(&builder, avals),
+      TriangularSolve(a, b,
+                      /*left_side=*/true, /*lower=*/false,
+                      /*unit_diagonal=*/false,
+                      /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE));
 
   ComputeAndCompareR3<float>(&builder, bvals, {a_data.get(), b_data.get()},
                              ErrorSpec(1e-2, 1e-2));
@@ -382,7 +435,7 @@ struct TriangularSolveTestSpec {
   int m, n;  // A is mxm, B is mxn
   bool left_side;
   bool lower;
-  bool transpose_a;
+  TriangularSolveOptions::Transpose transpose_a;
 };
 
 class TriangularSolveParametricTest
@@ -408,11 +461,11 @@ XLA_TEST_P(TriangularSolveParametricTest, Random) {
   XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(avals, 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(bvals, 1, "b", &builder, &b);
-  auto x = TriangularSolve(a, b, spec.left_side, spec.lower, spec.transpose_a,
-                           /*conjugate_a=*/false,
-                           /*block_size=*/3);
+  auto x = TriangularSolve(a, b, spec.left_side, spec.lower,
+                           /*unit_diagonal=*/false, spec.transpose_a);
   auto a_tri = Triangle(a, spec.lower);
-  a_tri = MaybeTransposeInMinorDims(a_tri, spec.transpose_a);
+  a_tri = MaybeTransposeInMinorDims(
+      a_tri, spec.transpose_a != TriangularSolveOptions::NO_TRANSPOSE);
   if (spec.left_side) {
     BatchDot(a_tri, x);
   } else {
@@ -429,7 +482,9 @@ std::vector<TriangularSolveTestSpec> TriangularSolveTests() {
     for (int n : {5, 10}) {
       for (bool left_side : {false, true}) {
         for (bool lower : {false, true}) {
-          for (bool transpose_a : {false, true}) {
+          for (TriangularSolveOptions::Transpose transpose_a :
+               {TriangularSolveOptions::NO_TRANSPOSE,
+                TriangularSolveOptions::TRANSPOSE}) {
             specs.push_back({m, n, left_side, lower, transpose_a});
           }
         }
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 4fbd7f2fb174ac899c1e3b23801986cb52db96a2..c51f30f3b5db95962a719ec226dd03f41142a782 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -64,7 +64,9 @@ class UnaryOpTest : public ClientLibraryTestBase {
         &builder, {-2, 25, 0, static_cast<T>(-0.0), -123, inf<T>(), -inf<T>()});
     Sign(arg);
 
-    ComputeAndCompareR1<T>(&builder, {-1, 1, 0, 0, -1, 1, -1}, {});
+    ComputeAndCompareR1<T>(
+        &builder,
+        {-1, 1, static_cast<T>(+0.0), static_cast<T>(-0.0), -1, 1, -1}, {});
   }
 
   template <typename T>
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index c7337e8caae8f2ee25f4b25dc22439e08d2ecc25..7b7b8f5d02dc99607b30f898e18c5b448d421e07 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -40,8 +40,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace gtl = ::tensorflow::gtl;
-
 class HloProfileTest : public ClientLibraryTestBase {};
 
 struct ParsedProfileOutputLine {
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index 7289ae7df65e56652eeeb67e536e4c721d97d999..fc7949d889dc8ed9fac425982cc555a6c42a7f1d 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 52fee4770ab940741723514d742e998b25765f24..ebd4bb1e42c9d1dc1f72a75514e916a2d900c30e 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -177,26 +177,6 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
-    name = "dumped_computation_to_tf_graphdef",
-    srcs = ["dumped_computation_to_tf_graphdef.cc"],
-    deps = [
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 tf_cc_binary(
     name = "hlo_proto_to_json",
     srcs = ["hlo_proto_to_json.cc"],
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 4375e7c138c9e8d193feaa7a39d63946c4ea3086..df2d3d18b9ff86c0dd2047c2415527aeb1c1f154 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 723569862c7550387e95003e3a673743464b67b8..35bb82ca22f46d2cdeaac3b9a87b253efe9a07d9 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
deleted file mode 100644
index f8bb9a6b1e217fc4e6e15c8a3302be61ed339c82..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Usage: dumped_computation_to_tf_graph some_binary_snapshot_proto*
-//
-// Dumps a tensorflow GraphDef in text format for a snapshot computation. The
-// dumped graph is an HLO computation with HLO instructions as nodes and can be
-// visualized on Tensorboard. Upload the dumped files on Tensorboard.
-//
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
-// ServiceInterface::SnapshotComputation to disk.
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-
-using tensorflow::Env;
-
-namespace xla {
-namespace tools {
-
-void RealMain(absl::Span<char* const> args) {
-  Client* client = ClientLibrary::LocalClientOrDie();
-  for (char* arg : args) {
-    HloSnapshot module;
-    TF_CHECK_OK(
-        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    XlaComputation computation =
-        client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = GetDebugOptionsFromFlags();
-    debug_options.set_xla_generate_hlo_graph(".*");
-    debug_options.set_xla_hlo_dump_as_graphdef(true);
-    ComputationStats stats =
-        client->GetComputationStats(computation, debug_options)
-            .ConsumeValueOrDie();
-    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
-  }
-}
-
-}  // namespace tools
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  std::vector<tensorflow::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  absl::Span<char* const> args(argv, argc);
-  args.remove_prefix(1);  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index ac865707f8697e0b94173a2a33e7be52a9564867..0c7c078b9b9d30427cb01b8930bd012046d852d3 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -139,9 +139,10 @@ HloComputation* FindComputation(const HloModule& module,
 // Print a help message describing the various available commands.
 void DoHelpCommand() {
   std::cout << R"(Commands:
-  <instruction> [<width>]
-    Renders a neighborhood of <width> nodes around <instruction>.  If <width>
-    is not provided, the default value is )"
+  <instruction> [<width>] [/ <boundary_instruction>+]
+    Renders a neighborhood of <width> nodes around <instruction>, without going
+    beyond the optional boundary instructions.  If <width> is not provided, 
+    the default value is )"
             << kDefaultWidth << R"(.
   allpaths <instruction> <instruction> [<n>]
     Renders a subset of all paths from one instruction to the other.  Either
@@ -457,12 +458,6 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
 // Plot a given instruction neighborhood or computation with graphviz.
 void DoPlotCommand(const Options& opts, const HloModule& module,
                    const std::vector<string>& tokens) {
-  if (tokens.size() > 2) {
-    std::cerr << R"(Illegal input.  Enter e.g. "%fusion.1 42" or "%fusion.1".)"
-              << std::endl;
-    return;
-  }
-
   string node_name = tokens[0];
 
   // Find the node with the given name.
@@ -475,16 +470,43 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
   }
 
   uint64 graph_width = kDefaultWidth;
-  if (tokens.size() == 2) {
+  absl::flat_hash_set<const HloInstruction*> boundary;
+  if (tokens.size() >= 2) {
     if (comp) {
       std::cerr << "Can only use graph-size parameter with instructions, but "
                 << node_name << " is a computation." << std::endl;
       return;
     }
-    if (!absl::SimpleAtoi(tokens[1], &graph_width)) {
-      std::cerr << "Can't parse '" << tokens[1] << "' as an integer."
-                << std::endl;
-      return;
+
+    int bound_index = 1;
+    // Get the <width> if present.
+    if (absl::SimpleAtoi(tokens[bound_index], &graph_width)) {
+      bound_index++;
+    } else {
+      // <width> not found, need to reset graph_width.
+      graph_width = kDefaultWidth;
+    }
+    // Get the '/'.
+    if (bound_index < tokens.size()) {
+      // This token must be a '/'.
+      if (tokens[bound_index] != "/") {
+        std::cerr << "Expect a /, but get a '" << tokens[bound_index] << "'."
+                  << std::endl;
+        return;
+      }
+      bound_index++;
+    }
+    // Get the boundary nodes.
+    while (bound_index < tokens.size()) {
+      string bnode_name = tokens[bound_index];
+      const HloInstruction* binstr = FindInstruction(module, bnode_name);
+      if (!binstr) {
+        std::cerr << "Couldn't find HloInstruction named " << bnode_name << "."
+                  << std::endl;
+        return;
+      }
+      boundary.insert(binstr);
+      bound_index++;
     }
   }
 
@@ -496,7 +518,9 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
         /*show_backend_config=*/show_backend_config));
   } else {
     DisplayGraphHandle(opts, hlo_graph_dumper::DumpNeighborhoodAround(
-        *instr, graph_width, /*show_backend_config=*/show_backend_config));
+                                 *instr, graph_width,
+                                 /*show_backend_config=*/show_backend_config,
+                                 /*boundary=*/boundary));
   }
 }
 
@@ -515,7 +539,7 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
                 << std::endl;
       continue;
     }
-    std::vector<string> tokens = absl::StrSplit(line, ' ');
+    std::vector<string> tokens = absl::StrSplit(line, ' ', absl::SkipEmpty());
     if (tokens[0] == "quit" || tokens[0] == "exit") {
       break;
     } else if (tokens[0] == "help") {
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 21217c23f6561a509cb3e30bf3dc841f8dc5db87..d66561315b4ad7a5e3f1f7b1bc1e557b71da6705 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -102,8 +102,9 @@ StatusOr<std::unique_ptr<LocalExecutable>> CompileExecutable(
     argument_layouts.push_back(Shape(param));
     argument_layout_ptrs.push_back(&argument_layouts.back());
   }
-  return client->Compile(computation, argument_layout_ptrs,
-                         ExecutableBuildOptions());
+  ExecutableBuildOptions exec_build_options;
+  *exec_build_options.mutable_debug_options() = GetDebugOptionsFromFlags();
+  return client->Compile(computation, argument_layout_ptrs, exec_build_options);
 }
 
 absl::optional<Shape> GetXfeedShape(bool is_infeed,
@@ -328,7 +329,10 @@ StatusOr<HloSnapshot> ParseInputFile(const string& filename,
   fprintf(stderr, "%s: is not HloProto. Trying HLO text.\n", filename.c_str());
   string contents;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, filename, &contents));
-  StatusOr<std::unique_ptr<HloModule>> module = ParseHloString(contents);
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  StatusOr<std::unique_ptr<HloModule>> module =
+      ParseHloString(contents, config);
   if (module.ok()) {
     *snapshot.mutable_hlo()->mutable_hlo_module() =
         module.ValueOrDie()->ToProto();
diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc
index cdf306dfd1027cf6022c5d8ae844b4308f580e8d..b80d0db8d812380d8144713109d1c05168713c77 100644
--- a/tensorflow/compiler/xla/tools/show_signature.cc
+++ b/tensorflow/compiler/xla/tools/show_signature.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 34b73b5206fa20d6dff7567afd78fd89897c8c33..bb8bbf57c4252b16836553334901a3c896a17f39 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdarg.h>
 #include <numeric>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -80,13 +81,9 @@ bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
   if (rank != permutation.size()) {
     return false;
   }
-  std::vector<int64> output(permutation.size(), -1);
-  for (auto index : permutation) {
-    CHECK_GE(index, 0);
-    CHECK_LT(index, rank);
-    output[index] = 0;
-  }
-  return !absl::c_linear_search(output, -1);
+  absl::InlinedVector<int64, 8> trivial_permutation(rank);
+  absl::c_iota(trivial_permutation, 0);
+  return absl::c_is_permutation(permutation, trivial_permutation);
 }
 
 std::vector<int64> InversePermutation(
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 60adea5a4a242e5843b41927ba77c197e8fac444..cda2d7c7c6b2403868f6d01a485753fa29a8d95f 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -17,7 +17,9 @@ def xla_proto_library(name, srcs = [], deps = [], visibility = None, testonly =
     cc_proto_library(
         name = name,
         srcs = srcs,
-        deps = deps,
+        # Append well-known proto dep. As far as I know this is the only way
+        # for xla_proto_library to access google.protobuf.{Any,Duration,...}.
+        deps = deps + ["@protobuf_archive//:cc_wkt_protos"],
         cc_libs = if_static(
             ["@protobuf_archive//:protobuf"],
             otherwise = ["@protobuf_archive//:protobuf_headers"],
@@ -28,6 +30,11 @@ def xla_proto_library(name, srcs = [], deps = [], visibility = None, testonly =
         **kwargs
     )
 
+def xla_py_proto_library(**kwargs):
+    # Note: we don't currently define a proto library target for Python in OSS.
+    _ignore = kwargs
+    pass
+
 def xla_py_grpc_library(**kwargs):
     # Note: we don't currently define any special targets for Python GRPC in OSS.
     _ignore = kwargs
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 92834dbb02cdcd6383ceec3ffd079834b163ee6a..925fcbf88c1e8dd81ab1339d292e05eae52e0d13 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -15,11 +15,11 @@ limitations under the License.
 
 syntax = "proto3";
 
-import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
-
 package xla;
 
+import "tensorflow/compiler/xla/service/hlo.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
 // Options for the HLO insert-reduce-precision-operations pass.
 message HloReducePrecisionOptions {
   // Where and when the reduce-precision operations will be added.
@@ -72,8 +72,7 @@ message DebugOptions {
   // Path to dump HLO graphs to.
   string xla_hlo_graph_path = 4;
 
-  // Dump HLO graphs as TensorFlow GraphDefs.
-  bool xla_hlo_dump_as_graphdef = 5;
+  reserved 5;  // Was xla_hlo_dump_as_graphdef
 
   // HLO modules matching this regex will be dumped to LOG(INFO). Set to ".*" to
   // dump *all* HLO modules.
@@ -171,9 +170,7 @@ message DebugOptions {
   // HLO graph.
   bool xla_hlo_graph_sharding_color = 92;
 
-  // Prefix the name scopes of the TF graph exports with "devX" device
-  // assignments, if available.
-  bool xla_hlo_tfgraph_device_scopes = 93;
+  reserved 93;  // Was xla_hlo_tfgraph_device_scopes
 
   // If true, the GPU backend is free to use cudnn for HLO batch normalization
   // ops.
@@ -234,7 +231,23 @@ message DebugOptions {
   // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
   bool xla_allow_scalar_index_dynamic_ops = 107;
 
-  // Next id: 108
+  enum StepMarkerLocation {
+    // Generate step mark at each iteration of top level while loop, which
+    // is assumed to be a training loop. This is the default.
+    STEP_MARK_AT_ENTRY = 0;
+    // Generate step mark at program entry. This handles the case where each
+    // step are done by one or multiple programs execution. Only the first
+    // program will be tagged for generating step mark at program entry.
+    STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP = 1;
+    // No step mark.
+    STEP_MARK_NONE = 2;
+  }
+  // Option to emit a target-specific marker to indicate the start of a training
+  // step. The location of the marker (if any) is determined by the option
+  // value.
+  StepMarkerLocation xla_step_marker_location = 108;
+
+  // Next id: 109
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -306,8 +319,7 @@ message TransferToInfeedRequest {
   DeviceHandle device_handle = 3;
 }
 
-message TransferToInfeedResponse {
-}
+message TransferToInfeedResponse {}
 
 message TransferFromOutfeedRequest {
   // This optional field directs the service to return the literal in this
@@ -326,8 +338,7 @@ message ResetDeviceRequest {
   DeviceHandle device_handle = 1;
 }
 
-message ResetDeviceResponse {
-}
+message ResetDeviceResponse {}
 
 message ComputationGraphStatsRequest {
   HloModuleProto computation = 1;
@@ -350,8 +361,7 @@ message UnregisterRequest {
   repeated GlobalDataHandle data = 1;
 }
 
-message UnregisterResponse {
-}
+message UnregisterResponse {}
 
 message CompileRequest {
   // The graph to be compiled.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index a64e2f5df5cacca05e83f31c941c57abd5ccf4de..226299a7186ef0acb41f6d01fdeffeee06f13d4d 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -545,6 +545,26 @@ enum RandomDistribution {
   // Next: 4
 }
 
+message TriangularSolveOptions {
+  // If true, solves ax = b. If false, solves xa = b.
+  bool left_side = 1;
+
+  // If true, 'a' is lower triangular. If false, 'a' is upper triangular.
+  bool lower = 2;
+
+  // If true, the diagonal elements of 'a' are assumed to be 1 and not accessed.
+  bool unit_diagonal = 3;
+
+  // Should we transpose or use the adjoint of 'a'?
+  enum Transpose {
+    TRANSPOSE_INVALID = 0;
+    NO_TRANSPOSE = 1;  // Don't transpose 'a'.
+    TRANSPOSE = 2;     // Transpose 'a'.
+    ADJOINT = 3;       // Complex conjugate and transpose 'a'.
+  };
+  Transpose transpose_a = 4;
+}
+
 message OpSharding {
   enum Type {
     // This sharding is replicated across all devices (implies maximal,
@@ -604,3 +624,15 @@ message PrecisionConfig {
 
   // Next: 2
 }
+
+// Describes whether all data-parallelism replicas will receive the same
+// parameter data at each buffer.
+message ParameterReplication {
+  // A list of boolean values for the flattened leaf buffers. Each value
+  // indicates whether the corresponding leaf buffer is replicated.
+  //
+  // If this field is empty, it means no buffer is replicated. Otherwise, the
+  // number of elements in this field must match the number of leaf buffers in
+  // the HLO instruction's shape.
+  repeated bool replicated_at_leaf_buffers = 1;
+}
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 2dae746d034a1bf52e84de74dfb0c6e23aaed4d1..b2718c5c283358d98da175a8d3b21bb1f2b01c75 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -11,9 +11,15 @@ package(
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_py_library",
     "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 xla_proto_library(
     name = "xrt_proto",
@@ -27,6 +33,12 @@ xla_proto_library(
     ],
 )
 
+tf_proto_library_py(
+    name = "xrt_proto",  # bzl adds a _py suffix
+    srcs = ["xrt.proto"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "xrt_utils",
     srcs = [
@@ -78,6 +90,25 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "xrt_ops_wrapper_py",
+    out = "xrt_ops.py",
+    deps = [
+        ":xrt_compile_ops_op_lib",
+        ":xrt_execute_op_op_lib",
+        ":xrt_state_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "xrt_ops",
+    kernels = ["//tensorflow/compiler/xrt/kernels:xrt_ops"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xrt_ops_wrapper_py",
+    ],
+)
+
 cc_library(
     name = "xrt_server",
     visibility = ["//visibility:public"],
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index dc02fd272fd8700c7f8fa64adf7ab57c88bab706..1e325191bba828e3d5e4599f87dcf4f4d0674945 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -51,7 +51,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
+        "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
         "//tensorflow/compiler/xrt:xrt_proto",
+        "//tensorflow/compiler/xrt:xrt_state_ops_op_lib",
         "//tensorflow/compiler/xrt:xrt_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 2ee1a6cd1aebcdbd65892b33e5044489070ab5c4..b791519c09758a4f4124c95add5351a9433ecb8f 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -68,9 +68,11 @@ class XRTCompileOp : public OpKernel {
 
 Status CompilationCacheKey(const xrt::XLAComputation& computation,
                            string* key) {
-  string serialized;
-  TF_RET_CHECK(SerializeToStringDeterministic(computation, &serialized));
-  uint64 fingerprint = Fingerprint64(serialized);
+  const size_t size = computation.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(
+      SerializeToBufferDeterministic(computation, serialized.get(), size));
+  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
   *key = absl::StrCat(fingerprint);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 116c193cab65410a5a7c3058f98cc2be2cbe9e67..42ef88168af4b6f391ffc2e69ab4c4000d1cbee1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 6a7f10652533920ba3fa48fba1d5161f7c4d4530..343f43b7159b55bad184eed2cada55c76085ffa0 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -122,6 +122,17 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<true, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handles")
+                            .HostMemory("tensors"),
+                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handles")
+                            .HostMemory("tensors"),
+                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle"),
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index e2c223b3dbb2311d0f42e1a36e316fd9d5f66040..6af73ecc85351a9b38ba526db076e9176d1cb2f1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -215,27 +217,29 @@ class XRTAllocateFromTensorOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &tf_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("make_tuple", &make_tuple));
+    std::vector<int64> minor_to_major;
     if (ctx->HasAttr("layouts")) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major));
     }
     OP_REQUIRES(
         ctx, tf_shapes_.size() == dtypes_.size(),
         errors::InvalidArgument("shapes and dtypes must be the same length"));
     std::vector<xla::Shape> xla_shapes;
+    xla_shapes.reserve(tf_shapes_.size());
     for (int i = 0; i < tf_shapes_.size(); i++) {
       xla::Shape xla_shape;
       OP_REQUIRES_OK(
           ctx, TensorShapeToXLAShape(dtypes_[i], tf_shapes_[i], &xla_shape));
-      xla_shapes.push_back(xla_shape);
+      xla_shapes.push_back(std::move(xla_shape));
     }
     if (xla_shapes.size() > 1 || make_tuple) {
       shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes);
     } else {
       shape_.Swap(&xla_shapes.front());
     }
-    if (!minor_to_major_.empty()) {
+    if (!minor_to_major.empty()) {
       xla::Shape shape_with_layouts;
-      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major_,
+      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major,
                                              /*layout_func=*/nullptr,
                                              &shape_with_layouts));
       shape_.Swap(&shape_with_layouts);
@@ -304,7 +308,6 @@ class XRTAllocateFromTensorOp : public OpKernel {
  private:
   std::vector<TensorShape> tf_shapes_;
   DataTypeVector dtypes_;
-  std::vector<int64> minor_to_major_;
   xla::Shape shape_;
 };
 
@@ -487,7 +490,7 @@ class XRTReadLiteralOp : public OpKernel {
     OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
                             ctx, allocation->device_ordinal(), &device_ref));
 
-    xla::Literal literal;
+    xla::Literal literal(allocation->on_host_shape());
     OP_REQUIRES_OK(
         ctx, allocation->ToLiteral(device_ref.backend(),
                                    device_ref.device_ordinal(), &literal));
@@ -499,6 +502,96 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that reads a device-resident tuple to host memory and returns it as a
+// literal.
+template <class DeviceAccessor>
+class XRTReadToTensorOp : public OpKernel {
+ public:
+  explicit XRTReadToTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("release_handles", &discard_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  }
+  ~XRTReadToTensorOp() override = default;
+  XRTReadToTensorOp(const XRTReadToTensorOp&) = delete;
+  XRTReadToTensorOp& operator=(const XRTReadToTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTReadToTensorOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    // TODO(phawkins,dlibenzi): accept multiple handles (i.e., vectors, not
+    // just scalars.)
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+
+    if (discard_) {
+      VLOG(2) << "Releasing handle " << allocation_handle;
+      OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
+                              rm, allocation_handle));
+    }
+
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    class DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+
+    xla::Shape shape = allocation->on_host_shape();
+    int output = 0;
+    Status status = xla::ShapeUtil::ForEachMutableSubshapeWithStatus(
+        &shape,
+        [&](xla::Shape* subshape, const xla::ShapeIndex& index) -> Status {
+          if (subshape->IsTuple()) return Status::OK();
+
+          xla::PrimitiveType xla_type;
+          TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(
+              ctx->expected_output_dtype(output), &xla_type));
+          if (xla_type != subshape->element_type()) {
+            return errors::InvalidArgument(
+                "Type mismatch between buffer type (", subshape->ToString(),
+                ") and tensor type (",
+                DataTypeString(ctx->expected_output_dtype(output)),
+                ") for output tensor ", output);
+          }
+
+          TensorShape output_shape;
+          TF_RETURN_IF_ERROR(XLAShapeToTensorShape(*subshape, &output_shape));
+
+          Tensor* output_tensor;
+          TF_RETURN_IF_ERROR(
+              ctx->allocate_output(output, output_shape, &output_tensor));
+
+          XRTTupleAllocation* sub;
+          TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+              allocation, index, &sub, /*alias_parent_allocation=*/true));
+          core::ScopedUnref sub_unref(sub);
+
+          xla::MutableBorrowingLiteral literal;
+          TF_RETURN_IF_ERROR(HostTensorToMutableBorrowingLiteral(
+              xla::LayoutUtil::GetWithDefaultLayout(*subshape), output_tensor,
+              &literal));
+          TF_RETURN_IF_ERROR(sub->ToLiteral(
+              device_ref.backend(), device_ref.device_ordinal(), &literal));
+
+          ++output;
+          return Status::OK();
+        });
+    OP_REQUIRES_OK(ctx, status);
+  }
+  bool discard_;
+  DataTypeVector dtypes_;
+};
+
 // Op that writes a new literal value into device-resident memory.
 template <class DeviceAccessor>
 class XRTWriteLiteralOp : public OpKernel {
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 2e743fec4963a52ee1abf64525f26e3d89479670..8832270fb2730d1ba64fa069b38f4a04b61773ef 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -151,6 +151,27 @@ releases the handle.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTReadToTensor")
+    .Input("handles: int64")
+    .Attr("release_handles: bool = False")
+    .Attr("dtypes: list(type)")
+    .Output("tensors: dtypes")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(
+        R"(
+Copies allocated values from device memory and returns them as zero or more
+Tensors. If a handle refers to a non-tuple buffer, a single tensor is returned.
+In general, the tensors returned for a handle correspond to an in-order traversal
+of a the tuple-tree value referenced by the handle.
+
+'handles' contains ids returned from Ops that produced on-device allocations.
+At present, only a single (scalar) handle is supported.
+'dtypes' are the expected types for each `Tensor` to be returned. If the
+expected and actual tensor types do not match, an error is returned.
+'release_handles': if True, `handles` are released.
+'tensors' are the output Tensors.
+)");
+
 REGISTER_OP("XRTReleaseAllocationHandle")
     .Input("handle: int64")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 1e2a9584f88b73d7c92a929e93af60376a59170b..1b3bcbea4c1228944a6604fc923228024e74d700 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -221,7 +220,7 @@ XRTTupleAllocation::~XRTTupleAllocation() {
 }
 
 Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
-                                     xla::Literal* literal) {
+                                     xla::MutableLiteralBase* literal) {
   auto transfer_manager = backend->transfer_manager();
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
 
@@ -235,9 +234,8 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
                                      " has been released");
     }
   }
-  TF_ASSIGN_OR_RETURN(*literal, transfer_manager->TransferLiteralFromDevice(
-                                    stream.get(), shaped_buffer));
-  return Status::OK();
+  return transfer_manager->TransferLiteralFromDevice(stream.get(),
+                                                     shaped_buffer, *literal);
 }
 
 Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index ddf2656e6f51775024a6d1cd0d7a387605faae6f..6519da30d02e41da5a862cadd2133bd8dd8b42d7 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -147,7 +147,7 @@ class XRTTupleAllocation : public ResourceBase {
 
   // Copies the allocation from device to host and returns it in literal.
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
-                   xla::Literal* literal);
+                   xla::MutableLiteralBase* literal);
 
   // Write a new literal value to the allocation.
   Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 25f2640e35af5f65eab25dc60c44e3ed7ce4e512..0173b8bb064c7b2fb8a0df018204515b24cfa718 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -218,7 +218,6 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:stats_ops_op_lib",
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
         "//tensorflow/contrib/text:all_ops",
-        "//tensorflow/contrib/tpu:all_ops",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index f0b1c92cf7e4b760381da38febd9682ce2a4f27c..5608e7ddafa25757484d8c845c8c84a5691e143c 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -73,8 +73,7 @@ cc_binary(
         "-z defs",
         "-s",
         "-Wl,--gc-sections",
-        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        "$(location {})".format(LINKER_SCRIPT),
+        "-Wl,--version-script,$(location {})".format(LINKER_SCRIPT),
     ]),
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 6138d7912601344ef7422fd50fb35c8401fd2e63..f0637595db08cbeb3b3ee0c94c5399df4c8c83e6 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
-
 namespace {
 
 class BigtableClientOp : public OpKernel {
@@ -341,8 +340,8 @@ class ToBigtableOp : public AsyncOpKernel {
   }
 
   template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
+  Status ParseScalarArgument(OpKernelContext* ctx, StringPiece argument_name,
+                             T* output) {
     const Tensor* argument_t;
     TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
     if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
@@ -360,5 +359,4 @@ REGISTER_KERNEL_BUILDER(Name("DatasetToBigtable").Device(DEVICE_CPU),
 
 }  // namespace
 }  // namespace data
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index e6fda9e61757f1441b3691c2a3d57c6f1a5a0d42..d9fce6e09f47ab05074f0b4c03dd8e672ed3d2ce 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -335,6 +335,17 @@ grpc::Status BigtableTestClient::ReadModifyWriteRow(
   return grpc::Status(grpc::StatusCode::UNIMPLEMENTED,
                       "ReadModifyWriteRow not implemented.");
 }
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::ReadModifyWriteRowResponse>>
+BigtableTestClient::AsyncReadModifyWriteRow(
+    grpc::ClientContext* context,
+    google::bigtable::v2::ReadModifyWriteRowRequest const& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to AsyncReadModifyWriteRow:" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::unique_ptr<
     grpc::ClientReaderInterface<google::bigtable::v2::ReadRowsResponse>>
 BigtableTestClient::ReadRows(
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index 8e1326f2ce841368ea81fc7194a0588e5d6cd637..63d59b32dd17a2f58d3413932b69f4d704c84e48 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -46,6 +46,13 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
       google::bigtable::v2::ReadModifyWriteRowRequest const& request,
       google::bigtable::v2::ReadModifyWriteRowResponse* response) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::ReadModifyWriteRowResponse>>
+  AsyncReadModifyWriteRow(
+      grpc::ClientContext* context,
+      google::bigtable::v2::ReadModifyWriteRowRequest const& request,
+      grpc::CompletionQueue* cq) override;
+
   std::unique_ptr<
       grpc::ClientReaderInterface<google::bigtable::v2::ReadRowsResponse>>
   ReadRows(grpc::ClientContext* context,
diff --git a/tensorflow/contrib/bigtable/ops/bigtable_ops.cc b/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
index 416b719e30aa5f2504449d151a48e95c9105c68b..39c2a2e775d5d5287b137bf33eef66251738e6d3 100644
--- a/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
+++ b/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
@@ -59,7 +59,7 @@ REGISTER_OP("BigtablePrefixKeyDataset")
     .Input("table: resource")
     .Input("prefix: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -68,14 +68,14 @@ REGISTER_OP("BigtableRangeKeyDataset")
     .Input("start_key: string")
     .Input("end_key: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BigtableSampleKeysDataset")
     .Input("table: resource")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -85,7 +85,7 @@ REGISTER_OP("BigtableSampleKeyPairsDataset")
     .Input("start_key: string")
     .Input("end_key: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -100,7 +100,7 @@ REGISTER_OP("BigtableScanDataset")
     .Input("columns: string")
     .Input("probability: float")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index a178820841c4c8bcb7f5742babdb6d0f4825de31..5ffbb9067081d7440ab5e11290697b822051bee5 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -84,12 +84,10 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -179,8 +177,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         `[batch_size, label_dimension]`).
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
-      label_name: String, name of the key in label dict. Can be null if label
-          is a tensor (single headed models).
+      label_name: String, name of the key in label dict. Can be null if label is
+        a tensor (single headed models).
       weight_column_name: Name of the column for weights, or None if not
         weighted.
       model_dir: Directory for model exports, etc.
@@ -195,11 +193,11 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -286,11 +284,11 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -353,10 +351,9 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       head: `Head` instance.
-      ranking_model_pair_keys: Keys to distinguish between features
-        for left and right part of the training pairs for ranking. For example,
-        for an Example with features "a.f1" and "b.f1", the keys would be
-        ("a", "b").
+      ranking_model_pair_keys: Keys to distinguish between features for left and
+        right part of the training pairs for ranking. For example, for an
+        Example with features "a.f1" and "b.f1", the keys would be ("a", "b").
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
       weight_column_name: Name of the column for weights, or None if not
@@ -376,12 +373,10 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -417,12 +412,12 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
 # When using this estimator, make sure to regularize the hessian (at least l2,
 # min_node_weight)!
 # TODO(nponomareva): extend to take multiple quantiles in one go.
 class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
-  """An estimator that does quantile regression and returns quantile estimates.
-  """
+  """An estimator that does quantile regression and returns quantile estimates."""
 
   def __init__(self,
                learner_config,
@@ -449,8 +444,8 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       quantiles: a list of quantiles for the loss, each between 0 and 1.
-      label_dimension: Dimension of regression label. This is the size
-        of the last dimension of the labels `Tensor` (typically, this has shape
+      label_dimension: Dimension of regression label. This is the size of the
+        last dimension of the labels `Tensor` (typically, this has shape
         `[batch_size, label_dimension]`). When label_dimension>1, it is
         recommended to use multiclass strategy diagonal hessian or full hessian.
       num_trees: An int, number of trees to build.
@@ -469,11 +464,11 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -519,6 +514,7 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
@@ -534,10 +530,8 @@ def core_multiclass_head(
 
   def loss_fn(labels, logits):
     result = losses.per_example_maxent_loss(
-        labels=labels,
-        logits=logits,
-        weights=weight_column,
-        num_classes=n_classes)
+        # Don't pass the weights: head already multiplies by them.
+        labels=labels, logits=logits, weights=None, num_classes=n_classes)
     return result[0]
 
   # pylint:disable=protected-access
@@ -564,7 +558,8 @@ def core_quantile_regression_head(
     result = losses.per_example_quantile_regression_loss(
         labels=labels,
         predictions=logits,
-        weights=weight_column,
+        # Don't pass the weights: head already multiplies by them.
+        weights=None,
         quantile=quantiles)
     return result[0]
 
@@ -623,11 +618,11 @@ class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
         the bias.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
     """
 
@@ -685,10 +680,9 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       head: `Head` instance.
-      ranking_model_pair_keys: Keys to distinguish between features
-        for left and right part of the training pairs for ranking. For example,
-        for an Example with features "a.f1" and "b.f1", the keys would be
-        ("a", "b").
+      ranking_model_pair_keys: Keys to distinguish between features for left and
+        right part of the training pairs for ranking. For example, for an
+        Example with features "a.f1" and "b.f1", the keys would be ("a", "b").
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
       weight_column_name: Name of the column for weights, or None if not
@@ -703,12 +697,10 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
 
     Raises:
@@ -748,8 +740,7 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 # TODO(nponomareva): extend to take multiple quantiles in one go.
 class CoreGradientBoostedDecisionTreeQuantileRegressor(
     core_estimator.Estimator):
-  """An estimator that does quantile regression and returns quantile estimates.
-  """
+  """An estimator that does quantile regression and returns quantile estimates."""
 
   def __init__(self,
                learner_config,
@@ -775,8 +766,8 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       quantiles: a list of quantiles for the loss, each between 0 and 1.
-      label_dimension: Dimension of regression label. This is the size
-        of the last dimension of the labels `Tensor` (typically, this has shape
+      label_dimension: Dimension of regression label. This is the size of the
+        last dimension of the labels `Tensor` (typically, this has shape
         `[batch_size, label_dimension]`). When label_dimension>1, it is
         recommended to use multiclass strategy diagonal hessian or full hessian.
       num_trees: An int, number of trees to build.
@@ -795,11 +786,11 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
         the bias.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
     """
     if len(quantiles) > 1:
@@ -814,7 +805,9 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
           params={
               'head':
                   core_quantile_regression_head(
-                      quantiles[0], label_dimension=label_dimension),
+                      quantiles[0],
+                      label_dimension=label_dimension,
+                      weight_column=weight_column_name),
               'feature_columns':
                   feature_columns,
               'learner_config':
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 47d910d42a27db4b857eeb12209dfbb429dd1be2..5a8b2ba9caf0a9813cb5b3409b8a0dc3de0a45d7 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -399,8 +399,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
   def testQuantileRegression(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.max_tree_depth = 6
+    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -413,7 +413,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
         quantiles=[0.95],
         learner_config=learner_config,
-        num_trees=100,
+        num_trees=12,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
@@ -428,31 +428,12 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper >= 0.92)
     self.assertTrue(frac_below_upper <= 0.98)
 
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
-    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.fit(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["scores"])
-
-    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower >= 0.92)
-    self.assertTrue(frac_above_lower <= 0.98)
-
   # Multi-dimensional quantile regression.
   def testQuantileRegressionMultiDimLabel(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.max_tree_depth = 6
+    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -467,7 +448,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
         quantiles=[0.95],
         learner_config=learner_config,
         label_dimension=2,
-        num_trees=100,
+        num_trees=18,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
@@ -490,35 +471,6 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_both_below_upper >= 0.91)
     self.assertTrue(frac_both_below_upper <= 0.99)
 
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
-        two_dimension=True)
-    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        label_dimension=2,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.fit(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["scores"])
-
-    count_above_lower = np.count_nonzero(lower < y, axis=0)
-    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
-    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
-    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
-    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower_0 >= 0.92)
-    self.assertTrue(frac_above_lower_0 <= 0.98)
-    self.assertTrue(frac_above_lower_1 >= 0.92)
-    self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.91)
-    self.assertTrue(frac_both_above_lower <= 0.99)
-
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -712,11 +664,12 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
-  # One dimensional quantile regression.
-  def testQuantileRegression(self):
+  # Quantile regression in core is the same as in non core estimator, so we
+  # just check that it does not fail.
+  def testQuantileRegressionDoesNotThroughException(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
+    learner_config.constraints.max_tree_depth = 1
     learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -731,112 +684,12 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
         quantiles=[0.95],
         learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_upper.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_upper.predict(input_fn=test_input_fn)
-    upper = []
-    for prediction_dict in result_iter:
-      upper.append(prediction_dict["predictions"])
-
-    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_below_upper >= 0.92)
-    self.assertTrue(frac_below_upper <= 0.98)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
-    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["predictions"])
-
-    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower >= 0.92)
-    self.assertTrue(frac_above_lower <= 0.98)
-
-  # Multi-dimensional quantile regression.
-  def testQuantileRegressionMultiDimLabel(self):
-    learner_config = learner_pb2.LearnerConfig()
-    learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
-    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.tree_complexity = (
-        1.0 / _QUANTILE_REGRESSION_SIZE)
-
-    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
-        two_dimension=True)
-    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
-
-    # 95% percentile.
-    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.95],
-        learner_config=learner_config,
-        num_trees=100,
-        label_dimension=2,
+        num_trees=1,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
     model_upper.train(input_fn=train_input_fn, steps=1000)
     result_iter = model_upper.predict(input_fn=test_input_fn)
-    upper = []
-    for prediction_dict in result_iter:
-      upper.append(prediction_dict["predictions"])
-
-    count_below_upper = np.count_nonzero(upper > y, axis=0)
-    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
-    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
-    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
-    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_below_upper_0 >= 0.92)
-    self.assertTrue(frac_below_upper_0 <= 0.98)
-    self.assertTrue(frac_below_upper_1 >= 0.92)
-    self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.91)
-    self.assertTrue(frac_both_below_upper <= 0.99)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
-        two_dimension=True)
-    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        label_dimension=2,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["predictions"])
-
-    count_above_lower = np.count_nonzero(lower < y, axis=0)
-    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
-    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
-    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
-    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower_0 >= 0.92)
-    self.assertTrue(frac_above_lower_0 <= 0.98)
-    self.assertTrue(frac_above_lower_1 >= 0.92)
-    self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.91)
-    self.assertTrue(frac_both_above_lower <= 0.99)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index a6e422847d3914188bca9e6dff797ba1ffb06749..eecf3c5aeb6c6785cae3fd5808954a73db6190d6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_util
@@ -88,6 +89,12 @@ def model_builder(features,
 
   if config is None:
     raise ValueError("Missing estimator RunConfig.")
+  if config.session_config is not None:
+    session_config = config.session_config
+    session_config.allow_soft_placement = True
+  else:
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+  config = config.replace(session_config=session_config)
 
   center_bias = params["center_bias"]
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 6d78e27e8f69ea289b686af8402bd91967f997f4..65276242abaf96de8b1936365278b18f8bba93a9 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -538,7 +538,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
           partition_boundaries[non_empty_partitions[root_idx]];
 
       float best_gain = std::numeric_limits<float>::lowest();
-      int32 best_dimension_idx = 0;
       bool default_right = false;
       int32 best_element_idx = 0;
 
@@ -571,7 +570,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
       // Iterate through dimensions.
       for (int j = 0; j < dimension_boundaries.size() - 1; ++j) {
         const DimensionBoundary& dimension_and_start = dimension_boundaries[j];
-        const int32 dimension_id = dimension_and_start.dimension_id;
 
         int start_index = dimension_and_start.start_index;
         // Even for the last dimension, we always have additional dummy
@@ -630,7 +628,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
               best_right_node_stats = right_stats_default_left;
               best_element_idx = element_idx;
               default_right = false;
-              best_dimension_idx = dimension_id;
             }
           }
           // Consider calculating the default direction only when there were
@@ -648,7 +645,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
               best_right_node_stats = right_stats_default_right;
               best_element_idx = element_idx;
               default_right = true;
-              best_dimension_idx = dimension_id;
             }
           }
         }
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index c3685b54e201f73039f6623443c67ba2b217a51e..ad6ff0a861af896ef0dd254bd47752d76378d63a 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -33,7 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 ops.NotDifferentiable("TreeEnsembleVariable")
 ops.NotDifferentiable("TreeEnsembleSerialize")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 0c319cc9bd1f720eb404a9da05227c5807ec874f..aff7105e94729942efc6e3e9d3ae23b733e8f5ed 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index ad1191d41236e71008bff8c8a7fbd42c16e3f9c5..2a0a206d97bbf01ac382531df31a66d429842bbb 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 99ed4959fad9699f265183d71a1f3b609d7e6d30..7b3df962542a656af8052e9f2eae6e83744411f2 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -27,7 +27,7 @@ Managing dependencies:
 @@NoDependency
 @@split_dependency
 
-Checkpointable data structures:
+Trackable data structures:
 @@List
 @@Mapping
 @@UniqueNameTracker
@@ -49,17 +49,16 @@ from tensorflow.contrib.checkpoint.python.python_state import NumpyState
 from tensorflow.contrib.checkpoint.python.python_state import PythonStateWrapper
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
-from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
+from tensorflow.core.protobuf.trackable_object_graph_pb2 import TrackableObjectGraph as CheckpointableObjectGraph
 from tensorflow.python.training.checkpoint_management import CheckpointManager
-from tensorflow.python.training.checkpointable.base import Checkpointable as CheckpointableBase
-from tensorflow.python.training.checkpointable.data_structures import List
-from tensorflow.python.training.checkpointable.data_structures import Mapping
-from tensorflow.python.training.checkpointable.data_structures import NoDependency
-from tensorflow.python.training.checkpointable.tracking import AutoCheckpointable as Checkpointable
-from tensorflow.python.training.checkpointable.util import capture_dependencies
-from tensorflow.python.training.checkpointable.util import list_objects
-from tensorflow.python.training.checkpointable.util import object_metadata
-
+from tensorflow.python.training.tracking.base import Trackable as CheckpointableBase
+from tensorflow.python.training.tracking.data_structures import List
+from tensorflow.python.training.tracking.data_structures import Mapping
+from tensorflow.python.training.tracking.data_structures import NoDependency
+from tensorflow.python.training.tracking.tracking import AutoTrackable as Checkpointable
+from tensorflow.python.training.tracking.util import capture_dependencies
+from tensorflow.python.training.tracking.util import list_objects
+from tensorflow.python.training.tracking.util import object_metadata
 from tensorflow.python.util.all_util import remove_undocumented
 
 remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index 4e529322c7c76797938468b405cd175609dc0a73..cd9c94c9bd72d398d183d3f3d485ab48cb2fd617 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -12,7 +12,7 @@ py_library(
         ":python_state",
         ":split_dependency",
         ":visualize",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:data_structures",
     ],
 )
 
@@ -22,8 +22,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:data_structures",
     ],
 )
 
@@ -36,8 +36,8 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -47,7 +47,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -64,7 +64,7 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -76,7 +76,7 @@ py_library(
     deps = [
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -89,8 +89,8 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -101,8 +101,8 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -118,6 +118,7 @@ tf_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
+    tags = ["no_oss"],  # b/124472244
 )
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 97936d9e9dfd5d6e62fdf8312707a276b63e1267..a25d51980ea760dfb7f323497a397fbd94fd5f23 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -1,4 +1,4 @@
-"""Checkpointable data structures."""
+"""Trackable data structures."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training.checkpointable import base as checkpointable_lib
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import base as trackable_lib
+from tensorflow.python.training.tracking import data_structures
 
 
-class UniqueNameTracker(data_structures.CheckpointableDataStructure):
-  """Adds dependencies on checkpointable objects with name hints.
+class UniqueNameTracker(data_structures.TrackableDataStructure):
+  """Adds dependencies on trackable objects with name hints.
 
   Useful for creating dependencies with locally unique names.
 
@@ -43,30 +43,30 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
 
   def __init__(self):
     super(UniqueNameTracker, self).__init__()
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     self._name_counts = {}
 
   @property
   def _values(self):
     return [dep.ref for dep in self._checkpoint_dependencies]
 
-  def track(self, checkpointable, base_name):
-    """Add a dependency on `checkpointable`.
+  def track(self, trackable, base_name):
+    """Add a dependency on `trackable`.
 
     Args:
-      checkpointable: An object to add a checkpoint dependency on.
+      trackable: An object to add a checkpoint dependency on.
       base_name: A name hint, which is uniquified to determine the dependency
         name.
     Returns:
-      `checkpointable`, for chaining.
+      `trackable`, for chaining.
     Raises:
-      ValueError: If `checkpointable` is not a checkpointable object.
+      ValueError: If `trackable` is not a trackable object.
     """
 
-    if not isinstance(checkpointable, checkpointable_lib.Checkpointable):
+    if not isinstance(trackable, trackable_lib.Trackable):
       raise ValueError(
-          ("Expected a checkpointable value, got %s which does not inherit "
-           "from CheckpointableBase.") % (checkpointable,))
+          ("Expected a trackable value, got %s which does not inherit "
+           "from tf.track.Trackable.") % (trackable,))
 
     def _format_name(prefix, number):
       if number > 0:
@@ -80,5 +80,5 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
       count += 1
       candidate = _format_name(base_name, count)
     self._name_counts[base_name] = count + 1
-    self._track_value(checkpointable, name=candidate)
-    return checkpointable
+    self._track_value(trackable, name=candidate)
+    return trackable
diff --git a/tensorflow/contrib/checkpoint/python/containers_test.py b/tensorflow/contrib/checkpoint/python/containers_test.py
index a2d453ec6eb3dcf9aba4c52fe866756a92673c63..bace21939602666aa48a05d2abfe05ae6aae41e2 100644
--- a/tensorflow/contrib/checkpoint/python/containers_test.py
+++ b/tensorflow/contrib/checkpoint/python/containers_test.py
@@ -26,9 +26,9 @@ from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 class UniqueNameTrackerTests(test.TestCase):
@@ -52,7 +52,7 @@ class UniqueNameTrackerTests(test.TestCase):
     save_root = util.Checkpoint(slots=slots)
     save_path = save_root.save(checkpoint_prefix)
 
-    restore_slots = tracking.AutoCheckpointable()
+    restore_slots = tracking.AutoTrackable()
     restore_root = util.Checkpoint(
         slots=restore_slots)
     status = restore_root.restore(save_path)
@@ -68,7 +68,7 @@ class UniqueNameTrackerTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testExample(self):
-    class SlotManager(tracking.AutoCheckpointable):
+    class SlotManager(tracking.AutoTrackable):
 
       def __init__(self):
         self.slotdeps = containers.UniqueNameTracker()
diff --git a/tensorflow/contrib/checkpoint/python/python_state.py b/tensorflow/contrib/checkpoint/python/python_state.py
index 969c90c78871ebff02b360f8f09623df56c9c077..737a6c30c1dce65dd7638ee52e6c26a8a40f8321 100644
--- a/tensorflow/contrib/checkpoint/python/python_state.py
+++ b/tensorflow/contrib/checkpoint/python/python_state.py
@@ -23,7 +23,7 @@ import six
 
 import numpy
 
-from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.tracking import base
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -34,8 +34,8 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-class NumpyState(base.Checkpointable):
-  """A checkpointable object whose NumPy array attributes are saved/restored.
+class NumpyState(base.Trackable):
+  """A trackable object whose NumPy array attributes are saved/restored.
 
   Example usage:
 
@@ -72,7 +72,7 @@ class NumpyState(base.Checkpointable):
     """Create placeholder NumPy arrays for to-be-restored attributes.
 
     Typically `_lookup_dependency` is used to check by name whether a dependency
-    exists. We cheat slightly by creating a checkpointable object for `name` if
+    exists. We cheat slightly by creating a trackable object for `name` if
     we don't already have one, giving us attribute re-creation behavior when
     loading a checkpoint.
 
@@ -85,7 +85,7 @@ class NumpyState(base.Checkpointable):
     value = super(NumpyState, self)._lookup_dependency(name)
     if value is None:
       value = _NumpyWrapper(numpy.array([]))
-      new_reference = base.CheckpointableReference(name=name, ref=value)
+      new_reference = base.TrackableReference(name=name, ref=value)
       self._unconditional_checkpoint_dependencies.append(new_reference)
       self._unconditional_dependency_names[name] = value
       super(NumpyState, self).__setattr__(name, value)
@@ -101,7 +101,7 @@ class NumpyState(base.Checkpointable):
   def __setattr__(self, name, value):
     """Automatically wrap NumPy arrays assigned to attributes."""
     # TODO(allenl): Consider supporting lists/tuples, either ad-hoc or by making
-    # ndarrays checkpointable natively and using standard checkpointable list
+    # ndarrays trackable natively and using standard trackable list
     # tracking.
     if isinstance(value, (numpy.ndarray, numpy.generic)):
       try:
@@ -110,19 +110,19 @@ class NumpyState(base.Checkpointable):
         return
       except AttributeError:
         value = _NumpyWrapper(value)
-        self._track_checkpointable(value, name=name, overwrite=True)
+        self._track_trackable(value, name=name, overwrite=True)
     elif (name not in ("_setattr_tracking", "_update_uid")
           and getattr(self, "_setattr_tracking", True)):
-      # Mixing restore()-created attributes with user-added checkpointable
+      # Mixing restore()-created attributes with user-added trackable
       # objects is tricky, since we can't use the `_lookup_dependency` trick to
       # re-create attributes (we might accidentally steal the restoration for
-      # another checkpointable object). For now `NumpyState` objects must be
+      # another trackable object). For now `NumpyState` objects must be
       # leaf nodes. Theoretically we could add some extra arguments to
       # `_lookup_dependency` to figure out whether we should create a NumPy
       # array for the attribute or not.
       raise NotImplementedError(
           ("Assigned %s to the %s property of %s, which is not a NumPy array. "
-           "Currently mixing NumPy arrays and other checkpointable objects is "
+           "Currently mixing NumPy arrays and other trackable objects is "
            "not supported. File a feature request if this limitation bothers "
            "you.")
           % (value, name, self))
@@ -130,7 +130,7 @@ class NumpyState(base.Checkpointable):
 
 
 @six.add_metaclass(abc.ABCMeta)
-class PythonStateWrapper(base.Checkpointable):
+class PythonStateWrapper(base.Trackable):
   """Wraps a Python object for storage in an object-based checkpoint."""
 
   @abc.abstractmethod
diff --git a/tensorflow/contrib/checkpoint/python/python_state_test.py b/tensorflow/contrib/checkpoint/python/python_state_test.py
index 45494351ff4e6c8c75634d8563c3fb63c6089036..40d8fe836402c8b6c8240ef9f665b753c54ede0d 100644
--- a/tensorflow/contrib/checkpoint/python/python_state_test.py
+++ b/tensorflow/contrib/checkpoint/python/python_state_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import util
 
 
 class NumpyStateTests(test.TestCase):
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
index 3e9700ad74618e24843181d169f3fb39ac96bff6..d7b02b538909305b14e638761bd8ba67a948d2b4 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -21,7 +21,7 @@ import functools
 
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
@@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
     return self._restore_callback(tensor)
 
 
-class _SplitDependency(checkpointable.Checkpointable):
+class _SplitDependency(trackable.Trackable):
   """Looks like a regular variable while synchronizing save/restores."""
 
   def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
@@ -81,9 +81,9 @@ class _SplitDependency(checkpointable.Checkpointable):
       return control_flow_ops.no_op()
 
   def _gather_saveables_for_checkpoint(self):
-    """Looks to Checkpointable like a regular variable."""
+    """Looks to Trackable like a regular variable."""
     return {
-        checkpointable.VARIABLE_VALUE_KEY:
+        trackable.VARIABLE_VALUE_KEY:
         functools.partial(_CallbackSaveable,
                           dtype=self._dtype,
                           save_callback=self._save,
@@ -117,7 +117,7 @@ def split_dependency(component_names, component_dtypes,
       may return `None`).
 
   Returns:
-    A dictionary mapping from names to Checkpointable objects. If one is
+    A dictionary mapping from names to Trackable objects. If one is
     reachable from an object as a dependency, the others should be too; adding
     dependencies on some but not all of the objects will result in errors.
   """
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index 664a4e76ab31bf31c7a57924e4af866f2d746804..9bc01059481ff69064e3f9c682a764146b79a250 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -23,9 +23,9 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 def _split_variable_closure(variable):
@@ -44,7 +44,7 @@ def _combine_variable_closure(variable):
   return _consume_restore_buffer_fn
 
 
-class SaveTensorSlicesAsDeps(base.Checkpointable):
+class SaveTensorSlicesAsDeps(base.Trackable):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
@@ -56,17 +56,17 @@ class SaveTensorSlicesAsDeps(base.Checkpointable):
         consume_restore_buffer_fn=_combine_variable_closure(
             self.combined))
     for name, dep in split_dependencies.items():
-      self._track_checkpointable(dep, name=name)
+      self._track_trackable(dep, name=name)
 
 
-class HasRegularDeps(tracking.AutoCheckpointable):
+class HasRegularDeps(tracking.AutoTrackable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
     self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
 
 
-class OnlyOneDep(tracking.AutoCheckpointable):
+class OnlyOneDep(tracking.AutoTrackable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
index bac071c4cff383f60b707b6e42c13faf5e0ac948..faf90f018476b3c70a7bfa1346a5b590edbbddcd 100644
--- a/tensorflow/contrib/checkpoint/python/visualize.py
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 def dot_graph_from_checkpoint(save_path):
@@ -51,7 +51,7 @@ def dot_graph_from_checkpoint(save_path):
     A graph in DOT format as a string.
   """
   reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-  object_graph = checkpointable_utils.object_metadata(save_path)
+  object_graph = trackable_utils.object_metadata(save_path)
   shape_map = reader.get_variable_to_shape_map()
   dtype_map = reader.get_variable_to_dtype_map()
   graph = 'digraph {\n'
@@ -63,7 +63,7 @@ def dot_graph_from_checkpoint(save_path):
       slot_ids.add(slot_reference.slot_variable_node_id)
   for node_id, node in enumerate(object_graph.nodes):
     if (len(node.attributes) == 1
-        and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY):
+        and node.attributes[0].name == trackable.VARIABLE_VALUE_KEY):
       if node_id in slot_ids:
         color = 'orange'
         tooltip_prefix = 'Slot variable'
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
index 583e3bc442893d825c337d73fb999d1e586738a1..98a22d573fdb6172cde100df461d9ae520c2c483 100644
--- a/tensorflow/contrib/checkpoint/python/visualize_test.py
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import adam
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 try:
   import pydot  # pylint: disable=g-import-not-at-top
@@ -57,7 +57,7 @@ class DotGraphTests(test.TestCase):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
       optimizer_step = resource_variable_ops.ResourceVariable(12)
-      save_checkpoint = checkpointable_utils.Checkpoint(
+      save_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model, optimizer_step=optimizer_step)
       optimizer.minimize(functools.partial(model, input_value))
       checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index e570c09ecb5e64130ed6f3375a51d74850cc3989..30b4e2dbdee1117df12ae7ab8ce902e667234fb0 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 69b6c047bc767b4d80e7af4d00ccb7c45b683dae)
+set(GRPC_TAG 62688b6a05cc85b47fb77dd408611734253e47e2)
 
 if(WIN32)
   # We use unsecure gRPC because boringssl does not build on windows
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 21ae9a08a6bb8f71e5935ddde2d7bb3ed0cd8bbc..3d86ab9abbb4cc90c406edc6237c0d2abe440122 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -13,6 +13,7 @@ tensorflow/core/lib
 tensorflow/core/lib/core
 tensorflow/core/profiler
 tensorflow/core/protobuf
+tensorflow/core/protobuf/tpu
 tensorflow/core/util
 tensorflow/examples
 tensorflow/examples/tutorials
@@ -71,7 +72,7 @@ tensorflow/python/tools
 tensorflow/python/tools/api
 tensorflow/python/tools/api/generator
 tensorflow/python/training
-tensorflow/python/training/checkpointable
+tensorflow/python/training/tracking
 tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
@@ -437,7 +438,6 @@ tensorflow/contrib/timeseries/python/timeseries/state_space_models
 tensorflow/contrib/tpu
 tensorflow/contrib/tpu/ops
 tensorflow/contrib/tpu/profiler
-tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/python
 tensorflow/contrib/tpu/python/ops
 tensorflow/contrib/tpu/python/profiler
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 013180c89083748b240ad061b342300e886d3568..b4603206da419f44af0857b9b933eb7df1b255ff 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -1,6 +1,7 @@
 tensorflow/core
 tensorflow/core/kernels/boosted_trees
 tensorflow/core/profiler
+tensorflow/core/protobuf/tpu
 tensorflow/python
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/cloud/kernels
@@ -12,7 +13,6 @@ tensorflow/contrib/mpi_collectives
 tensorflow/contrib/session_bundle
 tensorflow/contrib/tensor_forest/proto
 tensorflow/contrib/tensorboard/plugins/projector
-tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/profiler
 tensorflow/contrib/training/python/training
 tensorflow/contrib/verbs
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index d8d1cc3aa2ca4fff3c950654b7cbd7085c76010c..cc263d7995c01100f1c51436bcb584b600c8c161 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -125,9 +125,9 @@ endfunction()
 
 file(GLOB_RECURSE tf_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
+    "${tensorflow_source_dir}/tensorflow/core/protobuf/tpu/*.proto"
     "${tensorflow_source_dir}/tensorflow/compiler/xla/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/proto/*.proto"
 )
 
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index e32097ceddfec95b8677fc762d641d09078e5343..79c61589112b739837b401010690e7f4ca917d07 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -23,6 +23,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":xla",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
diff --git a/tensorflow/contrib/compiler/__init__.py b/tensorflow/contrib/compiler/__init__.py
index c4937dadfb8be3211377f0ae7017b95e7642dab0..797e5e8164e231e8b3806d40b32774711879b050 100644
--- a/tensorflow/contrib/compiler/__init__.py
+++ b/tensorflow/contrib/compiler/__init__.py
@@ -19,3 +19,4 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.compiler import jit
+from tensorflow.contrib.compiler import xla
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index ca92c31236a7a3882415834eb32a994a120b6d2d..403f30909520dc5cd5f5919af843291fe1400b91 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -58,7 +58,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
@@ -709,7 +709,7 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
-class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+class CudnnRNNTestSaveRestoreTrackable(test_util.TensorFlowTestCase):
 
   def _VerifyCheckpoint(
       self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
@@ -718,7 +718,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with ops.device("gpu:0"):
       cudnn_layer = cudnn_cell_fn()
-      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      cudnn_checkpoint = trackable_utils.Checkpoint(cell=cudnn_layer)
       status = cudnn_checkpoint.restore(checkpoint_path)
       inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
                                    dtype=dtypes.float32)
@@ -726,7 +726,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
       status.run_restore_ops()
     second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
     restore_layer = compatible_cell_fn()
-    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+    restore_layer_checkpoint = trackable_utils.Checkpoint(
         cell=restore_layer)
     status = restore_layer_checkpoint.restore(second_save_path)
     current_state = restore_layer.zero_state(1, dtypes.float32)
@@ -742,7 +742,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(restore_layer_output),
                         self.evaluate(cudnn_output)[-1, -1:, ...])
 
-  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+  def _TrackableSingleCellUnidirectionalTestTemplate(
       self, single_cell_fn, cudnn_cell_fn):
     # Single-layer cuDNN cells with object-based checkpointing should be
     # checkpoint compatible with either single CudnnCompatible cells or
@@ -759,7 +759,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
       value = np.random.normal(size=variable.shape)
       expected_values.append(value)
       self.evaluate(variable.assign(value))
-    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    save_checkpoint = trackable_utils.Checkpoint(cell=save_cell_layer)
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     first_save_path = save_checkpoint.save(checkpoint_prefix)
@@ -775,10 +775,10 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
-  def testLSTMCheckpointableSingleLayer(self):
+  def testLSTMTrackableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
-    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+    self._TrackableSingleCellUnidirectionalTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
@@ -788,19 +788,19 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
-  def testGRUCheckpointableSingleLayer(self):
+  def testGRUTrackableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
     with self.assertRaises(NotImplementedError):
       # TODO(allenl): Implement object-based saving for GRUs and other cells.
-      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+      self._TrackableSingleCellUnidirectionalTestTemplate(
           single_cell_fn=functools.partial(
               cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
           cudnn_cell_fn=functools.partial(
               cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
               direction=direction, name="awesome_gru"))
 
-  def _CheckpointableMultiLayerTestTemplate(
+  def _TrackableMultiLayerTestTemplate(
       self, single_cell_fn, cudnn_cell_fn, num_layers):
 
     def _MultiCellFn():
@@ -819,7 +819,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
         value = np.random.normal(size=variable.shape)
         expected_values.append(value)
         self.evaluate(variable.assign(value))
-      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      save_checkpoint = trackable_utils.Checkpoint(cell=save_layer)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       first_save_path = save_checkpoint.save(checkpoint_prefix)
@@ -837,7 +837,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     num_units = 2
     num_layers = 3
     direction = CUDNN_RNN_UNIDIRECTION
-    self._CheckpointableMultiLayerTestTemplate(
+    self._TrackableMultiLayerTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 86ad8ae8073714657c78badb1e0b4a6d8c8ed5f0..1cb477716dfc6a9cc793939059784f9d89bcdd8a 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -518,8 +518,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
-    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
-        checkpointable=self, dtype=self._plain_dtype)
+    self._saveable._add_trackable_dependencies(  # pylint: disable=protected-access
+        trackable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index f36e8d5022bc7e3f8268a161089153e5510dffc6..7d848e2ec2d99cd2a78ff3e813207c0cd5bb97cf 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking as checkpointable_lib
+from tensorflow.python.training.tracking import tracking as trackable_lib
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
@@ -737,13 +737,13 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
-  def _checkpointable_save(self, save_buffer):
+  def _trackable_save(self, save_buffer):
     weights, biases = self.format_converter.opaque_to_tf_canonical(
         self._variables)
     for name, tensor in zip(self._param_names, weights + biases):
       save_buffer[name] = array_ops.identity(tensor)
 
-  def _checkpointable_restore(self, restore_buffer):
+  def _trackable_restore(self, restore_buffer):
     tensors = [
         array_ops.identity(restore_buffer[name]) for name in self._param_names
     ]
@@ -752,26 +752,26 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         restored_shapes=None  # Unused
     )
 
-  def _add_checkpointable_dependencies(self, checkpointable, dtype):
-    """Add canonical weight dependencies to `checkpointable`.
+  def _add_trackable_dependencies(self, trackable, dtype):
+    """Add canonical weight dependencies to `trackable`.
 
     When saving or restoring, converts to or from the opaque buffer
     format. Weights are saved and loaded in the configuration expected by
     cuDNN-compatible cells.
 
     Args:
-      checkpointable: An object inheriting from `CheckpointableBase` to add
+      trackable: An object inheriting from `Trackable` to add
         dependencies too (typically the cuDNN `Layer`).
       dtype: The dtype for the canonical parameter Tensors.
     """
     split_dependencies = split_dependency.split_dependency(
         component_names=self._param_names,
         component_dtypes=(dtype,) * len(self._param_names),
-        fill_save_buffer_fn=self._checkpointable_save,
-        consume_restore_buffer_fn=self._checkpointable_restore)
-    self._checkpointable_track_params(checkpointable, split_dependencies)
+        fill_save_buffer_fn=self._trackable_save,
+        consume_restore_buffer_fn=self._trackable_restore)
+    self._trackable_track_params(trackable, split_dependencies)
 
-  def _checkpointable_track_params(self, checkpointable, params):
+  def _trackable_track_params(self, trackable, params):
     """Tracks parameters in a canonical configuration."""
     return  # NotImplementedError raised by the Layer.
 
@@ -819,7 +819,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_weights_names.append(prefix + "/kernel")
     tf_bias_names.append(prefix + "/bias")
 
-  def _checkpointable_track_params(self, checkpointable, params):
+  def _trackable_track_params(self, trackable, params):
     """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
     biases = []
     weights = []
@@ -833,12 +833,12 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
       # wrapping.
       kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
       bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
-      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
-      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+      trackable._track_trackable(kernel, name="kernel")  # pylint: disable=protected-access
+      trackable._track_trackable(bias, name="bias")  # pylint: disable=protected-access
     assert len(biases) == len(weights)
     for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.AutoCheckpointable()
-      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell = trackable_lib.AutoTrackable()
+      trackable._track_trackable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
       cell.bias = bias
       cell.kernel = kernel
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 6c5f8c6b00975b3fba041271309a93cecd9f5057..4db711c1f3f2815e7b8cf275af315c062ce4c02e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -25,11 +25,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class AssertElementShapeTest(test_base.DatasetTestBase):
 
   def test_assert_element_shape(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index b9840b1ff1a3df5a05db0e64f436637220f49f80..220f9934b67d1d2a97f6c0fd4ba7779f011e1b09 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -27,12 +27,14 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 prefix_path = "tensorflow/core/lib"
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class LMDBDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
index e7281d531870c75c638b5c48fa3fc6dc606a3623..78019fcc7d810da444f1407f3885d54e76a741c6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
@@ -25,10 +25,12 @@ from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index 2527706709fae8e459aca3489324d4db3c784be6..9275a36582a8c82b936659041129b71e100f883e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -26,11 +26,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 509eb78128d062c7ea44730c2797b7c919cd0d69..2ab94d00565376bfebd80ee61094831e09ed3e68 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -131,6 +131,16 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "one_device_strategy_test",
+    srcs = ["one_device_strategy_test.py"],
+    additional_deps = [
+        ":strategy_test_lib",
+        ":combinations",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 py_library(
     name = "collective_all_reduce_strategy",
     srcs = ["collective_all_reduce_strategy.py"],
@@ -192,18 +202,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "one_device_strategy_test",
-    srcs = ["one_device_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":strategy_test_lib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/distribute:one_device_strategy",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 # TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
@@ -517,6 +515,7 @@ cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     additional_deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":multi_worker_test_base",
         ":mirrored_strategy",
@@ -540,6 +539,7 @@ py_library(
     srcs = [
         "keras_backward_compat_test.py",
         "keras_test.py",
+        "keras_utils_test.py",
     ],
     deps = [
         ":combinations",
@@ -572,6 +572,24 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "keras_utils_test",
+    srcs = ["keras_utils_test.py"],
+    full_precision = True,
+    main = "keras_utils_test.py",
+    shard_count = 32,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_test",
+        ":keras_test_lib",
+    ],
+)
+
 # TODO(b/121200287): Remove this in 2.0
 distribute_py_test(
     name = "keras_backward_compat_test",
@@ -783,6 +801,6 @@ tf_xla_py_test(
         ":tpu_strategy",
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
diff --git a/tensorflow/contrib/distribute/python/checkpointing_test.py b/tensorflow/contrib/distribute/python/checkpointing_test.py
index aa5b9f57b8a5bc12ee94399ec1fc5a55177a5b5d..eadf7233f2ae5ee50b71836ebfcc895163124ac2 100644
--- a/tensorflow/contrib/distribute/python/checkpointing_test.py
+++ b/tensorflow/contrib/distribute/python/checkpointing_test.py
@@ -30,15 +30,15 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import adam as adam_v1
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -49,8 +49,8 @@ class Subclassed(training.Model):
     super(Subclassed, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -76,7 +76,7 @@ class TrainingCheckpointTests(xla_test.XLATestCase):
       with strategy.scope():
         model = Subclassed()
         optimizer = adam_v1.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             optimizer_step=training_util.get_or_create_global_step())
         root.restore(checkpoint_management.latest_checkpoint(
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index acbe4677b401cbea4fd0ec415415f25c920e68e4..ee7640dd1cea15e62ae9912ebedbd853778364a6 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -410,6 +410,7 @@ class DistributedCollectiveAllReduceStrategyTest(
         num_gpus=num_gpus,
         use_core_strategy=use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(yuefengz): Update how we use num_gpus and required_gpus
   @combinations.generate(
       combinations.combine(
@@ -418,7 +419,8 @@ class DistributedCollectiveAllReduceStrategyTest(
           required_gpus=1,
           use_dataset=[True, False],
           use_core_strategy=[True, False]))
-  def testMakeInputFnIterator(self, num_gpus, use_dataset, use_core_strategy):
+  def DISABLED_testMakeInputFnIterator(self, num_gpus, use_dataset,
+                                       use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -553,7 +555,7 @@ class LocalCollectiveAllReduceStrategy(
           required_gpus=2,
           use_dataset=[True, False],
           use_core_strategy=[True, False]))
-  def testMakeInputFnIterator(self, use_dataset, use_core_strategy):
+  def DISABLED_testMakeInputFnIterator(self, use_dataset, use_core_strategy):
     num_gpus = 2
     if use_dataset:
       fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 798a1591c73c4f4f3f37b015d20ec31c40aaa939..7c0f8033fbc046580bc46f90ee9945ffa2a718f9 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -352,6 +352,9 @@ default_strategy = NamedDistribution(
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
+one_device_strategy_gpu = NamedDistribution(
+    "OneDeviceGPU", lambda: one_device_lib.OneDeviceStrategy("/gpu:0"),
+    required_gpus=1)
 tpu_strategy = NamedDistribution(
     "TPU", _get_tpu_strategy_creator(steps_per_run=2),
     required_tpu=True)
diff --git a/tensorflow/contrib/distribute/python/cross_device_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
index 54cce2988383fcf5e063726948fbbf62c7094ce5..2b8e0197961ae37b67dc8958054a03e164242dcd 100644
--- a/tensorflow/contrib/distribute/python/cross_device_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -23,6 +23,7 @@ import itertools
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
@@ -204,15 +205,15 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
   reduction_to_one_combinations = combinations.combine(
       cross_device_ops=[
           combinations.NamedObject(
-              "DefaultReductionToOneDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              "DefaultReductionToOneDevice",
+              cross_device_ops_lib.ReductionToOneDevice()),
           combinations.NamedObject(
               "ReductionToCPUDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDevice(
                   reduce_to_device=_cpu_device)),
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDevice(
                   accumulation_fn=math_ops.accumulate_n)),
       ],
       distribution=[
@@ -228,20 +229,23 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
           combinations.NamedObject(
               "AllReduce",
               cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_device_ops_lib.AllReduceCrossDeviceOps(
-                  "hierarchical_copy", 8, 0, 0)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
               cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+          combinations.NamedObject("NcclAllReduce",
+                                   cross_device_ops_lib.NcclAllReduce()),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
           combinations.NamedObject(
               "HierarchicalCopyAggregateSmallTensors",
               cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 0, 100, 10))
       ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus,
-                    combinations.core_mirrored_strategy_with_two_gpus],
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus
+      ],
       mode=["graph", "eager"])
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
@@ -306,8 +310,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
       combinations.combine(
           cross_device_ops_instance=[
               combinations.NamedObject(
-                  "ReductionToOneDeviceCrossDeviceOps",
-                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
               combinations.NamedObject(
                   "AllReduceCrossDeviceOps",
                   cross_device_ops_lib.AllReduceCrossDeviceOps())
@@ -426,6 +430,9 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
       self._testReductionAndBroadcast(cross_device_ops, distribution)
 
 
+NUM_WORKERS = 3
+
+
 class MultiWorkerCollectiveAllReduceTest(
     multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase):
 
@@ -433,9 +440,9 @@ class MultiWorkerCollectiveAllReduceTest(
 
   @classmethod
   def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
+    """Create a local cluster with 3 workers."""
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=0)
+        num_workers=NUM_WORKERS, num_ps=0)
 
   def setUp(self):
     super(MultiWorkerCollectiveAllReduceTest, self).setUp()
@@ -443,7 +450,12 @@ class MultiWorkerCollectiveAllReduceTest(
     # collective key base for different tests.
     MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
 
-  def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus=0,
+                        use_strategy_object=False,
+                        local_mode=False):
     collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
@@ -452,16 +464,24 @@ class MultiWorkerCollectiveAllReduceTest(
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
-      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-          1, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
       else:
         devices = ["/device:CPU:0"]
-      return collective_all_reduce_ops, devices, ""
+
+      if use_strategy_object:
+        # Still using contrib CollectiveAllReduceStrategy because we can specify
+        # num_gpus in its constructor.
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=num_gpus)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return strategy, devices, ""
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            1, num_gpus, collective_keys=collective_keys)
+        return collective_all_reduce_ops, devices, ""
     else:
-      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-          3, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = [
             "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
@@ -469,8 +489,23 @@ class MultiWorkerCollectiveAllReduceTest(
         ]
       else:
         devices = ["/job:%s/task:%d" % (task_type, task_id)]
-      return (collective_all_reduce_ops, devices,
-              "grpc://" + self._cluster_spec[task_type][task_id])
+
+      if use_strategy_object:
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=num_gpus)
+        strategy.configure(
+            cluster_spec=self._cluster_spec,
+            task_type=task_type,
+            task_id=task_id)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return (strategy, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            NUM_WORKERS, num_gpus, collective_keys=collective_keys)
+        return (collective_all_reduce_ops, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
 
   def _assert_values_equal(self, left, right, sess):
     if isinstance(left, list):
@@ -490,9 +525,18 @@ class MultiWorkerCollectiveAllReduceTest(
       for l, r in zip(left_values, right_values):
         self.assertEqual(l, r)
 
-  def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False):
+  def _test_reduction(self,
+                      task_type,
+                      task_id,
+                      num_gpus,
+                      use_strategy_object=False,
+                      local_mode=False):
     collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type, task_id, num_gpus, local_mode=local_mode)
+        task_type,
+        task_id,
+        num_gpus,
+        use_strategy_object=use_strategy_object,
+        local_mode=local_mode)
     if local_mode:
       num_workers = 1
       worker_device = None
@@ -500,6 +544,27 @@ class MultiWorkerCollectiveAllReduceTest(
       num_workers = len(self._cluster_spec.get("chief", [])) + len(
           self._cluster_spec.get("worker", []))
       worker_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    def _reduce(test_object, reduce_op, per_replica, destinations):
+      if use_strategy_object:
+        with test_object.scope():
+          # Mimic the behavior that distribution strategy usually strips the
+          # wrapper if there is only one value.
+          if len(per_replica.values) == 1:
+            per_replica = per_replica.values[0]
+          return test_object.extended.reduce_to(reduce_op, per_replica,
+                                                destinations)
+      else:
+        return test_object.reduce(reduce_op, per_replica, destinations)
+
+    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.batch_reduce_to(reduce_op,
+                                                      value_destination_pairs)
+      else:
+        return test_object.batch_reduce(reduce_op, value_destination_pairs)
+
     with ops.Graph().as_default(), \
          ops.device(worker_device), \
          self.cached_session(target=master_target) as sess:
@@ -524,26 +589,30 @@ class MultiWorkerCollectiveAllReduceTest(
       # test reduce()
       for destinations in all_destinations:
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean, destinations), sess)
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
+                destinations=destinations), _fake_mirrored(
+                    mean_2, destinations), sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
                 per_replica_2,
                 destinations=destinations),
@@ -553,17 +622,13 @@ class MultiWorkerCollectiveAllReduceTest(
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                               [(per_replica, d1),
-                                                (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean, d1),
-                _fake_mirrored(mean_2, d2)
-            ], sess)
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
-                                               [(per_replica, d1),
-                                                (per_replica_2, d2)]),
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
+                          [(per_replica, d1), (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean * len(devices) * num_workers, d1),
                 _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
@@ -572,18 +637,36 @@ class MultiWorkerCollectiveAllReduceTest(
     return True
 
   @combinations.generate(
-      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2], required_gpus=1))
-  def testReductionDistributed(self, num_gpus):
+      combinations.combine(
+          mode=["graph"],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_strategy_object=[True, False]))
+  def testReductionDistributed(self, num_gpus, use_strategy_object):
     if context.num_gpus() < num_gpus:
       return
-    self._run_between_graph_clients(self._test_reduction, self._cluster_spec,
-                                    num_gpus)
+    self._run_between_graph_clients(
+        self._test_reduction,
+        self._cluster_spec,
+        num_gpus,
+        use_strategy_object=use_strategy_object)
 
   # Collective ops doesn't support strategy with one device.
-  def testReductionLocal(self, num_gpus=2):
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          num_gpus=[2],
+          required_gpus=2,
+          use_strategy_object=[True, False]))
+  def testReductionLocal(self, num_gpus, use_strategy_object):
     if context.num_gpus() < num_gpus:
       return
-    self._test_reduction(None, None, num_gpus, local_mode=True)
+    self._test_reduction(
+        None,
+        None,
+        num_gpus,
+        use_strategy_object=use_strategy_object,
+        local_mode=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index e17085628ba6d1dfc79839fd824801723f07a518..1ff1e7c1d255492e0535175dae7594d2ceb4010b 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -22,7 +22,6 @@ import shutil
 import tempfile
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.optimizer_v2 import adagrad
@@ -117,7 +116,7 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
       scores = estimator.evaluate(eval_input_fn)
 
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
 
     predictions = np.array([
         x[prediction_keys.PredictionKeys.PREDICTIONS]
diff --git a/tensorflow/contrib/distribute/python/input_lib_test.py b/tensorflow/contrib/distribute/python/input_lib_test.py
index 10a58316ec5b3d9d968a88c5c39ff70c277daa65..204f52b034f2366a42fbdab41c467feddb5969a0 100644
--- a/tensorflow/contrib/distribute/python/input_lib_test.py
+++ b/tensorflow/contrib/distribute/python/input_lib_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
@@ -214,33 +213,5 @@ class InputIteratorMultiWorkerTest(
                           expected_values, sess)
 
 
-class SplitDatasetBatchTest(test.TestCase):
-
-  def testBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20)
-    split_batch_by = 2
-    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testMapAndBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100)
-    dataset = dataset.apply(batching.map_and_batch(lambda x: x, 20))
-    split_batch_by = 2
-    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testPrefetchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20).prefetch(1)
-    split_batch_by = 2
-    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
index 9a581e7141af4a6625246539bc48835e6a920887..c49b5522f9135efd9ae3005e92099caf54a76a3a 100644
--- a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
+++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
-from tensorflow.python.training.mode_keys import ModeKeys
 
 _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
diff --git a/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
index dae32188917cce9209b8e51032ef808352bc257c..61202e30c4f33892d2675080fae07cc4d7102337 100644
--- a/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
@@ -47,7 +47,9 @@ class TestDistributionStrategyDnnCorrectness(
       # We add few non-linear layers to make it non-trivial.
       model = keras.Sequential()
       model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-      model.add(keras.layers.Dense(10, activation='relu'))
+      model.add(keras.layers.Dense(
+          10, activation='relu',
+          kernel_regularizer=keras.regularizers.l2(1e-4)))
       model.add(keras.layers.Dense(10, activation='relu'))
       model.add(keras.layers.Dense(1))
 
diff --git a/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
index f625664372dfb6814ccbe9539f6abe018d2a4447..3c2961456b2eede9570ce29f7a8900834f2ccfb7 100644
--- a/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
@@ -23,7 +23,7 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import keras_correctness_test_base
 from tensorflow.python import keras
 from tensorflow.python.eager import test
-from tensorflow.python.training import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 
 
 class DistributionStrategyCnnCorrectnessTest(
@@ -33,7 +33,8 @@ class DistributionStrategyCnnCorrectnessTest(
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       image = keras.layers.Input(shape=(28, 28, 3), name='image')
       c1 = keras.layers.Conv2D(
-          name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4))(
+          name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4),
+          kernel_regularizer=keras.regularizers.l2(1e-4))(
               image)
       if self.with_batch_norm:
         c1 = keras.layers.BatchNormalization(name='bn1')(c1)
@@ -47,7 +48,7 @@ class DistributionStrategyCnnCorrectnessTest(
         model.set_weights(initial_weights)
 
       model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(
+          optimizer=gradient_descent.SGD(
               learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'])
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index 952b11932b83d16558ac9f5ce780886d94e72744..c93d7afa7ceef2c9c272e91997e2871655cea079 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -64,7 +64,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
         def loss_fn():
           replica_id = _replica_id()
-          return math_ops.cast(replica_id + 1, dtype=dtypes.float32) * var
+          return math_ops.cast(replica_id + 1, dtype=dtypes.float32) * 0.5 * var
 
         train_op = optimizer.minimize(loss_fn, var_list=[var])
 
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index dd975c6c36d5d5387035e9da4170e4072406d79c..77e241974f7c4c27382ab548a202891fdbbc6ba0 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -17,9 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
-import tempfile
 from absl.testing import parameterized
 import numpy as np
 
@@ -27,17 +25,17 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
 from tensorflow.python import keras
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import values
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -72,6 +70,20 @@ def simple_functional_model():
   return model
 
 
+def simple_subclassed_model(num_labels=_NUM_CLASS):
+
+  class _SimpleMLP(keras.Model):
+
+    def __init__(self, num_labels):
+      super(_SimpleMLP, self).__init__()
+      self.dense = keras.layers.Dense(num_labels)
+
+    def call(self, inputs):
+      return self.dense(inputs)
+
+  return _SimpleMLP(num_labels)
+
+
 def simple_multi_inputs_multi_outputs_model():
   input_a = keras.layers.Input(shape=(16,), name='input_a')
   input_b = keras.layers.Input(shape=(16,), name='input_b')
@@ -216,6 +228,22 @@ def get_predict_dataset(distribution):
   return dataset
 
 
+def convert_numpy_to_dataset_with_unknown_cardinality(inputs,
+                                                      targets=None):
+  if targets is not None:
+    input_slices = (inputs, targets)
+    dummy_op = (lambda inp, target: True)
+  else:
+    input_slices = inputs
+    dummy_op = (lambda inp: True)
+
+  original_dataset = (dataset_ops.Dataset.from_tensor_slices(
+      input_slices))
+  ds_with_unknown_cardinality = (original_dataset.filter(dummy_op).
+                                 batch(10, drop_remainder=True))
+  return ds_with_unknown_cardinality
+
+
 def multi_input_output_model():
   a = keras.layers.Input(shape=(3,), name='input_a')
   b = keras.layers.Input(shape=(5,), name='input_b')
@@ -230,9 +258,12 @@ def multi_input_output_model():
   return model
 
 
+# TODO(josh11b): Add combinations.one_device_strategy_gpu once it works with
+# TestDistributionStrategyWithCallbacks.test_callbacks_in_predict.
 strategies_minus_tpu = [
     combinations.default_strategy,
     combinations.one_device_strategy,
+    combinations.one_device_strategy_gpu,
     combinations.mirrored_strategy_with_gpu_and_cpu,
     combinations.mirrored_strategy_with_two_gpus,
     combinations.core_mirrored_strategy_with_gpu_and_cpu,
@@ -244,15 +275,13 @@ tpu_strategies = [
 
 
 def strategy_minus_tpu_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu,
-      mode=['graph', 'eager'])
+  return combinations.combine(distribution=strategies_minus_tpu,
+                              mode=['graph', 'eager'])
 
 
 def tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=tpu_strategies,
-      mode=['graph'])
+  return combinations.combine(distribution=tpu_strategies,
+                              mode=['graph'])
 
 
 def all_strategy_combinations():
@@ -263,6 +292,7 @@ def all_strategy_combinations_minus_default():
   strategy_minus_default_combinations = combinations.combine(
       distribution=[
           combinations.one_device_strategy,
+          combinations.one_device_strategy_gpu,
           combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
           combinations.core_mirrored_strategy_with_gpu_and_cpu,
@@ -286,12 +316,6 @@ def strategy_and_optimizer_combinations():
       ]))
 
 
-def strategy_for_numpy_input_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu + tpu_strategies,
-      mode=['graph'])
-
-
 class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
                                         parameterized.TestCase):
 
@@ -447,7 +471,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
     # that use per_core_batch_size
@@ -478,7 +502,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_63_samples, steps=None, batch_size=None)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_no_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -524,7 +548,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           distributed_training_utils.get_input_params(
               distribution, input_63_samples, steps=1, batch_size=None)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -558,7 +582,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=None, batch_size=3)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_with_batch_size(self,
                                                                distribution):
     with self.cached_session():
@@ -575,7 +599,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=10, batch_size=13)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -606,7 +630,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         # with batch_size
         model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -658,7 +682,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
                 steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -841,9 +865,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  # TODO(b/122743976): Include TPUStrategy for this test as well once
-  # step inference is supported.
-  @combinations.generate(strategy_minus_tpu_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset_without_steps(
       self, distribution):
     with self.cached_session():
@@ -864,7 +886,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       predict_with_numpy = model.predict(inputs, batch_size=10)
 
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.batch(10)
+      dataset = dataset.batch(10, drop_remainder=True)
       fit_with_ds = model.fit(dataset, epochs=1).history
       eval_with_ds = model.evaluate(dataset)
       predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
@@ -877,6 +899,61 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
+  @combinations.generate(all_strategy_combinations())
+  def test_on_dataset_with_unknown_cardinality_without_steps(
+      self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((1000, 3), dtype=np.float32)
+      targets = np.zeros((1000, 4), dtype=np.float32)
+      # steps/steps_per_epoch are calculated when using numpy arrays as
+      # input data.
+      fit_with_numpy = model.fit(inputs, targets, epochs=1,
+                                 batch_size=10).history
+      fit_with_numpy_multiple_epochs = model.fit(
+          inputs, targets, epochs=2, batch_size=10).history
+      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+      predict_with_numpy = model.predict(inputs, batch_size=10)
+
+      dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs, targets)
+      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs)
+
+      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
+          dataset)), cardinality.UNKNOWN)
+      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
+          predict_dataset)), cardinality.UNKNOWN)
+
+      eval_with_ds = model.evaluate(dataset)
+      predict_with_ds = model.predict(predict_dataset)
+      self.assertAllClose(
+          eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
+
+      if (distributed_training_utils.is_tpu_strategy(distribution) and
+          distribution.extended.steps_per_run != 1):
+        with self.assertRaisesRegexp(ValueError, '`steps_per_epoch` '
+                                     'should be specified'):
+          fit_with_ds = model.fit(dataset, epochs=1)
+      else:
+        fit_with_ds = model.fit(dataset,
+                                epochs=1).history
+        fit_with_ds_multiple_epochs = model.fit(dataset,
+                                                epochs=2).history
+        self.assertAllClose(
+            fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
+        self.assertAllClose(
+            fit_with_numpy_multiple_epochs,
+            fit_with_ds_multiple_epochs, atol=1e-4, rtol=1e-4)
+
   @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
@@ -1123,397 +1200,126 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-4, rtol=1e-4)
 
 
-class Counter(keras.callbacks.Callback):
-  """Counts the number of times each callback method was run.
+class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
+  class IdentityRegularizer(keras.regularizers.Regularizer):
 
-  Attributes:
-    method_counts: dict. Contains the counts of time  each callback method was
-      run.
-  """
+    def __call__(self, x):
+      return array_ops.identity(x)
 
-  def __init__(self):
-    self.method_counts = collections.defaultdict(int)
-    methods_to_count = [
-        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
-        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
-        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
-        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
-        'on_train_batch_end', 'on_train_begin', 'on_train_end'
-    ]
-    for method_name in methods_to_count:
-      setattr(self, method_name,
-              self.wrap_with_counts(method_name, getattr(self, method_name)))
+  class AddLayer(keras.layers.Layer):
 
-  def wrap_with_counts(self, method_name, method):
+    def build(self, _):
+      self.v = self.add_weight(
+          'v', (), initializer='ones',
+          regularizer=TestRegularizerLoss.IdentityRegularizer())
 
-    def _call_and_count(*args, **kwargs):
-      self.method_counts[method_name] += 1
-      return method(*args, **kwargs)
+    def call(self, inputs):
+      return inputs + self.v
 
-    return _call_and_count
+  @staticmethod
+  def loss_fn(_, y_pred):
+    return math_ops.reduce_mean(y_pred)
 
-
-class TestDistributionStrategyWithCallbacks(test.TestCase,
-                                            parameterized.TestCase):
-
-  @combinations.generate(all_strategy_combinations())
-  def test_callbacks_in_fit(self, distribution):
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_regularizer_loss(self, distribution):
+    batch_size = 2
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      batch_size //= distribution.num_replicas_in_sync
+
+      # Given an input x, which is always 1, and variable v, this model computes
+      # Loss=x+v+regularizer_loss, where regularizer_loss=v and the variable is
+      # initialized to 1. Therefore, this model computes Loss=1+2v, and so the
+      # gradient dLoss/dv = 2. This gradient of 2 is averaged over all examples
+      # in a batch and then multiplied by the learning rate of 1. As a result,
+      # the model update for one batch should subtract 2 from v, resulting in v
+      # being -1. If the regularizer loss is not scaled correctly by number of
+      # replicas, the variable value will be incorrect when number of replicas
+      # >1. For e.g. it will be -2 if num replicas = 2.
     with distribution.scope():
-      model = get_model()
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
-
-    dataset = get_dataset(distribution)
-    counter = Counter()
-
-    epochs = 2
-    steps_per_epoch = 5
-    validation_steps = 3
-
-    model.fit(
-        dataset,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        verbose=0,
-        validation_data=dataset,
-        validation_steps=validation_steps,
-        callbacks=[counter])
-
-    if isinstance(distribution, tpu_strategy.TPUStrategy):
-      # TPU Strategy can have multi step training, from extended.steps_per_run
-      # if steps_per_run = 1, then num_batch_call_per_epoch = steps_per_epoch
-      steps_per_run = distribution.extended.steps_per_run
-      num_batch_call_per_epoch = steps_per_epoch // steps_per_run
-      if steps_per_epoch % steps_per_run:
-        num_batch_call_per_epoch += 1
-    else:
-      num_batch_call_per_epoch = steps_per_epoch
-
-    self.assertDictEqual(
-        counter.method_counts, {
-            'on_batch_begin': epochs * num_batch_call_per_epoch,
-            'on_batch_end': epochs * num_batch_call_per_epoch,
-            'on_epoch_begin': epochs,
-            'on_epoch_end': epochs,
-            'on_test_batch_begin': epochs * validation_steps,
-            'on_test_batch_end': epochs * validation_steps,
-            'on_test_begin': epochs,
-            'on_test_end': epochs,
-            'on_train_batch_begin': epochs * num_batch_call_per_epoch,
-            'on_train_batch_end': epochs * num_batch_call_per_epoch,
-            'on_train_begin': 1,
-            'on_train_end': 1
-        })
+      x = keras.layers.Input(shape=(), batch_size=batch_size)
+      y = TestRegularizerLoss.AddLayer()(x)
+      model = keras.models.Model(inputs=x, outputs=y)
+      opt = gradient_descent_keras.SGD(1.)
+      model.compile(opt, loss=TestRegularizerLoss.loss_fn)
+      model.fit(
+          x=np.array([[1.], [1.]], dtype=np.float32),
+          y=np.array([[1.], [1.]], dtype=np.float32),
+          batch_size=batch_size)
+      v = model.get_weights()[0]
+      self.assertEqual(-1.0, v)
+
+
+class TestDistributionStrategyWithKerasModels(test.TestCase,
+                                              parameterized.TestCase):
 
   @combinations.generate(all_strategy_combinations())
-  def test_callbacks_in_eval(self, distribution):
+  def test_distribution_strategy_on_sequential_model(self, distribution):
     with distribution.scope():
-      model = get_model()
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
-
-    dataset = get_dataset(distribution)
-    counter = Counter()
+      model = simple_sequential_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-    model.evaluate(dataset, steps=5, callbacks=[counter])
+      inputs = np.zeros((20, 10), np.float32)
+      targets = np.zeros((20, 2), np.float32)
 
-    self.assertDictEqual(
-        counter.method_counts, {
-            'on_test_batch_begin': 5,
-            'on_test_batch_end': 5,
-            'on_test_begin': 1,
-            'on_test_end': 1
-        })
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
+    model.predict(inputs, steps=1)
+    model.evaluate(inputs, targets, steps=1)
 
   @combinations.generate(all_strategy_combinations())
-  def test_callbacks_in_predict(self, distribution):
+  def test_distribution_strategy_on_functional_model(self, distribution):
     with distribution.scope():
       model = get_model()
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
-
-    dataset = get_dataset(distribution)
-    counter = Counter()
-
-    model.predict(get_predict_dataset(dataset), steps=5, callbacks=[counter])
-
-    self.assertDictEqual(
-        counter.method_counts, {
-            'on_predict_batch_begin': 5,
-            'on_predict_batch_end': 5,
-            'on_predict_begin': 1,
-            'on_predict_end': 1
-        })
-
-
-class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2))
-      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
-      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
-      x = values.DistributedValues(device_map, (a, b))
-      y = values.DistributedValues(device_map, (a, a))
-      # Removed device and input tensor shape details from the error message
-      # since the order of the device and the corresponding input tensor shape
-      # is not deterministic over different runs.
-      with self.assertRaisesRegexp(ValueError,
-                                   'Input tensor shapes do not match for '
-                                   'distributed tensor inputs '
-                                   'DistributedValues:.+'):
-        with distribution.scope():
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
-      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
-      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
-      x = values.DistributedValues(device_map, (a, b))
-      y = values.DistributedValues(device_map, (a, a))
-      # Removed device and input tensor dtype details from the error message
-      # since the order of the device and the corresponding input tensor dtype
-      # is not deterministic over different runs.
-      with self.assertRaisesRegexp(ValueError,
-                                   'Input tensor dtypes do not match for '
-                                   'distributed tensor inputs '
-                                   'DistributedValues:.+'):
-        with distribution.scope():
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_unsupported_features(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
-
-      dataset = get_dataset(distribution)
-
-      # Test with validation split
-      with self.assertRaisesRegexp(
-          ValueError, '`validation_split` argument is not '
-                      'supported when input `x` is a dataset or a '
-                      'dataset iterator.+'):
-        model.fit(dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0,
-                  validation_split=0.5, validation_steps=2)
-
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          ValueError, '`sample_weight` argument is not supported when input '
-                      '`x` is a dataset or a dataset iterator.'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            sample_weight=sample_weight)
-
-      # Test with not specifying the `steps` argument for dataset with infinite
-      # cardinality.
-      dataset = dataset.repeat()
-      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
-                                   'repeating dataset, you must specify the '
-                                   '`steps_per_epoch` argument'):
-        model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
-                                   'repeating dataset, you must specify the '
-                                   '`steps` argument'):
-        model.evaluate(dataset, verbose=0)
-
-      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
-                                   'repeating dataset, you must specify the '
-                                   '`steps` argument'):
-        model.predict(dataset, verbose=0)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
-
-      dataset = get_dataset(distribution)
-
-      def schedule(_):
-        return 0.001
-      with self.assertRaisesRegexp(ValueError,
-                                   'You must specify a Keras Optimizer V2 when '
-                                   'using'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'You must specify a Keras Optimizer V2 when '
-                                   'using'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
-
-
-class TestDistributionStrategyWithLossMasking(test.TestCase,
-                                              parameterized.TestCase):
-
-  # TODO(priyag): Enable all strategies for this test. Currently it does not
-  # work for TPU due to some invalid datatype.
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_masking(self, distribution):
-    with self.cached_session():
-      np.random.seed(1337)
-      x = np.array([[[1], [1]], [[0], [0]]])
-      with distribution.scope():
-        model = keras.models.Sequential()
-        model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
-        model.add(
-            keras.layers.TimeDistributed(
-                keras.layers.Dense(1, kernel_initializer='one')))
-        model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
-      self.assertEqual(hist.history['loss'][0], 0)
-
-
-class TestDistributionStrategyWithNormalizationLayer(
-    test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(combinations.times(
-      all_strategy_combinations(),
-      combinations.combine(fused=[True, False])))
-  def test_batchnorm_correctness(self, distribution, fused):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras.models.Sequential()
-        norm = keras.layers.BatchNormalization(
-            input_shape=(10,), momentum=0.8, fused=fused)
-        model.add(norm)
-        model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
-      x = x.astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
-      dataset = dataset.repeat(100)
-      dataset = batch_wrapper(dataset, 32, distribution)
-
-      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
-      predict_dataset = predict_dataset.repeat(100)
-      predict_dataset = batch_wrapper(predict_dataset, 32, distribution)
-
-      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
-      out = model.predict(predict_dataset, steps=2)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
-
-
-class TestDistributionStrategySaveLoadWeights(test.TestCase,
-                                              parameterized.TestCase):
-
-  @combinations.generate(all_strategy_combinations_minus_default())
-  def test_save_load_h5(self, distribution):
-    with self.cached_session():
-      dataset = get_dataset(distribution)
-      with distribution.scope():
-        model = get_model()
-        model.compile(gradient_descent_keras.SGD(0.01), 'mse')
-        model.fit(dataset, epochs=1, steps_per_epoch=1)
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-        weights_file = tempfile.mktemp('.h5')
-        model.save_weights(weights_file)
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
 
-        model_2 = get_model()
-        model_2.compile(gradient_descent_keras.SGD(0.01), 'mse')
-        model_2.load_weights(weights_file)
-        model_2.predict(get_predict_dataset(distribution), steps=2)
-        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
+    model.predict(inputs, steps=1)
+    model.evaluate(inputs, targets, steps=1)
 
+  # TODO(b/124377929): Remove error assertions once subclassed models
+  # are supported in DistributedStrategy.
   @combinations.generate(all_strategy_combinations_minus_default())
-  def test_save_load_checkpointable(self, distribution):
-    # TODO(sourabhbajaj): Test fails with optimizer v2 without h5
-    with self.cached_session():
-      dataset = get_dataset(distribution)
-      with distribution.scope():
-        model = get_model()
-        model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-        model.fit(dataset, epochs=1, steps_per_epoch=1)
-
-        weights_file = tempfile.mktemp()
-        model.save_weights(weights_file)
+  def test_distribution_strategy_on_subclassed_model(self, distribution):
+    with distribution.scope():
+      model = simple_subclassed_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-        model_2 = get_model()
-        model_2.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-        model_2.load_weights(weights_file)
-        model_2.predict(get_predict_dataset(distribution), steps=2)
-        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 2), dtype=np.float32)
 
+    with self.assertRaisesRegexp(AttributeError, 'has no attribute'):
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
 
-class TestDistributionStrategyValidation(test.TestCase,
-                                         parameterized.TestCase):
+    with self.assertRaisesRegexp(AttributeError, 'has no attribute'):
+      model.predict(inputs, steps=1)
 
-  @combinations.generate(all_strategy_combinations_minus_default())
-  def test_layer_outside_scope(self, distribution):
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'was not created in the distribution strategy'):
-        x = keras.layers.Input(shape=(3,), name='input')
-        y = keras.layers.Dense(4, name='dense')(x)
-        with distribution.scope():
-          model = keras.Model(x, y)
-          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-          loss = 'mse'
-          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
+    with self.assertRaisesRegexp(AttributeError, 'has no attribute'):
+      model.evaluate(inputs, targets, steps=1)
 
   @combinations.generate(all_strategy_combinations_minus_default())
-  def test_model_outside_scope(self, distribution):
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'was not created in the distribution strategy'):
-        x = keras.layers.Input(shape=(3,), name='input')
-        y = keras.layers.Dense(4, name='dense')(x)
-        model = keras.Model(x, y)
-        with distribution.scope():
-          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-          loss = 'mse'
-          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
+  def test_distribution_strategy_one_dimensional(self, distribution):
+    with distribution.scope():
+      inp = keras.layers.Input(shape=(10,))
+      out = keras.layers.Dense(3, activation='softmax')(inp)
+      model = keras.Model(inputs=[inp], outputs=[out])
+      model.compile(
+          optimizer='rmsprop',
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'],
+      )
+
+      x = np.random.random((64, 10)).astype('float32')
+      y = np.random.randint(3, size=64)
+
+      model.fit(x, y, epochs=1, steps_per_epoch=2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/keras_utils_test.py b/tensorflow/contrib/distribute/python/keras_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..36eaee77f21a9f6d62a7c3f616d0126b7a4a8902
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_utils_test.py
@@ -0,0 +1,471 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.keras models with callbacks, checkpointing with dist strategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_test as keras_test_lib
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.training import gradient_descent
+
+
+class Counter(keras.callbacks.Callback):
+  """Counts the number of times each callback method was run.
+
+  Attributes:
+    method_counts: dict. Contains the counts of time  each callback method was
+      run.
+  """
+
+  def __init__(self):
+    self.method_counts = collections.defaultdict(int)
+    methods_to_count = [
+        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
+        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
+        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
+        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
+        'on_train_batch_end', 'on_train_begin', 'on_train_end'
+    ]
+    for method_name in methods_to_count:
+      setattr(self, method_name,
+              self.wrap_with_counts(method_name, getattr(self, method_name)))
+
+  def wrap_with_counts(self, method_name, method):
+
+    def _call_and_count(*args, **kwargs):
+      self.method_counts[method_name] += 1
+      return method(*args, **kwargs)
+
+    return _call_and_count
+
+
+class TestDistributionStrategyWithCallbacks(test.TestCase,
+                                            parameterized.TestCase):
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_fit(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    epochs = 2
+    steps_per_epoch = 5
+    validation_steps = 3
+
+    model.fit(
+        dataset,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        verbose=0,
+        validation_data=dataset,
+        validation_steps=validation_steps,
+        callbacks=[counter])
+
+    if isinstance(distribution, tpu_strategy.TPUStrategy):
+      # TPU Strategy can have multi step training, from extended.steps_per_run
+      # if steps_per_run = 1, then num_batch_call_per_epoch = steps_per_epoch
+      steps_per_run = distribution.extended.steps_per_run
+      num_batch_call_per_epoch = steps_per_epoch // steps_per_run
+      if steps_per_epoch % steps_per_run:
+        num_batch_call_per_epoch += 1
+    else:
+      num_batch_call_per_epoch = steps_per_epoch
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_batch_begin': epochs * num_batch_call_per_epoch,
+            'on_batch_end': epochs * num_batch_call_per_epoch,
+            'on_epoch_begin': epochs,
+            'on_epoch_end': epochs,
+            'on_test_batch_begin': epochs * validation_steps,
+            'on_test_batch_end': epochs * validation_steps,
+            'on_test_begin': epochs,
+            'on_test_end': epochs,
+            'on_train_batch_begin': epochs * num_batch_call_per_epoch,
+            'on_train_batch_end': epochs * num_batch_call_per_epoch,
+            'on_train_begin': 1,
+            'on_train_end': 1
+        })
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_eval(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    model.evaluate(dataset, steps=5, callbacks=[counter])
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_test_batch_begin': 5,
+            'on_test_batch_end': 5,
+            'on_test_begin': 1,
+            'on_test_end': 1
+        })
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_predict(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    model.predict(
+        keras_test_lib.get_predict_dataset(dataset),
+        steps=5,
+        callbacks=[counter])
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_predict_batch_begin': 5,
+            'on_predict_batch_end': 5,
+            'on_predict_begin': 1,
+            'on_predict_end': 1
+        })
+
+
+class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(
+      self, distribution):
+    with self.cached_session():
+      a = constant_op.constant([1, 2], shape=(1, 2))
+      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
+      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
+      x = values.DistributedValues(device_map, (a, b))
+      y = values.DistributedValues(device_map, (a, a))
+      # Removed device and input tensor shape details from the error message
+      # since the order of the device and the corresponding input tensor shape
+      # is not deterministic over different runs.
+      with self.assertRaisesRegexp(
+          ValueError, 'Input tensor shapes do not match for '
+          'distributed tensor inputs '
+          'DistributedValues:.+'):
+        with distribution.scope():
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              distribution, x, y)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(
+      self, distribution):
+    with self.cached_session():
+      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
+      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
+      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
+      x = values.DistributedValues(device_map, (a, b))
+      y = values.DistributedValues(device_map, (a, a))
+      # Removed device and input tensor dtype details from the error message
+      # since the order of the device and the corresponding input tensor dtype
+      # is not deterministic over different runs.
+      with self.assertRaisesRegexp(
+          ValueError, 'Input tensor dtypes do not match for '
+          'distributed tensor inputs '
+          'DistributedValues:.+'):
+        with distribution.scope():
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              distribution, x, y)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+      dataset = keras_test_lib.get_dataset(distribution)
+
+      # Test with validation split
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not '
+          'supported when input `x` is a dataset or a '
+          'dataset iterator.+'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.5,
+            validation_steps=2)
+
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported when input '
+          '`x` is a dataset or a dataset iterator.'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
+      # Test with not specifying the `steps` argument for dataset with infinite
+      # cardinality.
+      dataset = dataset.repeat()
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps_per_epoch` argument'):
+        model.fit(dataset, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps` argument'):
+        model.evaluate(dataset, verbose=0)
+
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps` argument'):
+        model.predict(dataset, verbose=0)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+      dataset = keras_test_lib.get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+
+      with self.assertRaisesRegexp(
+          ValueError, 'You must specify a Keras Optimizer V2 when '
+          'using'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+
+      with self.assertRaisesRegexp(
+          ValueError, 'You must specify a Keras Optimizer V2 when '
+          'using'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            callbacks=[keras.callbacks.ReduceLROnPlateau()])
+
+
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
+
+  # TODO(priyag): Enable all strategies for this test. Currently it does not
+  # work for TPU due to some invalid datatype.
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_masking(self, distribution):
+    with self.cached_session():
+      np.random.seed(1337)
+      x = np.array([[[1], [1]], [[0], [0]]])
+      with distribution.scope():
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+        model.add(
+            keras.layers.TimeDistributed(
+                keras.layers.Dense(1, kernel_initializer='one')))
+        model.compile(
+            loss='mse',
+            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+      y = np.array([[[1], [1]], [[1], [1]]])
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
+      self.assertEqual(hist.history['loss'][0], 0)
+
+
+class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
+                                                     parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(keras_test_lib.all_strategy_combinations(),
+                         combinations.combine(fused=[True, False])))
+  def test_batchnorm_correctness(self, distribution, fused):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras.models.Sequential()
+        norm = keras.layers.BatchNormalization(
+            input_shape=(10,), momentum=0.8, fused=fused)
+        model.add(norm)
+        model.compile(
+            loss='mse',
+            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+      x = x.astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
+      dataset = dataset.repeat(100)
+      dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
+
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
+      predict_dataset = predict_dataset.repeat(100)
+      predict_dataset = keras_test_lib.batch_wrapper(predict_dataset, 32,
+                                                     distribution)
+
+      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
+      out = model.predict(predict_dataset, steps=2)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class TestDistributionStrategySaveLoadWeights(test.TestCase,
+                                              parameterized.TestCase):
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_save_load_h5(self, distribution):
+    with self.cached_session():
+      dataset = keras_test_lib.get_dataset(distribution)
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        model.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp('.h5')
+        model.save_weights(weights_file)
+
+        model_2 = keras_test_lib.get_model()
+        model_2.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(
+            keras_test_lib.get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_save_load_trackable(self, distribution):
+    # TODO(sourabhbajaj): Test fails with optimizer v2 without h5
+    with self.cached_session():
+      dataset = keras_test_lib.get_dataset(distribution)
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp()
+        model.save_weights(weights_file)
+
+        model_2 = keras_test_lib.get_model()
+        model_2.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(
+            keras_test_lib.get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+
+class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_layer_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        with distribution.scope():
+          model = keras.Model(x, y)
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_model_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        model = keras.Model(x, y)
+        with distribution.scope():
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 0b8df787e6b1bde8dce30ea420a3f0e19da23ca4..5ce731816ccefe36c1f876c79589e448f00b86f5 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -116,7 +116,8 @@ class MirroredTwoDeviceDistributionTest(
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
                                  expected_values)
 
-  def testMakeInputFnIteratorWithCallable(self, distribution):
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
+  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
     def fn():
       dataset = dataset_ops.Dataset.range(2).interleave(
           (lambda _: dataset_ops.Dataset.range(10)), cycle_length=2)
@@ -606,6 +607,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             aggregation="invalid")
 
   def testNonMatchingVariableCreation(self, distribution):
+    self.skipTest("b/123075960")
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
       ds_context.get_replica_context().merge_call(lambda _: _)
@@ -1454,7 +1456,7 @@ class MultiWorkerMirroredStrategyTest(
       self._test_input_fn_iterator(
           iterator, distribution.extended.worker_devices, expected_values, sess)
 
-  def testMakeInputFnIteratorWithCallable(self, distribution):
+  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
     self._configure_distribution_strategy(distribution)
     def fn():
       dataset = dataset_ops.Dataset.range(100)
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index b05aac431f65b4281d9ed9c2fa95c210d55f4008..7dca13a5b41d1a2db474c44c82f1da88be84df05 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -409,3 +409,25 @@ class IndependentWorkerTestBase(test.TestCase):
 
   def join_independent_workers(self, worker_threads):
     self._coord.join(worker_threads)
+
+
+def get_tf_config_task():
+  return json.loads(os.environ['TF_CONFIG'])['task']
+
+
+def get_tf_config_cluster_spec():
+  return json.loads(os.environ['TF_CONFIG'])['cluster']
+
+
+def get_task_type():
+  return get_tf_config_task()['type']
+
+
+def get_task_index():
+  return get_tf_config_task()['index']
+
+
+def is_chief():
+  return ('chief' not in get_tf_config_cluster_spec()
+          and get_task_type() == 'worker'
+          and get_task_index() == 0)
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 906bffc8525688f63474c3f1fbc5d7f0a024431b..0e56f663d6a1ed7945befd933f2f4a83c5f64342 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -18,36 +18,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
-from tensorflow.python.framework import test_util
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.one_device_strategy,
+        combinations.one_device_strategy_gpu],
+    mode=["eager", "graph"]))
 class OneDeviceStrategyTest(
     strategy_test_lib.DistributionTestBase,
     strategy_test_lib.OneDeviceDistributionTestBase):
 
-  def _get_distribution_strategy(self):
-    return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
-  def testReplicaId(self):
-    self._test_replica_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMakeInputFnIteratorWithDataset(self):
-    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i] for i in range(10)]
     input_fn = self._input_fn_to_test_input_context(
@@ -55,13 +54,11 @@ class OneDeviceStrategyTest(
         expected_num_replicas_in_sync=1,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)
-    iterator = d.make_input_fn_iterator(input_fn)
+    iterator = distribution.make_input_fn_iterator(input_fn)
     self._test_input_fn_iterator(
-        iterator, d.extended.worker_devices, expected_values)
+        iterator, distribution.extended.worker_devices, expected_values)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testMakeInputFnIteratorWithCallable(self):
-    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMakeInputFnIteratorWithCallable(self, distribution):
     def fn():
       dataset = dataset_ops.Dataset.range(10)
       it = dataset.make_one_shot_iterator()
@@ -72,32 +69,31 @@ class OneDeviceStrategyTest(
         expected_num_replicas_in_sync=1,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)
-    iterator = d.make_input_fn_iterator(input_fn)
+    iterator = distribution.make_input_fn_iterator(input_fn)
     self._test_input_fn_iterator(
-        iterator, d.extended.worker_devices, expected_values,
+        iterator, distribution.extended.worker_devices, expected_values,
         test_reinitialize=False)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testNumpyIterator(self):
-    self._test_numpy_iterator(self._get_distribution_strategy())
+  def testNumpyIterator(self, distribution):
+    self._test_numpy_iterator(distribution)
 
-  def testAllReduceSum(self):
-    self._test_all_reduce_sum(self._get_distribution_strategy())
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
 
-  def testAllReduceSumGradients(self):
-    self._test_all_reduce_sum_gradients(self._get_distribution_strategy())
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
 
-  def testAllReduceSumGradientTape(self):
-    self._test_all_reduce_sum_gradient_tape(self._get_distribution_strategy())
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
 
-  def testAllReduceMean(self):
-    self._test_all_reduce_mean(self._get_distribution_strategy())
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
 
-  def testAllReduceMeanGradients(self):
-    self._test_all_reduce_mean_gradients(self._get_distribution_strategy())
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
 
-  def testAllReduceMeanGradientTape(self):
-    self._test_all_reduce_mean_gradient_tape(self._get_distribution_strategy())
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index fede253d13804087476fef8b7211a6bfe5789906..3de2041ae35775de6df5bca02c0f1d04a9c2f24e 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -696,6 +696,7 @@ class ParameterServerStrategyTest(
   def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
     self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
       combinations.combine(
@@ -704,8 +705,8 @@ class ParameterServerStrategyTest(
           required_gpus=1,
           use_core_strategy=[True, False],
           use_dataset=[True, False]))
-  def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy,
-                                         use_dataset):
+  def DISABLED_testMakeInputFnIteratorDistributed(
+      self, num_gpus, use_core_strategy, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
@@ -732,6 +733,7 @@ class ParameterServerStrategyTest(
         test_reinitialize=use_dataset,
         use_core_strategy=use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   @combinations.generate(
       combinations.combine(
           mode=['graph'],
@@ -739,8 +741,8 @@ class ParameterServerStrategyTest(
           required_gpus=1,
           use_core_strategy=[True, False],
           use_dataset=[True, False]))
-  def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
-                                   use_dataset):
+  def DISABLED_testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
+                                            use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if use_dataset:
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 69ce1141d8bea835cb959f503647900fba5f6e25..2d9d221f427422f8bbeba55c5644658af9a9a620 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -26,6 +26,7 @@ import copy
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.contrib.tpu.python.tpu import functional as tpu_functional_ops
 from tensorflow.contrib.tpu.python.tpu import topology
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
@@ -41,6 +42,7 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
@@ -52,6 +54,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
 
@@ -69,11 +72,36 @@ def initialize_tpu_system(cluster_resolver=None):
   master = cluster_resolver.master()
 
   logging.info("Initializing the TPU system.")
-  session_config = config_pb2.ConfigProto(allow_soft_placement=True)
 
-  with ops.Graph().as_default():
-    with session_lib.Session(config=session_config, target=master) as sess:
-      serialized_topology = sess.run(tpu.initialize_system())
+  if context.executing_eagerly():
+    # This function looks as it is for the following non-intuitive reasons.
+    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
+    # DistributedTPURewritePass. This pass actually adds real ops that
+    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
+    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
+    # The easiest way to trigger a rewrite is to run the function with
+    # TPUPartitionedCallOp.
+    @function.defun
+    def _tpu_init_fn():
+      return tpu.initialize_system()
+
+    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
+    # see above) but need to define it to get it added to eager context
+    # and get its assigned name.
+    # pylint: disable=protected-access
+    graph_func = _tpu_init_fn._get_concrete_function_internal()
+    func_name = compat.as_str(graph_func._inference_function.name)
+    # pylint: enable=protected-access
+
+    output = tpu_functional_ops.TPUPartitionedCall(
+        args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
+    serialized_topology = output[0].numpy()
+  else:
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    with ops.Graph().as_default():
+      with session_lib.Session(config=session_config, target=master) as sess:
+        serialized_topology = sess.run(tpu.initialize_system())
+
   logging.info("Finished initializing TPU system.")
   return topology.Topology(serialized=serialized_topology)
 
@@ -133,7 +161,7 @@ def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
         strategy, device_map, value_list, aggregation,
         logical_device=logical_device)
 
-  if not context.executing_eagerly():
+  if not (context.executing_eagerly() or ops.inside_function()):
     g = ops.get_default_graph()
     # If "trainable" is True, next_creator() will add the member variables
     # to the TRAINABLE_VARIABLES collection, so we manually remove
@@ -155,8 +183,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def __init__(self,
                tpu_cluster_resolver=None,
                steps_per_run=None,
-               device_assignment=None,
-               **kwargs):
+               device_assignment=None):
     """Initializes the TPUStrategy object.
 
     Args:
@@ -170,18 +197,9 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       device_assignment: Optional `tf.contrib.tpu.DeviceAssignment` to specify
           the placement of replicas on the TPU cluster. Currently only supports
           the usecase of using a single core within a TPU cluster.
-      **kwargs: Additional experimental flags. Will be removed in future.
     """
-    if len(kwargs) > 1:
-      raise ValueError("TPUStrategy constructor only takes one experimental "
-                       "flag now")
-    elif len(kwargs) == 1 and "_disable_training_loop_on_host" not in kwargs:
-      raise ValueError("TPUStrategy constructor does not support arguments: "
-                       "{}".format(kwargs))
-
     super(TPUStrategy, self).__init__(TPUExtended(
-        self, tpu_cluster_resolver, steps_per_run, device_assignment,
-        kwargs.get("_disable_training_loop_on_host", False)))
+        self, tpu_cluster_resolver, steps_per_run, device_assignment))
 
   @property
   def steps_per_run(self):
@@ -193,13 +211,9 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   # This implementation runs a single step. It does not use infeed or outfeed.
   def experimental_run(self, fn, input_iterator=None):
     """See base class."""
-    if context.executing_eagerly():
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-
-    if self.extended._disable_training_loop_on_host:  # pylint: disable=protected-access
+    if context.executing_eagerly() and not ops.inside_function():
       raise NotImplementedError(
-          "`experimental_run` is not compatible with "
-          "`_disable_training_loop_on_host=True`")
+          "Eager mode not supported in TPUStrategy outside TF functions.")
 
     if input_iterator is None:
       inputs = []
@@ -207,13 +221,13 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       inputs = input_iterator.get_next()
 
     result = [None]
-    def replicated_fn(replica_id, inputs):
+    def replicated_fn(replica_id, replica_input):
       """Wraps user function to provide replica ID and `Tensor` inputs."""
       with _TPUReplicaContext(self, replica_id_in_sync_group=replica_id):
         if input_iterator is None:
           result[0] = fn()
         else:
-          result[0] = fn(inputs)
+          result[0] = fn(replica_input)
       return result[0]
 
     replicate_inputs = []  # By replica.
@@ -241,8 +255,7 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
                container_strategy,
                tpu_cluster_resolver=None,
                steps_per_run=None,
-               device_assignment=None,
-               disable_training_loop_on_host=False):
+               device_assignment=None):
     super(TPUExtended, self).__init__(container_strategy)
 
     if tpu_cluster_resolver is None:
@@ -256,7 +269,6 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     self._device_assignment = device_assignment
-    self._disable_training_loop_on_host = disable_training_loop_on_host
 
     # Device assignment is currently only supported for 1 core case.
     if self._device_assignment:
@@ -284,25 +296,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
     self._device_map = values.ReplicaDeviceMap(self._tpu_devices)
 
-    # If the training loop is on the device, we must use the infeed, with input
-    # on the host. Otherwise, we preload the data onto the TPUs.
-    if disable_training_loop_on_host:
-      input_device_map = values.ReplicaDeviceMap(tuple(
-          self.get_host_cpu_device(hid) for hid in range(self.num_hosts)))
-      worker_devices = [
-          (self.get_host(hid), [self.get_host_cpu_device(hid)])
-          for hid in range(self.num_hosts)
-      ]
-      self._input_workers = input_lib.InputWorkers(
-          input_device_map, worker_devices)
-    else:
-      input_worker_devices = collections.OrderedDict()
-      for tpu_device in self._tpu_devices:
-        host_device = _get_host_for_device(tpu_device)
-        input_worker_devices.setdefault(host_device, [])
-        input_worker_devices[host_device].append(tpu_device)
-      self._input_workers = input_lib.InputWorkers(
-          self._device_map, tuple(input_worker_devices.items()))
+    # Preload the data onto the TPUs.
+    input_worker_devices = collections.OrderedDict()
+    for tpu_device in self._tpu_devices:
+      host_device = _get_host_for_device(tpu_device)
+      input_worker_devices.setdefault(host_device, [])
+      input_worker_devices[host_device].append(tpu_device)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, tuple(input_worker_devices.items()))
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
@@ -402,17 +403,6 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
   def _experimental_run_steps_on_iterator(
       self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
-    if self._disable_training_loop_on_host:
-      impl = self._run_steps_on_iterator_with_device_loop
-    else:
-      impl = self._run_steps_on_iterator_with_host_loop
-
-    return impl(
-        fn=fn, multi_worker_iterator=multi_worker_iterator,
-        iterations=iterations, initial_loop_values=initial_loop_values)
-
-  def _run_steps_on_iterator_with_host_loop(
-      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
     output_shapes = multi_worker_iterator.output_shapes
     shapes = nest.flatten(output_shapes)
     if any(not s.is_fully_defined() for s in shapes):
@@ -507,79 +497,6 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     _set_last_step_outputs(ctx, last_step_tensor_outputs)
     return ctx
 
-  def _run_steps_on_iterator_with_device_loop(
-      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
-    output_shapes = multi_worker_iterator.output_shapes
-    shapes = nest.flatten(output_shapes)
-    if any(not s.is_fully_defined() for s in shapes):
-      raise ValueError(
-          "TPU currently requires fully defined shapes. Either use "
-          "set_shape() on the input tensors or use "
-          "dataset.batch(..., drop_remainder=True).")
-    types = nest.flatten(multi_worker_iterator.output_types)
-
-    enqueue_ops = [
-        self._get_enqueue_op_per_host(host_id, multi_worker_iterator, shapes,
-                                      iterations)
-        for host_id in range(self.num_hosts)]
-
-    def dequeue_fn():
-      dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
-      return nest.pack_sequence_as(output_shapes, dequeued)
-
-    # Wrap `fn` for repeat.
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-    ctx = input_lib.MultiStepContext()
-
-    def run_fn(*args, **kwargs):
-      """Single step on the TPU device."""
-      del args, kwargs
-      fn_result = fn(ctx, dequeue_fn())
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      if flat_last_step_outputs:
-        with ops.control_dependencies([fn_result]):
-          return [array_ops.identity(f) for f in flat_last_step_outputs]
-      else:
-        return fn_result
-
-    def iterate_on_tpu():
-      return training_loop.repeat(iterations, run_fn, initial_loop_values)
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop and TPU replicate context. This is useful in cases
-    # where we might need to exit these contexts and get back to the outer
-    # context to do some things, for e.g. create an op which should be
-    # evaluated only once at the end of the loop on the host. One such usage
-    # is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    replicate_inputs = [[]] * self._num_replicas_in_sync
-    replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
-
-    del self._outer_control_flow_context
-    ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
-
-    # Filter out any ops from the outputs, typically this would be the case
-    # when there were no tensor outputs.
-    last_step_tensor_outputs = [x for x in replicate_outputs
-                                if not isinstance(x, ops.Operation)]
-
-    # Outputs are currently of the structure (grouped by device)
-    # [[output0_device0, output1_device0, output2_device0],
-    #  [output0_device1, output1_device1, output2_device1]]
-    # Convert this to the following structure instead: (grouped by output)
-    # [[output0_device0, output0_device1],
-    #  [output1_device0, output1_device1],
-    #  [output2_device0, output2_device1]]
-    last_step_tensor_outputs = [list(x) for x in
-                                zip(*last_step_tensor_outputs)]
-
-    _set_last_step_outputs(ctx, last_step_tensor_outputs)
-    return ctx
-
   def _call_for_each_replica(self, fn, args, kwargs):
     # TODO(jhseu): Consider making it so call_for_each_replica implies that
     # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
@@ -619,9 +536,10 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
             # Initialize replicas with the same value:
-            if context.executing_eagerly():
-              kwargs["initial_value"] = array_ops.identity(
-                  value_list[0].value())
+            if context.executing_eagerly() or ops.inside_function():
+              with ops.init_scope():
+                kwargs["initial_value"] = array_ops.identity(
+                    value_list[0].value())
             else:
               def initial_value_fn(device=d):
                 with ops.device(device):
@@ -655,19 +573,24 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, self._device_map, value, destinations)
 
-    # Validate that the destination is same as the host device
-    # Note we don't do this when in replicate context as the reduction is
-    # performed on the TPU device itself.
     devices = cross_device_ops_lib.get_devices_from(destinations)
-    if len(devices) == 1:
-      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
-          self._host_device)
-    else:
+    if len(devices) != 1:
       raise ValueError("Multiple devices are not supported for TPUStrategy")
 
-    output = math_ops.add_n(value)
-    if reduce_op == reduce_util.ReduceOp.MEAN:
-      return output * (1. / len(value))
+    # Always performs the reduction on the TPU host.
+    with ops.device(self._host_device):
+      output = math_ops.add_n(value.values)
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        output *= (1. / len(value.values))
+
+    # If necessary, copy to requested destination.
+    dest_canonical = device_util.canonicalize(devices[0])
+    host_canonical = device_util.canonicalize(self._host_device)
+
+    if dest_canonical != host_canonical:
+      with ops.device(devices[0]):
+        output = array_ops.identity(output)
+
     return output
 
   def _update(self, var, fn, args, kwargs, group):
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 51c58b0b2f3dc2ab63e22718825a471b8657f892..9fd251175b8b8e3453e33434b4d86386a078295e 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -656,7 +656,8 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
   def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, replica_local = _make_replica_local("sum", distribution)
+      v, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 8966a9befcd3db4a3f397b319e80f37f84ad236b..d441e4735b64fe1176e77a978d281d46a7b287ab 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -144,7 +144,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 78ab155896cfeda4dd259a8529f4b1f77a12cf0b..48925b1bfacc6b59c210b2fb4b53a9a1a851673f 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -37,7 +37,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class IteratorTest(test.TestCase):
@@ -238,7 +238,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
     dataset = dataset.map(math_ops.square).batch(2)
     iterator = datasets.Iterator(dataset)
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], iterator.get_next().numpy())
     save_path = checkpoint.save(checkpoint_prefix)
     self.assertAllEqual([9, 16], iterator.get_next().numpy())
@@ -257,7 +257,7 @@ class IteratorTest(test.TestCase):
     dataset_2 = Dataset.range(10)
     iterator_3 = datasets.Iterator(dataset_2)
 
-    checkpoint = checkpointable_utils.Checkpoint(
+    checkpoint = trackable_utils.Checkpoint(
         iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
     self.assertAllEqual([1, 4], iterator_1.get_next().numpy())
     self.assertEqual(0, iterator_3.get_next().numpy())
@@ -279,7 +279,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.range(3)
     iterator = datasets.Iterator(dataset)
 
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertEqual(0, iterator.get_next().numpy())
     self.assertEqual(1, iterator.get_next().numpy())
     save_path = checkpoint.save(checkpoint_prefix)
@@ -293,7 +293,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.range(10)
     for i in range(5):
       iterator = datasets.Iterator(dataset)
-      checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+      checkpoint = trackable_utils.Checkpoint(iterator=iterator)
       checkpoint.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
       for j in range(2):
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index d18a097063c7d25947af3e2e2959ce574edd553f..3143270ccfe4f670428c80bdc1e09fa452584207 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 # pylint: enable=g-bad-import-order
 
 
@@ -421,7 +421,7 @@ class SpinnTest(test_util.TensorFlowTestCase):
 
     # 5. Verify that checkpoints exist and contains all the expected variables.
     self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
-    object_graph = checkpointable_utils.object_metadata(
+    object_graph = trackable_utils.object_metadata(
         checkpoint_management.latest_checkpoint(config.logdir))
     ckpt_variable_names = set()
     for node in object_graph.nodes:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index c8d9266672a8b87d32338ea7c4f74fb40d41c767..b32501c2e804838af9d4c77663be131b77bd30b4 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -32,12 +32,12 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
 
-class Metric(checkpointable.Checkpointable):
+class Metric(trackable.Trackable):
   """A metric holds state for aggregating statistics over an evaluation run.
 
   Example use with eager execution:
@@ -269,7 +269,7 @@ class Metric(checkpointable.Checkpointable):
       else:
         collections = [ops.GraphKeys.LOCAL_VARIABLES]
       collections += [ops.GraphKeys.METRIC_VARIABLES]
-    # Variables are Checkpointable dependencies of Metrics regardless of the
+    # Variables are Trackable dependencies of Metrics regardless of the
     # global/local distinction. Users can avoid saving variables by not adding a
     # dependency on the Metric.
     v = self._add_variable_with_custom_getter(
@@ -282,7 +282,7 @@ class Metric(checkpointable.Checkpointable):
         use_resource=True,
         getter=variable_scope.get_variable,
         # Raise duplicate variable exceptions from get_variable rather than
-        # Checkpointable.
+        # Trackable.
         overwrite=True)
     self._vars.append(v)
     if context.executing_eagerly():
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 39e5957f5d1760613f2c33607c0bdb163040efb4..c56d1956fde35b562e60496015e666efe9ebc8f6 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class MetricsTest(test.TestCase):
@@ -314,7 +314,7 @@ class MetricsTest(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     mean = metrics.Mean()
-    checkpoint = checkpointable_utils.Checkpoint(mean=mean)
+    checkpoint = trackable_utils.Checkpoint(mean=mean)
     mean.build()
     mean._built = True
     self.evaluate(mean.init_variables())
@@ -327,7 +327,7 @@ class MetricsTest(test.TestCase):
     self.assertAllEqual(200., self.evaluate(mean.value()))
 
     restore_mean = metrics.Mean()
-    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    restore_checkpoint = trackable_utils.Checkpoint(mean=restore_mean)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_mean(300.)
     status.assert_consumed().run_restore_ops()
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 240f213c602395b8589d39c3ecd90b602ffa9848..b3e8daddaf2369e9e33179fde2aab1469e97ea47 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 # pylint: disable=not-callable
@@ -65,7 +65,7 @@ class NetworkTest(test.TestCase):
 
   def test_checkpointing_not_implemented(self):
     checkpoint_directory = self.get_temp_dir()
-    checkpoint = checkpointable_utils.Checkpoint(net=MyNetwork())
+    checkpoint = trackable_utils.Checkpoint(net=MyNetwork())
     with self.assertRaises(NotImplementedError):
       checkpoint.save(checkpoint_directory)
 
diff --git a/tensorflow/contrib/eager/python/parameter_server.py b/tensorflow/contrib/eager/python/parameter_server.py
index 7803a6799bb64441fab881bf6ca986d5cf3851a8..258f0a19309235dcd99b31b4de3d35ef8d89b15b 100644
--- a/tensorflow/contrib/eager/python/parameter_server.py
+++ b/tensorflow/contrib/eager/python/parameter_server.py
@@ -30,7 +30,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
@@ -129,8 +129,8 @@ class SharedVariable(resource_variable_ops.ResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index b82e1bb71bce9a28d7bbbf961cc6d5e25dd18acf..df5b059448f735f7dc1f2963ffbc9c8a8287250a 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -62,7 +62,6 @@ To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@Checkpoint
 @@Checkpointable
-@@CheckpointableSaver
 
 @@executing_eagerly
 @@in_eager_mode
@@ -138,9 +137,8 @@ from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Vari
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
-from tensorflow.python.training.checkpointable.tracking import AutoCheckpointable as Checkpointable
-from tensorflow.python.training.checkpointable.util import CheckpointableSaver
-from tensorflow.python.training.checkpointable.util import Checkpoint
+from tensorflow.python.training.tracking.tracking import AutoTrackable as Checkpointable
+from tensorflow.python.training.tracking.util import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 48a6ef4dca0ca7682f7b99b66177679f29ad9ec9..da2479a0b7b029561136903c82cabed9aae622b8 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -203,10 +203,7 @@ py_test(
     srcs = ["python/ops/kmeans_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/73741358
-        "notsan",  # b/67512932
-    ],
+    tags = ["notsan"],
     deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 8fc5f1cfe7800653ef1e43c6d40d1a66e34f2106..0a9199d61f36f10c98b95d79ece7e86765d2db0e 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -14,7 +14,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
-        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -72,60 +71,3 @@ tf_py_test(
     ],
     tags = ["no_pip"],
 )
-
-py_library(
-    name = "sequence_feature_column_v2",
-    srcs = ["python/feature_column/sequence_feature_column_v2.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
-    ],
-)
-
-tf_py_test(
-    name = "sequence_feature_column_v2_test",
-    srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
-    additional_deps = [
-        ":sequence_feature_column_v2",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//tensorflow/python/feature_column:feature_column_v2_test",
-    ],
-    tags = ["no_pip"],
-)
-
-py_test(
-    name = "sequence_feature_column_v2_integration_test",
-    srcs = ["python/feature_column/sequence_feature_column_v2_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":sequence_feature_column_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//tensorflow/python/keras:layers",
-    ],
-)
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 3f6dbe0cbdeeae5e2107755f80bcfe5f7fc310e4..8fd2b5f39bc88b76fe5583f8d18389e232ea9f40 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -32,7 +32,6 @@ tf_custom_op_py_library(
         "python/ops/arg_scope.py",
         "python/ops/audio_ops.py",
         "python/ops/checkpoint_ops.py",
-        "python/ops/critical_section_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
         "python/ops/script_ops.py",
@@ -172,26 +171,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "critical_section_test",
-    size = "medium",
-    srcs = ["python/ops/critical_section_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 py_test(
     name = "ops_test",
     size = "small",
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 3784631dcbfbeb215b6c695e4b6f1bbd02fa708c..063717f08aa88f4de9470d8392db2b7c95b3e4bf 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -94,8 +94,6 @@
 @@smart_constant_value
 @@smart_case
 
-@@CriticalSection
-
 @@BoundedTensorSpec
 @@TensorSpec
 
@@ -129,6 +127,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
+    'is_nested',
     'is_sequence',
     'is_sequence_or_composite',
     'flatten',
@@ -139,6 +138,7 @@ _nest_allowed_symbols = [
     'map_structure_with_tuple_paths',
     'assert_shallow_structure',
     'flatten_up_to',
+    'flatten_with_tuple_paths_up_to',
     'map_structure_up_to',
     'map_structure_with_tuple_paths_up_to',
     'get_traverse_shallow_structure',
diff --git a/tensorflow/contrib/framework/python/ops/__init__.py b/tensorflow/contrib/framework/python/ops/__init__.py
index c4976497f5fa95d82e492153b117681f693eaa13..8113bf7c095bd0817e40cfd08bdf1ef7275ba55b 100644
--- a/tensorflow/contrib/framework/python/ops/__init__.py
+++ b/tensorflow/contrib/framework/python/ops/__init__.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.ops.arg_scope import *
 from tensorflow.contrib.framework.python.ops.checkpoint_ops import *
-from tensorflow.contrib.framework.python.ops.critical_section_ops import *
 from tensorflow.contrib.framework.python.ops.ops import *
 from tensorflow.contrib.framework.python.ops.prettyprint_ops import *
 from tensorflow.contrib.framework.python.ops.script_ops import *
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index b6b75ffa248d66cc4cb49339f193d486f05a6a4a..f13a66717f67a1a627f66af9468c6f2897aaf7a4 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -565,6 +565,20 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
         fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
             stream->parent()),
         &algorithms));
+    if (activation_mode == ActivationMode::NONE) {
+      // Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is supported for
+      // identity activation, other algs seem to quietly do Relu.
+      // See
+      // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBiasActivationForward
+      algorithms.erase(
+          std::remove_if(
+              algorithms.begin(), algorithms.end(),
+              [](dnn::AlgorithmDesc alg) {
+                return alg.algo_id() !=
+                       CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+              }),
+          algorithms.end());
+    }
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index db0868fb2c43464a811b3d6dfcd96480ba2463ee..386e4cf69b7aa118a85fb25bcb809a879c5c1bd8 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -377,7 +377,10 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 5b9c54e43a16adf457d5ed0e7e73dcd168ab0d67..66af79d1e81bbc450141673dd54d865e5c7932d5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -238,10 +237,10 @@ class GANEstimatorIntegrationTest(test.TestCase):
     # Evaluate.
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # Predict.
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
index c00ff4399748a77f88d9753df7592bf3859d754e..0fcd1b7924eb02f5d617b45af16852baf2e2bb48 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -235,10 +234,10 @@ class StarGANEstimatorIntegrationTest(test.TestCase):
     # EVALUTE
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
index 0d9e6489bdd1d89cc49bfedc2eed784999c31d2b..baf2c28df4b63cff525dcf3ff880730768ad000a 100644
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -184,11 +183,11 @@ class TPUGANEstimatorIntegrationTest(test.TestCase, parameterized.TestCase):
     # Evaluate.
     num_steps_eval = 2
     scores = est.evaluate(eval_input_fn, steps=num_steps_eval)
-    self.assertIn(ops.GraphKeys.GLOBAL_STEP, six.iterkeys(scores))
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn(ops.GraphKeys.GLOBAL_STEP, scores)
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # Predict.
     predictions = np.array([x['generated_data'] for x in
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index bd17571a0535a3c8e9dfee24a8da16eb2e72f165..bc7c1057b478fe2656898e68c1a14013b5a71d12 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -365,7 +365,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     unused_image = array_ops.zeros([2, 299, 299, 3])
     incscore = _run_with_mock(classifier_metrics.inception_score, unused_image)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       incscore_np = sess.run(incscore, {'concat:0': logits})
 
     self.assertAllClose(_expected_inception_score(logits), incscore_np)
@@ -473,7 +473,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         classifier_fn=lambda x: x,
         max_block_size=600)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_kid, actual_std = sess.run(kid_op)
 
     expected_kid, expected_std = _expected_kid_and_std(test_pool_real_a,
@@ -500,7 +500,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         max_block_size=max_block_size)
 
     for block_size in [50, 512, 1000]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         actual_kid, actual_std = sess.run(kid_op, {max_block_size: block_size})
 
       expected_kid, expected_std = _expected_kid_and_std(
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index e3c780ac1a0f0ef15ff993bd3a9bf9730dcb45b8..44ee0f52696dc1cdcd91286a80b2d4b42be93a4d 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -403,7 +403,9 @@ class _PenaltyTest(object):
   def test_all_correct(self):
     loss = self._penalty_fn(**self._kwargs)
     self.assertEqual(self._expected_dtype, loss.dtype)
-    self.assertEqual(self._expected_op_name, loss.op.name)
+    # NOTE: Op names will change, it is inappropriate to include them in tests.
+    # See go/tf-breaking-change.
+    # self.assertEqual(self._expected_op_name, loss.op.name)
     with self.cached_session():
       variables.global_variables_initializer().run()
       self.assertAlmostEqual(self._expected_loss, loss.eval(), 6)
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index f36a5d346e0f27fbbc480e876380db51ed559c09..9bff8090d93d3ad7def69726073accfb234ef301 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -757,7 +757,9 @@ def cyclegan_loss(
 
   return namedtuples.CycleGANLoss(loss_x2y, loss_y2x)
 
-
+# Begin google-internal
+# The four major parts can be found here: http://screen/tMRMBAohDYG.
+# End google-internal
 def stargan_loss(
     model,
     generator_loss_fn=tfgan_losses.stargan_generator_loss_wrapper(
@@ -776,8 +778,6 @@ def stargan_loss(
     add_summaries=True):
   """StarGAN Loss.
 
-  The four major part can be found here: http://screen/tMRMBAohDYG.
-
   Args:
     model: (StarGAN) Model output of the stargan_model() function call.
     generator_loss_fn: The loss function on the generator. Takes a
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
index b84710d26eb8a64bf2f86b9f920551a8a8dbb233..755cbdff31cd7ca31579e0d64399d681dc24ad81 100644
--- a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
@@ -100,8 +100,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
 
     // Logic to be executed on the RecvBufAsync callback.
     auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
-                              to_device_ctx, to_tensor, dev_to_dev_stream_index,
-                              done](const Status& s) {
+                              to_device_ctx, to_tensor, done](const Status& s) {
       if (s.ok()) {
         remote_memory_manager_->TensorFromTransportOptions(
             to_tensor, state->call->resp_.transport_options(), to_device,
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 5f8c300155770ed03ad12a9fa5ac74456edaf024..1124dff741309d8fd04954e70c5ebaaf164b940a 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -167,8 +167,11 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
 
     // RendezvousMgr already aborted, shouldn't send RPC call any more
     if (!call->status().ok()) {
-      done(call->status(), Args(), Args(), Tensor(), false);
+      // NOTE: `*session()` can potentially be deleted before we return from
+      // `call->done()(...)`, so we must release the worker before calling the
+      // callback.
       session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(call->status(), Args(), Args(), Tensor(), false);
       delete call;
       return;
     }
@@ -181,8 +184,11 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
       // If StartAbort was called prior to DeregisterCall, then the
       // current status should be bad.
       Status s = call->status();
-      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
+      // NOTE: `*session()` can potentially be deleted before we return from
+      // `call->done()(...)`, so we must release the worker before calling the
+      // callback.
       session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
       delete call;
       Unref();
     });
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index e79ccd8da1f8952758ae322d3a92dec34910a9db..5b37239665d46db38fc249e9004d2200abb3d610 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 from copy import deepcopy
 from functools import partial
 from six import iteritems
-from six import iterkeys
 from six import string_types
 from six import StringIO
 from tensorflow.contrib.graph_editor import reroute
@@ -735,9 +734,8 @@ def graph_replace(target_ts, replacement_ts, dst_scope="",
   # control dependencies.
   graph = util.get_unique_graph(flatten_target_ts, check_types=(tf_ops.Tensor))
   control_ios = util.ControlOutputs(graph)
-  ops = select.get_walks_intersection_ops(list(iterkeys(replacement_ts)),
-                                          flatten_target_ts,
-                                          control_ios=control_ios)
+  ops = select.get_walks_intersection_ops(
+      list(replacement_ts), flatten_target_ts, control_ios=control_ios)
   if not ops:
     raise ValueError("Targets and replacements are not connected!")
 
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index d319aa7986d81cf9ac2d1dc2e15b053a0aa0c31b..92016e6a83975a9b15a39a15125e0eabc111912e 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -19,16 +19,25 @@ tf_cc_binary(
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:candidate_sampling_ops_op_lib",
         "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:io_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+        "//tensorflow/core:lookup_ops_op_lib",
         "//tensorflow/core:manip_ops_op_lib",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:parsing_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:random_ops_op_lib",
         "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:sparse_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:string_ops_op_lib",
         "//tensorflow/core:training_ops_op_lib",
         "//tensorflow/core:user_ops_op_lib",
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
index c4476a7bbd5056fa898468a46031bf3d8b1e44cf..b12832d2e2a3cccb4948d9e3bf3d226030121ac2 100644
--- a/tensorflow/contrib/keras/api/keras/losses/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import categorical_hinge
-from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import cosine_similarity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
 from tensorflow.python.keras.losses import logcosh
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
index 7317fdb52c5b79e787a49d71be49f5261d6b1fff..095b5d798df9ac9038fa1088cdd402dff304e87e 100644
--- a/tensorflow/contrib/keras/api/keras/metrics/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -23,7 +23,7 @@ from tensorflow.python.keras.metrics import binary_accuracy
 from tensorflow.python.keras.metrics import binary_crossentropy
 from tensorflow.python.keras.metrics import categorical_accuracy
 from tensorflow.python.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras.metrics import cosine_proximity
+from tensorflow.python.keras.metrics import cosine_similarity
 from tensorflow.python.keras.metrics import hinge
 from tensorflow.python.keras.metrics import kullback_leibler_divergence
 from tensorflow.python.keras.metrics import mean_absolute_error
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 403b522ce45ac6ad98a321378626b87aaa7738aa..9d9524e4e4b995d795b7c71b5bd083d11c60d5ce 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2308,7 +2308,7 @@ def layer_norm(inputs,
           initializer=init_ops.ones_initializer(),
           collections=gamma_collections,
           trainable=trainable)
-    # Calculate the moments on the last axis (layer activations).
+    # By default, compute the moments across all the dimensions except the one with index 0.
     norm_axes = list(range(begin_norm_axis, inputs_rank))
     mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
     # Compute layer normalization using the batch_normalization function.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 7c2d9bb0767cb979dae9c84b5342d129225677ed..a52d25acf402bdda46771e9146a40cfb71e99d53 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -62,8 +62,8 @@ def _assert_no_variables(test_case):
 def _assert_metrics(test_case, expected_loss, expected_eval_metrics,
                     model_fn_ops):
   test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4)
-  for k in six.iterkeys(expected_eval_metrics):
-    test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops))
+  for k in expected_eval_metrics:
+    test_case.assertIn(k, model_fn_ops.eval_metric_ops)
   variables.initialize_local_variables().run()
   for key, expected_value in six.iteritems(expected_eval_metrics):
     value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key]
@@ -545,19 +545,19 @@ class MultiLabelHeadTest(test.TestCase):
       with session.Session():
         self.assertListEqual(
             [1, 0, 0], model_fn_ops.predictions["classes"].eval().tolist()[0])
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
 
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.731059, 0.5, 0.5],
              [0.5, 0.5, 0.731059,]],
@@ -850,18 +850,18 @@ class BinaryClassificationHeadTest(test.TestCase):
       with session.Session():
         self.assertListEqual(
             [1, 1], list(model_fn_ops.predictions["classes"].eval()))
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.LOGISTIC_REGRESSION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         predicted_classes = predictions_for_serving["classes"].eval().tolist()
         self.assertListEqual(
             [b"0", b"1"], predicted_classes[0])
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
 
   def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
@@ -1349,18 +1349,18 @@ class MultiClassHeadTest(test.TestCase):
         self.assertAllEqual(
             [0, 2],
             model_fn_ops.predictions["classes"].eval())
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.576117, 0.2119416, 0.2119416],
              [0.2119416, 0.2119416, 0.576117]],
@@ -1401,18 +1401,18 @@ class MultiClassHeadTest(test.TestCase):
         self.assertAllEqual(
             [b"key0", b"key2"],
             model_fn_ops.predictions["classes"].eval())
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"key0", b"key1", b"key2"], [b"key0", b"key1", b"key2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.576117, 0.2119416, 0.2119416],
              [0.2119416, 0.2119416, 0.576117]],
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index a28394964a12013c43d85701b5a0ab5c559afd62..8fda828e994bc2436eaba4475077020436703631 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -36,7 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 
 
-# TODO(rohanj): This should subclass Checkpointable and implement
+# TODO(rohanj): This should subclass Trackable and implement
 # _gather_saveables_for_checkpoint.
 class ShardedMutableDenseHashTable(object):
   """A sharded version of MutableDenseHashTable.
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 591eabc66c49f301cf73cd912ebbef70cc9e1e3f..9fe8dafcc8edd6b80625c61a4a0e783e65b44720 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -1483,3 +1483,4 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
 if __name__ == "__main__":
   test.main()
+
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 728f75f8ef1eb3b107dbd0ab4ffbecd63787bf3e..f4ebbdeee883ddeef0d47cb561901c16e2195bb2 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -82,10 +82,11 @@ py_library(
 
 py_test(
     name = "metric_loss_ops_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "python/metric_learning/metric_loss_ops_test.py",
     ],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":metric_learning_py",
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 9ea94c74330e3e49414a6a84cd5bc0db3778114a..0a0ba36232075460b561bc54a95fc24973017571 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -40,7 +40,6 @@ tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/platform/cpu_info.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/platform/default/mutex.cc
-tensorflow/core/platform/default/protobuf.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/env.cc
@@ -53,6 +52,7 @@ tensorflow/core/platform/posix/error.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/port.cc
 tensorflow/core/platform/posix/posix_file_system.cc
+tensorflow/core/platform/protobuf.cc
 tensorflow/core/platform/protobuf_util.cc
 tensorflow/core/platform/setround.cc
 tensorflow/core/platform/tensor_coding.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index 8330c45cc16ffa536107e25699379bb5d9e8993b..1c1460ce77c99d29785c7e8b8a8e9f770a45b59f 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -25,6 +25,7 @@ tensorflow/core/framework/variable.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/grappler/costs/op_performance_data.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
+tensorflow/core/protobuf/trackable_object_graph.pb.cc
 tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
 tensorflow/core/protobuf/eager_service.pb.cc
@@ -34,7 +35,9 @@ tensorflow/core/protobuf/meta_graph.pb.cc
 tensorflow/core/protobuf/named_tensor.pb.cc
 tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
+tensorflow/core/protobuf/saved_object_graph.pb.cc
 tensorflow/core/protobuf/saver.pb.cc
+tensorflow/core/protobuf/struct.pb.cc
 tensorflow/core/protobuf/tensorflow_server.pb.cc
 tensorflow/core/protobuf/verifier_config.pb.cc
 tensorflow/core/util/event.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index 7257ac8feedfb8ed18c4d691cd85766e70a48ae8..5def632e8a7b65272a1339bdacd92c1fa23012d2 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -25,6 +25,7 @@ tensorflow/core/framework/variable.pb.h
 tensorflow/core/framework/versions.pb.h
 tensorflow/core/grappler/costs/op_performance_data.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
+tensorflow/core/protobuf/trackable_object_graph.pb.h
 tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
@@ -34,7 +35,9 @@ tensorflow/core/protobuf/meta_graph.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/rewriter_config.pb.h
+tensorflow/core/protobuf/saved_object_graph.pb.h
 tensorflow/core/protobuf/saver.pb.h
+tensorflow/core/protobuf/struct.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/protobuf/tensorflow_server.pb.h
 tensorflow/core/protobuf/verifier_config.pb.h
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 24d86d313b76343ed9450a33cf185d9c426696bb..deb6a5b94020a02b878bdd68a33b3737a97fcf2b 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -31,6 +31,7 @@ tensorflow/core/framework/versions.proto
 tensorflow/core/grappler/costs/op_performance_data.proto
 tensorflow/core/kernels/boosted_trees/boosted_trees.proto
 tensorflow/core/lib/core/error_codes.proto
+tensorflow/core/protobuf/trackable_object_graph.proto
 tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
@@ -40,7 +41,9 @@ tensorflow/core/protobuf/meta_graph.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/rewriter_config.proto
+tensorflow/core/protobuf/saved_object_graph.proto
 tensorflow/core/protobuf/saver.proto
+tensorflow/core/protobuf/struct.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/protobuf/tensorflow_server.proto
 tensorflow/core/protobuf/verifier_config.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 63843b993c16363a80b64622af665aaa64e05830..93701249cc8bf722c8c8558e91e0b700ca1c4a04 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -10,6 +10,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -45,6 +46,28 @@ tf_gen_op_wrapper_py(
     deps = [":memory_stats_ops_op_lib"],
 )
 
+tf_gen_op_wrapper_cc(
+    name = "memory_stats_ops",
+    out_ops_file = "memory_stats_ops",
+)
+
+cc_library(
+    name = "memory_stats_cc",
+    srcs = ["memory_stats_ops.cc"],
+    hdrs = ["memory_stats_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_stats_kernels",
+        ":memory_stats_ops_op_lib",
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "memory_stats_py",
     srcs = [
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 974fb537499c5ea4591a0a128f53d2dea67b9e57..7ae1dbeaa2d04d7846e7fada117f3941319cc1c1 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -24,13 +24,15 @@ class MemoryStatsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     Allocator* allocator =
         context->device()->GetAllocator(AllocatorAttributes());
-    AllocatorStats allocator_stats;
-    allocator->GetStats(&allocator_stats);
+    absl::optional<AllocatorStats> allocator_stats = allocator->GetStats();
+    if (!allocator_stats) {
+      *allocator_stats = AllocatorStats();
+    }
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    output_tensor->scalar<int64>()() = ExtractAllocatorStats(allocator_stats);
+    output_tensor->scalar<int64>()() = ExtractAllocatorStats(*allocator_stats);
   }
 
  protected:
@@ -71,7 +73,7 @@ class BytesLimitOp : public MemoryStatsOp {
  private:
   int64 ExtractAllocatorStats(
       const AllocatorStats& allocator_stats) const override {
-    return allocator_stats.bytes_limit;
+    return allocator_stats.bytes_limit ? *allocator_stats.bytes_limit : -1;
   }
 };
 
@@ -93,7 +95,7 @@ class MaxBytesInUseOp : public MemoryStatsOp {
  private:
   int64 ExtractAllocatorStats(
       const AllocatorStats& allocator_stats) const override {
-    return allocator_stats.max_bytes_in_use;
+    return allocator_stats.peak_bytes_in_use;
   }
 };
 
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
index cae57ce60eb09509af69f8ccab9eacedea361548..9b5d52e1b648e62af93d5420885e4f22796e3ea1 100644
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ b/tensorflow/contrib/mpi_collectives/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 12320d9e456ae93cbf95639a0c9e0c7f414f3518..f30643cf3059754daaeee4093938ac47b26f76ea 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -413,8 +413,9 @@ py_test(
 
 py_test(
     name = "shampoo_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/training/shampoo_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 0243927ce44aec626973744507e75b20a42253e9..b2ea3daf82ed8daa6e0b9acd8e3cf258b8181615 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -44,14 +44,15 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
+    super(NonLayerTrackable, self).__init__()
     self.a_variable = util.add_variable(
         self, name="a_variable", shape=[])
 
@@ -64,8 +65,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -100,7 +101,7 @@ class CheckpointingTests(test.TestCase):
     other_model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     if context.executing_eagerly():
       optimizer.minimize(
@@ -116,11 +117,10 @@ class CheckpointingTests(test.TestCase):
           other_model(input_value),
           global_step=optimizer_step)
       self.evaluate(util.gather_initializers(
-          root_checkpointable))
+          root_trackable))
       self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        util._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -208,7 +208,7 @@ class CheckpointingTests(test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     if context.executing_eagerly():
@@ -217,24 +217,24 @@ class CheckpointingTests(test.TestCase):
     else:
       train_op = optimizer.minimize(model(input_value))
       # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      root_trackable.save_counter  # pylint: disable=pointless-statement
       self.evaluate(util.gather_initializers(
-          root_checkpointable))
+          root_trackable))
       self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(optimizer.variables())
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
@@ -440,7 +440,7 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.AutoCheckpointable()
+    root = util.Checkpoint()
     root.var = util.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
@@ -455,21 +455,17 @@ class CheckpointingTests(test.TestCase):
           util.Checkpoint(root=root, optimizer=optimizer)))
       self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = util.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
                                    14.))
-    slots_path = util.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.AutoCheckpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = util.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = util.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = util.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
     new_root.var = util.add_variable(
@@ -508,15 +504,14 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = util.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
         self.evaluate(util.gather_initializers(obj))
-        saver = util.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testManyRestoresGraph(self):
@@ -526,16 +521,15 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = util.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
         self.evaluate(util.gather_initializers(obj))
-        saver = util.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         before_ops = graph.get_operations()
-        saver.restore(save_path)
+        obj.restore(save_path)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testMultipleGraphsNonSlotVariables(self):
@@ -548,11 +542,11 @@ class CheckpointingTests(test.TestCase):
       first_session = session_lib.Session(graph=first_graph)
       with first_graph.as_default(), first_session.as_default():
         first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = util.Checkpoint(
+        first_root_trackable = util.Checkpoint(
             optimizer=optimizer, variable=first_variable)
         train_op = optimizer.minimize(first_variable.read_value)
         self.evaluate(util.gather_initializers(
-            first_root_checkpointable))
+            first_root_trackable))
         self.evaluate(train_op)
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
@@ -564,23 +558,23 @@ class CheckpointingTests(test.TestCase):
       second_graph = ops.Graph()
       with second_graph.as_default(), session_lib.Session(graph=second_graph):
         second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = util.Checkpoint(
+        second_root_trackable = util.Checkpoint(
             optimizer=optimizer, variable=second_variable)
         train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
+        second_root_trackable.restore(None).initialize_or_restore()
         self.evaluate(train_op)
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
         beta_1_power, _ = optimizer._get_beta_accumulators()
         self.evaluate(beta_1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        save_path = second_root_trackable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
         beta_1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(6., self.evaluate(beta_1_power))
-        status = second_root_checkpointable.restore(save_path)
+        status = second_root_trackable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
@@ -600,7 +594,7 @@ class CheckpointingTests(test.TestCase):
 class TemplateTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -647,13 +641,13 @@ class CheckpointCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
     self.evaluate(util.gather_initializers(
-        root_checkpointable))
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -662,24 +656,24 @@ class CheckpointCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta_1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta_1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta_1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta_1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta_1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta_1_power))
 
   def _write_name_based_checkpoint(self):
@@ -704,7 +698,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = util.CheckpointableSaver(root)
+      object_saver = util.TrackableSaver(graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 1323ed014c9e51e273491694fa44a8e36cc723d0..a7f978634ed45012144b2cc49ed069f6fca44f66 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -24,7 +24,6 @@ import abc
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
@@ -39,7 +38,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 
 
@@ -224,7 +223,7 @@ class _OptimizerV2State(object):
       }
     self._slots = {}
     self._non_slot_dict = {}
-    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # Extra state to help Optimizers implement Trackable. Holds information
     # about variables which will be restored as soon as they're created.
     self._deferred_dependencies = {}  # Non-slot variables
     self._deferred_slot_restorations = {}  # Slot variables
@@ -367,8 +366,8 @@ class _OptimizerV2State(object):
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
       optional_op_name: Name to use when scoping the Variable that needs to be
@@ -386,7 +385,7 @@ class _OptimizerV2State(object):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.create_slot(
           var=variable,
@@ -661,7 +660,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                name=None,
                grad_loss=None,
                stop_gradients=None,
-               scale_loss_by_num_replicas=None):
+               scale_loss_by_num_replicas=False):
     """Add operations to minimize `loss` by updating `var_list`.
 
     This method simply combines calls `compute_gradients()` and
@@ -685,8 +684,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
       scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
-        by the number of replicas. By default, auto-detects whether this is
-        needed.
+        by the number of replicas. DEPRECATED and generally no longer needed.
 
     Returns:
       An Operation that updates the variables in `var_list`.  If `global_step`
@@ -732,7 +730,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                         aggregation_method=None,
                         grad_loss=None,
                         stop_gradients=None,
-                        scale_loss_by_num_replicas=None):
+                        scale_loss_by_num_replicas=False):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -756,8 +754,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
       scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
-        by the number of replicas. By default, auto-detects whether this is
-        needed.
+        by the number of replicas. DEPRECATED and generally no longer needed.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -781,9 +778,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss for number of replicas (callable-loss case). In this case,
-        # we have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        # Scale loss for number of replicas (callable-loss case).
         loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas)
 
       if var_list is None:
@@ -839,9 +834,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
   @staticmethod
   def _scale_loss(loss_value, scale_loss_by_num_replicas):
     """Scale loss for the number of replicas."""
-    if scale_loss_by_num_replicas is None:
-      scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
       num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
@@ -1267,10 +1259,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
     return self._per_graph_state.get(var._graph_key, None)
 
   # --------------
-  # Overridden methods from Checkpointable.
+  # Overridden methods from Trackable.
   # --------------
 
-  def _track_checkpointable(self, *args, **kwargs):
+  def _track_trackable(self, *args, **kwargs):
     """Optimizers may not track dependencies. Raises an error."""
     raise NotImplementedError(
         "Optimizers may not have dependencies. File a feature request if this "
@@ -1278,7 +1270,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   @property
   def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    """From Trackable. Gather graph-specific non-slot variables to save."""
     current_graph_non_slot_variables = []
     state = self._get_per_graph_state()
     if state is not None:
@@ -1287,14 +1279,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
           # Avoid comparing variables
           key=lambda item: item[0]):
         current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
+            trackable.TrackableReference(
                 name=name, ref=variable_object))
     # Note: ignores super(); Optimizers may not have any dependencies outside of
     # state objects.
     return current_graph_non_slot_variables
 
   def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
+    """From Trackable. Find a non-slot variable in the current graph."""
     state = self._get_per_graph_state()
     if state is None:
       return None
@@ -1303,10 +1295,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   @property
   def _deferred_dependencies(self):
-    """Lets Checkpointable know where non-slot variables are created.
+    """Lets Trackable know where non-slot variables are created.
 
     If necessary, creates a new state object for the current default graph.
-    Checkpointable will then add entries to that state's deferred dependency
+    Trackable will then add entries to that state's deferred dependency
     dictionary. The state object will check that dictionary when creating
     non-slot variables, restoring their value if an entry is found.
 
@@ -1319,14 +1311,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
                                        variable):
-    """Checkpointable: Restore a slot variable's value, possibly creating it.
+    """Trackable: Restore a slot variable's value, possibly creating it.
 
     Called when a variable which has an associated slot variable is created or
     restored.
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index e0c6da00d86fe4c5f881bcab7b444182da092b8f..a70f748fad60c6467946225ad5035caaf89c2aaf 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -454,7 +454,7 @@ def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor,
         strides=layer_op.get_attr('strides'),
         padding=layer_op.get_attr('padding'),
         use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
-        data_format=layer_op.get_attr('data_format'),
+        data_format=layer_op.get_attr('data_format').decode(),
         name=new_layer_name)
   elif layer_op.type == 'MatMul':
     return math_ops.matmul(
@@ -867,7 +867,7 @@ class _OpCloner(object):
         strides=op.get_attr('strides'),
         padding=op.get_attr('padding'),
         use_cudnn_on_gpu=op.get_attr('use_cudnn_on_gpu'),
-        data_format=op.get_attr('data_format'),
+        data_format=op.get_attr('data_format').decode(),
         name=new_name).op
 
   def _CloneDepthwiseConv2d(self, op, inputs, new_name):
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index d65d80df8073ef70d591c4ae2af99132f1c318ef..24fa740d24502a28cb42c994715d09180ee99899 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -102,26 +102,6 @@ cuda_py_tests(
     xla_enabled = True,
 )
 
-cuda_py_tests(
-    name = "core_rnn_cell_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/core_rnn_cell_test.py"],
-    additional_deps = [
-        ":rnn_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_tests(
     name = "rnn_test",
     size = "medium",
@@ -144,32 +124,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "core_rnn_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/core_rnn_test.py"],
-    additional_deps = [
-        ":rnn_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-    ],
-    shard_count = 10,
-)
-
 tf_py_test(
     name = "fused_rnn_cell_test",
     size = "medium",
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
deleted file mode 100644
index a70e806211c644c703f49610414854fe3e16a9b7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ /dev/null
@@ -1,1241 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for RNN cells."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib import rnn as contrib_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras import layers as keras_layers
-from tensorflow.python.layers import base as base_layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-
-# pylint: enable=protected-access
-Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
-
-
-class RNNCellTest(test.TestCase, parameterized.TestCase):
-
-  def testLinear(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0)):
-        x = array_ops.zeros([1, 2])
-        l = Linear([x], 2, False)([x])
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([l], {x.name: np.array([[1., 2.]])})
-        self.assertAllClose(res[0], [[3.0, 3.0]])
-
-        # Checks prevent you from accidentally creating a shared function.
-        with self.assertRaises(ValueError):
-          l1 = Linear([x], 2, False)([x])
-
-        # But you can create a new one in a new scope and share the variables.
-        with variable_scope.variable_scope("l1") as new_scope:
-          l1 = Linear([x], 2, False)([x])
-        with variable_scope.variable_scope(new_scope, reuse=True):
-          Linear([l1], 2, False)([l1])
-        self.assertEqual(len(variables_lib.trainable_variables()), 2)
-
-  def testBasicRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = rnn_cell_impl.BasicRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertEqual([
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.trainable_variables])
-        self.assertFalse(cell.non_trainable_variables)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testBasicRNNCellNotTrainable(self):
-    with self.cached_session() as sess:
-
-      def not_trainable_getter(getter, *args, **kwargs):
-        kwargs["trainable"] = False
-        return getter(*args, **kwargs)
-
-      with variable_scope.variable_scope(
-          "root",
-          initializer=init_ops.constant_initializer(0.5),
-          custom_getter=not_trainable_getter):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = rnn_cell_impl.BasicRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertFalse(cell.trainable_variables)
-        self.assertEqual([
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.non_trainable_variables])
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testIndRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = contrib_rnn_cell.IndRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertEqual([
-            "root/ind_rnn_cell/%s_w:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/ind_rnn_cell/%s_u:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/ind_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.trainable_variables])
-        self.assertFalse(cell.non_trainable_variables)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testGRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.175991, 0.175991]])
-      with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)):
-        # Test GRUCell with input_size != num_units.
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.156736, 0.156736]])
-
-  def testIndyGRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.185265, 0.17704]])
-      with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)):
-        # Test IndyGRUCell with input_size != num_units.
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.155127, 0.157328]])
-
-  def testSRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.509682, 0.509682]])
-
-  def testSRUCellKerasRNN(self):
-    """Tests that SRUCell works with keras RNN layer."""
-    cell = contrib_rnn_cell.SRUCell(10)
-    seq_input = ops.convert_to_tensor(
-        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
-    rnn_layer = keras_layers.RNN(cell=cell)
-    rnn_outputs_keras = rnn_layer(seq_input)
-    with self.cached_session() as sess:
-      sess.run([variables_lib.global_variables_initializer()])
-      self.assertEqual(sess.run(rnn_outputs_keras).shape, (2, 10))
-
-  def testSRUCellBiasType(self):
-    """Tests that the bias' dtype is properly set."""
-    cell = contrib_rnn_cell.SRUCell(10)
-    cell.build((2, 3, 5))
-    self.assertEqual(cell._bias.dtype, dtypes.float32_ref)
-
-    cell = contrib_rnn_cell.SRUCell(10, dtype=dtypes.int32)
-    cell.build((2, 3, 5))
-    self.assertEqual(cell._bias.dtype, dtypes.int32_ref)
-
-    cell_input = ops.convert_to_tensor(
-        np.random.rand(2, 5), name="cell_input", dtype=dtypes.float16)
-    cell_state = ops.convert_to_tensor(
-        np.random.rand(2, 10), name="cell_state", dtype=dtypes.float16)
-    cell = contrib_rnn_cell.SRUCell(10)
-    cell(cell_input, [cell_state])
-    self.assertEqual(cell._bias.dtype, dtypes.float16_ref)
-
-  def testSRUCellWithDiffSize(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
-
-  def testBasicLSTMCell(self):
-    for dtype in [dtypes.float16, dtypes.float32]:
-      np_dtype = dtype.as_numpy_dtype
-      with self.session(graph=ops.Graph()) as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          x = array_ops.zeros([1, 2], dtype=dtype)
-          m = array_ops.zeros([1, 8], dtype=dtype)
-          cell = rnn_cell_impl.MultiRNNCell(
-              [
-                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
-                  for _ in range(2)
-              ],
-              state_is_tuple=False)
-          self.assertEqual(cell.dtype, None)
-          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
-          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
-          cell.get_config()  # Should not throw an error
-          g, out_m = cell(x, m)
-          # Layer infers the input type.
-          self.assertEqual(cell.dtype, dtype.name)
-          expected_variable_names = [
-              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME
-          ]
-          self.assertEqual(expected_variable_names,
-                           [v.name for v in cell.trainable_variables])
-          self.assertFalse(cell.non_trainable_variables)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run([g, out_m], {
-              x.name: np.array([[1., 1.]]),
-              m.name: 0.1 * np.ones([1, 8])
-          })
-          self.assertEqual(len(res), 2)
-          variables = variables_lib.global_variables()
-          self.assertEqual(expected_variable_names, [v.name for v in variables])
-          # The numbers in results were not calculated, this is just a
-          # smoke test.
-          self.assertAllClose(res[0], np.array(
-              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
-          expected_mem = np.array(
-              [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
-              dtype=np_dtype)
-          self.assertAllClose(res[1], expected_mem, 1e-2)
-        with variable_scope.variable_scope(
-            "other", initializer=init_ops.constant_initializer(0.5)):
-          # Test BasicLSTMCell with input_size != num_units.
-          x = array_ops.zeros([1, 3], dtype=dtype)
-          m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m], {
-                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
-              })
-          self.assertEqual(len(res), 2)
-
-  def testBasicLSTMCellDimension0Error(self):
-    """Tests that dimension 0 in both(x and m) shape must be equal."""
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        num_units = 2
-        state_size = num_units * 2
-        batch_size = 3
-        input_size = 4
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size - 1, state_size])
-        with self.assertRaises(ValueError):
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              num_units, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          sess.run(
-              [g, out_m], {
-                  x.name: 1 * np.ones([batch_size, input_size]),
-                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
-              })
-
-  def testBasicLSTMCellStateSizeError(self):
-    """Tests that state_size must be num_units * 2."""
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        num_units = 2
-        state_size = num_units * 3  # state_size must be num_units * 2
-        batch_size = 3
-        input_size = 4
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        with self.assertRaises(ValueError):
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              num_units, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          sess.run(
-              [g, out_m], {
-                  x.name: 1 * np.ones([batch_size, input_size]),
-                  m.name: 0.1 * np.ones([batch_size, state_size])
-              })
-
-  def testBasicLSTMCellStateTupleType(self):
-    with self.cached_session():
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m0 = (array_ops.zeros([1, 2]),) * 2
-        m1 = (array_ops.zeros([1, 2]),) * 2
-        cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
-            state_is_tuple=True)
-        self.assertTrue(isinstance(cell.state_size, tuple))
-        self.assertTrue(
-            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(
-            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
-
-        # Pass in regular tuples
-        _, (out_m0, out_m1) = cell(x, (m0, m1))
-        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
-
-        # Pass in LSTMStateTuples
-        variable_scope.get_variable_scope().reuse_variables()
-        zero_state = cell.zero_state(1, dtypes.float32)
-        self.assertTrue(isinstance(zero_state, tuple))
-        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
-        _, (out_m0, out_m1) = cell(x, zero_state)
-        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
-
-  def testBasicLSTMCellWithStateTuple(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m0 = array_ops.zeros([1, 4])
-        m1 = array_ops.zeros([1, 4])
-        cell = rnn_cell_impl.MultiRNNCell(
-            [
-                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
-                for _ in range(2)
-            ],
-            state_is_tuple=True)
-        g, (out_m0, out_m1) = cell(x, (m0, m1))
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, out_m0, out_m1], {
-                x.name: np.array([[1., 1.]]),
-                m0.name: 0.1 * np.ones([1, 4]),
-                m1.name: 0.1 * np.ones([1, 4])
-            })
-        self.assertEqual(len(res), 3)
-        # The numbers in results were not calculated, this is just a smoke test.
-        # Note, however, these values should match the original
-        # version having state_is_tuple=False.
-        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
-        expected_mem0 = np.array(
-            [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
-        expected_mem1 = np.array(
-            [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
-        self.assertAllClose(res[1], expected_mem0)
-        self.assertAllClose(res[2], expected_mem1)
-
-  def testIndyLSTMCell(self):
-    for dtype in [dtypes.float16, dtypes.float32]:
-      np_dtype = dtype.as_numpy_dtype
-      with self.session(graph=ops.Graph()) as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          x = array_ops.zeros([1, 2], dtype=dtype)
-          state_0 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          state_1 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          cell = rnn_cell_impl.MultiRNNCell(
-              [contrib_rnn_cell.IndyLSTMCell(2) for _ in range(2)])
-          self.assertEqual(cell.dtype, None)
-          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
-          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
-          cell.get_config()  # Should not throw an error
-          g, (out_state_0, out_state_1) = cell(x, (state_0, state_1))
-          # Layer infers the input type.
-          self.assertEqual(cell.dtype, dtype.name)
-          expected_variable_names = [
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_w:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_u:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_w:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_u:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME
-          ]
-          self.assertEqual(expected_variable_names,
-                           [v.name for v in cell.trainable_variables])
-          self.assertFalse(cell.non_trainable_variables)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_state_0, out_state_1], {
-                  x.name: np.array([[1., 1.]]),
-                  state_0[0].name: 0.1 * np.ones([1, 2]),
-                  state_0[1].name: 0.1 * np.ones([1, 2]),
-                  state_1[0].name: 0.1 * np.ones([1, 2]),
-                  state_1[1].name: 0.1 * np.ones([1, 2]),
-              })
-          self.assertEqual(len(res), 3)
-          variables = variables_lib.global_variables()
-          self.assertEqual(expected_variable_names, [v.name for v in variables])
-          # Only check the range of outputs as this is just a smoke test.
-          self.assertAllInRange(res[0], -1.0, 1.0)
-          self.assertAllInRange(res[1], -1.0, 1.0)
-          self.assertAllInRange(res[2], -1.0, 1.0)
-        with variable_scope.variable_scope(
-            "other", initializer=init_ops.constant_initializer(0.5)):
-          # Test IndyLSTMCell with input_size != num_units.
-          x = array_ops.zeros([1, 3], dtype=dtype)
-          state = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          g, out_state = contrib_rnn_cell.IndyLSTMCell(2)(x, state)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_state], {
-                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-                  state[0].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
-                  state[1].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
-              })
-          self.assertEqual(len(res), 2)
-
-  def testLSTMCell(self):
-    with self.cached_session() as sess:
-      num_units = 8
-      num_proj = 6
-      state_size = num_units + num_proj
-      batch_size = 3
-      input_size = 2
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        cell = rnn_cell_impl.LSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            state_is_tuple=False)
-        output, state = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [output, state], {
-                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-                m.name: 0.1 * np.ones((batch_size, state_size))
-            })
-        self.assertEqual(len(res), 2)
-        # The numbers in results were not calculated, this is mostly just a
-        # smoke test.
-        self.assertEqual(res[0].shape, (batch_size, num_proj))
-        self.assertEqual(res[1].shape, (batch_size, state_size))
-        # Different inputs so different outputs and states
-        for i in range(1, batch_size):
-          self.assertTrue(
-              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
-          self.assertTrue(
-              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
-
-  def testLSTMCellVariables(self):
-    with self.cached_session():
-      num_units = 8
-      num_proj = 6
-      state_size = num_units + num_proj
-      batch_size = 3
-      input_size = 2
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        cell = rnn_cell_impl.LSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            state_is_tuple=False)
-        cell(x, m)  # Execute to create variables
-      variables = variables_lib.global_variables()
-      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
-      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
-      self.assertEquals(variables[2].op.name,
-                        "root/lstm_cell/projection/kernel")
-
-  def testLSTMCellLayerNorm(self):
-    with self.cached_session() as sess:
-      num_units = 2
-      num_proj = 3
-      batch_size = 1
-      input_size = 4
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        c = array_ops.zeros([batch_size, num_units])
-        h = array_ops.zeros([batch_size, num_proj])
-        state = rnn_cell_impl.LSTMStateTuple(c, h)
-        cell = contrib_rnn_cell.LayerNormLSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            layer_norm=True,
-            norm_gain=1.0,
-            norm_shift=0.0)
-        g, out_m = cell(x, state)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, out_m], {
-                x.name: np.ones((batch_size, input_size)),
-                c.name: 0.1 * np.ones((batch_size, num_units)),
-                h.name: 0.1 * np.ones((batch_size, num_proj))
-            })
-        self.assertEqual(len(res), 2)
-        # The numbers in results were not calculated, this is mostly just a
-        # smoke test.
-        self.assertEqual(res[0].shape, (batch_size, num_proj))
-        self.assertEqual(res[1][0].shape, (batch_size, num_units))
-        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
-        # Different inputs so different outputs and states
-        for i in range(1, batch_size):
-          self.assertTrue(
-              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
-          self.assertTrue(
-              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperCheckpointing(self):
-    for wrapper_type in [
-        rnn_cell_impl.DropoutWrapper,
-        rnn_cell_impl.ResidualWrapper,
-        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
-      cell = rnn_cell_impl.BasicRNNCell(1)
-      wrapper = wrapper_type(cell)
-      wrapper(array_ops.ones([1, 1]),
-              state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
-      self.evaluate([v.initializer for v in cell.variables])
-      checkpoint = checkpointable_utils.Checkpoint(wrapper=wrapper)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      self.evaluate(cell._bias.assign([40.]))
-      save_path = checkpoint.save(prefix)
-      self.evaluate(cell._bias.assign([0.]))
-      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-      self.assertAllEqual([40.], self.evaluate(cell._bias))
-
-  def testOutputProjectionWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 3])
-        cell = contrib_rnn.OutputProjectionWrapper(rnn_cell_impl.GRUCell(3), 2)
-        g, new_m = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.231907, 0.231907]])
-
-  def testInputProjectionWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 3])
-        cell = contrib_rnn.InputProjectionWrapper(
-            rnn_cell_impl.GRUCell(3), num_proj=3)
-        g, new_m = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
-
-  @parameterized.parameters(
-      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
-  @test_util.run_in_graph_and_eager_modes
-  def testResidualWrapper(self, wrapper_type):
-    x = ops.convert_to_tensor(np.array([[1., 1., 1.]]))
-    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
-    base_cell = rnn_cell_impl.GRUCell(
-        3, kernel_initializer=init_ops.constant_initializer(0.5),
-        bias_initializer=init_ops.constant_initializer(0.5))
-    g, m_new = base_cell(x, m)
-    wrapper_object = wrapper_type(base_cell)
-    (name, dep), = wrapper_object._checkpoint_dependencies
-    wrapper_object.get_config()  # Should not throw an error
-    self.assertIs(dep, base_cell)
-    self.assertEqual("cell", name)
-
-    g_res, m_new_res = wrapper_object(x, m)
-    self.evaluate([variables_lib.global_variables_initializer()])
-    res = self.evaluate([g, g_res, m_new, m_new_res])
-    # Residual connections
-    self.assertAllClose(res[1], res[0] + [1., 1., 1.])
-    # States are left untouched
-    self.assertAllClose(res[2], res[3])
-
-  @parameterized.parameters(
-      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
-  @test_util.run_in_graph_and_eager_modes
-  def testResidualWrapperWithSlice(self, wrapper_type):
-    x = ops.convert_to_tensor(np.array([[1., 1., 1., 1., 1.]]))
-    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
-    base_cell = rnn_cell_impl.GRUCell(
-        3, kernel_initializer=init_ops.constant_initializer(0.5),
-        bias_initializer=init_ops.constant_initializer(0.5))
-    g, m_new = base_cell(x, m)
-
-    def residual_with_slice_fn(inp, out):
-      inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
-      return inp_sliced + out
-
-    g_res, m_new_res = wrapper_type(
-        base_cell, residual_with_slice_fn)(x, m)
-    self.evaluate([variables_lib.global_variables_initializer()])
-    res_g, res_g_res, res_m_new, res_m_new_res = self.evaluate(
-        [g, g_res, m_new, m_new_res])
-    # Residual connections
-    self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
-    # States are left untouched
-    self.assertAllClose(res_m_new, res_m_new_res)
-
-  def testDeviceWrapper(self):
-    with variable_scope.variable_scope(
-        "root", initializer=init_ops.constant_initializer(0.5)):
-      x = array_ops.zeros([1, 3])
-      m = array_ops.zeros([1, 3])
-      wrapped = rnn_cell_impl.GRUCell(3)
-      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
-      (name, dep), = cell._checkpoint_dependencies
-      cell.get_config()  # Should not throw an error
-      self.assertIs(dep, wrapped)
-      self.assertEqual("cell", name)
-
-      outputs, _ = cell(x, m)
-      self.assertTrue("cpu:14159" in outputs.device.lower())
-
-  def _retrieve_cpu_gpu_stats(self, run_metadata):
-    cpu_stats = None
-    gpu_stats = None
-    step_stats = run_metadata.step_stats
-    for ds in step_stats.dev_stats:
-      if "cpu:0" in ds.device[-5:].lower():
-        cpu_stats = ds.node_stats
-      if "gpu:0" == ds.device[-5:].lower():
-        gpu_stats = ds.node_stats
-    return cpu_stats, gpu_stats
-
-  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
-    if not test.is_gpu_available():
-      # Can't perform this test w/o a GPU
-      return
-
-    gpu_dev = test.gpu_device_name()
-    with self.session(use_gpu=True) as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 1, 3])
-        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
-        with ops.device("/cpu:0"):
-          outputs, _ = rnn.dynamic_rnn(
-              cell=cell, inputs=x, dtype=dtypes.float32)
-        run_metadata = config_pb2.RunMetadata()
-        opts = config_pb2.RunOptions(
-            trace_level=config_pb2.RunOptions.FULL_TRACE)
-
-        sess.run([variables_lib.global_variables_initializer()])
-        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
-
-      cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
-      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
-      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
-
-  def testEmbeddingWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 1], dtype=dtypes.int32)
-        m = array_ops.zeros([1, 2])
-        embedding_cell = contrib_rnn.EmbeddingWrapper(
-            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
-        self.assertEqual(embedding_cell.output_size, 2)
-        g, new_m = embedding_cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 2))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.17139, 0.17139]])
-
-  def testEmbeddingWrapperWithDynamicRnn(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope("root"):
-        inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
-        input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
-        embedding_cell = contrib_rnn.EmbeddingWrapper(
-            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
-            embedding_classes=1,
-            embedding_size=2)
-        outputs, _ = rnn.dynamic_rnn(
-            cell=embedding_cell,
-            inputs=inputs,
-            sequence_length=input_lengths,
-            dtype=dtypes.float32)
-        sess.run([variables_lib.global_variables_initializer()])
-        # This will fail if output's dtype is inferred from input's.
-        sess.run(outputs)
-
-  def testMultiRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 4])
-        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=False)
-        _, ml = multi_rnn_cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1, 0.1]])
-        })
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
-        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
-        self.assertTrue(
-            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
-
-  def testMultiRNNCellWithStateTuple(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m_bad = array_ops.zeros([1, 4])
-        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
-
-        # Test incorrectness of state
-        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
-          rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-              state_is_tuple=True)(x, m_bad)
-
-        _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=True)(x, m_good)
-
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            ml, {
-                x.name: np.array([[1., 1.]]),
-                m_good[0].name: np.array([[0.1, 0.1]]),
-                m_good[1].name: np.array([[0.1, 0.1]])
-            })
-
-        # The numbers in results were not calculated, this is just a
-        # smoke test.  However, these numbers should match those of
-        # the test testMultiRNNCell.
-        self.assertAllClose(res[0], [[0.175991, 0.175991]])
-        self.assertAllClose(res[1], [[0.13248, 0.13248]])
-
-  @parameterized.parameters(
-      [[rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2],
-       [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2]])
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperKerasStyle(self, wrapper, wrapper_v2):
-    """Tests if wrapper cell is instantiated in keras style scope."""
-    wrapped_cell_v2 = wrapper_v2(rnn_cell_impl.BasicRNNCell(1))
-    self.assertTrue(wrapped_cell_v2._keras_style)
-
-    wrapped_cell = wrapper(rnn_cell_impl.BasicRNNCell(1))
-    self.assertFalse(wrapped_cell._keras_style)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperV2VariableNames(self, wrapper):
-    """Tests that variables names do not depend on wrapper in RNN layer."""
-
-    def _rnn_input(apply_wrapper, name):
-      """Creates a RNN layer with/without wrapper and returns built rnn cell."""
-      with base_layer.keras_style_scope():
-        base_cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
-             for _ in range(2)])
-      if apply_wrapper:
-        rnn_cell = wrapper(base_cell)
-      else:
-        rnn_cell = base_cell
-      rnn_layer = keras_layers.RNN(rnn_cell, name=name)
-      inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
-      _ = rnn_layer(inputs)
-      return base_cell._cells[0]
-
-    rnn_1 = _rnn_input(True, name="rnn_0")
-    rnn_2 = _rnn_input(False, name="rnn_1")
-
-    for i, cell in enumerate([rnn_1, rnn_2]):
-      var_prefix = "rnn_{}/cell_0/basic_rnn_cell/".format(i)
-      self.assertCountEqual([v.name for v in cell.weights],
-                            (var_prefix + "kernel:0", var_prefix + "bias:0"))
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperWeights(self, wrapper):
-    """Tests that wrapper weights contain wrapped cells weights."""
-
-    with base_layer.keras_style_scope():
-      base_cell = rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
-    rnn_cell = wrapper(base_cell)
-    rnn_layer = keras_layers.RNN(rnn_cell)
-    inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
-    rnn_layer(inputs)
-
-    expected_weights = ["rnn/" + var for var in ("kernel:0", "bias:0")]
-    self.assertEqual(len(rnn_cell.weights), 2)
-    self.assertCountEqual([v.name for v in rnn_cell.weights], expected_weights)
-    self.assertCountEqual([v.name for v in rnn_cell.trainable_variables],
-                          expected_weights)
-    self.assertCountEqual([v.name for v in rnn_cell.non_trainable_variables],
-                          [])
-    self.assertCountEqual([v.name for v in rnn_cell._cell.weights],
-                          expected_weights)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperV2Caller(self, wrapper):
-    """Tests that wrapper V2 is using the LayerRNNCell's caller."""
-
-    with base_layer.keras_style_scope():
-      base_cell = rnn_cell_impl.MultiRNNCell(
-          [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
-    rnn_cell = wrapper(base_cell)
-    inputs = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
-    state = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
-    _ = rnn_cell(inputs, [state, state])
-    weights = base_cell._cells[0].weights
-    self.assertLen(weights, expected_len=2)
-    self.assertTrue(all(["_wrapper" in v.name for v in weights]))
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperV2Build(self, wrapper):
-    cell = rnn_cell_impl.LSTMCell(10)
-    wrapper = wrapper(cell)
-    wrapper.build((1,))
-    self.assertTrue(cell.built)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
-
-  def _testDropoutWrapper(self,
-                          batch_size=None,
-                          time_steps=None,
-                          parallel_iterations=None,
-                          wrapper_type=None,
-                          scope="root",
-                          **kwargs):
-    if batch_size is None and time_steps is None:
-      # 2 time steps, batch size 1, depth 3
-      batch_size = 1
-      time_steps = 2
-      x = constant_op.constant(
-          [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
-      m = rnn_cell_impl.LSTMStateTuple(
-          *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)] * 2)
-    else:
-      x = constant_op.constant(
-          np.random.randn(time_steps, batch_size, 3).astype(np.float32))
-      m = rnn_cell_impl.LSTMStateTuple(*[
-          constant_op.
-          constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)] * 2)
-    outputs, final_state = rnn.dynamic_rnn(
-        cell=wrapper_type(
-            rnn_cell_impl.LSTMCell(
-                3, initializer=init_ops.constant_initializer(0.5)),
-            dtype=x.dtype, **kwargs),
-        time_major=True,
-        parallel_iterations=parallel_iterations,
-        inputs=x,
-        initial_state=m,
-        scope=scope)
-    self.evaluate([variables_lib.global_variables_initializer()])
-    res = self.evaluate([outputs, final_state])
-    self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
-    self.assertEqual(res[1].c.shape, (batch_size, 3))
-    self.assertEqual(res[1].h.shape, (batch_size, 3))
-    return res
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperProperties(self, wrapper_type):
-    cell = rnn_cell_impl.BasicRNNCell(10)
-    wrapper = wrapper_type(cell)
-    # Github issue 15810
-    self.assertEqual(wrapper.wrapped_cell, cell)
-    self.assertEqual(wrapper.state_size, 10)
-    self.assertEqual(wrapper.output_size, 10)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperZeroState(self, wrapper_type):
-    class _Cell(rnn_cell_impl.BasicRNNCell):
-
-      def zero_state(self, batch_size=None, dtype=None):
-        return "wrapped_cell_zero_state"
-    wrapper = wrapper_type(_Cell(10))
-    self.assertEqual(wrapper.zero_state(10, dtypes.float32),
-                     "wrapped_cell_zero_state")
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepAllConstantInput(self, wrapper_type):
-    keep = array_ops.ones([])
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
-        wrapper_type=wrapper_type)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(true_full_output, res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepAll(self, wrapper_type):
-    keep = variable_scope.get_variable("all", initializer=1.0)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
-        wrapper_type=wrapper_type)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(true_full_output, res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperWithSeed(self, wrapper_type):
-    keep_some = 0.5
-    random_seed.set_random_seed(2)
-    ## Use parallel_iterations = 1 in both calls to
-    ## _testDropoutWrapper to ensure the (per-time step) dropout is
-    ## consistent across both calls.  Otherwise the seed may not end
-    ## up being munged consistently across both graphs.
-    res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        seed=10,
-        parallel_iterations=1,
-        wrapper_type=wrapper_type,
-        scope="root_1")
-    random_seed.set_random_seed(2)
-    res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        seed=10,
-        parallel_iterations=1,
-        wrapper_type=wrapper_type,
-        scope="root_2")
-    self.assertAllClose(res_standard_1[0], res_standard_2[0])
-    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
-    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepNoOutput(self, wrapper_type):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_none,
-        state_keep_prob=keep_all,
-        wrapper_type=wrapper_type)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(np.zeros(res[0].shape), res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self, wrapper_type):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    # Even though we dropout state, by default DropoutWrapper never
-    # drops out the memory ("c") term of an LSTMStateTuple.
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_all,
-        state_keep_prob=keep_none,
-        wrapper_type=wrapper_type)
-    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    self.assertAllClose(true_full_output[0], res[0][0])
-    # Second output is modified by zero input state
-    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
-    # h state has been set to zero
-    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
-    # c state of an LSTMStateTuple is NEVER modified.
-    self.assertAllClose(true_c_state, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepNoInput(self, wrapper_type):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    # All outputs are different because inputs are zeroed out
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_none,
-        output_keep_prob=keep_all,
-        state_keep_prob=keep_all,
-        wrapper_type=wrapper_type)
-    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
-    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
-    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperRecurrentOutput(self, wrapper_type):
-    keep_some = 0.8
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_all,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7)
-    # Ensure the same dropout pattern for all time steps
-    output_mask = np.abs(res[0]) > 1e-6
-    for m in output_mask[1:]:
-      self.assertAllClose(output_mask[0], m)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperRecurrentStateInputAndOutput(self, wrapper_type):
-    keep_some = 0.9
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7)
-
-    # Smoke test for the state/input masks.
-    output_mask = np.abs(res[0]) > 1e-6
-    for time_step in output_mask:
-      # Ensure the same dropout output pattern for all time steps
-      self.assertAllClose(output_mask[0], time_step)
-      for batch_entry in time_step:
-        # Assert all batch entries get the same mask
-        self.assertAllClose(batch_entry, time_step[0])
-
-    # For state, ensure all batch entries have the same mask
-    state_c_mask = np.abs(res[1].c) > 1e-6
-    state_h_mask = np.abs(res[1].h) > 1e-6
-    for batch_entry in state_c_mask:
-      self.assertAllClose(batch_entry, state_c_mask[0])
-    for batch_entry in state_h_mask:
-      self.assertAllClose(batch_entry, state_h_mask[0])
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(
-      self, wrapper_type):
-    keep_some = 0.9
-    random_seed.set_random_seed(2347)
-    np.random.seed(23487)
-    res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7,
-        seed=-234987,
-        scope="root_0")
-    random_seed.set_random_seed(2347)
-    np.random.seed(23487)
-    res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7,
-        seed=-234987,
-        scope="root_1")
-
-    output_mask = np.abs(res0[0]) > 1e-6
-    for time_step in output_mask:
-      # Ensure the same dropout output pattern for all time steps
-      self.assertAllClose(output_mask[0], time_step)
-      for batch_entry in time_step:
-        # Assert all batch entries get the same mask
-        self.assertAllClose(batch_entry, time_step[0])
-
-    # For state, ensure all batch entries have the same mask
-    state_c_mask = np.abs(res0[1].c) > 1e-6
-    state_h_mask = np.abs(res0[1].h) > 1e-6
-    for batch_entry in state_c_mask:
-      self.assertAllClose(batch_entry, state_c_mask[0])
-    for batch_entry in state_h_mask:
-      self.assertAllClose(batch_entry, state_h_mask[0])
-
-    # Ensure seeded calculation is identical.
-    self.assertAllClose(res0[0], res1[0])
-    self.assertAllClose(res0[1].c, res1[1].c)
-    self.assertAllClose(res0[1].h, res1[1].h)
-
-
-def basic_rnn_cell(inputs, state, num_units, scope=None):
-  if state is None:
-    if inputs is not None:
-      batch_size = inputs.get_shape()[0]
-      dtype = inputs.dtype
-    else:
-      batch_size = 0
-      dtype = dtypes.float32
-    init_output = array_ops.zeros(
-        array_ops.stack([batch_size, num_units]), dtype=dtype)
-    init_state = array_ops.zeros(
-        array_ops.stack([batch_size, num_units]), dtype=dtype)
-    init_output.set_shape([batch_size, num_units])
-    init_state.set_shape([batch_size, num_units])
-    return init_output, init_state
-  else:
-    with variable_scope.variable_scope(scope, "basic_rnn_cell",
-                                       [inputs, state]):
-      output = math_ops.tanh(
-          Linear([inputs, state], num_units, True)([inputs, state]))
-    return output, output
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index d7ee7fb8faacb0876218a983d68f007e1905c11e..dfac2df6a0d4143106ad0f090805597c26659280 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -22,6 +22,7 @@ import itertools
 
 import numpy as np
 
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell as legacy_rnn_cell
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -53,6 +54,294 @@ from tensorflow.python.util import nest
 
 class RNNCellTest(test.TestCase):
 
+  def testIndRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.IndRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/ind_rnn_cell/%s_w:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/ind_rnn_cell/%s_u:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/ind_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  def testIndyGRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.185265, 0.17704]])
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)):
+        # Test IndyGRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.155127, 0.157328]])
+
+  def testIndyLSTMCell(self):
+    for dtype in [dtypes.float16, dtypes.float32]:
+      np_dtype = dtype.as_numpy_dtype
+      with self.session(graph=ops.Graph()) as sess:
+        with variable_scope.variable_scope(
+            "root", initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.zeros([1, 2], dtype=dtype)
+          state_0 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          state_1 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          cell = rnn_cell_impl.MultiRNNCell(
+              [contrib_rnn_cell.IndyLSTMCell(2) for _ in range(2)])
+          self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
+          g, (out_state_0, out_state_1) = cell(x, (state_0, state_1))
+          # Layer infers the input type.
+          self.assertEqual(cell.dtype, dtype.name)
+          expected_variable_names = [
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_w:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_u:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_w:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_u:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME
+          ]
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
+          self.assertFalse(cell.non_trainable_variables)
+          sess.run([variables.global_variables_initializer()])
+          res = sess.run(
+              [g, out_state_0, out_state_1], {
+                  x.name: np.array([[1., 1.]]),
+                  state_0[0].name: 0.1 * np.ones([1, 2]),
+                  state_0[1].name: 0.1 * np.ones([1, 2]),
+                  state_1[0].name: 0.1 * np.ones([1, 2]),
+                  state_1[1].name: 0.1 * np.ones([1, 2]),
+              })
+          self.assertEqual(len(res), 3)
+          global_variables = variables.global_variables()
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in global_variables])
+          # Only check the range of outputs as this is just a smoke test.
+          self.assertAllInRange(res[0], -1.0, 1.0)
+          self.assertAllInRange(res[1], -1.0, 1.0)
+          self.assertAllInRange(res[2], -1.0, 1.0)
+        with variable_scope.variable_scope(
+            "other", initializer=init_ops.constant_initializer(0.5)):
+          # Test IndyLSTMCell with input_size != num_units.
+          x = array_ops.zeros([1, 3], dtype=dtype)
+          state = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          g, out_state = contrib_rnn_cell.IndyLSTMCell(2)(x, state)
+          sess.run([variables.global_variables_initializer()])
+          res = sess.run(
+              [g, out_state], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  state[0].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
+                  state[1].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
+              })
+          self.assertEqual(len(res), 2)
+
+  def testLSTMCellLayerNorm(self):
+    with self.cached_session() as sess:
+      num_units = 2
+      num_proj = 3
+      batch_size = 1
+      input_size = 4
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        c = array_ops.zeros([batch_size, num_units])
+        h = array_ops.zeros([batch_size, num_proj])
+        state = rnn_cell_impl.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormLSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            layer_norm=True,
+            norm_gain=1.0,
+            norm_shift=0.0)
+        g, out_m = cell(x, state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.ones((batch_size, input_size)),
+                c.name: 0.1 * np.ones((batch_size, num_units)),
+                h.name: 0.1 * np.ones((batch_size, num_proj))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1][0].shape, (batch_size, num_units))
+        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+
+  def testOutputProjectionWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 3])
+        cell = legacy_rnn_cell.OutputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), 2)
+        g, new_m = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 3))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.231907, 0.231907]])
+
+  def testInputProjectionWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 3])
+        cell = legacy_rnn_cell.InputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), num_proj=3)
+        g, new_m = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 3))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
+
+  def testEmbeddingWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1], dtype=dtypes.int32)
+        m = array_ops.zeros([1, 2])
+        embedding_cell = legacy_rnn_cell.EmbeddingWrapper(
+            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
+        self.assertEqual(embedding_cell.output_size, 2)
+        g, new_m = embedding_cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 2))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.17139, 0.17139]])
+
+  def testEmbeddingWrapperWithDynamicRnn(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope("root"):
+        inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
+        input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
+        embedding_cell = legacy_rnn_cell.EmbeddingWrapper(
+            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
+            embedding_classes=1,
+            embedding_size=2)
+        outputs, _ = rnn.dynamic_rnn(
+            cell=embedding_cell,
+            inputs=inputs,
+            sequence_length=input_lengths,
+            dtype=dtypes.float32)
+        sess.run([variables.global_variables_initializer()])
+        # This will fail if output's dtype is inferred from input's.
+        sess.run(outputs)
+
+  def testSRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
+
+  def testSRUCellKerasRNN(self):
+    """Tests that SRUCell works with keras RNN layer."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    seq_input = ops.convert_to_tensor(
+        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
+    rnn_layer = keras_layers.RNN(cell=cell)
+    rnn_outputs_keras = rnn_layer(seq_input)
+    with self.cached_session() as sess:
+      sess.run([variables.global_variables_initializer()])
+      self.assertEqual(sess.run(rnn_outputs_keras).shape, (2, 10))
+
+  def testSRUCellBiasType(self):
+    """Tests that the bias' dtype is properly set."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.float32_ref)
+
+    cell = contrib_rnn_cell.SRUCell(10, dtype=dtypes.int32)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.int32_ref)
+
+    cell_input = ops.convert_to_tensor(
+        np.random.rand(2, 5), name="cell_input", dtype=dtypes.float16)
+    cell_state = ops.convert_to_tensor(
+        np.random.rand(2, 10), name="cell_state", dtype=dtypes.float16)
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell(cell_input, [cell_state])
+    self.assertEqual(cell._bias.dtype, dtypes.float16_ref)
+
+  def testSRUCellWithDiffSize(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
+
   def testCoupledInputForgetGateLSTMCell(self):
     with self.cached_session() as sess:
       num_units = 2
diff --git a/tensorflow/contrib/rnn/python/ops/rnn.py b/tensorflow/contrib/rnn/python/ops/rnn.py
index 0266b72dcb15e4aba01a9a31b4be75c5b84d44da..41b1698321e20f4360d75fa2db79f9bd8a806cea 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn.py
@@ -131,7 +131,8 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
                                     sequence_length=None,
                                     parallel_iterations=None,
                                     time_major=False,
-                                    scope=None):
+                                    scope=None,
+                                    swap_memory=False):
   """Creates a dynamic bidirectional recurrent neural network.
 
   Stacks several bidirectional rnn layers. The combined forward and backward
@@ -171,6 +172,10 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
       data is batch-major, so by default this function accepts input and emits
       output in batch-major form.
     scope: VariableScope for the created subgraph; defaults to None.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
 
   Returns:
     A tuple (outputs, output_state_fw, output_state_bw) where:
@@ -230,6 +235,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
             sequence_length=sequence_length,
             parallel_iterations=parallel_iterations,
             dtype=dtype,
+            swap_memory=swap_memory,
             time_major=time_major)
         # Concat the outputs to create the new input.
         prev_layer = array_ops.concat(outputs, 2)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 482e547a16be85804beec88a91fa03b053d09b27..d25afc8b9c4381fb3b0092ef21f46646353e1b8e 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -3153,7 +3153,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
   r"""Independently Gated Recurrent Unit cell.
 
   Based on IndRNNs (https://arxiv.org/abs/1803.04831) and similar to GRUCell,
-  yet with the \(U_r\), \(U_z\), and \(U\) matrices in equations 5, 6, and
+  yet with the \\(U_r\\), \\(U_z\\), and \\(U\\) matrices in equations 5, 6, and
   8 of http://arxiv.org/abs/1406.1078 respectively replaced by diagonal
   matrices, i.e. a Hadamard product with a single vector:
 
@@ -3164,12 +3164,10 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     $$\tilde{h}^{(t)}_j = \phi\left([\mathbf W \mathbf x]_j +
       [\mathbf u \circ \mathbf r \circ \mathbf h_{(t-1)}]_j\right)$$
 
-  where \(\circ\) denotes the Hadamard operator. This means that each IndyGRU
+  where \\(\circ\\) denotes the Hadamard operator. This means that each IndyGRU
   node sees only its own state, as opposed to seeing all states in the same
   layer.
 
-  TODO(gonnet): Write a paper describing this and add a reference here.
-
   Args:
     num_units: int, The number of units in the GRU cell.
     activation: Nonlinearity to use.  Default: `tanh`.
@@ -3254,7 +3252,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     self.built = True
 
   def call(self, inputs, state):
-    """Gated recurrent unit (GRU) with nunits cells."""
+    """Recurrently independent Gated Recurrent Unit (GRU) with nunits cells."""
 
     gate_inputs = math_ops.matmul(inputs, self._gate_kernel_w) + (
         gen_array_ops.tile(state, [1, 2]) * self._gate_kernel_u)
@@ -3278,10 +3276,9 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
   r"""Basic IndyLSTM recurrent network cell.
 
   Based on IndRNNs (https://arxiv.org/abs/1803.04831) and similar to
-  BasicLSTMCell, yet with the \(U_f\), \(U_i\), \(U_o\) and \(U_c\)
-  matrices in
-  https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate
-  replaced by diagonal matrices, i.e. a Hadamard product with a single vector:
+  BasicLSTMCell, yet with the \\(U_f\\), \\(U_i\\), \\(U_o\\) and \\(U_c\\)
+  matrices in the regular LSTM equations replaced by diagonal matrices, i.e. a
+  Hadamard product with a single vector:
 
     $$f_t = \sigma_g\left(W_f x_t + u_f \circ h_{t-1} + b_f\right)$$
     $$i_t = \sigma_g\left(W_i x_t + u_i \circ h_{t-1} + b_i\right)$$
@@ -3289,8 +3286,8 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     $$c_t = f_t \circ c_{t-1} +
             i_t \circ \sigma_c\left(W_c x_t + u_c \circ h_{t-1} + b_c\right)$$
 
-  where \(\circ\) denotes the Hadamard operator. This means that each IndyLSTM
-  node sees only its own state \(h\) and \(c\), as opposed to seeing all
+  where \\(\circ\\) denotes the Hadamard operator. This means that each IndyLSTM
+  node sees only its own state \\(h\\) and \\(c\\), as opposed to seeing all
   states in the same layer.
 
   We add forget_bias (default: 1) to the biases of the forget gate in order to
@@ -3298,11 +3295,6 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
 
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
-
-  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
-  that follows.
-
-  TODO(gonnet): Write a paper describing this and add a reference here.
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 0392ed9eee79391c60318faf68d8dfd6eb64a994..a61e9579b84a60d74b73e45a6100a2c772d9cff8 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -22,5 +22,5 @@ from tensorflow.python.keras import saving
 
 
 # TODO(kathywu): Remove all contrib callers, switch to tf.keras.
-save_keras_model = saving.export
+save_keras_model = saving.export_saved_model
 load_keras_model = saving.load_from_saved_model
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 2a70b08f5c46e11e7fd83fe134741b9a241153f5..8e2ce82294287dda07d2067c5b9f012f510dbd08 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -269,4 +269,5 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
 )
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index d815f81f847ad79ddcc6c6ecf5c050598e185d8d..1a5692f7b5be5e87b78dac9d1ae51f280ca089f8 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -358,7 +358,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -387,7 +387,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -454,7 +454,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333333333))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -696,7 +696,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0025896581),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.73333333))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -707,12 +707,12 @@ class AttentionWrapperTest(test.TestCase):
             shape=(5, 6), dtype=dtype('float32'), mean=-0.00069823361),
         time=3,
         alignments=ResultSummary(
-            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+            shape=(5, 8), dtype=dtype('float32'), mean=0.029914695),
         attention_state=ResultSummary(
-            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+            shape=(5, 8), dtype=dtype('float32'), mean=0.029914695),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
-        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.04865776002407074)
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.0465225502849)
 
     self._testWithAttention(
         create_attention_mechanism,
@@ -921,9 +921,9 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11723966),
+            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.115853324533),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=7.266666666666667))
+            shape=(5, 3), dtype=dtype('int32'), mean=8.6))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -931,7 +931,7 @@ class AttentionWrapperTest(test.TestCase):
             h=ResultSummary(
                 shape=(5, 9), dtype=dtype('float32'), mean=-0.0018327223)),
         attention=ResultSummary(
-            shape=(5, 20), dtype=dtype('float32'), mean=0.11601614207),
+            shape=(5, 20), dtype=dtype('float32'), mean=0.11462739855),
         time=3,
         alignments=(ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
index 7ff04e1780c4c44df14d6e87c5afdbf533ca5c90..5ee01f66f165bd2ac22cae10807f24f6b97f0c64 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
@@ -17,14 +17,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
-from tensorflow.python.framework import ops
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import initializers
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -37,13 +46,10 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
     self.memory_size = 6
     self.units = 8
 
-    self.memory = ops.convert_to_tensor(
-        np.random.random((self.batch, self.timestep, self.memory_size)),
-        dtype=np.float32)
-    self.query = ops.convert_to_tensor(
-        np.random.random((self.batch, self.units)), dtype=np.float32)
-    self.state = ops.convert_to_tensor(
-        np.random.random((self.batch, self.timestep)), dtype=np.float32)
+    self.memory = np.random.randn(self.batch, self.timestep,
+                                  self.memory_size).astype(np.float32)
+    self.query = np.random.randn(self.batch, self.units).astype(np.float32)
+    self.state = np.random.randn(self.batch, self.timestep).astype(np.float32)
 
   @parameterized.named_parameters(
       ("luong", wrapper.LuongAttentionV2),
@@ -52,8 +58,8 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
       ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
   )
   def test_attention_shape_inference(self, attention_cls):
-    attention = attention_cls(self.units)
-    attention_score = attention([self.query, self.state, self.memory])
+    attention = attention_cls(self.units, self.memory)
+    attention_score = attention([self.query, self.state])
     self.assertLen(attention_score, 2)
     self.assertEqual(attention_score[0].shape, (self.batch, self.timestep))
     self.assertEqual(attention_score[1].shape, (self.batch, self.timestep))
@@ -65,7 +71,7 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
       ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
   )
   def test_get_config(self, attention_cls):
-    attention = attention_cls(self.units)
+    attention = attention_cls(self.units, self.memory)
     config = attention.get_config()
 
     attention_from_config = attention_cls.from_config(config)
@@ -80,9 +86,8 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
       ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
   )
   def test_layer_output(self, attention_cls):
-    attention = attention_cls(self.units)
-
-    score = attention([self.query, self.state, self.memory])
+    attention = attention_cls(self.units, self.memory)
+    score = attention([self.query, self.state])
     self.evaluate(variables.variables_initializer(attention.variables))
 
     score_val = self.evaluate(score)
@@ -90,5 +95,651 @@ class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(score_val[0].shape, (self.batch, self.timestep))
     self.assertEqual(score_val[1].shape, (self.batch, self.timestep))
 
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_passing_memory_from_call(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    weights_before_query = attention.get_weights()
+    ref_score = attention([self.query, self.state])
+
+    self.evaluate(variables.global_variables_initializer())
+    ref_score_val = self.evaluate(ref_score)
+
+    all_weights = attention.get_weights()
+    config = attention.get_config()
+    # Simulate the twice invocation of calls here.
+    attention_from_config = attention_cls.from_config(config)
+    attention_from_config.build(self.memory.shape)
+    attention_from_config.set_weights(weights_before_query)
+    attention_from_config(self.memory, setup_memory=True)
+    attention_from_config.build([self.query.shape, self.state.shape])
+    attention_from_config.set_weights(all_weights)
+    score = attention_from_config([self.query, self.state])
+
+    score_val = self.evaluate(score)
+    self.assertAllClose(ref_score_val, score_val)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_save_load_layer(self, attention_cls):
+    vocab = 20
+    embedding_dim = 6
+    inputs = keras.layers.Input(shape=[self.timestep])
+    encoder_input = keras.layers.Embedding(
+        vocab, embedding_dim, mask_zero=True)(
+            inputs)
+    encoder_output = keras.layers.UnifiedLSTM(
+        self.memory_size, return_sequences=True)(
+            encoder_input)
+
+    attention = attention_cls(self.units, encoder_output)
+    query = keras.layers.Input(shape=[self.units])
+    state = keras.layers.Input(shape=[self.timestep])
+
+    score = attention([query, state])
+
+    x = np.random.randint(vocab, size=(self.batch, self.timestep))
+    x_test = np.random.randint(vocab, size=(self.batch, self.timestep))
+    y = np.random.randn(self.batch, self.timestep)
+    model = keras.models.Model([inputs, query, state], score)
+    model.compile("rmsprop", "mse")
+    model.fit([x, self.query, self.state], (y, y))
+    y_ref = model.predict_on_batch([x_test, self.query, self.state])
+
+    config = model.get_config()
+    weights = model.get_weights()
+    loaded_model = keras.models.Model.from_config(
+        config, custom_objects={attention_cls.__name__: attention_cls})
+    loaded_model.set_weights(weights)
+
+    y = loaded_model.predict_on_batch([x_test, self.query, self.state])
+
+    self.assertAllClose(y_ref, y)
+
+  # TODO(scottzhu): Add tests for model.compile(run_eagerly=True)
+
+
+class ResultSummary(
+    collections.namedtuple("ResultSummary", ("shape", "dtype", "mean"))):
+  pass
+
+
+def get_result_summary(x):
+  if isinstance(x, np.ndarray):
+    return ResultSummary(x.shape, x.dtype, x.mean())
+  return x
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionWrapperV2Test(test.TestCase, parameterized.TestCase):
+
+  def assertAllCloseOrEqual(self, x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+      return super(AttentionWrapperV2Test, self).assertAllClose(
+          x, y, atol=1e-3, **kwargs)
+    else:
+      self.assertAllEqual(x, y, **kwargs)
+
+  def setUp(self):
+    super(AttentionWrapperV2Test, self).setUp()
+    self.batch = 64
+    self.units = 128
+    self.encoder_timestep = 10
+    self.encoder_dim = 256
+    self.decoder_timestep = 12
+    self.encoder_outputs = np.random.randn(self.batch, self.encoder_timestep,
+                                           self.encoder_dim)
+    self.encoder_sequence_length = np.random.randint(
+        self.encoder_timestep, size=(self.batch,)).astype(np.int32)
+    self.decoder_inputs = np.random.randn(self.batch, self.decoder_timestep,
+                                          self.units)
+    self.decoder_sequence_length = np.random.randint(
+        self.decoder_timestep, size=(self.batch,)).astype(np.int32)
+
+  def _testWithAttention(self,
+                         create_attention_mechanism,
+                         expected_final_output,
+                         expected_final_state,
+                         attention_mechanism_depth=3,
+                         alignment_history=False,
+                         expected_final_alignment_history=None,
+                         attention_layer_size=6,
+                         attention_layer=None,
+                         create_query_layer=False,
+                         create_memory_layer=True,
+                         create_attention_kwargs=None):
+    attention_layer_sizes = ([attention_layer_size]
+                             if attention_layer_size is not None else None)
+    attention_layers = ([attention_layer]
+                        if attention_layer is not None else None)
+    self._testWithMaybeMultiAttention(
+        is_multi=False,
+        create_attention_mechanisms=[create_attention_mechanism],
+        expected_final_output=expected_final_output,
+        expected_final_state=expected_final_state,
+        attention_mechanism_depths=[attention_mechanism_depth],
+        alignment_history=alignment_history,
+        expected_final_alignment_history=expected_final_alignment_history,
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
+        create_query_layer=create_query_layer,
+        create_memory_layer=create_memory_layer,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def _testWithMaybeMultiAttention(self,
+                                   is_multi,
+                                   create_attention_mechanisms,
+                                   expected_final_output,
+                                   expected_final_state,
+                                   attention_mechanism_depths,
+                                   alignment_history=False,
+                                   expected_final_alignment_history=None,
+                                   attention_layer_sizes=None,
+                                   attention_layers=None,
+                                   create_query_layer=False,
+                                   create_memory_layer=True,
+                                   create_attention_kwargs=None):
+    # Allow is_multi to be True with a single mechanism to enable test for
+    # passing in a single mechanism in a list.
+    assert len(create_attention_mechanisms) == 1 or is_multi
+    encoder_sequence_length = [3, 2, 3, 1, 1]
+    decoder_sequence_length = [2, 0, 1, 2, 3]
+    batch_size = 5
+    encoder_max_time = 8
+    decoder_max_time = 4
+    input_depth = 7
+    encoder_output_depth = 10
+    cell_depth = 9
+    create_attention_kwargs = create_attention_kwargs or {}
+
+    if attention_layer_sizes is not None:
+      # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
+      attention_depth = sum(attention_layer_size or encoder_output_depth
+                            for attention_layer_size in attention_layer_sizes)
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth]).dims[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
+
+    decoder_inputs = np.random.randn(batch_size, decoder_max_time,
+                                     input_depth).astype(np.float32)
+    encoder_outputs = np.random.randn(batch_size, encoder_max_time,
+                                      encoder_output_depth).astype(np.float32)
+
+    attention_mechanisms = []
+    for creator, depth in zip(create_attention_mechanisms,
+                              attention_mechanism_depths):
+      # Create a memory layer with deterministic initializer to avoid randomness
+      # in the test between graph and eager.
+      if create_query_layer:
+        create_attention_kwargs["query_layer"] = keras.layers.Dense(
+            depth, kernel_initializer="ones", use_bias=False)
+      if create_memory_layer:
+        create_attention_kwargs["memory_layer"] = keras.layers.Dense(
+            depth, kernel_initializer="ones", use_bias=False)
+
+      attention_mechanisms.append(
+          creator(
+              units=depth,
+              memory=encoder_outputs,
+              memory_sequence_length=encoder_sequence_length,
+              **create_attention_kwargs))
+
+    with self.cached_session(use_gpu=True):
+      attention_layer_size = attention_layer_sizes
+      attention_layer = attention_layers
+      if not is_multi:
+        if attention_layer_size is not None:
+          attention_layer_size = attention_layer_size[0]
+        if attention_layer is not None:
+          attention_layer = attention_layer[0]
+      cell = rnn_cell.LSTMCell(cell_depth, initializer="ones")
+      cell = wrapper.AttentionWrapper(
+          cell,
+          attention_mechanisms if is_multi else attention_mechanisms[0],
+          attention_layer_size=attention_layer_size,
+          alignment_history=alignment_history,
+          attention_layer=attention_layer)
+      # Set the attention_layer within AttentionWrapper to have deterministic
+      # kernel initializer, for testing purpose.
+      if cell._attention_layers is not None:
+        for layer in cell._attention_layers:
+          if getattr(layer, "kernel_initializer") is None:
+            layer.kernel_initializer = initializers.ones()
+
+      sampler = sampler_py.TrainingSampler()
+      my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      final_outputs, final_state, _ = my_decoder(
+          decoder_inputs,
+          initial_state=initial_state,
+          sequence_length=decoder_sequence_length)
+
+      self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+      self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+      self.assertIsInstance(final_state.cell_state, rnn_cell.LSTMStateTuple)
+
+      expected_time = (
+          expected_final_state.time if context.executing_eagerly() else None)
+      self.assertEqual((batch_size, expected_time, attention_depth),
+                       tuple(final_outputs.rnn_output.get_shape().as_list()))
+      self.assertEqual((batch_size, expected_time),
+                       tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      self.assertEqual((batch_size, attention_depth),
+                       tuple(final_state.attention.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state.c.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state.h.get_shape().as_list()))
+
+      if alignment_history:
+        if is_multi:
+          state_alignment_history = []
+          for history_array in final_state.alignment_history:
+            history = history_array.stack()
+            self.assertEqual((expected_time, batch_size, encoder_max_time),
+                             tuple(history.get_shape().as_list()))
+            state_alignment_history.append(history)
+          state_alignment_history = tuple(state_alignment_history)
+        else:
+          state_alignment_history = final_state.alignment_history.stack()
+          self.assertEqual((expected_time, batch_size, encoder_max_time),
+                           tuple(state_alignment_history.get_shape().as_list()))
+        nest.assert_same_structure(cell.state_size,
+                                   cell.zero_state(batch_size, dtypes.float32))
+        # Remove the history from final_state for purposes of the
+        # remainder of the tests.
+        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
+      else:
+        state_alignment_history = ()
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "final_outputs": final_outputs,
+          "final_state": final_state,
+          "state_alignment_history": state_alignment_history,
+      })
+
+      final_output_info = nest.map_structure(get_result_summary,
+                                             eval_result["final_outputs"])
+      final_state_info = nest.map_structure(get_result_summary,
+                                            eval_result["final_state"])
+      print("final_output_info: ", final_output_info)
+      print("final_state_info: ", final_state_info)
+
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
+                         final_output_info)
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
+                         final_state_info)
+      if alignment_history:  # by default, the wrapper emits attention as output
+        final_alignment_history_info = nest.map_structure(
+            get_result_summary, eval_result["state_alignment_history"])
+        print("final_alignment_history_info: ", final_alignment_history_info)
+        nest.map_structure(
+            self.assertAllCloseOrEqual,
+            # outputs are batch major but the stacked TensorArray is time major
+            expected_final_alignment_history,
+            final_alignment_history_info)
+
+  @parameterized.parameters([np.float16, np.float32, np.float64])
+  def _testBahdanauNormalizedDType(self, dtype):
+    encoder_outputs = self.encoder_outputs.astype(dtype)
+    decoder_inputs = self.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.BahdanauAttentionV2(
+        units=self.units,
+        memory=encoder_outputs,
+        memory_sequence_length=self.encoder_sequence_length,
+        normalize=True,
+        dtype=dtype)
+    cell = rnn_cell.LSTMCell(self.units)
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.zero_state(dtype=dtype, batch_size=self.batch),
+        sequence_length=self.decoder_sequence_length)
+    self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+    self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+    self.assertIsInstance(final_state.cell_state, rnn_cell.LSTMStateTuple)
+
+  @parameterized.parameters([np.float16, np.float32, np.float64])
+  def testLuongScaledDType(self, dtype):
+    # Test case for GitHub issue 18099
+    encoder_outputs = self.encoder_outputs.astype(dtype)
+    decoder_inputs = self.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.LuongAttentionV2(
+        units=self.units,
+        memory=encoder_outputs,
+        memory_sequence_length=self.encoder_sequence_length,
+        scale=True,
+        dtype=dtype,
+    )
+    cell = rnn_cell.LSTMCell(self.units)
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.zero_state(dtype=dtype, batch_size=self.batch),
+        sequence_length=self.decoder_sequence_length)
+    self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+    self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+    self.assertIsInstance(final_state.cell_state, rnn_cell.LSTMStateTuple)
+
+  def testBahdanauNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype(np.float32), mean=4.8290324),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype(np.float32), mean=1.6432636),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype(np.float32), mean=0.75866824)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype(np.float32), mean=6.7445569),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        create_query_layer=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.9548259),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.4652209),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.70997983)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=6.3075728),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testLuongNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongAttentionV2
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=2.6605489),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.88403547),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.37819088)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=4.084631),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9)
+
+  def testLuongScaled(self):
+    create_attention_mechanism = wrapper.LuongAttentionV2
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=2.6605489),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.88403547),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.37819088)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=4.0846314),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testNotUseAttentionLayer(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.072406612),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.86666666))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.032002),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.61177742)),
+        attention=ResultSummary(
+            shape=(5, 10), dtype=np.dtype("float32"), mean=0.011346335),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_layer_size=None,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauMonotonicNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=5.9850435),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.6752492),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.76052248)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=8.361186),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.10989678),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.10989678),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.117412611)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauMonotonicNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones",
+                               "normalize": True}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=4.5706983),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.6005473),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.77863038)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=7.3326721),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.12258384),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.12258384),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.12258384)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testLuongMonotonicNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongMonotonicAttentionV2
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.159497),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.072384),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.50331038)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=5.3079605),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.11899644)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history)
+
+  def testLuongMonotonicScaled(self):
+    create_attention_mechanism = wrapper.LuongMonotonicAttentionV2
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.159497),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.072384),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.50331038)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=5.3079605),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.11899644)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
index 15fb688fc4dd4909e5bab36def7ac58e9d7be4ea..2341ebb77ab6ecad1e979bc8bed0080128a804da 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
@@ -127,7 +127,7 @@ class BasicDecoderTest(keras_parameterized.TestCase):
           np.argmax(eval_result["step_outputs"].rnn_output, -1),
           eval_result["step_outputs"].sample_id)
 
-  def testStepWithGreedyEmbeddingHelper(self):
+  def DISABLED_testStepWithGreedyEmbeddingHelper(self):
     batch_size = 5
     vocabulary_size = 7
     cell_depth = vocabulary_size  # cell's logits must match vocabulary size
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 5e28e651c666b1c448f778fc9c02d637ce817bae..56f2a0acc9f2e6f951c5df26a53a31645697da4f 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -25,10 +25,13 @@ from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
 from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
@@ -530,11 +533,10 @@ class BeamSearchDecoderTest(test.TestCase):
           return (shape[1], shape[0]) + shape[2:]
         return shape
 
-      self.assertTrue(
-          isinstance(final_outputs,
-                     beam_search_decoder.FinalBeamSearchDecoderOutput))
-      self.assertTrue(
-          isinstance(final_state, beam_search_decoder.BeamSearchDecoderState))
+      self.assertIsInstance(
+          final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput)
+      self.assertIsInstance(
+          final_state, beam_search_decoder.BeamSearchDecoderState)
 
       beam_search_decoder_output = final_outputs.beam_search_decoder_output
       self.assertEqual(
@@ -574,5 +576,119 @@ class BeamSearchDecoderTest(test.TestCase):
         with_alignment_history=True)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class BeamSearchDecoderV2Test(test.TestCase):
+
+  def _testDynamicDecodeRNN(self, time_major, has_attention,
+                            with_alignment_history=False):
+    encoder_sequence_length = np.array([3, 2, 3, 1, 1])
+    decoder_sequence_length = np.array([2, 0, 1, 2, 3])
+    batch_size = 5
+    decoder_max_time = 4
+    input_depth = 7
+    cell_depth = 9
+    attention_depth = 6
+    vocab_size = 20
+    end_token = vocab_size - 1
+    start_token = 0
+    embedding_dim = 50
+    max_out = max(decoder_sequence_length)
+    output_layer = layers.Dense(vocab_size, use_bias=True, activation=None)
+    beam_width = 3
+
+    with self.cached_session():
+      batch_size_tensor = constant_op.constant(batch_size)
+      embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      initial_state = cell.zero_state(batch_size, dtypes.float32)
+      coverage_penalty_weight = 0.0
+      if has_attention:
+        coverage_penalty_weight = 0.2
+        inputs = array_ops.placeholder_with_default(
+            np.random.randn(batch_size, decoder_max_time, input_depth).astype(
+                np.float32),
+            shape=(None, None, input_depth))
+        tiled_inputs = beam_search_decoder.tile_batch(
+            inputs, multiplier=beam_width)
+        tiled_sequence_length = beam_search_decoder.tile_batch(
+            encoder_sequence_length, multiplier=beam_width)
+        attention_mechanism = attention_wrapper.BahdanauAttention(
+            num_units=attention_depth,
+            memory=tiled_inputs,
+            memory_sequence_length=tiled_sequence_length)
+        initial_state = beam_search_decoder.tile_batch(
+            initial_state, multiplier=beam_width)
+        cell = attention_wrapper.AttentionWrapper(
+            cell=cell,
+            attention_mechanism=attention_mechanism,
+            attention_layer_size=attention_depth,
+            alignment_history=with_alignment_history)
+      cell_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
+      if has_attention:
+        cell_state = cell_state.clone(cell_state=initial_state)
+      bsd = beam_search_decoder.BeamSearchDecoderV2(
+          cell=cell,
+          beam_width=beam_width,
+          output_layer=output_layer,
+          length_penalty_weight=0.0,
+          coverage_penalty_weight=coverage_penalty_weight,
+          output_time_major=time_major,
+          maximum_iterations=max_out)
+
+      final_outputs, final_state, final_sequence_lengths = bsd(
+          embedding,
+          start_tokens=array_ops.fill([batch_size_tensor], start_token),
+          end_token=end_token,
+          initial_state=cell_state)
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      self.assertIsInstance(
+          final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput)
+      self.assertIsInstance(
+          final_state, beam_search_decoder.BeamSearchDecoderState)
+
+      beam_search_decoder_output = final_outputs.beam_search_decoder_output
+      expected_seq_length = 3 if context.executing_eagerly() else None
+      self.assertEqual(
+          _t((batch_size, expected_seq_length, beam_width)),
+          tuple(beam_search_decoder_output.scores.get_shape().as_list()))
+      self.assertEqual(
+          _t((batch_size, expected_seq_length, beam_width)),
+          tuple(final_outputs.predicted_ids.get_shape().as_list()))
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_results = self.evaluate({
+          'final_outputs': final_outputs,
+          'final_sequence_lengths': final_sequence_lengths
+      })
+
+      max_sequence_length = np.max(eval_results['final_sequence_lengths'])
+
+      # A smoke test
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)),
+          eval_results['final_outputs'].beam_search_decoder_output.scores.shape)
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)), eval_results[
+              'final_outputs'].beam_search_decoder_output.predicted_ids.shape)
+
+  def testDynamicDecodeRNNBatchMajorNoAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=False)
+
+  def testDynamicDecodeRNNBatchMajorYesAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=True)
+
+  def testDynamicDecodeRNNBatchMajorYesAttentionWithAlignmentHistory(self):
+    self._testDynamicDecodeRNN(
+        time_major=False,
+        has_attention=True,
+        with_alignment_history=True)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index ae3e7f1b5d8c9f06b5defbaee9cad3810e58abd4..79c2ac2f500307ba23b6d97a7a30c6d04cea5176 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -25,10 +25,13 @@ import math
 import numpy as np
 
 from tensorflow.contrib.framework.python.framework import tensor_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import initializers
 from tensorflow.python.keras import layers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
@@ -225,18 +228,39 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
     1. Storing the query and memory layers.
     2. Preprocessing and storing the memory.
 
-  Note that this layer only support Keras functional API since it takes multiple
-  input tensors, which is not available in sequential model.
+  Note that this layer takes memory as its init parameter, which is an
+  anti-pattern of Keras API, we have to keep the memory as init parameter for
+  performance and dependency reason. Under the hood, during `__init__()`, it
+  will invoke `base_layer.__call__(memory, setup_memory=True)`. This will let
+  keras to keep track of the memory tensor as the input of this layer. Once
+  the `__init__()` is done, then user can query the attention by
+  `score = att_obj([query, state])`, and use it as a normal keras layer.
+
+  Special attention is needed when adding using this class as the base layer for
+  new attention:
+    1. Build() could be invoked at least twice. So please make sure weights are
+       not duplicated.
+    2. Layer.get_weights() might return different set of weights if the instance
+       has `query_layer`. The query_layer weights is not initialized until the
+       memory is configured.
+
+  Also note that this layer does not work with Keras model when
+  `model.compile(run_eagerly=True)` due to the fact that this layer is stateful.
+  The support for that will be added in a future version.
   """
 
   def __init__(self,
+               memory,
                probability_fn,
                query_layer=None,
                memory_layer=None,
+               memory_sequence_length=None,
                **kwargs):
     """Construct base AttentionMechanism class.
 
     Args:
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
       probability_fn: A `callable`. Converts the score and previous alignments
         to probabilities. Its signature should be:
         `probabilities = probability_fn(score, state)`.
@@ -247,6 +271,9 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
         depth must match the depth of `query_layer`.
         If `memory_layer` is not provided, the shape of `memory` must match
         that of `query_layer`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory. If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       **kwargs: Dictionary that contains other common arguments for layer
         creation.
     """
@@ -273,20 +300,127 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
     self.batch_size = None
     self._memory_initialized = False
     self._check_inner_dims_defined = True
+    self.supports_masking = True
+    self.score_mask_value = dtypes.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
+
+    if memory is not None:
+      # Setup the memory by self.__call__() with memory and memory_seq_length.
+      # This will make the attention follow the keras convention which takes
+      # all the tensor inputs via __call__().
+      if memory_sequence_length is None:
+        inputs = memory
+      else:
+        inputs = [memory, memory_sequence_length]
+
+      self.values = super(_BaseAttentionMechanismV2, self).__call__(
+          inputs, setup_memory=True)
 
   def build(self, input_shape):
-    # The layer suppose to take 3 inputs, [query, state, memory].
-    query_input_shape, _, memory_input_shape = input_shape
-    if self.query_layer is not None:
-      self.query_layer.build(query_input_shape)
-    if self.memory_layer is not None:
-      self.memory_layer.build(memory_input_shape)
-    # dtype of the layer is known at this moment, create the score_mask_value if
-    # needed.
-    self.score_mask_value = dtypes.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
-    self.built = True
+    if not self._memory_initialized:
+      # This is for setting up the memory, which contains memory and optional
+      # memory_sequence_length. Build the memory_layer with memory shape.
+      if self.memory_layer is not None and not self.memory_layer.built:
+        if isinstance(input_shape, list):
+          self.memory_layer.build(input_shape[0])
+        else:
+          self.memory_layer.build(input_shape)
+    else:
+      # The input_shape should be query.shape and state.shape. Use the query
+      # to init the query layer.
+      if self.query_layer is not None and not self.query_layer.built:
+        self.query_layer.build(input_shape[0])
+
+  def __call__(self, inputs, **kwargs):
+    """Preprocess the inputs before calling `base_layer.__call__()`.
+
+    Note that there are situation here, one for setup memory, and one with
+    actual query and state.
+    1. When the memory has not been configured, we just pass all the param to
+    base_layer.__call__(), which will then invoke self.call() with proper
+    inputs, which allows this class to setup memory.
+    2. When the memory has already been setup, the input should contain query
+    and state, and optionally processed memory. If the processed memory is
+    not included in the input, we will have to append it to the inputs and
+    give it to the base_layer.__call__(). The processed memory is the output
+    of first invocation of self.__call__(). If we don't add it here, then from
+    keras perspective, the graph is disconnected since the output from
+    previous call is never used.
 
-  def _setup_memory(self, memory, memory_mask=None):
+    Args:
+      inputs: the inputs tensors.
+      **kwargs: dict, other keyeword arguments for the `__call__()`
+    """
+    if self._memory_initialized:
+      if len(inputs) not in (2, 3):
+        raise ValueError("Expect the inputs to have 2 or 3 tensors, got %d" %
+                         len(inputs))
+      if len(inputs) == 2:
+        # We append the calculated memory here so that the graph will be
+        # connected.
+        inputs.append(self.values)
+    return super(_BaseAttentionMechanismV2, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, setup_memory=False, **kwargs):
+    """Setup the memory or query the attention.
+
+    There are two case here, one for setup memory, and the second is query the
+    attention score. `setup_memory` is the flag to indicate which mode it is.
+    The input list will be treated differently based on that flag.
+
+    Args:
+      inputs: a list of tensor that could either be `query` and `state`, or
+        `memory` and `memory_sequence_length`.
+        `query` is the tensor of dtype matching `memory` and shape
+        `[batch_size, query_depth]`.
+        `state` is the tensor of dtype matching `memory` and shape
+        `[batch_size, alignments_size]`. (`alignments_size` is memory's
+        `max_time`).
+        `memory` is the memory to query; usually the output of an RNN encoder.
+        The tensor should be shaped `[batch_size, max_time, ...]`.
+        `memory_sequence_length` (optional) is the sequence lengths for the
+         batch entries in memory. If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      mask: optional bool tensor with shape `[batch, max_time]` for the mask of
+        memory. If it is not None, the corresponding item of the memory should
+        be filtered out during calculation.
+      setup_memory: boolean, whether the input is for setting up memory, or
+        query attention.
+      **kwargs: Dict, other keyword arguments for the call method.
+    Returns:
+      Either processed memory or attention score, based on `setup_memory`.
+    """
+    if setup_memory:
+      if isinstance(inputs, list):
+        if len(inputs) not in (1, 2):
+          raise ValueError("Expect inputs to have 1 or 2 tensors, got %d" %
+                           len(inputs))
+        memory = inputs[0]
+        memory_sequence_length = inputs[1] if len(inputs) == 2 else None
+        memory_mask = mask
+      else:
+        memory, memory_sequence_length = inputs, None
+        memory_mask = mask
+      self._setup_memory(memory, memory_sequence_length, memory_mask)
+      # We force the self.built to false here since only memory is initialized,
+      # but the real query/state has not been call() yet. The layer should be
+      # build and call again.
+      self.built = False
+      # Return the processed memory in order to create the Keras connectivity
+      # data for it.
+      return self.values
+    else:
+      if not self._memory_initialized:
+        raise ValueError("Cannot query the attention before the setup of "
+                         "memory")
+      if len(inputs) not in (2, 3):
+        raise ValueError("Expect the inputs to have query, state, and optional "
+                         "processed memory, got %d items" % len(inputs))
+      # Ignore the rest of the inputs and only care about the query and state
+      query, state = inputs[0], inputs[1]
+      return self._calculate_attention(query, state)
+
+  def _setup_memory(self, memory, memory_sequence_length=None,
+                    memory_mask=None):
     """Pre-process the memory before actually query the memory.
 
     This should only be called once at the first invocation of call().
@@ -294,17 +428,30 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
     Args:
       memory: The memory to query; usually the output of an RNN encoder. This
         tensor should be shaped `[batch_size, max_time, ...]`.
-      memory_mask: The boolean tensor with shape `[batch_size, max_time]`. For
-        any value equal to False, the corresponding value in memory should be
-        ignored.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory. If provided, the memory tensor rows are masked with zeros for
+        values past the respective sequence lengths.
+      memory_mask: (Optional) The boolean tensor with shape `[batch_size,
+        max_time]`. For any value equal to False, the corresponding value in
+        memory should be ignored.
     """
     if self._memory_initialized:
       raise ValueError("The memory for the attention has already been setup.")
+    if memory_sequence_length is not None and memory_mask is not None:
+      raise ValueError("memory_sequence_length and memory_mask cannot be "
+                       "used at same time for attention.")
     with ops.name_scope(
         self.name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self.values = _prepare_memory(
-          memory, memory_mask=memory_mask,
+          memory,
+          memory_sequence_length=memory_sequence_length,
+          memory_mask=memory_mask,
           check_inner_dims_defined=self._check_inner_dims_defined)
+      # Mark the value as check since the memory and memory mask might not
+      # passed from __call__(), which does not have proper keras metadata.
+      # TODO(omalleyt): Remove this hack once the mask the has proper keras
+      # history.
+      base_layer_utils.mark_checked(self.values)
       if self.memory_layer is not None:
         self.keys = self.memory_layer(self.values)
       else:
@@ -315,36 +462,25 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
       self._alignments_size = (tensor_shape.dimension_value(self.keys.shape[1])
                                or array_ops.shape(self.keys)[1])
       if memory_mask is not None:
-        self.probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
-            self.probability_fn(_maybe_mask_score(
-                score, self.score_mask_value, memory_mask=memory_mask), prev))
+        unwrapped_probability_fn = self.probability_fn
+        def _mask_probability_fn(score, prev):
+          return unwrapped_probability_fn(
+              _maybe_mask_score(
+                  score,
+                  memory_mask=memory_mask,
+                  memory_sequence_length=memory_sequence_length,
+                  score_mask_value=self.score_mask_value), prev)
+        self.probability_fn = _mask_probability_fn
     self._memory_initialized = True
 
-  def call(self, inputs, mask=None, **kwargs):
-    """Base method to calculate the attention score.
-
-    Args:
-      inputs: a list of tensor that contains `query`, `state`, and `memory`.
-        `query` is the tensor of dtype matching `memory` and shape
-        `[batch_size, query_depth]`.
-        `state` is the tensor of dtype matching `memory` and shape
-        `[batch_size, alignments_size]`. (`alignments_size` is memory's
-        `max_time`).
-        `memory` is the memory to query; usually the output of an RNN encoder.
-        This tensor should be shaped `[batch_size, max_time, feature]`.
-      mask: optional bool tensor with shape `[batch, max_time]` for the mask of
-        memory. If it is not None, the corresponding item of the memory should
-        be filtered out during calculation.
-      **kwargs: Dict, other keyword arguments for the call method.
-    """
-    query, state, memory, memory_mask = self._process_inputs(inputs, mask)
-    if not self._memory_initialized:
-      self._setup_memory(memory, memory_mask=memory_mask)
-    return self.calculate_attention(query, state)
-
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     raise NotImplementedError(
-        "calculate_attention need to be implemented by subclasses.")
+        "_calculate_attention need to be implemented by subclasses.")
+
+  def compute_mask(self, inputs, mask=None):
+    # There real input of the attention is query and state, and the memory layer
+    # mask shouldn't be pass down. Returning None for all output mask here.
+    return None, None
 
   def get_config(self):
     config = {}
@@ -361,16 +497,12 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
           "class_name": self.memory_layer.__class__.__name__,
           "config": self.memory_layer.get_config(),
       }
+    # memory is a required init parameter and its a tensor. It cannot be
+    # serialized to config, so we put a placeholder for it.
+    config["memory"] = None
     base_config = super(_BaseAttentionMechanismV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  def _process_inputs(self, inputs, mask):
-    if len(inputs) != 3:
-      raise ValueError(
-          "Expect to have 3 inputs for attention, got %d" % len(inputs))
-    query, state, memory = inputs
-    return query, state, memory, mask
-
   def _process_probability_fn(self, func_name):
     """Helper method to retrieve the probably function by string input."""
     valid_probability_fns = {
@@ -418,6 +550,46 @@ class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
   def alignments_size(self):
     return self._alignments_size
 
+  @property
+  def state_size(self):
+    return self._alignments_size
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return a tensor of all zeros.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return _zero_state_tensors(max_time, batch_size, dtype)
+
+  def initial_state(self, batch_size, dtype):
+    """Creates the initial state values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return the same output as initial_alignments.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A structure of all-zero tensors with shapes as described by `state_size`.
+    """
+    return self.initial_alignments(batch_size, dtype)
+
 
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
@@ -587,6 +759,8 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                scale=False,
                probability_fn="softmax",
                dtype=None,
@@ -596,6 +770,11 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
 
     Args:
       units: The depth of the attention mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       scale: Python boolean. Whether to scale the energy term.
       probability_fn: (optional) string, the name of function to convert the
         attention score to probabilities. The default is `softmax` which is
@@ -618,26 +797,27 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
     if not memory_layer:
       memory_layer = layers.Dense(
           units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.scale = scale
+    self.scale_weight = None
     super(LuongAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
         query_layer=None,
         memory_layer=memory_layer,
         probability_fn=wrapped_probability_fn,
         name=name,
         dtype=dtype,
         **kwargs)
-    self.units = units
-    self.scale = scale
 
   def build(self, input_shape):
     super(LuongAttentionV2, self).build(input_shape)
-    if self.scale:
+    if self.scale and self.scale_weight is None:
       self.scale_weight = self.add_weight(
           "attention_g", initializer=init_ops.ones_initializer, shape=())
-    else:
-      self.scale_weight = None
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -851,8 +1031,11 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                normalize=False,
                probability_fn="softmax",
+               kernel_initializer="glorot_uniform",
                dtype=None,
                name="BahdanauAttention",
                **kwargs):
@@ -860,12 +1043,19 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
 
     Args:
       units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       normalize: Python boolean.  Whether to normalize the energy term.
       probability_fn: (optional) string, the name of function to convert the
         attention score to probabilities. The default is `softmax` which is
         `tf.nn.softmax`. Other options is `hardmax`, which is hardmax() within
         this module. Any other value will result into validation error. Default
         to use `softmax`.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
       dtype: The data type for the query and memory layers of the attention
         mechanism.
       name: Name to use when creating ops.
@@ -885,33 +1075,39 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
     if not memory_layer:
       memory_layer = layers.Dense(
           units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.normalize = normalize
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.attention_v = None
+    self.attention_g = None
+    self.attention_b = None
     super(BahdanauAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
         query_layer=query_layer,
         memory_layer=memory_layer,
         probability_fn=wrapped_probability_fn,
         name=name,
         dtype=dtype,
         **kwargs)
-    self.units = units
-    self.normalize = normalize
 
   def build(self, input_shape):
     super(BahdanauAttentionV2, self).build(input_shape)
-    self.attention_v = self.add_weight(
-        "attention_v", [self.units], dtype=self.dtype)
-    if self.normalize:
+    if self.attention_v is None:
+      self.attention_v = self.add_weight(
+          "attention_v", [self.units],
+          dtype=self.dtype,
+          initializer=self.kernel_initializer)
+    if self.normalize and self.attention_g is None and self.attention_b is None:
       self.attention_g = self.add_weight(
           "attention_g", initializer=init_ops.constant_initializer(
               math.sqrt((1. / self.units))), shape=())
       self.attention_b = self.add_weight(
           "attention_b", shape=[self.units],
           initializer=init_ops.zeros_initializer())
-    else:
-      self.attention_g = None
-      self.attention_b = None
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -940,6 +1136,7 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
         "units": self.units,
         "normalize": self.normalize,
         "probability_fn": self.probability_fn_name,
+        "kernel_initializer": initializers.serialize(self.kernel_initializer)
     }
     base_config = super(BahdanauAttentionV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1299,11 +1496,14 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                normalize=False,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
                score_bias_init=0.,
                mode="parallel",
+               kernel_initializer="glorot_uniform",
                dtype=None,
                name="BahdanauMonotonicAttention",
                **kwargs):
@@ -1311,6 +1511,11 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
     Args:
       units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       normalize: Python boolean. Whether to normalize the energy term.
       sigmoid_noise: Standard deviation of pre-sigmoid noise. See the docstring
         for `_monotonic_probability_fn` for more information.
@@ -1321,6 +1526,8 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
       mode: How to compute the attention distribution. Must be one of
         'recursive', 'parallel', or 'hard'. See the docstring for
         `tf.contrib.seq2seq.monotonic_attention` for more information.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
       dtype: The data type for the query and memory layers of the attention
         mechanism.
       name: Name to use when creating ops.
@@ -1341,32 +1548,39 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
     if not memory_layer:
       memory_layer = layers.Dense(
           units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.normalize = normalize
+    self.sigmoid_noise = sigmoid_noise
+    self.sigmoid_noise_seed = sigmoid_noise_seed
+    self.score_bias_init = score_bias_init
+    self.mode = mode
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.attention_v = None
+    self.attention_score_bias = None
+    self.attention_g = None
+    self.attention_b = None
     super(BahdanauMonotonicAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
         query_layer=query_layer,
         memory_layer=memory_layer,
         probability_fn=wrapped_probability_fn,
         name=name,
         dtype=dtype,
         **kwargs)
-    self.units = units
-    self.normalize = normalize
-    self.sigmoid_noise = sigmoid_noise
-    self.sigmoid_noise_seed = sigmoid_noise_seed
-    self.score_bias_init = score_bias_init
-    self.mode = mode
 
   def build(self, input_shape):
     super(BahdanauMonotonicAttentionV2, self).build(input_shape)
-    self.attention_v = self.add_weight(
-        "attention_v", [self.units], dtype=self.dtype)
-    self.attention_score_bias = self.add_weight(
-        "attention_score_bias", shape=(), dtype=self.dtype,
-        initializer=init_ops.constant_initializer(
-            self.score_bias_init, dtype=self.dtype))
-    if not self.normalize:
-      self.attention_g = None
-      self.attention_b = None
-    else:
+    if self.attention_v is None:
+      self.attention_v = self.add_weight(
+          "attention_v", [self.units], dtype=self.dtype,
+          initializer=self.kernel_initializer)
+    if self.attention_score_bias is None:
+      self.attention_score_bias = self.add_weight(
+          "attention_score_bias", shape=(), dtype=self.dtype,
+          initializer=init_ops.constant_initializer(
+              self.score_bias_init, dtype=self.dtype))
+    if self.normalize and self.attention_g is None and self.attention_b is None:
       self.attention_g = self.add_weight(
           "attention_g", dtype=self.dtype,
           initializer=init_ops.constant_initializer(
@@ -1377,7 +1591,7 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
           initializer=init_ops.zeros_initializer())
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -1409,6 +1623,7 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
         "sigmoid_noise_seed": self.sigmoid_noise_seed,
         "score_bias_init": self.score_bias_init,
         "mode": self.mode,
+        "kernel_initializer": initializers.serialize(self.kernel_initializer),
     }
     base_config = super(BahdanauMonotonicAttentionV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1542,6 +1757,8 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                scale=False,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
@@ -1554,6 +1771,11 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
     Args:
       units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       scale: Python boolean.  Whether to scale the energy term.
       sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the docstring
         for `_monotonic_probability_fn` for more information.
@@ -1580,34 +1802,37 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
     if not memory_layer:
       memory_layer = layers.Dense(
           units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.scale = scale
+    self.sigmoid_noise = sigmoid_noise
+    self.sigmoid_noise_seed = sigmoid_noise_seed
+    self.score_bias_init = score_bias_init
+    self.mode = mode
+    self.attention_g = None
+    self.attention_score_bias = None
     super(LuongMonotonicAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
         query_layer=None,
         memory_layer=memory_layer,
         probability_fn=wrapped_probability_fn,
         name=name,
         dtype=dtype,
         **kwargs)
-    self.units = units
-    self.scale = scale
-    self.sigmoid_noise = sigmoid_noise
-    self.sigmoid_noise_seed = sigmoid_noise_seed
-    self.score_bias_init = score_bias_init
-    self.mode = mode
 
   def build(self, input_shape):
     super(LuongMonotonicAttentionV2, self).build(input_shape)
-    if self.scale:
+    if self.scale and self.attention_g is None:
       self.attention_g = self.add_weight(
           "attention_g", initializer=init_ops.ones_initializer, shape=())
-    else:
-      self.attention_g = None
-    self.attention_score_bias = self.add_weight(
-        "attention_score_bias", shape=(),
-        initializer=init_ops.constant_initializer(
-            self.score_bias_init, dtype=self.dtype))
+    if self.attention_score_bias is None:
+      self.attention_score_bias = self.add_weight(
+          "attention_score_bias", shape=(),
+          initializer=init_ops.constant_initializer(
+              self.score_bias_init, dtype=self.dtype))
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -1695,7 +1920,15 @@ class AttentionWrapperState(
     def with_same_shape(old, new):
       """Check and set new tensor's shape."""
       if isinstance(old, ops.Tensor) and isinstance(new, ops.Tensor):
-        return tensor_util.with_same_shape(old, new)
+        if not context.executing_eagerly():
+          return tensor_util.with_same_shape(old, new)
+        else:
+          if old.shape.as_list() != new.shape.as_list():
+            raise ValueError("The shape of the AttentionWrapperState is "
+                             "expected to be same as the one to clone. "
+                             "self.shape: %s, input.shape: %s" %
+                             (old.shape, new.shape))
+          return new
       return new
 
     return nest.map_structure(
@@ -1739,41 +1972,26 @@ def _prepare_memory(memory, memory_sequence_length=None, memory_mask=None,
                          "but saw shape: %s" % (m.name, m.get_shape()))
     nest.map_structure(_check_dims, memory)
   if memory_sequence_length is None and memory_mask is None:
-    seq_len_mask = None
-    seq_len_batch_size = None
+    return memory
   elif memory_sequence_length is not None:
     seq_len_mask = array_ops.sequence_mask(
         memory_sequence_length,
         maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
         dtype=nest.flatten(memory)[0].dtype)
-    seq_len_batch_size = (
-        tensor_shape.dimension_value(memory_sequence_length.shape[0])
-        or array_ops.shape(memory_sequence_length)[0])
   else:
     # For memory_mask is not None
-    seq_len_mask = memory_mask
-    seq_len_batch_size = (
-        tensor_shape.dimension_value(memory_mask.shape[0])
-        or array_ops.shape(memory_mask)[0])
+    seq_len_mask = math_ops.cast(
+        memory_mask, dtype=nest.flatten(memory)[0].dtype)
   def _maybe_mask(m, seq_len_mask):
     """Mask the memory based on the memory mask."""
     rank = m.get_shape().ndims
     rank = rank if rank is not None else array_ops.rank(m)
     extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
-    m_batch_size = tensor_shape.dimension_value(
-        m.shape[0]) or array_ops.shape(m)[0]
-    if seq_len_batch_size is not None:
-      message = ("memory_sequence_length and memory tensor batch sizes do not "
-                 "match.")
-      with ops.control_dependencies([
-          check_ops.assert_equal(
-              seq_len_batch_size, m_batch_size, message=message)]):
-        seq_len_mask = array_ops.reshape(
-            seq_len_mask,
-            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
-        return m * seq_len_mask
-    else:
-      return m
+    seq_len_mask = array_ops.reshape(
+        seq_len_mask,
+        array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
+    return m * seq_len_mask
+
   return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
 
 
@@ -1819,8 +2037,14 @@ def hardmax(logits, name=None):
 def _compute_attention(attention_mechanism, cell_output, attention_state,
                        attention_layer):
   """Computes the attention and alignments for a given attention_mechanism."""
-  alignments, next_attention_state = attention_mechanism(
-      cell_output, state=attention_state)
+  if isinstance(attention_mechanism, _BaseAttentionMechanismV2):
+    alignments, next_attention_state = attention_mechanism(
+        [cell_output, attention_state])
+  else:
+    # For other class, assume they are following _BaseAttentionMechanism, which
+    # takes query and state as separate parameter.
+    alignments, next_attention_state = attention_mechanism(
+        cell_output, state=attention_state)
 
   # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
   expanded_alignments = array_ops.expand_dims(alignments, 1)
@@ -1833,13 +2057,13 @@ def _compute_attention(attention_mechanism, cell_output, attention_state,
   # the batched matmul is over memory_time, so the output shape is
   #   [batch_size, 1, memory_size].
   # we then squeeze out the singleton dim.
-  context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
-  context = array_ops.squeeze(context, [1])
+  context_ = math_ops.matmul(expanded_alignments, attention_mechanism.values)
+  context_ = array_ops.squeeze(context_, [1])
 
   if attention_layer is not None:
-    attention = attention_layer(array_ops.concat([cell_output, context], 1))
+    attention = attention_layer(array_ops.concat([cell_output, context_], 1))
   else:
-    attention = context
+    attention = context_
 
   return attention, alignments, next_attention_state
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 8f8f057702951094758b277ce060955f3dc6e99d..1d773a449890cd7335b2225db39d79ca958a3276 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -24,11 +24,12 @@ import numpy as np
 from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.layers import base as layers_base
+from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -182,11 +183,12 @@ def gather_tree_from_array(t, parent_ids, sequence_length):
   return ordered
 
 
-def _check_maybe(t):
+def _check_ndims(t):
   if t.shape.ndims is None:
     raise ValueError(
         "Expected tensor (%s) to have known rank, but ndims == None." % t)
 
+
 def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
   """Raises an exception if dimensions are known statically and can not be
   reshaped to [batch_size, beam_size, -1].
@@ -205,6 +207,7 @@ def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
     return False
   return True
 
+
 def _check_batch_beam(t, batch_size, beam_width):
   """Returns an Assert operation checking that the elements of the stacked
   TensorArray can be reshaped to [batch_size, beam_size, -1]. At this point,
@@ -229,70 +232,30 @@ def _check_batch_beam(t, batch_size, beam_width):
   return control_flow_ops.Assert(condition, [error_message])
 
 
+class BeamSearchDecoderMixin(object):
+  """BeamSearchDecoderMixin contains the common methods for BeamSearchDecoder.
 
-class BeamSearchDecoder(decoder.Decoder):
-  """BeamSearch sampling decoder.
-
-    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
-    `AttentionWrapper`, then you must ensure that:
-
-    - The encoder output has been tiled to `beam_width` via
-      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
-    - The `batch_size` argument passed to the `zero_state` method of this
-      wrapper is equal to `true_batch_size * beam_width`.
-    - The initial state created with `zero_state` above contains a
-      `cell_state` value containing properly tiled final state from the
-      encoder.
-
-    An example:
-
-    ```
-    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
-        encoder_outputs, multiplier=beam_width)
-    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
-        encoder_final_state, multiplier=beam_width)
-    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
-        sequence_length, multiplier=beam_width)
-    attention_mechanism = MyFavoriteAttentionMechanism(
-        num_units=attention_depth,
-        memory=tiled_inputs,
-        memory_sequence_length=tiled_sequence_length)
-    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
-    decoder_initial_state = attention_cell.zero_state(
-        dtype, batch_size=true_batch_size * beam_width)
-    decoder_initial_state = decoder_initial_state.clone(
-        cell_state=tiled_encoder_final_state)
-    ```
-
-    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
-    when computing scores(https://arxiv.org/pdf/1609.08144.pdf). It encourages
-    the translation to cover all inputs.
+  It is expected to be used a base class for concrete BeamSearchDecoder. Since
+  this is a mixin class, it is expected to be used together with other class as
+  base.
   """
 
   def __init__(self,
                cell,
-               embedding,
-               start_tokens,
-               end_token,
-               initial_state,
                beam_width,
                output_layer=None,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
-               reorder_tensor_arrays=True):
-    """Initialize the BeamSearchDecoder.
+               reorder_tensor_arrays=True,
+               **kwargs):
+    """Initialize the BeamSearchDecoderMixin.
 
     Args:
       cell: An `RNNCell` instance.
-      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-        or the `params` argument for `embedding_lookup`.
-      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-      end_token: `int32` scalar, the token that marks end of decoding.
-      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
       beam_width:  Python integer, the number of beams.
-      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
-        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
-        to storing the result or sampling.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
       length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
       coverage_penalty_weight: Float weight to penalize the coverage of source
         sentence. Disabled with 0.0.
@@ -302,59 +265,35 @@ class BeamSearchDecoder(decoder.Decoder):
         Otherwise, the `TensorArray` will be returned as is. Set this flag to
         `False` if the cell state contains `TensorArray`s that are not amenable
         to reordering.
+      **kwargs: Dict, other keyword arguments for parent class.
 
     Raises:
       TypeError: if `cell` is not an instance of `RNNCell`,
-        or `output_layer` is not an instance of `tf.layers.Layer`.
-      ValueError: If `start_tokens` is not a vector or
-        `end_token` is not a scalar.
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
     """
     rnn_cell_impl.assert_like_rnncell("cell", cell)  # pylint: disable=protected-access
     if (output_layer is not None and
-        not isinstance(output_layer, layers_base.Layer)):
+        not isinstance(output_layer, layers.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
     self._output_layer = output_layer
     self._reorder_tensor_arrays = reorder_tensor_arrays
 
-    if callable(embedding):
-      self._embedding_fn = embedding
-    else:
-      self._embedding_fn = (
-          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-
-    self._start_tokens = ops.convert_to_tensor(
-        start_tokens, dtype=dtypes.int32, name="start_tokens")
-    if self._start_tokens.get_shape().ndims != 1:
-      raise ValueError("start_tokens must be a vector")
-    self._end_token = ops.convert_to_tensor(
-        end_token, dtype=dtypes.int32, name="end_token")
-    if self._end_token.get_shape().ndims != 0:
-      raise ValueError("end_token must be a scalar")
-
-    self._batch_size = array_ops.size(start_tokens)
+    self._start_tokens = None
+    self._end_token = None
+    self._batch_size = None
     self._beam_width = beam_width
     self._length_penalty_weight = length_penalty_weight
     self._coverage_penalty_weight = coverage_penalty_weight
-    self._initial_cell_state = nest.map_structure(
-        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
-    self._start_tokens = array_ops.tile(
-        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
-    self._start_inputs = self._embedding_fn(self._start_tokens)
-
-    self._finished = array_ops.one_hot(
-        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width,
-        on_value=False,
-        off_value=True,
-        dtype=dtypes.bool)
+    super(BeamSearchDecoderMixin, self).__init__(**kwargs)
 
   @property
   def batch_size(self):
     return self._batch_size
 
   def _rnn_output_size(self):
+    """Get the output shape from the RNN layer."""
     size = self._cell.output_size
     if self._output_layer is None:
       return size
@@ -393,50 +332,6 @@ class BeamSearchDecoder(decoder.Decoder):
         predicted_ids=tensor_shape.TensorShape([self._beam_width]),
         parent_ids=tensor_shape.TensorShape([self._beam_width]))
 
-  @property
-  def output_dtype(self):
-    # Assume the dtype of the cell is the output_size structure
-    # containing the input_state's first component's dtype.
-    # Return that structure and int32 (the id)
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
-    return BeamSearchDecoderOutput(
-        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-        predicted_ids=dtypes.int32,
-        parent_ids=dtypes.int32)
-
-  def initialize(self, name=None):
-    """Initialize the decoder.
-
-    Args:
-      name: Name scope for any created operations.
-
-    Returns:
-      `(finished, start_inputs, initial_state)`.
-    """
-    finished, start_inputs = self._finished, self._start_inputs
-
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
-    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
-        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width,
-        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
-        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
-        dtype=dtype)
-    init_attention_probs = get_attention_probs(
-        self._initial_cell_state, self._coverage_penalty_weight)
-    if init_attention_probs is None:
-      init_attention_probs = ()
-
-    initial_state = BeamSearchDecoderState(
-        cell_state=self._initial_cell_state,
-        log_probs=log_probs,
-        finished=finished,
-        lengths=array_ops.zeros(
-            [self._batch_size, self._beam_width], dtype=dtypes.int64),
-        accumulated_attention_probs=init_attention_probs)
-
-    return (finished, start_inputs, initial_state)
-
   def finalize(self, outputs, final_state, sequence_lengths):
     """Finalize and return the predicted_ids.
 
@@ -562,7 +457,7 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if isinstance(t, tensor_array_ops.TensorArray):
       return t
-    _check_maybe(t)
+    _check_ndims(t)
     if t.shape.ndims >= 1:
       return self._split_batch_beams(t, s)
     else:
@@ -586,7 +481,7 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if isinstance(t, tensor_array_ops.TensorArray):
       return t
-    _check_maybe(t)
+    _check_ndims(t)
     if t.shape.ndims >= 2:
       return self._merge_batch_beams(t, s)
     else:
@@ -609,11 +504,18 @@ class BeamSearchDecoder(decoder.Decoder):
     if not isinstance(t, tensor_array_ops.TensorArray):
       return t
     # pylint: disable=protected-access
-    if (not t._infer_shape or not t._element_shape
-        or t._element_shape[0].ndims is None
-        or t._element_shape[0].ndims < 1):
+    # This is a bad hack due to the implementation detail of eager/graph TA.
+    # TODO(b/124374427): Update this to use public property of TensorArray.
+    if context.executing_eagerly():
+      element_shape = t._element_shape
+    else:
+      element_shape = t._element_shape[0]
+    if (not t._infer_shape
+        or not t._element_shape
+        or element_shape.ndims is None
+        or element_shape.ndims < 1):
       shape = (
-          t._element_shape[0] if t._infer_shape and t._element_shape
+          element_shape if t._infer_shape and t._element_shape
           else tensor_shape.TensorShape(None))
       tf_logging.warn("The TensorArray %s in the cell state is not amenable to "
                       "sorting based on the beam search result. For a "
@@ -621,10 +523,10 @@ class BeamSearchDecoder(decoder.Decoder):
                       "defined and have at least a rank of 1, but saw shape: %s"
                       % (t.handle.name, shape))
       return t
-    shape = t._element_shape[0]
     # pylint: enable=protected-access
     if not _check_static_batch_beam_maybe(
-        shape, tensor_util.constant_value(self._batch_size), self._beam_width):
+        element_shape, tensor_util.constant_value(self._batch_size),
+        self._beam_width):
       return t
     t = t.stack()
     with ops.control_dependencies(
@@ -684,6 +586,359 @@ class BeamSearchDecoder(decoder.Decoder):
     return (beam_search_output, beam_search_state, next_inputs, finished)
 
 
+class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.Decoder):
+  # Note that the inheritance hierarchy is important here. The Mixin has to be
+  # the first parent class since we will use super().__init__(), and Mixin which
+  # is a object will properly invoke the __init__ method of other parent class.
+  """BeamSearch sampling decoder.
+
+    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+    `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `zero_state` method of this
+      wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `zero_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    ```
+    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+        encoder_outputs, multiplier=beam_width)
+    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+        encoder_final_state, multiplier=beam_width)
+    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+        sequence_length, multiplier=beam_width)
+    attention_mechanism = MyFavoriteAttentionMechanism(
+        num_units=attention_depth,
+        memory=tiled_inputs,
+        memory_sequence_length=tiled_sequence_length)
+    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+    decoder_initial_state = attention_cell.zero_state(
+        dtype, batch_size=true_batch_size * beam_width)
+    decoder_initial_state = decoder_initial_state.clone(
+        cell_state=tiled_encoder_final_state)
+    ```
+
+    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+    the decoder to cover all inputs.
+  """
+
+  def __init__(self,
+               cell,
+               embedding,
+               start_tokens,
+               end_token,
+               initial_state,
+               beam_width,
+               output_layer=None,
+               length_penalty_weight=0.0,
+               coverage_penalty_weight=0.0,
+               reorder_tensor_arrays=True):
+    """Initialize the BeamSearchDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+      beam_width:  Python integer, the number of beams.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of source
+        sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+      ValueError: If `start_tokens` is not a vector or
+        `end_token` is not a scalar.
+    """
+    super(BeamSearchDecoder, self).__init__(
+        cell,
+        beam_width,
+        output_layer=output_layer,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        reorder_tensor_arrays=reorder_tensor_arrays)
+
+    if callable(embedding):
+      self._embedding_fn = embedding
+    else:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+
+  def initialize(self, name=None):
+    """Initialize the decoder.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    """
+    finished, start_inputs = self._finished, self._start_inputs
+
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
+    init_attention_probs = get_attention_probs(
+        self._initial_cell_state, self._coverage_penalty_weight)
+    if init_attention_probs is None:
+      init_attention_probs = ()
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=log_probs,
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int64),
+        accumulated_attention_probs=init_attention_probs)
+
+    return (finished, start_inputs, initial_state)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+
+class BeamSearchDecoderV2(BeamSearchDecoderMixin, decoder.BaseDecoder):
+  # Note that the inheritance hierarchy is important here. The Mixin has to be
+  # the first parent class since we will use super().__init__(), and Mixin which
+  # is a object will properly invoke the __init__ method of other parent class.
+  """BeamSearch sampling decoder.
+
+    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+    `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `zero_state` method of this
+      wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `zero_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    ```
+    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+        encoder_outputs, multiplier=beam_width)
+    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+        encoder_final_state, multiplier=beam_width)
+    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+        sequence_length, multiplier=beam_width)
+    attention_mechanism = MyFavoriteAttentionMechanism(
+        num_units=attention_depth,
+        memory=tiled_inputs,
+        memory_sequence_length=tiled_sequence_length)
+    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+    decoder_initial_state = attention_cell.zero_state(
+        dtype, batch_size=true_batch_size * beam_width)
+    decoder_initial_state = decoder_initial_state.clone(
+        cell_state=tiled_encoder_final_state)
+    ```
+
+    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+    the decoding to cover all inputs.
+  """
+
+  def __init__(self,
+               cell,
+               beam_width,
+               embedding_fn=None,
+               output_layer=None,
+               length_penalty_weight=0.0,
+               coverage_penalty_weight=0.0,
+               reorder_tensor_arrays=True,
+               **kwargs):
+    """Initialize the BeamSearchDecoderV2.
+
+    Args:
+      cell: An `RNNCell` instance.
+      beam_width:  Python integer, the number of beams.
+      embedding_fn: A callable that takes a vector tensor of `ids` (argmax ids).
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of source
+        sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
+      **kwargs: Dict, other keyword arguments for initialization.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+    """
+    super(BeamSearchDecoderV2, self).__init__(
+        cell,
+        beam_width,
+        output_layer=output_layer,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        reorder_tensor_arrays=reorder_tensor_arrays,
+        **kwargs)
+
+    if embedding_fn is None or callable(embedding_fn):
+      self._embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be a callable, got %s" %
+                       type(embedding_fn))
+
+  def initialize(self,
+                 embedding,
+                 start_tokens,
+                 end_token,
+                 initial_state):
+    """Initialize the decoder.
+
+    Args:
+      embedding: A tensor from the embedding layer output, which is the
+        `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    Raises:
+      ValueError: If `start_tokens` is not a vector or `end_token` is not a
+        scalar.
+    """
+    if embedding is not None and self._embedding_fn is not None:
+      raise ValueError(
+          "embedding and embedding_fn cannot be provided at same time")
+    elif embedding is not None:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+
+    finished, start_inputs = self._finished, self._start_inputs
+
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
+    init_attention_probs = get_attention_probs(
+        self._initial_cell_state, self._coverage_penalty_weight)
+    if init_attention_probs is None:
+      init_attention_probs = ()
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=log_probs,
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int64),
+        accumulated_attention_probs=init_attention_probs)
+
+    return (finished, start_inputs, initial_state)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+  def call(self, embeddning, start_tokens, end_token, initial_state, **kwargs):
+    init_kwargs = kwargs
+    init_kwargs["start_tokens"] = start_tokens
+    init_kwargs["end_token"] = end_token
+    init_kwargs["initial_state"] = initial_state
+    return decoder.dynamic_decode(self,
+                                  output_time_major=self.output_time_major,
+                                  impute_finished=self.impute_finished,
+                                  maximum_iterations=self.maximum_iterations,
+                                  parallel_iterations=self.parallel_iterations,
+                                  swap_memory=self.swap_memory,
+                                  decoder_init_input=embeddning,
+                                  decoder_init_kwargs=init_kwargs)
+
+
 def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                       beam_width, end_token, length_penalty_weight,
                       coverage_penalty_weight):
@@ -1068,7 +1323,7 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
   """
   if isinstance(gather_from, tensor_array_ops.TensorArray):
     return gather_from
-  _check_maybe(gather_from)
+  _check_ndims(gather_from)
   if gather_from.shape.ndims >= len(gather_shape):
     return _tensor_gather_helper(
         gather_indices=gather_indices,
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index f88b03ec4c2b1f250091594ea12d7d1862029fa2..7dd52df6b68caea6111813837ba1e872acbeccdb 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -4,17 +4,14 @@ exports_files([
     "LICENSE",
 ])
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "py_test",
-    "tf_gen_op_wrapper_py",
-)
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
     name = "summary_ops_test",
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":summary",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -22,7 +19,6 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -35,6 +31,7 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":summary",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -43,7 +40,6 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 807741e05f92f6b666c175269742dc1af50c0054..8e13f7f56b23e47f046120b285b1519c6371ddab 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -22,6 +22,7 @@ import time
 
 import six
 
+from tensorflow.contrib.summary import summary as summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -32,7 +33,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 10e4556dacbc17ec02c2bd698389b04d517d7076..27bfdeb3601f4fdb9897feee509b06d5e8f9b873 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -25,6 +25,7 @@ import sqlite3
 import numpy as np
 import six
 
+from tensorflow.contrib.summary import summary as summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -36,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 398ac314f4b520610ec100273b37c33bc4b5b43a..583bbf97c57cf263f65bc3b0a56b32cc2dce5482 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -537,8 +537,9 @@ py_library(
 
 py_test(
     name = "random_forest_test",
-    size = "large",
+    size = "medium",
     srcs = ["client/random_forest_test.py"],
+    shard_count = 6,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",
diff --git a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
index b9aad36f3d25b9fb7b8b525be54fb7a39394b373..76b1d2b4da269cda71f5b49878f2933d7d9b5776 100644
--- a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
@@ -304,7 +304,7 @@ class TraverseTreeV4Op : public OpKernel {
     auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
     int num_threads = worker_threads->num_threads;
     const int64 costPerTraverse = 500;
-    auto traverse = [this, &set_leaf_ids, &data_set, decision_tree_resource,
+    auto traverse = [&set_leaf_ids, &data_set, decision_tree_resource,
                      num_data](int64 start, int64 end) {
       CHECK(start <= end);
       CHECK(end <= num_data);
diff --git a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
index fe2c91c1047fe56710b1a86b2fa3206caf6ff3bc..0243f106814511c1b53a5aacb830b845214a00a3 100644
--- a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
@@ -307,7 +307,7 @@ class ProcessInputOp : public OpKernel {
     // from a digits run on local desktop.  Heuristics might be necessary
     // if it really matters that much.
     const int64 costPerUpdate = 1000;
-    auto update = [this, &target, &leaf_ids_tensor, &num_targets, &data_set,
+    auto update = [&target, &leaf_ids_tensor, &num_targets, &data_set,
                    fertile_stats_resource, &locks, &set_lock, &ready_to_split,
                    num_data](int64 start, int64 end) {
       CHECK(start <= end);
@@ -317,7 +317,7 @@ class ProcessInputOp : public OpKernel {
                   static_cast<int32>(end), &ready_to_split);
     };
 
-    auto update_collated = [this, &target, &num_targets, fertile_stats_resource,
+    auto update_collated = [&target, &num_targets, fertile_stats_resource,
                             tree_resource, &leaf_examples, &set_lock,
                             &ready_to_split, &data_set,
                             num_leaves](int64 start, int64 end) {
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
index 290c16fe3966791ea78986539750caf938a37322..40bf7081a3f22dfd68fd46f0f61695ee9ca7863b 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -35,7 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 _model_ops = loader.load_op_library(
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
index 9184198cd4c8fd2a7609714d094d5ef2b6868658..80afcfb251f4d6455a9eb8ba5df4a6e43d2feb1c 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 _stats_ops = loader.load_op_library(
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 42bf1eda5179d0f72f4fd8432e6b5684f8e46917..4a959378138dec6f1c1a3f490704d7aebeae9b47 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -21,20 +21,21 @@ from __future__ import print_function
 from tensorflow.python.compiler.tensorrt import trt_convert
 
 
-def create_inference_graph(input_graph_def,
-                           outputs,
-                           max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20,
-                           precision_mode=trt_convert.TrtPrecisionMode.FP32,
-                           minimum_segment_size=3,
-                           is_dynamic_op=False,
-                           maximum_cached_engines=1,
-                           cached_engine_batches=None,
-                           use_calibration=True,
-                           input_saved_model_dir=None,
-                           input_saved_model_tags=None,
-                           output_saved_model_dir=None,
-                           session_config=None):
+def create_inference_graph(
+    input_graph_def,
+    outputs,
+    max_batch_size=1,
+    max_workspace_size_bytes=trt_convert.DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=trt_convert.TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    cached_engine_batches=None,
+    use_calibration=True,
+    input_saved_model_dir=None,
+    input_saved_model_tags=None,
+    output_saved_model_dir=None,
+    session_config=None):
   return trt_convert.create_inference_graph(
       input_graph_def=input_graph_def,
       outputs=outputs,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index d1be31ddc799ce4c4ef9baa15729fde7925f2f6c..4ba814b9e3d3621f9ab924961e2740885fa93b33 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -161,7 +161,10 @@ py_test(
     ],
     shard_count = 10,
     srcs_version = "PY2AND3",
-    tags = ["no_pip_gpu"],  # b/63391119
+    tags = [
+        "no_pip_gpu",  # b/63391119
+        "notap",  # b/124520733
+    ],
     deps = [
         ":estimators",
         ":feature_keys",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 1859dee9d08ac4a8f3f496222d537b622c65621e..7c1661d20f15f94a929a46dafc79d59ca73e53cb 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -23,17 +23,12 @@ package(
     ],
 )
 
-cc_library(
-    name = "all_ops",
+py_library(
+    name = "tpu_py",
+    srcs = ["python/ops/tpu_ops.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":cross_replica_ops_op_lib",
-        ":heartbeat_ops_op_lib",
-        ":host_compute_ops_op_lib",
-        ":infeed_ops_op_lib",
-        ":outfeed_ops_op_lib",
-        ":replication_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
+        "//tensorflow/python/tpu:tpu_py",
     ],
 )
 
@@ -42,19 +37,7 @@ py_library(
     srcs = ["python/tpu/async_checkpoint.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/tpu:async_checkpoint",
     ],
 )
 
@@ -75,145 +58,20 @@ py_library(
         ":functional",
         ":tpu_embedding",
         ":tpu_lib",
-        ":tpu_ordinal_selector_py",
         "//tensorflow/contrib/training:training_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:function",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:util",
-        "@six_archive//:six",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "cross_replica_ops",
-        "heartbeat_ops",
-        "host_compute_ops",
-        "infeed_ops",
-        "outfeed_ops",
-        "replication_ops",
-        "tpu_configuration_ops",
-        "tpu_embedding_ops",
-        "tpu_ordinal_selector_op",
-        "functional_ops",
-    ],
-    deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_output_layout_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_custom_op_library(
-    name = "python/ops/_tpu_ops.so",
-    srcs = [
-        "ops/cross_replica_ops.cc",
-        "ops/heartbeat_ops.cc",
-        "ops/host_compute_ops.cc",
-        "ops/infeed_ops.cc",
-        "ops/outfeed_ops.cc",
-        "ops/replication_ops.cc",
-        "ops/tpu_configuration_ops.cc",
-        "ops/tpu_embedding_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_output_layout_utils",
-        "//tensorflow/core:lib_proto_parsing",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "tpu_ops",
-    hidden = [
-        "SendTPUEmbeddingGradients",
-        "EnqueueTPUEmbeddingIntegerBatch",
-        "EnqueueTPUEmbeddingSparseBatch",
-        "EnqueueTPUEmbeddingSparseTensorBatch",
-    ],
-    deps = [
-        ":cross_replica_ops_op_lib",
-        ":heartbeat_ops_op_lib",
-        ":host_compute_ops_op_lib",
-        ":infeed_ops_op_lib",
-        ":outfeed_ops_op_lib",
-        ":replication_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
-    ],
-)
-
-tf_custom_op_library(
-    name = "python/ops/_tpu_ordinal_selector_op.so",
-    srcs = ["ops/tpu_ordinal_selector_op.cc"],
-)
-
-tf_custom_op_py_library(
-    name = "tpu_ordinal_selector_py",
-    srcs = ["python/ops/tpu_ordinal_selector_op.py"],
-    dso = [":python/ops/_tpu_ordinal_selector_op.so"],
-    kernels = [
-        ":tpu_ordinal_selector_op_op_lib",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":tpu_ordinal_selector_op",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "tpu_ordinal_selector_op",
-    deps = [
-        ":tpu_ordinal_selector_op_op_lib",
+        "//tensorflow/python/tpu:tpu_estimator",
     ],
 )
 
-tf_custom_op_library(
-    name = "python/ops/_functional_ops.so",
-    srcs = ["ops/functional_ops.cc"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_functional_ops",
-    out = "python/tpu/gen_functional_ops.py",
-    hidden = [
-        "TPUPartitionedCall",
-    ],
-    deps = [":functional_ops_op_lib"],
-)
-
-tf_custom_op_py_library(
+py_library(
     name = "functional",
     srcs = ["python/tpu/functional.py"],
-    dso = [":python/ops/_functional_ops.so"],
-    kernels = [
-        ":functional_ops_op_lib",
-    ],
     srcs_version = "PY2AND3",
     visibility = [
         "//visibility:public",
     ],
     deps = [
-        ":gen_functional_ops",
+        "//tensorflow/python/tpu:functional",
     ],
 )
 
@@ -222,30 +80,7 @@ py_library(
     srcs = ["python/profiler/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_pb2_grpc",
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_proto_py",
-        "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "tpu_py",
-    srcs = ["python/ops/tpu_ops.py"],
-    dso = [":python/ops/_tpu_ops.so"],
-    kernels = [
-        ":all_ops",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":profiler",
-        ":tpu_ops",
-        "//tensorflow/contrib/compiler:xla",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/tpu/profiler",
     ],
 )
 
@@ -262,6 +97,7 @@ py_library(
         ":tpu_embedding",
         ":tpu_estimator",
         ":tpu_lib",
+        "//tensorflow/python/tpu",
     ],
 )
 
@@ -284,8 +120,8 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/distribute",
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -325,30 +161,12 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
+        ":functional",
         ":profiler",
         ":tpu_py",
-        "//tensorflow/compiler/xla/experimental/xla_sharding",
-        "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:xla",
-        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
-        "//tensorflow/contrib/tpu/proto:dynamic_padding_proto_py",
-        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
-        "//tensorflow/contrib/tpu/proto:topology_proto_py",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu:tpu_lib",
     ],
 )
 
@@ -359,125 +177,20 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-tf_py_test(
-    name = "datasets_test",
-    size = "medium",
-    srcs = ["python/tpu/datasets_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        ":datasets",
-    ],
-    grpc_enabled = True,
-    shard_count = 4,
-    tags = ["no_oss"],
-)
-
-tf_py_test(
-    name = "tpu_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:layers",
-    ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
-)
-
-tf_py_test(
-    name = "tpu_sharding_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_sharding_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "bfloat16_test",
-    size = "small",
-    srcs = ["python/tpu/bfloat16_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_infeed_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_infeed_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_config_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_config_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_estimator_signals_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_estimator_signals_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "topology_test",
-    size = "medium",
-    srcs = ["python/tpu/topology_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/tpu:datasets",
     ],
 )
 
 py_library(
     name = "tpu_embedding",
-    srcs = ["python/tpu/tpu_embedding.py"],
+    srcs = [
+        "python/tpu/tpu_embedding.py",
+        "python/tpu/tpu_embedding_gradient.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":tpu_lib",
-        ":tpu_ops",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "@six_archive//:six",
+        "//tensorflow/python/tpu:tpu_embedding",
     ],
 )
 
@@ -486,31 +199,6 @@ py_library(
     srcs = ["python/tpu/feature_column.py"],
     deps = [
         ":tpu_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
-    ],
-)
-
-tf_py_test(
-    name = "feature_column_test",
-    srcs = [
-        "python/tpu/feature_column_test.py",
-    ],
-    additional_deps = [
-        ":feature_column",
-        "//third_party/py/numpy",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/tpu:feature_column",
     ],
-    main = "python/tpu/feature_column_test.py",
 )
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 541fbf33a302a4d850422885fdbbc438bd6b9b7b..9fb29f5e17bde9864a7e9b85d4abc2f6a4681b42 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -2,35 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
-
-tf_proto_library(
-    name = "tpu_profiler_proto",
-    srcs = ["tpu_profiler.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    protodeps = [":op_profile_proto"] + tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "dump_tpu_profile",
-    srcs = ["dump_tpu_profile.cc"],
-    hdrs = ["dump_tpu_profile.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":op_profile_proto_cc",
-        ":tpu_profiler_proto_cc",
-        ":trace_events_proto_cc",
-        ":trace_events_to_json",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
 
 cc_library(
     name = "version",
@@ -45,69 +16,9 @@ tf_cc_binary(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":dump_tpu_profile",
-        ":tpu_profiler_analysis_proto_cc",
-        ":tpu_profiler_proto_cc",
         ":version",
-        "//tensorflow:grpc++",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "//tensorflow/core/platform/cloud:gcs_file_system",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
     ],
 )
-
-tf_proto_library(
-    name = "trace_events_proto",
-    srcs = ["trace_events.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "trace_events_to_json",
-    srcs = ["trace_events_to_json.cc"],
-    hdrs = ["trace_events_to_json.h"],
-    deps = [
-        ":trace_events_proto_cc",
-        "//tensorflow/core:lib",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_cc_test(
-    name = "trace_events_to_json_test",
-    srcs = ["trace_events_to_json_test.cc"],
-    deps = [
-        ":trace_events_to_json",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_proto_library(
-    name = "op_profile_proto",
-    srcs = ["op_profile.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library(
-    name = "tpu_profiler_analysis_proto",
-    srcs = ["tpu_profiler_analysis.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    protodeps = [":tpu_profiler_proto"] + tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "tpu_profiler_analysis_pb2_grpc",
-    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [":tpu_profiler_analysis_proto_py"],
-)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 1c5ea2d997a58ca57ddc212ffd56aad525e961da..f11d1a9f37eeb19b95a876bd68575022e6b91521 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -18,235 +18,11 @@ limitations under the License.
 // Initiates a TPU profiling on the TPUProfiler service at service_addr,
 // receives and dumps the profile data to a tensorboard log directory.
 
-#include "grpcpp/grpcpp.h"
-
-#include <cstdio>
-#include <ctime>
-#include <vector>
-
-#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
 #include "tensorflow/contrib/tpu/profiler/version.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
-namespace tensorflow {
-namespace tpu {
-namespace {
-
-using ::tensorflow::TPUProfileAnalysis;
-using ::tensorflow::TPUProfiler;
-
-constexpr uint64 kMaxEvents = 1000000;
-
-string GetCurrentTimeStampAsString() {
-  char s[128];
-  std::time_t t = std::time(nullptr);
-  CHECK_NE(std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t)), 0);
-  return s;
-}
-
-Status ValidateHostPortPair(const string& host_port) {
-  uint32 port;
-  std::vector<string> parts = str_util::Split(host_port, ':');
-  // Must be host:port, port must be a number, host must not contain a '/',
-  // host also must not be empty.
-  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
-      parts[0].find("/") != string::npos || parts[0].empty()) {
-    return errors::InvalidArgument("Could not interpret \"", host_port,
-                                   "\" as a host-port pair.");
-  }
-  return Status::OK();
-}
-
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const string& repository_root,
-                                      const string& session_id,
-                                      const ProfileOptions& opts) {
-  ProfileRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_max_events(kMaxEvents);
-  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
-    // For backward compatibilities, only generate tracetable etc when the
-    // user provide a GCS path for model directory.
-    request.set_repository_root(repository_root);
-    request.set_session_id(session_id);
-  }
-  request.add_tools("op_profile");
-  request.add_tools("input_pipeline");
-  request.add_tools("memory_viewer");
-  request.add_tools("overview_page");
-  *request.mutable_opts() = opts;
-  return request;
-}
-
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-bool Profile(const string& service_addr, const string& logdir, int duration_ms,
-             const string& repository_root, const string& session_id,
-             const ProfileOptions& opts) {
-  ProfileRequest request =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  // `ValidateHostPortPair` checks for empty host string case.
-  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                      std::numeric_limits<int32>::max());
-  std::unique_ptr<TPUProfiler::Stub> stub =
-      TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  ProfileResponse response;
-  TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-
-  if (!response.encoded_trace().empty()) {
-    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-        logdir, session_id, "", response, &std::cout));
-    // Print this at the end so that it's not buried in irrelevant LOG messages.
-    std::cout
-        << "NOTE: using the trace duration " << duration_ms << "ms."
-        << std::endl
-        << "Set an appropriate duration (with --duration_ms) if you "
-           "don't see a full step in your trace or the captured trace is too "
-           "large."
-        << std::endl;
-  }
-
-  return response.encoded_trace().empty();
-}
-
-// Start a new profiling session that include all the hosts included in
-// hostnames, for the time interval of duration_ms. Possibly save the profiling
-// result in the directory specified by repository_root and session_id.
-bool NewSession(const string& service_addr,
-                const std::vector<tensorflow::string>& hostnames,
-                int duration_ms, const string& repository_root,
-                const string& session_id, const ProfileOptions& opts) {
-  NewProfileSessionRequest new_session_request;
-  *new_session_request.mutable_request() =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-  new_session_request.set_repository_root(repository_root);
-  new_session_request.set_session_id(session_id);
-  for (const auto& hostname : hostnames) {
-    new_session_request.add_hosts(hostname);
-  }
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  // `ValidateHostPortPair` checks for empty host string case.
-  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  // TODO(jiesun): GRPC support following relevant naming scheme:
-  // 1. dns:///host:port
-  // 2. ipv4:host:port or ipv6:[host]:port
-  // We might need to change the prefix which depends on what TPU name resolver
-  // will give us.
-  std::unique_ptr<TPUProfileAnalysis::Stub> stub =
-      TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  NewProfileSessionResponse new_session_response;
-  TF_QCHECK_OK(FromGrpcStatus(
-      stub->NewSession(&context, new_session_request, &new_session_response)));
-
-  std::cout << "Profile session succeed for host(s):"
-            << str_util::Join(hostnames, ",") << std::endl;
-  return new_session_response.empty_trace();
-}
-
-// Starts tracing on a single or multiple TPU hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-void StartTracing(const tensorflow::string& service_addr,
-                  const tensorflow::string& logdir,
-                  const tensorflow::string& workers_list,
-                  bool include_dataset_ops, int duration_ms,
-                  int num_tracing_attempts) {
-  // Use the current timestamp as the run name.
-  tensorflow::string session_id = GetCurrentTimeStampAsString();
-  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
-  tensorflow::string repository_root =
-      io::JoinPath(logdir, kProfilePluginDirectory);
-  std::vector<tensorflow::string> hostnames =
-      tensorflow::str_util::Split(workers_list, ",");
-
-  bool empty_trace = false;
-  int remaining_attempts = num_tracing_attempts;
-  tensorflow::ProfileOptions opts;
-  opts.set_include_dataset_ops(include_dataset_ops);
-  while (true) {
-    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
-              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    if (hostnames.empty()) {
-      empty_trace = tensorflow::tpu::Profile(service_addr, logdir, duration_ms,
-                                             repository_root, session_id, opts);
-    } else {
-      tensorflow::string tpu_master = service_addr;
-      empty_trace =
-          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
-                                      repository_root, session_id, opts);
-    }
-    if (remaining_attempts <= 0 || !empty_trace) break;
-    std::cout << "No trace event is collected. Automatically retrying."
-              << std::endl
-              << std::endl;
-  }
-
-  if (empty_trace) {
-    std::cout << "No trace event is collected after " << num_tracing_attempts
-              << " attempt(s). "
-              << "Perhaps, you want to try again (with more attempts?)."
-              << std::endl
-              << "Tip: increase number of attempts with --num_tracing_attempts."
-              << std::endl;
-  }
-}
-
-MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) {
-  MonitorRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_monitoring_level(monitoring_level);
-  return request;
-}
-
-// Repeatedly collects profiles and shows user-friendly metrics for
-// 'num_queries' time(s).
-void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
-                     int monitoring_level, int num_queries) {
-  for (int query = 0; query < num_queries; ++query) {
-    MonitorRequest request =
-        PopulateMonitorRequest(duration_ms, monitoring_level);
-
-    ::grpc::ClientContext context;
-    ::grpc::ChannelArguments channel_args;
-    channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                        std::numeric_limits<int32>::max());
-    std::unique_ptr<TPUProfiler::Stub> stub =
-        TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-            "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-            channel_args));
-    MonitorResponse response;
-    TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
-
-    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
-              << "):\n\n"
-              << response.data() << std::flush;
-  }
-}
-
-}  // namespace
-}  // namespace tpu
-}  // namespace tensorflow
-
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
@@ -300,8 +76,9 @@ int main(int argc, char** argv) {
     std::cout << usage.c_str() << std::endl;
     return 2;
   }
-  tensorflow::Status status =
-      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  tensorflow::Status status;
+  status =
+      tensorflow::profiler::client::ValidateHostPortPair(FLAGS_service_addr);
   if (!status.ok()) {
     std::cout << status.error_message() << std::endl;
     std::cout << usage.c_str() << std::endl;
@@ -324,12 +101,17 @@ int main(int argc, char** argv) {
               << FLAGS_service_addr << " for " << duration_ms
               << "ms and show metrics for " << num_queries << " time(s)."
               << std::endl;
-    tensorflow::tpu::StartMonitoring(FLAGS_service_addr, duration_ms,
-                                     FLAGS_monitoring_level, num_queries);
+    tensorflow::profiler::client::StartMonitoring(
+        FLAGS_service_addr, duration_ms, FLAGS_monitoring_level, num_queries);
   } else {
-    tensorflow::tpu::StartTracing(FLAGS_service_addr, FLAGS_logdir,
-                                  FLAGS_workers_list, FLAGS_include_dataset_ops,
-                                  duration_ms, num_tracing_attempts);
+    status = tensorflow::profiler::client::StartTracing(
+        FLAGS_service_addr, FLAGS_logdir, FLAGS_workers_list,
+        FLAGS_include_dataset_ops, duration_ms, num_tracing_attempts);
+    if (!status.ok()) {
+      std::cout << status.error_message() << std::endl;
+      std::cout << usage.c_str() << std::endl;
+      return 2;
+    }
   }
   return 0;
 }
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 500dd2cd39d6b8747cebb95d0a01d8c5680427fe..8605bae5c128513186d8c03835dcf49d3e4b6fd9 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -1,394 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Operations for TPUs."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tpu.ops import gen_tpu_ops
-  from tensorflow.contrib.tpu.ops.gen_tpu_ops import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _tpu_ops = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_tpu_ops.so"))
-
-  def _create_default_group_assignment():
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      logging.warning(
-          "cross_replica_sum should be used within a tpu_shard_context, but "
-          "got unset number_of_shards. Assuming 1.")
-      num_shards = 1
-    group_assignment = [list(range(num_shards))]
-    return group_assignment
-
-  def all_to_all(x,
-                 concat_dimension,
-                 split_dimension,
-                 split_count,
-                 group_assignment=None,
-                 name=None):
-    """Exchange data across TPU replicas.
-
-    Args:
-      x: The local tensor.
-      concat_dimension: The dimension number to concatenate.
-      split_dimension: The dimension number to split.
-      split_count: The number of splits, this number must equal to the sub-group
-        size(group_assignment.get_shape()[1])
-      group_assignment: Optional 2d int32 lists with shape [num_groups,
-        num_replicas_per_group]. `group_assignment[i]` represents the replica
-        ids in the ith subgroup.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is concatenated by data from different replicas.
-    """
-    if group_assignment is None:
-      group_assignment = _create_default_group_assignment()
-    return gen_tpu_ops.all_to_all(
-        x,
-        group_assignment,
-        concat_dimension=concat_dimension,
-        split_dimension=split_dimension,
-        split_count=split_count,
-        name=name)
-
-  @ops.RegisterGradient("AllToAll")
-  def _all_to_all_grad(op, grad):
-    # The gradient of a all-to-all is also a all-to-all but the
-    # split_dimension and concat_dimension is swapped.
-    # The graident with respect to group_assignment is None.
-    return [
-        gen_tpu_ops.all_to_all(
-            grad,
-            op.inputs[1],
-            concat_dimension=op.get_attr("split_dimension"),
-            split_dimension=op.get_attr("concat_dimension"),
-            split_count=op.get_attr("split_count")), None
-    ]
-
-  def cross_replica_sum(x, group_assignment=None, name=None):
-    """Sum the input tensor across replicas according to group_assignment.
-
-    Args:
-      x: The local tensor to the sum.
-      group_assignment: Optional 2d int32 lists with shape [num_groups,
-        num_replicas_per_group]. `group_assignment[i]` represents the replica
-        ids in the ith subgroup.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is summed across replicas.
-    """
-    if group_assignment is None:
-      group_assignment = _create_default_group_assignment()
-
-    return gen_tpu_ops.cross_replica_sum(x, group_assignment, name=name)
-
-  def collective_permute(x, source_target_pairs, name=None):
-    """Permute the input tensor across replicas given source_target_pairs.
-
-    For each source_target_pair <a, b>, we send replica a's input to replica b.
-    Each replica id must only appear once in the source column. Also it must
-    only appear once in the target column.
-    For the replica id not in the target column, this op returns a zero tensor
-    with the same shape and dtype of the input x.
-
-    For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-    source_target_pairs=`[[0,1],[1,2],[2,3]]` gets the outputs:
-    `[0, A, B, C]`.
-
-    Args:
-      x: The local tensor to be permuted.
-      source_target_pairs: 2d int lists with shape [num_pairs, 2].
-        source_target_pairs[i][0] represents the source replica id and
-        source_target_pairs[i][1] represents the target replica id.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is permuted.
-    """
-    return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
-
-  @ops.RegisterGradient("CollectivePermute")
-  def _collective_permute_grad(op, grad):
-    # The gradient of a collective permute operation is also a collective
-    # permute, but with source/target pairs reversed. The gradient with respect
-    # to input argument `source_target_pairs` is `None`.
-    source_target_pairs = op.inputs[1][:, ::-1]
-    return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
-
-  @ops.RegisterGradient("CrossReplicaSum")
-  def _cross_replica_sum_grad(op, grad):
-    # The gradient of a cross replica sum is also a cross-replica sum.
-    # The gradient with respect to group_assignment is None.
-    return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
-
-  # This extra type checking exists to give a more helpful error message in
-  # the common case that uint8 and int64 values are infed. Remove when both
-  # types are supported.
-
-  _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
-      dtypes.complex64, dtypes.uint32
-  ])
-
-  def infeed_dequeue(dtype, shape, name=None):
-    """A placeholder op for a value that will be fed into the computation.
-
-    Args:
-      dtype: A `tf.DType`. The type of elements in the tensor.
-      shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
-      name: A name for the operation (optional).
-
-    Returns:
-      A `Tensor` of type `dtype`.
-      A tensor that will be provided using the infeed mechanism.
-
-    Raises:
-      TypeError: If 'dtype` is not a supported infeed type.
-    """
-    if dtype not in _SUPPORTED_INFEED_DTYPES:
-      raise TypeError(
-          "{} is not a supported TPU infeed type. Supported types are: "
-          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
-
-    return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
-
-  # pylint: disable=redefined-outer-name
-  def infeed_dequeue_tuple(dtypes, shapes, name=None):
-    """A placeholder op for values fed into the TPU simultaneously as a tuple.
-
-    Args:
-      dtypes: A list of `tf.DType`s that has length `>= 1`.
-        The element types of each element in `outputs`.
-      shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
-        The shapes of each tensor in `outputs`.
-      name: A name for the operation (optional).
-
-    Returns:
-      A list of `Tensor` objects of type `dtypes`.
-      A list of tensors that will be provided using the infeed mechanism.
-
-    Raises:
-      TypeError: If a type in 'dtypes` is not a supported infeed type.
-    """
-    for dtype in dtypes:
-      if dtype not in _SUPPORTED_INFEED_DTYPES:
-        raise TypeError(
-            "{} is not a supported TPU infeed type. Supported types are: "
-            "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
-    return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
-  # pylint: enable=redefined-outer-name
-
-  # pylint: disable=protected-access
-  def send_tpu_embedding_gradients(inputs,
-                                   config,
-                                   learning_rates=None,
-                                   name=None):
-    """A placeholder op for feeding per-sample gradients to the embedding layer.
-
-    Args:
-      inputs: A TensorList of gradients with which to update embedding tables.
-          This argument has the same length and shapes as the return value of
-          RecvTPUEmbeddingActivations, but contains gradients of the model's
-          loss with respect to the embedding activations. The embedding tables
-          are updated from these gradients via the optimizers specified in the
-          TPU embedding configuration given to tpu.initialize_system.
-      config: Serialized TPUEmbeddingConfiguration proto.
-      learning_rates: A TensorList of float32 scalars, one for each dynamic
-          learning rate tag: see the comments in
-          //third_party/tensorflow/contrib/tpu/proto/
-                                               optimization_parameters.proto.
-          Multiple tables can share the same dynamic learning rate tag as
-          specified in the configuration. If the learning rates for all tables
-          are constant, this list should be empty.
-      name: A name for the operation (optional).
-
-    Returns:
-      A SendTPUEmbeddingGradients operation.
-    """
-    if learning_rates is None:
-      learning_rates = []
-    return gen_tpu_ops._send_tpu_embedding_gradients(
-        inputs=inputs, learning_rates=learning_rates, config=config, name=name)
-
-
-  send_tpu_embedding_gradients.__doc__ = (
-      gen_tpu_ops._send_tpu_embedding_gradients.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_integer_batch(batch,
-                                          device_ordinal,
-                                          mode_override=None,
-                                          name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      batch: A list of 1D tensors, one for each embedding table, containing the
-        indices into the tables.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingIntegerBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_integer_batch(
-        batch=batch,
-        device_ordinal=device_ordinal,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_integer_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_integer_batch.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_sparse_batch(sample_indices,
-                                         embedding_indices,
-                                         aggregation_weights,
-                                         device_ordinal,
-                                         combiners=None,
-                                         mode_override=None,
-                                         name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      sample_indices: A list of rank 1 Tensors specifying the training example
-        and feature to which the corresponding embedding_indices and
-        aggregation_weights values belong. sample_indices[i] must equal b * nf +
-        f, where nf is the number of features from the corresponding table, f is
-        in [0, nf), and b is in [0, batch size).
-      embedding_indices: A list of rank 1 Tensors, indices into the embedding
-        tables.
-      aggregation_weights: A list of rank 1 Tensors containing per sample --
-        i.e. per (training example, feature) -- aggregation weights.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      combiners: A list of string scalars, one for each embedding table that
-        specify how to normalize the embedding activations after weighted
-        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
-        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
-        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
-        is to use 'sum' for all tables (optional).
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingSparseBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_sparse_batch(
-        sample_indices=sample_indices,
-        embedding_indices=embedding_indices,
-        aggregation_weights=aggregation_weights,
-        device_ordinal=device_ordinal,
-        combiners=combiners,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_sparse_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_sparse_batch.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
-                                                embedding_indices,
-                                                aggregation_weights,
-                                                table_ids,
-                                                device_ordinal,
-                                                combiners=None,
-                                                mode_override=None,
-                                                name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      sample_indices: A list of rank 1 Tensors specifying the training example
-        to which the corresponding embedding_indices and aggregation_weights
-        values belong. It corresponds to sp_ids.indices[:,0] in
-        embedding_lookup_sparse().
-      embedding_indices: A list of rank 1 Tensors, indices into the embedding
-        tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
-      aggregation_weights: A list of rank 1 Tensors containing per training
-        example aggregation weights. It corresponds to sp_weights.values in
-        embedding_lookup_sparse().
-      table_ids: A list of integers specifying the identifier of the embedding
-        table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
-        lookup the corresponding input. The ith input is looked up using
-        table_ids[i]. The size of the table_ids list must be equal to that of
-        sample_indices, embedding_indices and aggregation_weights.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      combiners: A list of string scalars, one for each embedding table that
-        specify how to normalize the embedding activations after weighted
-        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
-        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
-        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
-        is to use 'sum' for all tables (optional).
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingSparseTensorBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch(
-        sample_indices=sample_indices,
-        embedding_indices=embedding_indices,
-        aggregation_weights=aggregation_weights,
-        table_ids=table_ids,
-        device_ordinal=device_ordinal,
-        combiners=combiners,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
-
-else:
-  # We have already built the appropriate libraries into the binary via CMake
-  # if we have built contrib, so we don't need this
-  pass
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.ops.tpu_ops import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py b/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
index 5ca38cd1bae5753a7398834bd96d3b26e66b4941..788e1fe0568cf2f406c379e4d928100ea51a37a3 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
@@ -1,38 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Operations to select TPU core to run."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tpu.ops.gen_tpu_ordinal_selector_op import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _tpu_ordinal_selector_op = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_tpu_ordinal_selector_op.so"))
-
-else:
-  # We have already built the appropriate libraries into the binary via CMake
-  # if we have built contrib, so we don't need this
-  pass
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.ops.tpu_ordinal_selector_op import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
index 15ce6aceec299adacd7025f0021cf8b6f6ef765b..aeb061dbe114bc287946b50d08a86778c78c7b38 100644
--- a/tensorflow/contrib/tpu/python/profiler/__init__.py
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -1,31 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Classes for TPU trace events."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.contrib.tpu.profiler.tpu_profiler_analysis_pb2 import *
-from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
+from tensorflow.python.tpu.profiler import *
 # pylint: enable=wildcard-import,unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/python/tpu/__init__.py b/tensorflow/contrib/tpu/python/tpu/__init__.py
index 0dffd7064b19f353aed6afa3ad383564643a4a90..82d4f68c0221013706f70bcf54ae4c97cc7db1d3 100644
--- a/tensorflow/contrib/tpu/python/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/python/tpu/__init__.py
@@ -1,20 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Ops related to Tensor Processing Units."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
index 6ce96e5bcdbe5777f68eb969be46423b5b3410cb..41aa4d267812cabe775459723df7e01efaa83c93 100644
--- a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
@@ -1,273 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""Tooling for support TPU embedding in TPUEstimator."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-from tensorflow.contrib.tpu.python.tpu import feature_column as tpu_fc
-from tensorflow.contrib.tpu.python.tpu import tpu_embedding
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.feature_column import feature_column as core_fc
-from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
-
-# pylint: disable=protected-access
-_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
-                                 tpu_fc._TPUSharedEmbeddingColumn)
-_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
-                             core_fc_lib.EmbeddingColumn,
-                             core_fc._SharedEmbeddingColumn)
-_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
-
-# pylint: enable=protected-access
-
-
-def get_tpu_embedding_config_from_feature_columns(feature_columns):
-  """Create configs for TPUEmbedding from a list of feature columns.
-
-  This function will place one embedding tensor per table and the return is
-  intended to be used as input to TPUEmbedding.
-
-  Args:
-    feature_columns: a list of supported feature columns.
-
-  Returns:
-    A pair of dicts, the first maps tables to their config, the second maps
-    features to tables.
-  """
-
-  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
-
-  for column in feature_columns:
-    if not isinstance(column, allowed):
-      raise TypeError(
-          'Unsupported feature column {}. Supported types are {}.'.format(
-              type(column), allowed))
-
-  table_to_config = {}
-  feature_to_table = {}
-  for column in feature_columns:
-    feature_name = column.get_feature_key_name()
-    table_name = 'tbl_{}'.format(column.get_embedding_var_name())
-    if feature_name in feature_to_table:
-      raise ValueError(
-          'Feature column {} is used with multiple embeddings and this is '
-          'not supported.'.format(feature_name))
-    feature_to_table[feature_name] = table_name
-    vocabulary_size, dimension = column.get_embedding_table_size()
-    table_to_config[table_name] = tpu_embedding.TableConfig(
-        vocabulary_size=vocabulary_size,
-        dimension=dimension,
-        initializer=column.get_initializer(),
-        combiner=column.get_combiner())
-
-  return table_to_config, feature_to_table
-
-
-def _get_tpu_embedding_optimization_parameters(embedding_config_spec):
-  """Get tpu_embedding._OptimizationParameters from EmbeddingConfigSpec."""
-  if embedding_config_spec.optimizer_type == 'adagrad':
-    return tpu_embedding.AdagradParameters(
-        embedding_config_spec.learning_rate,
-        embedding_config_spec.adagrad_initial_accumulator,
-        embedding_config_spec.use_gradient_accumulation)
-  elif embedding_config_spec.optimizer_type == 'sgd':
-    return tpu_embedding.StochasticGradientDescentParameters(
-        embedding_config_spec.learning_rate,
-        embedding_config_spec.use_gradient_accumulattion)
-  elif embedding_config_spec.optimizer_type == 'adam':
-    return tpu_embedding.AdamParameters(
-        embedding_config_spec.learning_rate,
-        embedding_config_spec.adam_parameters.beta1,
-        embedding_config_spec.adam_parameters.beta2,
-        embedding_config_spec.adam_parameters.epsilon,
-        use_gradient_accumulation=embedding_config_spec
-        .use_gradient_accumulation)
-  else:
-    raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
-
-
-AdamParameters = collections.namedtuple('AdamParameters',
-                                        ['beta1', 'beta2', 'epsilon'])
-
-
-# TODO(shizhiw): Improve the API to support more optimizer parameters in API.
-class EmbeddingConfigSpec(
-    collections.namedtuple('EmbeddingConfigSpec', [
-        'feature_columns', 'learning_rate', 'optimizer_type',
-        'adagrad_initial_accumulator', 'clipping_limit',
-        'use_gradient_accumulation', 'adam_parameters'
-    ])):
-  """Class to keep track of embedding config specification."""
-
-  def __new__(cls,
-              feature_columns,
-              learning_rate,
-              optimizer_type='adagrad',
-              adagrad_initial_accumulator=None,
-              clipping_limit=None,
-              use_gradient_accumulation=False,
-              adam_parameters=None):
-    """Creates an EmbeddingConfigSpec instance.
-
-    Args:
-      feature_columns: All `FeatureColumn`s used by model.
-      learning_rate: embedding optimizer learning rate.
-      optimizer_type: (String) Name of the optimizer for embedding gradients
-        updates. Must be either 'adagrad' ( `tf.train.AdagradOptimizer`, default
-        value), 'sgd' (`tf.train.GradientDescentOptimizer`), or 'adam'
-        (`tf.contrib.opt.LazyAdamOptimizer`) for lazy Adam. This optimizer will
-        be applied to all embedding variables specified by `feature_columns`.
-      adagrad_initial_accumulator: Initial accumulator for Adagrad. Used when
-        optimizer_type is 'adagrad'. Default is `0.1`.
-      clipping_limit: (Optional) Clipping limit (absolute value).
-      use_gradient_accumulation: (Experimental) Whether to accumulate the
-        gradients across TPU embedding mini-batches. Gradient accumulation does
-        not affect SGD and therefore this is applicable only for Adagrad.
-      adam_parameters: AdamParameters. Used when optimizer_type is 'adam'.
-        Default is 0.9 for beta1, 0.999 for beta2 and 1e-8 for epsilon.
-
-    Returns:
-      An EmbeddingConfigSpec instance.
-
-    Raises:
-      ValueError: If the feature_columns are not specified.
-      TypeError: If the feature columns are not of ths correct type (one of
-        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
-        _EMBEDDING_COLUMN_CLASSES).
-      ValueError: If use_gradient_accumulation is True for SGD.
-      ValueError: If `optimizer_type` is not one of "adagrad" or "sgd" or
-        "adam".
-    """
-    if not feature_columns:
-      raise ValueError('`feature_columns` cannot be `None` or empty.')
-
-    # It is unknown at this moment, whether the TPUEstimator is running in CPU
-    # or TPU mode. So allow non-TPU embedding columns also.
-    supported_classes = tuple(
-        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
-        list(_EMBEDDING_COLUMN_CLASSES))
-
-    for column in feature_columns:
-      if not isinstance(column, supported_classes):
-        raise TypeError(
-            'All feature columns must be supported types in {}. Got {}'.format(
-                supported_classes, type(column)))
-
-    if optimizer_type == 'adagrad':
-      if adagrad_initial_accumulator is None:
-        adagrad_initial_accumulator = 0.1
-      if adagrad_initial_accumulator <= 0:
-        raise ValueError('Adagrad initial_accumulator must be positive')
-    elif optimizer_type == 'sgd':
-      if use_gradient_accumulation:
-        raise ValueError('Gradient accumulation makes sense for Adagrad only.')
-    elif optimizer_type == 'adam':
-      if adam_parameters is None:
-        adam_parameters = AdamParameters(0.9, 0.999, 1e-8)
-      if adam_parameters.beta1 < 0. or adam_parameters.beta1 >= 1.:
-        raise ValueError('beta1 must be between 0. and 1; got {}.'.format(
-            adam_parameters.beta1))
-      if adam_parameters.beta2 < 0. or adam_parameters.beta2 >= 1.:
-        raise ValueError('beta2 must be between 0. and 1; got {}.'.format(
-            adam_parameters.beta2))
-      if adam_parameters.epsilon <= 0.:
-        raise ValueError('epsilon must be positive; got {}.'.format(
-            adam_parameters.epsilon))
-    else:
-      raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
-
-    return super(EmbeddingConfigSpec, cls).__new__(
-        cls,
-        feature_columns=feature_columns,
-        learning_rate=learning_rate,
-        optimizer_type=optimizer_type,
-        adagrad_initial_accumulator=adagrad_initial_accumulator,
-        clipping_limit=clipping_limit,
-        use_gradient_accumulation=use_gradient_accumulation,
-        adam_parameters=adam_parameters)
-
-
-class EmbeddingConfig(object):
-  """This is the internal immutable object for embedding config.
-
-  `_EmbeddingConfig` is responsible to _translate_ user provided
-  `EmbeddingConfigSpec` to internal data structures, mostly constructor
-  arguments of `TPUEmbedding`.
-  """
-
-  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
-               num_hosts, num_cores, master):
-    self._embedding_config_spec = embedding_config_spec
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._num_hosts = num_hosts
-    self._num_cores = num_cores
-    self._master = master
-
-    self._table_to_config_dict, self._feature_to_table_dict = (
-        get_tpu_embedding_config_from_feature_columns(
-            embedding_config_spec.feature_columns))
-    self._optimization_parameters = _get_tpu_embedding_optimization_parameters(
-        self._embedding_config_spec)
-    self._mode_to_tpu_embedding_dict = {}
-
-  def has_embedding_tables(self):
-    return bool(self._table_to_config_dict)
-
-  def _create_tpu_embedding(self, mode):
-    """Create tpu_embedding.TPUEmbedding based on mode."""
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      batch_size = self._train_batch_size
-    else:
-      batch_size = self._eval_batch_size
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      tpu_embedding_mode = tpu_embedding.TRAINING
-    elif (mode == model_fn_lib.ModeKeys.EVAL or
-          mode == model_fn_lib.ModeKeys.PREDICT):
-      tpu_embedding_mode = tpu_embedding.INFERENCE
-    else:
-      raise ValueError('Mode {} is not supported.'.format(mode))
-
-    tpu_embedding_ = tpu_embedding.TPUEmbedding(
-        self._table_to_config_dict,
-        self._feature_to_table_dict,
-        batch_size,
-        tpu_embedding_mode,
-        self._master,
-        self._optimization_parameters,
-    )
-    return tpu_embedding_
-
-  def get_tpu_embedding(self, mode):
-    if mode not in self._mode_to_tpu_embedding_dict:
-      self._mode_to_tpu_embedding_dict[mode] = (
-          self._create_tpu_embedding(mode))
-    return self._mode_to_tpu_embedding_dict[mode]
-
-
-def split_inputs(ctx, features, labels):
-  """Splits the dense and sparse tensors inside the features and labels."""
-  sparse_features = collections.OrderedDict()
-  if ctx.embedding_config:
-    tpu_embedding_ = ctx.embedding_config.tpu_embedding
-    for feature_key in tpu_embedding_.feature_to_table_dict:
-      sparse_features[feature_key] = features.pop(feature_key)
-
-  return features, labels, sparse_features
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index 1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf..5eb8034e47474873ccef0b6123f2becd0668738c 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -1,212 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the 'License');
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Hook for asynchronous checkpointing.
-
-This hook dispatches checkpoint writing operations in a separate thread to
-allow execution to continue on the main thread.
-"""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import threading
-import time
-
-from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.framework import meta_graph
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import training_util
-from tensorflow.python.training.session_run_hook import SessionRunArgs
-from tensorflow.python.training.summary_io import SummaryWriterCache
-
-
-class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
-  """Saves checkpoints every N steps or seconds."""
-
-  def __init__(self,
-               checkpoint_dir,
-               save_secs=None,
-               save_steps=None,
-               saver=None,
-               checkpoint_basename="model.ckpt",
-               scaffold=None,
-               listeners=None):
-    """Initializes a `CheckpointSaverHook`.
-
-    Args:
-      checkpoint_dir: `str`, base directory for the checkpoint files.
-      save_secs: `int`, save every N secs.
-      save_steps: `int`, save every N steps.
-      saver: `Saver` object, used for saving.
-      checkpoint_basename: `str`, base name for the checkpoint files.
-      scaffold: `Scaffold`, use to get saver object.
-      listeners: List of `CheckpointSaverListener` subclass instances. Used for
-        callbacks that run immediately before or after this hook saves the
-        checkpoint.
-
-    Raises:
-      ValueError: One of `save_steps` or `save_secs` should be set.
-      ValueError: At most one of `saver` or `scaffold` should be set.
-    """
-    logging.info("Create AsyncCheckpointSaverHook.")
-    if saver is not None and scaffold is not None:
-      raise ValueError("You cannot provide both saver and scaffold.")
-    self._saver = saver
-    self._save_thread = None
-    self._write_graph_thread = None
-    self._checkpoint_dir = checkpoint_dir
-    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
-    self._scaffold = scaffold
-    self._timer = basic_session_run_hooks.SecondOrStepTimer(
-        every_secs=save_secs, every_steps=save_steps)
-    self._listeners = listeners or []
-    self._steps_per_run = 1
-    self._summary_writer = None
-    self._global_step_tensor = None
-
-    self._last_checkpoint_step = None
-
-  def _set_steps_per_run(self, steps_per_run):
-    self._steps_per_run = steps_per_run
-
-  def begin(self):
-    self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
-    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-    if self._global_step_tensor is None:
-      raise RuntimeError(
-          "Global step should be created to use CheckpointSaverHook.")
-    for l in self._listeners:
-      l.begin()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-
-    # We do write graph and saver_def at the first call of before_run.
-    # We cannot do this in begin, since we let other hooks to change graph and
-    # add variables in begin. Graph is finalized after all begin calls.
-    def _write_graph_fn(self):
-      training_util.write_graph(
-          ops.get_default_graph().as_graph_def(add_shapes=True),
-          self._checkpoint_dir, "graph.pbtxt")
-    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
-                                                args=[self])
-    self._write_graph_thread.start()
-
-    saver_def = self._get_saver().saver_def if self._get_saver() else None
-    graph = ops.get_default_graph()
-    meta_graph_def = meta_graph.create_meta_graph_def(
-        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
-    self._summary_writer.add_graph(graph)
-    self._summary_writer.add_meta_graph(meta_graph_def)
-    # The checkpoint saved here is the state at step "global_step".
-    self._save(session, global_step)
-    self._timer.update_last_triggered_step(global_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    return SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    global_step = run_context.session.run(self._global_step_tensor)
-    if self._timer.should_trigger_for_step(global_step):
-      self._timer.update_last_triggered_step(global_step)
-      logging.info("Triggering checkpoint. %s", global_step)
-      if self._save(run_context.session, global_step):
-        run_context.request_stop()
-
-  def end(self, session):
-    if self._save_thread:
-      logging.info("Waiting for any pending checkpoints to finish.")
-      self._save_thread.join()
-    if self._write_graph_thread:
-      logging.info("Waiting for any pending write_graph to finish.")
-      self._write_graph_thread.join()
-
-    last_step = session.run(self._global_step_tensor)
-
-    if self._last_checkpoint_step != last_step:
-      self._save(session, last_step, asynchronous=False)
-
-    for l in self._listeners:
-      l.end(session, last_step)
-
-  def _save(self, session, step, asynchronous=True):
-    """Saves the latest checkpoint, returns should_stop."""
-
-    # Skip saving on step 0
-    if step == 0:
-      return
-
-    def _save_fn():
-      """Run the saver process."""
-      logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
-
-      start_time = time.time()
-      for l in self._listeners:
-        l.before_save(session, step)
-
-      self._get_saver().save(session, self._save_path, global_step=step)
-      self._summary_writer.add_session_log(
-          SessionLog(
-              status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
-          step)
-
-      for l in self._listeners:
-        l.after_save(session, step)
-
-      end_time = time.time()
-      logging.info("Checkpoint actual writing time: (%.3f sec)",
-                   end_time - start_time)
-      logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
-
-    if not asynchronous:
-      self._last_checkpoint_step = step
-      _save_fn()
-      return
-
-    if self._save_thread is not None:
-      self._save_thread.join(timeout=0.1)
-      if self._save_thread.is_alive():
-        logging.info("Saver thread still in progress, skipping checkpoint.")
-        return
-
-    self._last_checkpoint_step = step
-    self._save_thread = threading.Thread(target=_save_fn)
-    self._save_thread.start()
-
-  def _get_saver(self):
-    if self._saver is not None:
-      return self._saver
-    elif self._scaffold is not None:
-      return self._scaffold.saver
-
-    # Get saver from the SAVERS collection if present.
-    collection_key = ops.GraphKeys.SAVERS
-    savers = ops.get_collection(collection_key)
-    if not savers:
-      raise RuntimeError(
-          "No items in collection {}. Please add a saver to the collection "
-          "or provide a saver or scaffold.".format(collection_key))
-    elif len(savers) > 1:
-      raise RuntimeError(
-          "More than one item in collection {}. "
-          "Please indicate which one to use by passing it to the constructor."
-          .format(collection_key))
-
-    self._saver = savers[0]
-    return savers[0]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.async_checkpoint import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
index fa74f651aa63c72d14eb78c8af479263810e9b7d..f3d392a8daec2a80f974d90051324a02be002afd 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -1,77 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper context for running models with bfloat16."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_contextlib
-
-
-def _get_custom_getter():
-  """Returns a custom getter that this class's methods must be called under.
-
-  All methods of this class must be called under a variable scope that was
-  passed this custom getter. Example:
-
-  ```python
-  network = ConvNetBuilder(...)
-  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
-    network.conv(...)
-    # Call more methods of network here
-  ```
-
-  Currently, this custom getter only does anything if self.use_tf_layers is
-  True. In that case, it causes variables to be stored as dtype
-  self.variable_type, then casted to the requested dtype, instead of directly
-  storing the variable as the requested dtype.
-  """
-
-  def inner_custom_getter(getter, *args, **kwargs):
-    """Custom getter that forces variables to have type self.variable_type."""
-    cast_to_bfloat16 = False
-    requested_dtype = kwargs['dtype']
-    if requested_dtype == dtypes.bfloat16:
-      # Only change the variable dtype if doing so does not decrease variable
-      # precision.
-      kwargs['dtype'] = dtypes.float32
-      cast_to_bfloat16 = True
-    var = getter(*args, **kwargs)
-    # This if statement is needed to guard the cast, because batch norm
-    # assigns directly to the return value of this custom getter. The cast
-    # makes the return value not a variable so it cannot be assigned. Batch
-    # norm variables are always in fp32 so this if statement is never
-    # triggered for them.
-    if cast_to_bfloat16:
-      var = math_ops.cast(var, dtypes.bfloat16)
-    return var
-
-  return inner_custom_getter
-
-
-@tf_contextlib.contextmanager
-def bfloat16_scope():
-  """Scope class for bfloat16 variables so that the model uses custom getter.
-
-  This enables variables to be read as bfloat16 type when using get_variable.
-  """
-  with variable_scope.variable_scope(
-      '', custom_getter=_get_custom_getter()) as varscope:
-    yield varscope
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.bfloat16 import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index bc0cd41d210ac6f8de1b20ebf744ee1e1dd04137..c20aac7e36aa31c5a9d88ca6fe02a8703f9ed5a3 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -1,191 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Library of Cloud TPU helper functions for data loading."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import interleave_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import functional_ops
-
-
-def _TextLineDataset(filename):
-  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
-  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
-  return dataset
-
-
-def _TFRecordDataset(filename):
-  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
-  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
-  return dataset
-
-
-_FILETYPE_MAP = {
-    'tfrecord': _TFRecordDataset,
-    'textline': _TextLineDataset,
-    'text': _TextLineDataset,
-}
-
-
-def StreamingFilesDataset(files,
-                          filetype=None,
-                          file_reader_job=None,
-                          worker_job=None,
-                          num_epochs=None,
-                          filename_shuffle_buffer_size=None,
-                          num_parallel_reads=None,
-                          batch_transfer_size=None,
-                          sloppy=None):
-  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
-
-  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
-  files local to your GCE VM. In order to train using files stored on your local
-  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
-  helper to generate a dataset to feed your Cloud TPU with files from your GCE
-  VM.
-
-  The resulting dataset may return an OutOfRangeError if there are no files
-  found as a result of the fileglob expansion.
-
-  Note: StreamingFilesDataset assumes that the session is using a
-  TPUClusterResolver and has therefore a worker and a coordinator job. File
-  loading will be done on the coordinator job.
-
-  Args:
-    files: A string glob to match files, or a `tf.data.Dataset` generating file
-      names.
-    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
-      TensorFlow function that when given a filename returns a dataset.
-    file_reader_job: An optional string that corresponds to the job that should
-      perform the file reads.
-    worker_job: An optional string that corresponds to the job that should
-      process the tensors (i.e. your GPU or TPU worker).
-    num_epochs: The number of epochs through the training set that should be
-      generated. By default, it will repeat infinitely.
-    filename_shuffle_buffer_size: An optional integer whose value controls the
-      shuffling of the file names. If you would like to read from the files in
-      the same order, set to 0 or False.
-    num_parallel_reads: An optional integer controlling the number of files to
-      read from concurrently. (Set to 1 for no parallelism.)
-    batch_transfer_size: An optional integer controlling the batching used to
-      amortize the remote function invocation overhead. Set to a very large
-      number to increase throughput. Set to a very small number to reduce memory
-      consumption. Set to False to skip batching.
-    sloppy: (Optional.) If `False`, read input data while maintaining a
-      deterministic order. (This may have significant performance impacts.)
-      sloppy defaults to: True.
-  Returns:
-    A `tf.data.Dataset` with an infinite stream of elements generated by a
-    parallel interleaving of the set of files matched (or generated) by `files`
-    with a type is the output of the dataset specified by `filetype`.
-
-  Raises:
-    ValueError: if any argument is not of the expected type.
-  """
-  if filetype is None:
-    filetype = 'tfrecord'
-
-  if isinstance(filetype, str):
-    if filetype not in _FILETYPE_MAP:
-      raise ValueError('Unexpected filetype: %s' % filetype)
-    reader_fn = _FILETYPE_MAP[filetype]
-  elif callable(filetype):
-    reader_fn = filetype
-  else:
-    raise ValueError('filetype should be a string or a callable')
-
-  file_reader_job = file_reader_job or 'coordinator'
-
-  worker_job = worker_job or 'worker'
-
-  if filename_shuffle_buffer_size is None:
-    filename_shuffle_buffer_size = 4096
-
-  num_parallel_reads = num_parallel_reads or 8
-
-  if batch_transfer_size is None:
-    batch_transfer_size = 256
-
-  if sloppy is None:
-    sloppy = True
-
-  with ops.device('/job:%s' % file_reader_job):
-    if isinstance(files, str):
-      source_dataset = dataset_ops.Dataset.list_files(files)
-    elif isinstance(files, dataset_ops.DatasetV2):
-      source_dataset = files
-    else:
-      raise ValueError('files was not a string or a dataset: %s' % files)
-
-    if filename_shuffle_buffer_size:
-      source_dataset = source_dataset.shuffle(
-          buffer_size=filename_shuffle_buffer_size)
-
-    source_dataset = source_dataset.apply(
-        interleave_ops.parallel_interleave(
-            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
-
-    source_dataset = source_dataset.repeat(num_epochs)
-
-    if batch_transfer_size:
-      source_dataset = source_dataset.batch(batch_transfer_size)
-
-    source_dataset = source_dataset.prefetch(1)
-
-    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
-    source_handle = source_iterator.string_handle()
-
-  @function.Defun(dtypes.string)
-  def LoadingFunc(h):
-    remote_iterator = iterator_ops.Iterator.from_string_handle(
-        h, source_dataset.output_types, source_dataset.output_shapes)
-    return remote_iterator.get_next()
-
-  def MapFn(unused_input):
-    if isinstance(source_dataset.output_types, dtypes.DType):
-      output_types = [source_dataset.output_types]
-    elif isinstance(source_dataset.output_types, (list, tuple)):
-      output_types = source_dataset.output_types
-    else:
-      raise ValueError('source dataset has invalid output types')
-    remote_calls = functional_ops.remote_call(
-        args=[source_handle],
-        Tout=output_types,
-        f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
-    if len(remote_calls) == 1:
-      return remote_calls[0]
-    else:
-      return remote_calls
-
-  with ops.device('/job:%s' % worker_job):
-    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
-        MapFn, num_parallel_calls=4 if sloppy else None)
-    output_dataset = output_dataset.prefetch(1)
-
-    if batch_transfer_size:
-      # Undo the batching used during the transfer.
-      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
-
-  return output_dataset
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.datasets import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index 3313dc749c2c7606101b2dc96614df2d052dfed1..05dffef3a1efdae2ad7306ca5ad3bc7a9eac04cf 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -1,313 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Library of TPU helper functions."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.python.tpu.topology import Topology
-
-
-SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
-
-
-def _compute_task_and_cores_to_replicas(core_assignment, topology):
-  """Computes a nested dict which maps task and logical core to replicas."""
-  task_and_cores_to_replicas = {}
-  for replica in xrange(core_assignment.shape[0]):
-    for logical_core in xrange(core_assignment.shape[1]):
-      coordinates = core_assignment[replica, logical_core, :]
-      task_id = topology.task_ordinal_at_coordinates(coordinates)
-      if task_id not in task_and_cores_to_replicas:
-        task_and_cores_to_replicas[task_id] = {}
-      if logical_core not in task_and_cores_to_replicas[task_id]:
-        task_and_cores_to_replicas[task_id][logical_core] = set()
-
-      task_and_cores_to_replicas[task_id][logical_core].add(replica)
-
-  task_to_sorted_replica_id = {}
-
-  for task, core_to_replicas in task_and_cores_to_replicas.items():
-    core_to_sorted_replicas = {}
-    for core, replicas in core_to_replicas.items():
-      core_to_sorted_replicas[core] = sorted(replicas)
-
-    task_to_sorted_replica_id[task] = core_to_sorted_replicas
-  return task_to_sorted_replica_id
-
-
-class DeviceAssignment(object):
-  """Mapping from logical cores in a computation to the physical TPU topology.
-
-  Prefer to use the `device_assignment()` helper to construct a
-  `DeviceAssignment`; it is easier if less flexible than constructing a
-  `DeviceAssignment` directly.
-  """
-
-  def __init__(self, topology, core_assignment):
-    """Constructs a `DeviceAssignment` object.
-
-    Args:
-      topology: A `Topology` object that describes the physical TPU topology.
-      core_assignment: A logical to physical core mapping, represented as a
-        rank 3 numpy array. See the description of the `core_assignment`
-        property for more details.
-
-    Raises:
-      ValueError: If `topology` is not `Topology` object.
-      ValueError: If `core_assignment` is not a rank 3 numpy array.
-    """
-    if not isinstance(topology, Topology):
-      raise ValueError("topology must be a Topology object, got {}".format(
-          type(topology)))
-    core_assignment = np.asarray(core_assignment, dtype=np.int32)
-
-    self._topology = topology
-
-    if core_assignment.ndim != 3:
-      raise ValueError("core_assignment must be a rank 3 numpy array, "
-                       "got shape {}".format(core_assignment.shape))
-
-    self._num_replicas = core_assignment.shape[0]
-    self._num_cores_per_replica = core_assignment.shape[1]
-
-    if core_assignment.shape[-1] != topology.mesh_rank:
-      raise ValueError(
-          "minor dimension of core_assignment must have size equal to topology "
-          "rank ({}), got shape {}".format(topology.mesh_rank,
-                                           core_assignment.shape))
-
-    self._core_assignment = core_assignment
-    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
-        self._core_assignment, topology)
-
-  @property
-  def topology(self):
-    """A `Topology` that describes the TPU topology."""
-    return self._topology
-
-  @property
-  def num_cores_per_replica(self):
-    """The number of cores per replica."""
-    return self._num_cores_per_replica
-
-  @property
-  def num_replicas(self):
-    """The number of replicas of the computation."""
-    return self._num_replicas
-
-  @property
-  def core_assignment(self):
-    """The logical to physical core mapping.
-
-    Returns:
-      An integer numpy array of rank 3, with shape
-      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
-      (replica, logical core) pairs to physical topology coordinates.
-    """
-    return self._core_assignment
-
-  def _coordinates(self, replica, logical_core):
-    """Returns the physical topology coordinates of a logical core."""
-    return tuple(self.core_assignment[replica, logical_core, :])
-
-  def lookup_replicas(self, task_id, logical_core):
-    """Lookup replica ids by task number and logical core.
-
-    Args:
-      task_id: TensorFlow task number.
-      logical_core: An integer, identifying a logical core.
-    Returns:
-      A sorted list of the replicas that are attached to that task and
-      logical_core.
-    Raises:
-      ValueError: If no replica exists in the task which contains the logical
-      core.
-    """
-    try:
-      return self._task_and_cores_to_replicas[task_id][logical_core]
-    except KeyError:
-      raise ValueError(
-          "Can not find any replica in task: {} contains logical_core: {} ".
-          format(task_id, logical_core))
-
-  def tpu_ordinal(self, replica=0, logical_core=0):
-    """Returns the ordinal of the TPU device assigned to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
-
-  def host_device(self, replica=0, logical_core=0, job=None):
-    """Returns the CPU device attached to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
-
-  def tpu_device(self, replica=0, logical_core=0, job=None):
-    """Returns the name of the TPU device assigned to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
-
-
-def device_assignment(topology,
-                      computation_shape=None,
-                      computation_stride=None,
-                      num_replicas=1):
-  """Computes a device_assignment of a computation across a TPU topology.
-
-  Attempts to choose a compact grid of cores for locality.
-
-  Returns a `DeviceAssignment` that describes the cores in the topology assigned
-  to each core of each replica.
-
-  `computation_shape` and `computation_stride` values should be powers of 2 for
-  optimal packing.
-
-  Args:
-    topology: A `Topology` object that describes the TPU cluster topology.
-      To obtain a TPU topology, evaluate the `Tensor` returned by
-      `initialize_system` using `Session.run`. Either a serialized
-      `TopologyProto` or a `Topology` object may be passed. Note: you must
-      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
-    computation_shape: A rank 1 int32 numpy array with size equal to the
-      topology rank, describing the shape of the computation's block of cores.
-      If None, the `computation_shape` is `[1] * topology_rank`.
-    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
-      describing the inter-core spacing of the `computation_shape` cores in the
-      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
-    num_replicas: The number of computation replicas to run. The replicas will
-      be packed into the free spaces of the topology.
-
-  Returns:
-    A DeviceAssignment object, which describes the mapping between the logical
-    cores in each computation replica and the physical cores in the TPU
-    topology.
-
-  Raises:
-    ValueError: If `topology` is not a valid `Topology` object.
-    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
-      numpy arrays with shape [3] where all values are positive.
-    ValueError: If computation's replicas cannot fit into the TPU topology.
-  """
-  # Deserialize the Topology proto, if it is a string.
-  if isinstance(topology, bytes):
-    topology = Topology(serialized=topology)
-
-  if not isinstance(topology, Topology):
-    raise ValueError("`topology` is not a Topology object; got {}".format(
-        type(topology)))
-
-  topology_rank = len(topology.mesh_shape)
-  mesh_shape = topology.mesh_shape
-  if computation_shape is None:
-    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
-  else:
-    computation_shape = np.asarray(computation_shape, dtype=np.int32)
-
-  if computation_stride is None:
-    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
-  else:
-    computation_stride = np.asarray(computation_stride, dtype=np.int32)
-
-  if computation_shape.shape != (topology_rank,):
-    raise ValueError("computation_shape must have shape [{}]; got {}".format(
-        topology_rank, computation_shape.shape))
-  if computation_stride.shape != (topology_rank,):
-    raise ValueError("computation_stride must have shape [{}]; got {}".format(
-        topology_rank, computation_stride.shape))
-
-  if any(computation_shape < 1):
-    raise ValueError(
-        "computation_shape must be positive; got computation_shape={}".format(
-            computation_shape))
-  if any(computation_stride < 1):
-    raise ValueError(
-        "computation_stride must be positive; got computation_stride={}".format(
-            computation_stride))
-
-  # Computes the physical size of one computation instance.
-  computation_footprint = computation_shape * computation_stride
-  if any(computation_footprint > mesh_shape):
-    raise ValueError(
-        "computation footprint {} does not fit in TPU topology shape {}".format(
-            computation_footprint, mesh_shape))
-
-  # Computes how many copies of the computation footprint fit in the mesh.
-  block_counts = mesh_shape // computation_footprint
-
-  replica_counts = block_counts * computation_stride
-  max_replicas = np.prod(replica_counts)
-  if num_replicas > max_replicas:
-    raise ValueError(
-        "requested {} replicas but only {} replicas with shape {} and "
-        "computation_stride {} fit in a TPU mesh of shape {}".format(
-            num_replicas, max_replicas, computation_shape, computation_stride,
-            mesh_shape))
-
-  def ceil_of_ratio(n, m):
-    return (n + m - 1) // m
-
-  replica_shape = [0] * topology_rank
-  if num_replicas > 0:
-    remaining_replicas = num_replicas
-    remaining_dims = topology_rank
-
-    # Choose dimensions as close to an equal cube as possible, in order of
-    # increasing dimension size. By visiting dimensions in increasing size, we
-    # assign the most constrained dimension first, so we won't make infeasible
-    # choices.
-    #
-    # As a secondary sort order, visit the dimensions in reverse order. This
-    # means we try to use both cores on the same chip in preference to two cores
-    # on different chips.
-    for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
-      i = -ni
-      target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
-      replica_shape[i] = min(target_size, x)
-      remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
-      remaining_dims -= 1
-
-    assert remaining_replicas == 1 and remaining_dims == 0
-
-  # Assigns an offset to each replica such that no two replicas overlap.
-  replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
-  for replica in xrange(num_replicas):
-    # Chooses a replica number in each axis.
-    t = replica
-    pos = []
-    for dim in replica_shape[::-1]:
-      pos.append(t % dim)
-      t //= dim
-    replica_pos = np.array(pos[::-1], dtype=np.int32)
-
-    # Determines where that replica starts in each axis.
-    outer = replica_pos // computation_stride
-    inner = replica_pos % computation_stride
-    replica_offsets[replica, :] = outer * computation_footprint + inner
-
-  # Computes a complete logical core -> physical core mapping for each replica.
-  indices = [
-      np.arange(0, computation_shape[i] * computation_stride[i],
-                computation_stride[i]) for i in xrange(topology_rank)
-  ]
-  indices = np.concatenate(
-      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
-      axis=-1)
-  indices = indices.reshape((-1, topology_rank))
-  assignment = indices + replica_offsets[:, np.newaxis, :]
-  return DeviceAssignment(topology, core_assignment=assignment)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.device_assignment import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
index 52e1ea42370d653d1de7c12eee4b456ec7ce921c..1b1328b4075d9a737e40693c13e33e0b7c1fbedf 100644
--- a/tensorflow/contrib/tpu/python/tpu/error_handling.py
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -1,132 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""ErrorRendezvous handler for collecting errors from multiple threads."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import sys
-import threading
-import time
-
-import six
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-
-_UNINTERESTING_ERRORS = (errors.CancelledError,)
-
-
-class ErrorRendezvous(object):
-  """Resolve errors from multiple threads during TPU execution.
-
-  TPU errors can occur on the infeed or outfeed threads as well as the main
-  training thread.
-
-  Depending on which thread "wins" and receives the session error first, we may
-  end up showing users a confusing and non-actionable error message (session
-  cancelled) instead of a root cause (e.g. a bad filename).
-
-  The rendezvous object provides a location to capture these errors until all
-  threads terminate.  At that point we can choose the most informative error
-  to report.
-  """
-
-  def __init__(self, num_sources):
-    # string -> (message, traceback)
-    self._errors = {}
-    self._num_sources = num_sources
-    self._session_cancel_timer = None
-
-  def record_error(self, source, exc_info, session=None):
-    """Report an exception from the given source.
-
-    If a session is passed, a timer will be registered to close it after a few
-    seconds.  This is necessary to ensure the main training loop does not hang
-    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
-    interesting error from another thread to propagate.
-
-    Args:
-      source: string, source of the error
-      exc_info: Output from `sys.exc_info` (type, value, traceback)
-      session: Session to close after delay.
-    """
-    _, value, _ = exc_info
-    self._errors[source] = exc_info
-    logging.info('Error recorded from %s: %s', source, value)
-
-    if session is not None and self._session_cancel_timer is None:
-
-      def _cancel_session():
-        time.sleep(5)
-        try:
-          session.close()
-        except:  # pylint: disable=bare-except
-          pass
-
-      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
-      self._session_cancel_timer.daemon = True
-      self._session_cancel_timer.start()
-
-  def record_done(self, source):
-    """Mark execution source `source` as done.
-
-    If an error was originally reported from `source` it is left intact.
-
-    Args:
-      source: `str`, source being recorded
-    """
-    logging.info('%s marked as finished', source)
-    if source not in self._errors:
-      self._errors[source] = None
-
-  @contextlib.contextmanager
-  def catch_errors(self, source, session=None):
-    """Context manager to report any errors within a block."""
-    try:
-      yield
-    except Exception:  # pylint: disable=broad-except
-      self.record_error(source, sys.exc_info(), session)
-
-  def raise_errors(self, timeout_sec=0):
-    """Wait for up to `timeout` seconds for all error sources to finish.
-
-    Preferentially raise "interesting" errors (errors not in the
-    _UNINTERESTING_ERRORS) set.
-
-    Args:
-      timeout_sec: Seconds to wait for other error sources.
-    """
-    for _ in range(timeout_sec):
-      if len(self._errors) == self._num_sources:
-        break
-      time.sleep(1)
-
-    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
-
-    # First check for any interesting errors, then fall back on the session
-    # cancelled errors etc.
-    for k, (typ, value, traceback) in kept_errors:
-      if isinstance(value, _UNINTERESTING_ERRORS):
-        continue
-      else:
-        logging.warn('Reraising captured error')
-        six.reraise(typ, value, traceback)
-
-    for k, (typ, value, traceback) in kept_errors:
-      logging.warn('Reraising captured error')
-      six.reraise(typ, value, traceback)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.error_handling import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column.py b/tensorflow/contrib/tpu/python/tpu/feature_column.py
index 8edf131bc24fd003806263570b63ee8514c49896..ded75e975b10c4265370af260bf804687c9caebc 100644
--- a/tensorflow/contrib/tpu/python/tpu/feature_column.py
+++ b/tensorflow/contrib/tpu/python/tpu/feature_column.py
@@ -1,429 +1,30 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU Feature Column Library."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.feature_column import feature_column_lib as fc_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variable_scope
-# pylint: disable=protected-access
-
-
-_TPU_FC_TO_SCOPE = '_tpu_feature_column_scope'
-_SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
-                                  fc._VocabularyFileCategoricalColumn,
-                                  fc._VocabularyListCategoricalColumn,
-                                  fc._WeightedCategoricalColumn,
-                                  fc_lib.IdentityCategoricalColumn,
-                                  fc_lib.VocabularyFileCategoricalColumn,
-                                  fc_lib.VocabularyListCategoricalColumn,
-                                  fc_lib.WeightedCategoricalColumn)
-
-
-def embedding_column(categorical_column,
-                     dimension,
-                     combiner='mean',
-                     initializer=None):
-  """TPU embedding_column for `tf.feature_column.embedding_column`.
-
-  Note that the interface for TPU embedding_column is different from the non-TPU
-  version. The following args available for the non-TPU version are NOT
-  supported: ckpt_to_load_from, tensor_name_in_ckp, max_norm and trainable.
-
-  Args:
-    categorical_column: A categorical_column returned from
-        categorical_column_with_identity,  weighted_categorical_column,
-        categorical_column_with_vocabulary_list or
-        categorical_column_with_vocabulary_file.
-    dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. For more information, see
-      `tf.feature_column.embedding_column`.
-    initializer: A variable initializer function to be used in embedding
-      variable initialization. If not specified, defaults to
-      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-      `1/sqrt(dimension)`.
-
-  Returns:
-    A  _TPUEmbeddingColumn.
-
-  Raises:
-    ValueError: if `dimension` not > 0.
-    ValueError: if `initializer` is specified but not callable.
-  """
-  if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
-    raise TypeError(
-        'categorical_column for tpu '
-        ' embedding_column must be type %s, got %s.' % (' or '.join([
-            cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
-        ]), type(categorical_column)))
-  if (dimension is None) or (dimension < 1):
-    raise ValueError('Invalid dimension {}.'.format(dimension))
-
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified. '
-                     'Embedding of column_name: {}'.format(
-                         categorical_column.name))
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1 / math.sqrt(dimension))
-
-  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
-
-  def _creator(weight_collections, scope):
-    embedding_column_layer = fc._EmbeddingColumnLayer(
-        embedding_shape=embedding_shape,
-        initializer=initializer,
-        weight_collections=weight_collections,
-        trainable=True,
-        name='embedding_column_layer')
-    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
-
-  column = _TPUEmbeddingColumn(
-      categorical_column=categorical_column,
-      dimension=dimension,
-      combiner=combiner,
-      layer_creator=_creator,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      max_norm=None,
-      trainable=True)
-  # For Embedding column, the initializer is hidden inside the creator Fn, which
-  # is not accessiable later. So, we attach it to a speicial field. Also note
-  # that non-TPU Embedding column and non-TPU shared Embedding column handle the
-  # initializer differently. See shared_embedding_columns for details.
-  column._tpu_initializer = initializer
-  return column
-
-
-def shared_embedding_columns(categorical_columns,
-                             dimension,
-                             combiner='mean',
-                             initializer=None,
-                             shared_embedding_collection_name=None):
-  """List of dense columns that convert from sparse, categorical input."""
-  for categorical_column in categorical_columns:
-    if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
-      raise TypeError(
-          'categorical_column for tpu '
-          ' shared_embedding_columns must be type %s, got %s.' % (' or '.join([
-              cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
-          ]), type(categorical_column)))
-  columns = fc_lib.shared_embedding_columns(
-      categorical_columns,
-      dimension,
-      combiner=combiner,
-      initializer=initializer,
-      shared_embedding_collection_name=shared_embedding_collection_name,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      max_norm=None,
-      trainable=True)
-
-  # Use the initializer and shared_embedding_collection_name to create TPU
-  # version
-  initializer = columns[0].initializer
-  shared_embedding_collection_name = columns[0].shared_embedding_collection_name
-  tpu_columns = []
-
-  # Create the state (_SharedEmbeddingColumnLayer) here.
-  for categorical_column in categorical_columns:
-    column = _TPUSharedEmbeddingColumn(
-        categorical_column=categorical_column,
-        dimension=dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=None,
-        tensor_name_in_ckpt=None,
-        max_norm=None,
-        trainable=True)
-    tpu_columns.append(column)
-
-  return tpu_columns
-
-
-class _TPUBaseEmbeddingColumn(object):
-  """Base class for TPU Embedding Column."""
-
-  def __init__(self, categorical_column):
-    self._tpu_categorical_column = categorical_column
-
-  def get_combiner(self):
-    """Returns the embedding combiner."""
-    raise NotImplementedError('not implemented')
-
-  def get_embedding_table_size(self):
-    """Returns the embedding table size, tuple of vocab size and dimension."""
-    raise NotImplementedError('not implemented')
-
-  def get_feature_key_name(self):
-    """Returns the feature key name in the features dict."""
-    raise NotImplementedError('not impl')
-
-  def get_weight_key_name(self):
-    """Return the key name for weights."""
-    raise NotImplementedError('not impl')
-
-  def get_embedding_var_name(self):
-    """Returns the embedding variable name.
-
-    Feature key name and embedding variable name are usually one-to-one mapping.
-    But for shared embedding columns, it is many-to-one mapping.
-    """
-    raise NotImplementedError('not impl')
-
-  def get_initializer(self):
-    """Returns the initializer."""
-    raise NotImplementedError('not impl')
-
-  def is_categorical_column_weighted(self):
-    """Check if the categorical column of the embedding column is weighted."""
-    raise NotImplementedError('not impl')
-
-
-class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
-  """Core Embedding Column."""
-
-  def __new__(cls,
-              categorical_column,
-              dimension,
-              combiner='mean',
-              layer_creator=None,
-              ckpt_to_load_from=None,
-              tensor_name_in_ckpt=None,
-              max_norm=None,
-              trainable=True):
-    # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
-    # are not supported on TPU. They are solely for matching the signature of
-    # __new__ of parent class fc._EmbeddingColumn.
-    return fc._EmbeddingColumn.__new__(
-        cls,
-        categorical_column,
-        dimension,
-        combiner=combiner,
-        layer_creator=layer_creator,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable)
-
-  def __init__(self,
-               categorical_column,
-               dimension,
-               combiner='mean',
-               layer_creator=None,
-               ckpt_to_load_from=None,
-               tensor_name_in_ckpt=None,
-               max_norm=None,
-               trainable=True):
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
-    self._key = None
-
-  def get_combiner(self):
-    return self.combiner
-
-  def get_embedding_table_size(self):
-    """Returns num_ids and width."""
-    return (self.categorical_column._num_buckets, self.dimension)
-
-  def get_feature_key_name(self):
-    """get_feature_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.categorical_column.name
-    return self.categorical_column.name
-
-  def get_weight_key_name(self):
-    """get_weight_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.weight_feature_key
-    return None
-
-  def get_embedding_var_name(self):
-    """get_embedding_var_name."""
-    return self.categorical_column.name
-
-  def get_initializer(self):
-    return self._tpu_initializer
-
-  def is_categorical_column_weighted(self):
-    """Check if the categorical column of the embedding column is weighted."""
-    if isinstance(
-        self.categorical_column,
-        (
-            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
-            fc_lib.WeightedCategoricalColumn)):
-      return True
-    return False
-
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
-    if tpu.under_tpu_inference_context():
-      def host_computation():
-        return fc._EmbeddingColumn._get_dense_tensor(
-            self, inputs, weight_collections, trainable)
-      return tpu.outside_compilation(host_computation)
-
-    if _is_running_on_cpu():
-      return fc._EmbeddingColumn._get_dense_tensor(
-          self, inputs, weight_collections, trainable)
-
-    # TPU mode
-    # Get the embeddings from the LazyBuilder.
-    tensor = inputs.get(self.get_feature_key_name())
-
-    # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
-
-    return tensor
-
-
-class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
-                                fc._SharedEmbeddingColumn):
-  """Core Shared Embedding Column."""
-
-  def __new__(cls,
-              categorical_column,
-              dimension,
-              combiner='mean',
-              initializer=None,
-              shared_embedding_collection_name=None,
-              ckpt_to_load_from=None,
-              tensor_name_in_ckpt=None,
-              max_norm=None,
-              trainable=True):
-    return fc._SharedEmbeddingColumn.__new__(
-        cls,
-        categorical_column,
-        dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable)
-
-  def __init__(self,
-               categorical_column,
-               dimension,
-               combiner='mean',
-               initializer=None,
-               shared_embedding_collection_name=None,
-               ckpt_to_load_from=None,
-               tensor_name_in_ckpt=None,
-               max_norm=None,
-               trainable=True):
-
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
-    self._key = None
-
-  def get_combiner(self):
-    return self.combiner
-
-  def get_embedding_table_size(self):
-    """Returns num_ids and width."""
-    return (self.categorical_column._num_buckets, self.dimension)
-
-  def get_feature_key_name(self):
-    """get_feature_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.categorical_column.name
-    return self.categorical_column.name
-
-  def get_weight_key_name(self):
-    """get_weight_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.weight_feature_key
-    return None
-
-  def get_embedding_var_name(self):
-    """get_embedding_var_name."""
-    return self.shared_embedding_collection_name
-
-  def get_initializer(self):
-    return self.initializer
-
-  def is_categorical_column_weighted(self):
-    """Check if the categorical column of the embedding column is weighted."""
-    if isinstance(
-        self.categorical_column,
-        (
-            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
-            fc_lib.WeightedCategoricalColumn)):
-      return True
-    return False
-
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
-    if tpu.under_tpu_inference_context():
-      def host_computation():
-        return fc._SharedEmbeddingColumn._get_dense_tensor(
-            self, inputs, weight_collections, trainable)
-      return tpu.outside_compilation(host_computation)
-
-    if _is_running_on_cpu():
-      return fc._SharedEmbeddingColumn._get_dense_tensor(
-          self, inputs, weight_collections, trainable)
-
-    # TPU mode
-    # Get the embeddings from the LazyBuilder.
-    tensor = inputs.get(self.get_feature_key_name())
-
-    # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(
-        self.get_embedding_var_name(),
-        'embedding_weights',
-        is_shared_embedding=True)
-    return tensor
-
-
-def _record_variable_scope_and_name(embedding_var_name,
-                                    embedding_var_name_in_fc,
-                                    is_shared_embedding=False):
-  """Add embedding variable name and scope to collection."""
-  g = ops.get_default_graph()
-  collection = g.get_collection_ref(_TPU_FC_TO_SCOPE)
-  if not collection:
-    collection.append({})
-
-  var_def_dict = collection[0]
-
-  captured_scope = None
-
-  if is_shared_embedding and (embedding_var_name in var_def_dict):
-    if var_def_dict[embedding_var_name][1] != embedding_var_name_in_fc:
-      raise ValueError(
-          'For embedding var name {}, the shared embedding name is different, '
-          'got {}; expected {}'.format(embedding_var_name,
-                                       embedding_var_name_in_fc,
-                                       var_def_dict[embedding_var_name][1]))
-  else:
-    # scope contains var_scope_name.
-    captured_scope = variable_scope.get_variable_scope()
-    var_def_dict[embedding_var_name] = (captured_scope,
-                                        embedding_var_name_in_fc)
-
-
-def _is_running_on_cpu():
-  """Returns True if the current context is CPU model."""
-  return tpu_function.get_tpu_context().number_of_shards is None
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.feature_column import *
+# used by tests
+from tensorflow.python.tpu.feature_column import _is_running_on_cpu
+from tensorflow.python.tpu.feature_column import _record_variable_scope_and_name
+from tensorflow.python.tpu.feature_column import _TPU_FC_TO_SCOPE
+from tensorflow.python.tpu.feature_column import _TPUBaseEmbeddingColumn
+from tensorflow.python.tpu.feature_column import _TPUEmbeddingColumn
+from tensorflow.python.tpu.feature_column import _TPUSharedEmbeddingColumn
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/functional.py b/tensorflow/contrib/tpu/python/tpu/functional.py
index 24c85156e53a9b770f811c4cf3b903eab6553c76..9a5759221ed9660200cc213df69961db56f8d490 100644
--- a/tensorflow/contrib/tpu/python/tpu/functional.py
+++ b/tensorflow/contrib/tpu/python/tpu/functional.py
@@ -1,39 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-"""Functional operations."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
-from tensorflow.contrib.tpu.python.tpu import gen_functional_ops
-
-
-TPUPartitionedCall = gen_functional_ops._tpu_partitioned_call  # pylint: disable=invalid-name,protected-access
-
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tpu.ops.gen_tpu_ordinal_selector_op import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _tpu_partitioned_call_op = loader.load_op_library(
-      resource_loader.get_path_to_datafile("../ops/_functional_ops.so")
-  )
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.functional import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 37fe9af8c4b154a2e20a957f6ca5d97df3d413be..6ad4e45e9625f191bb4c01f70b434dc2c4fba638 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -55,8 +55,6 @@ import numpy as np
 import six
 
 from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver as tpu_cluster_resolver_lib
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import keras_tpu_variables
 from tensorflow.contrib.tpu.python.tpu import tpu
@@ -64,6 +62,7 @@ from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
@@ -94,6 +93,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.deprecation import deprecated
 
 
 # TODO(b/114775106): temporary shim to optionally initialize the TPU
@@ -2172,7 +2172,10 @@ Output shape: %(output_shape)s
 # pylint: enable=bad-continuation
 
 
-@experimental
+@deprecated(
+    '2019-02-20', 'Switch to tf.contrib.distribute.TPUStrategy. '
+    'https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy'
+)
 def tpu_model(model, strategy=None):
   """Copy `model` along with weights to the TPU.
 
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 5cb2ca6478a1d7589cd2aa2d52c82306b3fd11f4..ed8f9525c9b91208d39805654b01837abdbf3a77 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -1,438 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the 'License');
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Operations for handling session logging and shutdown notifications."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-import time
-from google.protobuf import text_format
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-
-_WATCHDOG = None
-
-
-class CoordinatorShutdownException(Exception):
-  """Raised when the coordinator needs to shutdown."""
-  pass
-
-
-def _clone_session(session, graph=None):
-  return session_lib.Session(
-      target=session.sess_str,
-      config=session._config,  # pylint: disable=protected-access
-      graph=graph if graph else session.graph)
-
-
-def _make_heartbeat_op(session, device, request_ph):
-  """Return a heartbeat op or None if heartbeats are not supported by device."""
-  try:
-    # Test if we can connect in a isolated graph + session
-    with ops.Graph().as_default():
-      with _clone_session(session) as temp_session:
-        with ops.device(device):
-          heartbeat_op = tpu_ops.worker_heartbeat('')
-          options = config_pb2.RunOptions(timeout_in_ms=5000)
-          temp_session.run(heartbeat_op, options=options)
-  except errors.InvalidArgumentError as _:
-    logging.warning('Error running heartbeat on %s', device)
-    return None
-  except errors.DeadlineExceededError as _:
-    logging.warning('Timeout connecting to %s when testing heartbeat', device)
-    return None
-
-  # If we successfully connected and pinged the worker, go ahead and construct
-  # the operation.
-  with ops.device(device):
-    return tpu_ops.worker_heartbeat(request_ph)
-
-
-class WorkerHeartbeatManager(object):
-  """Manages the status/heartbeat monitor for a set of workers."""
-
-  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
-    """Construct a new WorkerHeartbeatManager.
-
-    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
-
-    Args:
-      session: `tf.Session`, session to use for heartbeat operations.
-      devices: `list[string]` Set of devices to connect to.
-      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
-      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
-        the WorkerHeartbeatRequest protocol buffer.
-    """
-    self._session = session
-    self._devices = devices
-    self._ops = heartbeat_ops
-    self._request_placeholder = request_placeholder
-
-  @staticmethod
-  def from_devices(session, devices):
-    """Construct a heartbeat manager for the given devices."""
-    if not devices:
-      logging.error('Trying to create heartbeat manager with no devices?')
-
-    logging.info('Creating heartbeat manager for %s', devices)
-    request_placeholder = array_ops.placeholder(
-        name='worker_heartbeat_request', dtype=dtypes.string)
-
-    heartbeat_ops = []
-    kept_devices = []
-    for device in devices:
-      heartbeat_op = _make_heartbeat_op(session, device, request_placeholder)
-      if heartbeat_op is not None:
-        kept_devices.append(device)
-        heartbeat_ops.append(heartbeat_op)
-      else:
-        logging.warning('Heartbeat support not available for %s', device)
-
-    return WorkerHeartbeatManager(session, kept_devices, heartbeat_ops,
-                                  request_placeholder)
-
-  def num_workers(self):
-    return len(self._devices)
-
-  def configure(self, message):
-    """Configure heartbeat manager for all devices.
-
-    Args:
-      message: `event_pb2.WorkerHeartbeatRequest`
-    Returns: `None`
-    """
-    logging.info('Configuring worker heartbeat: %s',
-                 text_format.MessageToString(message))
-    self._session.run(self._ops,
-                      {self._request_placeholder: message.SerializeToString()})
-
-  def ping(self, request=None, timeout_in_ms=5000):
-    """Ping all workers, returning the parsed status results."""
-    if request is None:
-      request = event_pb2.WorkerHeartbeatRequest()
-
-    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
-    results = self._session.run(
-        self._ops,
-        feed_dict={self._request_placeholder: request.SerializeToString()},
-        options=options)
-    parsed_results = [
-        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
-        for res_pb in results
-    ]
-    logging.debug('Ping results: %s', parsed_results)
-    return parsed_results
-
-  def lame_workers(self):
-    """Ping all workers, returning manager containing lame workers (or None)."""
-    ping_results = self.ping()
-    lame_workers = []
-
-    for ping_response, device, op in zip(ping_results, self._devices,
-                                         self._ops):
-      if ping_response.health_status != event_pb2.OK:
-        lame_workers.append((device, op))
-
-    if not lame_workers:
-      return None
-
-    bad_devices, bad_ops = zip(*lame_workers)
-    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
-                                  self._request_placeholder)
-
-  def __repr__(self):
-    return 'HeartbeatManager(%s)' % ','.join(self._devices)
-
-  def shutdown(self, timeout_ms=10000):
-    """Shutdown all workers after `shutdown_timeout_secs`."""
-    logging.info('Shutting down %s.', self)
-    req = event_pb2.WorkerHeartbeatRequest(
-        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
-        shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
-    self.configure(req)
-
-    # Wait for workers to shutdown.  This isn't strictly required
-    # but it avoids triggering multiple checkpoints with the same lame worker.
-    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
-    time.sleep(timeout_ms / 1000)
-
-
-def all_worker_devices(session):
-  """Return a list of devices for each worker in the system."""
-  devices = session.list_devices()
-  return [
-      device.name
-      for device in devices
-      if ':CPU:' in device.name and 'coordinator' not in device.name
-  ]
-
-
-class WatchdogManager(threading.Thread):
-  """Configures worker watchdog timer and handles periodic pings.
-
-  Usage:
-    # Ping workers every minute, shutting down workers if they haven't received
-    # a ping after 1 hour.
-    watchdog_manager = WatchdogManager(
-      ping_interval=60, shutdown_timeout=3600
-    )
-
-    # Use as a context manager, resetting watchdog on context exit:
-    with watchdog_manager:
-      session.run(...)
-
-    # Or setup globally; watchdog will remain active until program exit.
-    watchdog_manager.configure_and_run()
-  """
-
-  def __init__(self,
-               session,
-               devices=None,
-               ping_interval=60,
-               shutdown_timeout=3600):
-    """Initialize a watchdog manager.
-
-    Args:
-      session: Session connected to worker devices.  A cloned session and graph
-        will be created for managing worker pings.
-      devices: Set of devices to monitor.  If none, all workers will be
-        monitored.
-      ping_interval: Time, in seconds, between watchdog pings.
-      shutdown_timeout: Time, in seconds, before watchdog timeout.
-    """
-    threading.Thread.__init__(self)
-    self.ping_interval = ping_interval
-    self.shutdown_timeout = shutdown_timeout
-    self.daemon = True
-    self._config = session._config  # pylint: disable=protected-access
-    self._target = session.sess_str
-    self._running = False
-    self._devices = devices
-
-    self._graph = None
-    self._session = None
-    self._worker_manager = None
-
-  def _reset_manager(self):
-    """Reset the graph, session and worker manager."""
-    self._graph = ops.Graph()
-    self._session = session_lib.Session(
-        target=self._target,
-        graph=self._graph,
-        config=self._config,
-    )
-
-    if self._devices is None:
-      self._devices = all_worker_devices(self._session)
-
-    with self._graph.as_default():
-      self._worker_manager = WorkerHeartbeatManager.from_devices(
-          self._session, self._devices)
-
-    self._worker_manager.configure(
-        event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(
-                timeout_ms=self.shutdown_timeout * 1000,),
-            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
-
-  def configure_and_run(self):
-    logging.info(
-        'Enabling watchdog timer with %d second timeout '
-        'and %d second ping interval.', self.shutdown_timeout,
-        self.ping_interval)
-    self._reset_manager()
-    self._running = True
-    self.start()
-
-  def stop(self):
-    logging.info('Stopping worker watchdog.')
-    self._worker_manager.configure(
-        event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
-            shutdown_mode=event_pb2.NOT_CONFIGURED))
-    self._running = False
-    self.join()
-
-  def __enter__(self):
-    self.configure_and_run()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    self.stop()
-
-  def run(self):
-    # Don't fetch logs or adjust timing: just ping the watchdog.
-    #
-    # If we hit an exception, reset our session as it is likely broken.
-    while self._running:
-      try:
-        self._worker_manager.ping(request=None)
-        time.sleep(self.ping_interval)
-      except errors.OpError as e:
-        # Catch any TF errors that occur so we don't stop sending heartbeats
-        logging.debug('Caught error while sending heartbeat: %s', e)
-        self._reset_manager()
-
-
-def start_worker_watchdog(session,
-                          devices=None,
-                          ping_interval=60,
-                          shutdown_timeout=3600):
-  """Start global worker watchdog to shutdown workers on coordinator exit."""
-  global _WATCHDOG
-  if _WATCHDOG is None:
-    # Ensure we can send a few pings before we timeout!
-    ping_interval = min(shutdown_timeout / 10., ping_interval)
-    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
-                                shutdown_timeout)
-    _WATCHDOG.configure_and_run()
-
-
-class GracefulShutdownHook(session_run_hook.SessionRunHook):
-  """Session hook that watches for shutdown events.
-
-  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
-  SystemShutdown exception is raised to terminate the main session.  If `saver`
-  is None the `SAVERS` collection will be read to find a saver.
-
-  `on_shutdown_hooks` is an optional list of functions that should be called
-  after checkpointing.  The function is called with (`run_context`,
-  `all_workers`, `lame_workers`).
-
-  If `heartbeat_group` is not specified, it will default to all CPU workers
-  in the system.
-  """
-
-  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
-    self._saver = saver
-    self._checkpoint_prefix = checkpoint_prefix
-    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
-
-    # Worker heartbeats are managed independently of the main training graph.
-    self._graph = ops.Graph()
-    self._workers = None
-    self._session = None
-    self._heartbeat_supported = False
-
-  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
-    # N.B. We have to pull the global step here to avoid it being unavailable
-    # at checkpoint time; the graph has been frozen at that point.
-    if training_util.get_global_step() is None and self.saver() is not None:
-      raise ValueError(
-          'Saver defined but no global step.  Run `get_or_create_global_step()`'
-          ' in your model definition to allow checkpointing.')
-
-    with self._graph.as_default():
-      logging.info('Installing graceful shutdown hook.')
-      self._session = _clone_session(training_session, self._graph)
-      self._workers = WorkerHeartbeatManager.from_devices(
-          self._session, all_worker_devices(self._session))
-      self._heartbeat_supported = self._workers.num_workers() > 0
-      if self._heartbeat_supported:
-        self._workers.configure(
-            event_pb2.WorkerHeartbeatRequest(
-                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
-      else:
-        logging.warn(
-            'No workers support hearbeats. Failure handling will be disabled.')
-
-  def saver(self):
-    if self._saver:
-      return self._saver
-
-    savers = ops.get_collection(ops.GraphKeys.SAVERS)
-    if not savers:
-      return None
-
-    if not isinstance(savers, list):
-      return savers
-
-    if len(savers) > 1:
-      logging.error(
-          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
-          'will be disabled. Pass an explicit `saver` to the constructor to '
-          'override this behavior.')
-      return None
-
-    return savers[0]
-
-  def after_run(self, run_context, run_values):
-    del run_values
-
-    if not self._heartbeat_supported:
-      return
-
-    lame_workers = self._workers.lame_workers()
-    if lame_workers:
-      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
-
-      if self.saver():
-        logging.info('ShutdownHook: saving checkpoint to %s',
-                     self._checkpoint_prefix)
-        self.saver().save(
-            run_context.session,
-            self._checkpoint_prefix,
-            global_step=training_util.get_global_step(),
-            write_state=True,
-        )
-      else:
-        logging.info('ShutdownHook: no Saver defined.')
-
-      for fn in self._on_shutdown_hooks:
-        fn(run_context, self._workers, lame_workers)
-
-
-class RestartComputation(object):
-  """Restart the entire computation.
-
-  This hook shuts down all workers and returns control to the top-level by
-  throwing a CoordinatorShutdownException.
-  """
-
-  def __init__(self, timeout_ms=10000):
-    self.timeout_ms = timeout_ms
-
-  def __call__(self, run_context, all_workers, lame_workers):
-    del run_context, lame_workers
-    all_workers.shutdown(timeout_ms=self.timeout_ms)
-
-    logging.info('Terminating coordinator.')
-    raise CoordinatorShutdownException()
-
-
-class ShutdownLameWorkers(object):
-  """Shutdown lamed workers.
-
-  Processing will continue normally (typically by waiting for the down
-  workers to be restarted).
-  """
-
-  def __init__(self, timeout_ms=10000):
-    self.timeout_in_ms = timeout_ms
-
-  def __call__(self, run_context, all_workers, lame_workers):
-    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.session_support import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
index 2c5ea65182e404ec44b24bcd7d0f412c04f1beb1..73db253fd790f26679fb05bd6e7a5da6a99da1a7 100644
--- a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -1,1481 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ========================================================================
-"""A utility to trace tensor values on TPU."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import os.path
-import re
-import sys
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-
-_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
-_DEVICE_TYPE_TPU = 'tpu'
-_DEVICE_TYPE_CPU = 'cpu'
-_TRACE_MODE_NAN_INF = 'nan-inf'
-_TRACE_MODE_PART_TENSOR = 'part-tensor'
-_TRACE_MODE_PART_TENSOR_SIZE = 3
-_TRACE_MODE_FULL_TENSOR = 'full-tensor'
-_TRACE_MODE_NORM = 'norm'
-_TRACE_MODE_MAX_ABS = 'max-abs'
-_SUBMODE_BRIEF = 'brief'
-_SUBMODE_DETAILED = 'detailed'
-_REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
-_REASON_UNSAFE_OP = 'not-traced-unsafe-op'
-_REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
-_REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
-_REASON_DEVICE_MISMATCH = 'not-traced-device-mismatch'
-_REASON_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
-_REASON_SCALAR_GET_TRACED = 'traced-scalar'
-_REASON_TENSOR_GET_TRACED = 'traced-tensor'
-_REASON_USER_INCLUDED = 'traced-user-included'
-_REASON_USER_EXCLUDED = 'not-traced-user-excluded'
-_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
-_REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
-_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
-_MARKER_SECTION_END = '!!!!!!! section-end:'
-_SECTION_NAME_CONFIG = 'configuration'
-_SECTION_NAME_REASON = 'reason'
-_SECTION_NAME_OP_LIST = 'op-list'
-_SECTION_NAME_TENSOR_LIST = 'tensor-list'
-_SECTION_NAME_CACHE_INDEX_MAP = 'cache-index-map'
-_SECTION_NAME_GRAPH = 'graph'
-_FIELD_NAME_VERSION = 'version:'
-_FIELD_NAME_DEVICE = 'device:'
-_FIELD_NAME_TRACE_MODE = 'trace-mode:'
-_FIELD_NAME_SUBMODE = 'submode:'
-_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
-_FIELD_NAME_NUM_REPLICAS_PER_HOST = 'num-replicas-per-host:'
-_FIELD_NAME_NUM_HOSTS = 'num-hosts:'
-_FIELD_NAME_NUM_OPS = 'number-of-ops:'
-_FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
-_FIELD_NAME_NUM_CACHE_INDICES = 'number-of-indices:'
-_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
-_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
-_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
-_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
-_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
-_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
-_FLAG_NAME_ENABLE = 'enable'
-_FLAG_NAME_TRACE_MODE = 'trace_mode'
-_FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
-_FLAG_NAME_SUBMODE = 'submode'
-_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
-_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
-_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
-_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
-_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
-_FLAG_NAME_TRACE_DIR = 'trace_dir'
-_FLAG_NAME_REPORT_FILE = 'report_file'
-_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
-_FLAG_NAME_OP_RANGE = 'op_range'
-_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
-_OUTPUT_STREAM_ESCAPE = 'file://'
-_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
-_TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
-_TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
-_TRACE_FILE_NAME = 'trace.all'
-_COMPACT_TRACE_FILE_PREFIX = 'compact_trace.'
-_COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
-_TENSOR_TRACER_STORAGE = 'tensor_tracer_storage'
-_TENSOR_VALUES_CACHE = 'tensor_values_cache'
-_REPLICA_ID_TAG = '#replica-id: '
-
-def tensor_tracepoint(tensor, checkpoint_name):
-  """Adds a checkpoint with the given checkpoint name for the given tensor.
-
-  The tensor will be added to the list of tensors that will be traced by the
-  tensor tracer.
-
-  Args:
-     tensor: the tensor object for which the tracing is requested.
-     checkpoint_name: a string name for the checkpoint. This name has to be a
-     unique name if used within model comparison. The tensors that have the same
-     checkpoint identifier is compared in model comparison.
-  Returns:
-    The provided tensor.
-  """
-
-  tensor.graph.get_collection(_TENSOR_TRACER_COLLECTION)
-  tensor.graph.add_to_collection(_TENSOR_TRACER_COLLECTION,
-                                 (tensor, checkpoint_name))
-  return tensor
-
-
-def keras_layer_tracepoint(layer, checkpoint_name):
-  """An interface for adding the tensor outputs of a keras layer.
-
-  Encapsulates tensor_tracepoint.
-
-  Args:
-     layer: A keras layer.
-     checkpoint_name: a string name for the checkpoint. This name has to be a
-     unique name if used within model comparison. The tensors that have the same
-     checkpoint identifier is compared in model comparison.
-
-  Returns:
-    The provided layer.
-  """
-  try:
-    outputs = layer.output
-    if tensor_util.is_tensor(outputs):
-      tensor_tracepoint(outputs, '%s' % (checkpoint_name))
-    else:
-      idx = 0
-      for output_tensor in outputs:
-        if tensor_util.is_tensor(outputs):
-          tensor_tracepoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
-        idx += 1
-  except AttributeError:
-    pass
-  except RuntimeError:
-    pass
-  return layer
-
-
-def _trace_files_need_precreated(output_dir):
-  """Return True if trace files must be pre-created by users."""
-
-  if not output_dir.startswith('/'):
-    return False
-  if len(output_dir) < 5:
-    return False
-  if output_dir[2] != 'n':
-    return False
-  if output_dir[3] != 's':
-    return False
-  if output_dir[1] != 'c':
-    return False
-  if output_dir[4] != '/':
-    return False
-  return True
-
-
-def _get_tensor_values_cache(graph=None):
-  """Returns the variable that implements tensor-value caching."""
-
-  graph = graph or ops.get_default_graph()
-  collection = graph.get_collection(_TENSOR_TRACER_STORAGE)
-  if len(collection) == 1:
-    return collection[0]
-  elif not collection:
-    raise RuntimeError('%s has not been created'%_TENSOR_VALUES_CACHE)
-  else:
-    raise RuntimeError('Multiple %s created'%_TENSOR_VALUES_CACHE)
-  return None
-
-
-def _create_tensor_values_cache(graph, num_tensors):
-  """Creates a variable as the cache to store intermediate tensor values."""
-
-  graph = graph or ops.get_default_graph()
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        _TENSOR_VALUES_CACHE,
-        shape=[num_tensors],
-        dtype=dtypes.float32,
-        initializer=init_ops.constant_initializer(
-            _COMPACT_TRACE_ENTRY_INIT_VALUE),
-        trainable=False,
-        use_resource=True,
-        collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.GLOBAL_VARIABLES])
-
-
-def _set_fetches(result_tensor, train_op):
-  """Sets the fetches from the result tensor and training op."""
-
-  fetches = []
-  if result_tensor is not None:
-    fetches.append(result_tensor)
-  if train_op is not None:
-    fetches.append(train_op)
-  if not fetches:
-    return None
-  return fetches
-
-
-class TensorTracer(object):
-  """A software construct for tracing tensor values in a TF graph on TPU.
-
-  This utility is disabled by default. It can be enabled by setting
-  the TENSOR_TRACER_FLAGS env variable as:
-    export TENSOR_TRACER_FLAGS="--enable=1"
-  If it is enabled, it will trace the output tensor values of
-  selected Ops in the graph. It has two outputs: (1) the traces and (2)
-  a report. The traces are dumped to a specified local file on the TPU
-  host. The report is printed to the log.info of the TPU job.
-  By passing options via the env variable, users can change:
-     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
-         full tensor values)
-     (2) which Ops to be traced (via op.name or op.type)
-     (3) output trace file path.
-  """
-
-  @staticmethod
-  def _match_next_flag(flags, pos):
-    """Returns the match for the next TensorTracer flag.
-
-    Args:
-       flags: a string that contains the flags.
-       pos: where in flags to start the search.
-
-    Returns:
-       A pair where the first element is the regular-expression
-       match found and the second element indicates if the match
-       has a value.
-    """
-
-    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
-    if match:
-      # The flag is found but is not given a value.
-      return match, False
-    # The flag is not found.
-    return None, False
-
-  @staticmethod
-  def validate_flag_names():
-    """Validates if the TensorTrace flags passed are valid."""
-    valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
-                        _FLAG_NAME_USE_COMPACT_TRACE,
-                        _FLAG_NAME_SUBMODE,
-                        _FLAG_NAME_EXCLUDED_OPNAMES,
-                        _FLAG_NAME_EXCLUDED_OPTYPES,
-                        _FLAG_NAME_INCLUDED_OPNAMES,
-                        _FLAG_NAME_INCLUDED_OPTYPES,
-                        _FLAG_NAME_TRACE_DIR,
-                        _FLAG_NAME_REPORT_FILE,
-                        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
-                        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS,
-                        _FLAG_NAME_OP_RANGE]
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return
-    pos = 0
-    while True:
-      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      if flag_name not in valid_flag_names:
-        raise ValueError(
-            'The flag name "%s" passed via the environment variable "%s" '
-            'is invalid. Valid flag names are:'
-            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
-      pos = match.end()
-
-  @staticmethod
-  def print_flag_values():
-    """Prints all TensorTracer flags passed via environment variables."""
-
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
-    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
-                                                   tensor_tracer_flags)
-    result += 'Individual flag value:\n'
-    pos = 0
-    while True:
-      match, has_value = TensorTracer._match_next_flag(
-          tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      if has_value:
-        flag_value = match.group(2)
-      else:
-        flag_value = None
-      result += '  %s: %s\n'%(flag_name, flag_value)
-      pos = match.end()
-    result += '\n'
-    return result
-
-  @staticmethod
-  def get_flag_value(wanted_flag_name):
-    """Returns the value of a TensorTracer flags.
-
-    Args:
-      wanted_flag_name: the name the the flag we are looking for.
-
-    Returns:
-      A pair where the first element indicates if the flag is
-      found and the second element is the value of the flag.
-
-    Raises:
-      RuntimeError: If supposedly deadcode is reached.
-    """
-
-    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return False, None
-    pos = 0
-    while True:
-      match, has_value = TensorTracer._match_next_flag(
-          tensor_tracer_flags, pos)
-      if not match:
-        return False, None
-      flag_name = match.group(1)
-      if has_value:
-        flag_value = match.group(2)
-      else:
-        flag_value = None
-      if flag_name == wanted_flag_name:
-        return True, flag_value
-      pos = match.end()
-    raise RuntimeError('Should not reach here.')
-
-  @staticmethod
-  def flag_value_to_re_list(flag_name):
-    """Converts list of strings to compiled RE."""
-
-    re_list = []
-    found, flag_value = TensorTracer.get_flag_value(flag_name)
-    if not found or not flag_value:
-      return re_list
-    list_of_values = flag_value.split()
-    for v in list_of_values:
-      r = re.compile(v)
-      re_list.append(r)
-    return re_list
-
-  @staticmethod
-  def _is_flag_on(flag_name):
-    """Returns True if the given flag is on."""
-
-    found, flag_value = TensorTracer.get_flag_value(flag_name)
-    if not found:
-      return False
-    if flag_value is None:
-      return True
-    # Depends on the flag value.
-    flag_value = flag_value.lower()
-    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
-    return enabled
-
-  @staticmethod
-  def is_enabled():
-    """Returns True if TensorTracer is enabled."""
-
-    return TensorTracer._is_flag_on(_FLAG_NAME_ENABLE)
-
-  @staticmethod
-  def use_test_undeclared_outputs_dir():
-    """Decides the output directory of the report and trace files.
-
-    Args:
-       None.
-
-    Returns:
-       True if the output files should be written to the
-       test-undeclared-outputs-directory defined via an
-       env variable.
-    """
-
-    return TensorTracer._is_flag_on(
-        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
-
-  @staticmethod
-  def use_compact_trace():
-    return TensorTracer._is_flag_on(
-        _FLAG_NAME_USE_COMPACT_TRACE)
-
-  @staticmethod
-  def check_device_type(device_type):
-    """Checks if the given device type is valid."""
-
-    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
-      raise ValueError('Invalid device_type "%s"'%device_type)
-
-  @staticmethod
-  def check_trace_mode(trace_mode):
-    """Checks if the given trace mode is valid."""
-
-    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
-                         _TRACE_MODE_FULL_TENSOR, _TRACE_MODE_NORM,
-                         _TRACE_MODE_MAX_ABS]
-    if trace_mode not in valid_trace_modes:
-      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
-                       'Valid trace modes are: %s'%(trace_mode,
-                                                    valid_trace_modes))
-
-  @staticmethod
-  def check_submode(submode):
-    """Checks if the given submode is valid."""
-
-    if not submode:
-      return
-    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
-    if submode not in valid_submodes:
-      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
-                       'Valid submodes are: %s'%(submode,
-                                                 valid_submodes))
-
-  @staticmethod
-  def unsafe_op(op):
-    """Returns True if this op is not safe to be traced."""
-
-    if control_flow_util.IsInCond(op):
-      return True
-    # Reasons for not including following op types:
-    #    Assign: cause incorrect result with CPU tracing.
-    if op.type in ['Assign']:
-      return True
-    return False
-
-  @staticmethod
-  def device_mismatch(device_type, op):
-    if device_type == _DEVICE_TYPE_TPU:
-      # pylint: disable=protected-access
-      return tpu._TPU_REPLICATE_ATTR not in op.node_def.attr
-      # pylint: enable=protected-access
-    return False
-
-  @staticmethod
-  def unsafe_scalar_trace(op):
-    """Return true if scalar output tensor from Op is not safe to be traced."""
-
-    # Tracing the following causes cycle in the graph on TPU.
-    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
-                   'Switch', 'Less', 'ReadVariableOp']:
-      return True
-    # Tracing the following will cause casting-issue
-    # with the norm tracing mode or other compilation issues on CPU.
-    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
-                   'IteratorGetNext', 'OneShotIterator',
-                   'IteratorV2', 'MakeIterator',
-                   'BatchDatasetV2', 'MapDataset',
-                   'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
-                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
-      return True
-    return False
-
-  @staticmethod
-  def less_interesting_op(op):
-    """Returns True if the given Op is not an interesting one to be traced."""
-
-    found, _ = TensorTracer.get_flag_value(
-        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
-    if found:
-      # users force to include all ops.
-      return False
-    # Following ops are highly unlikey to cause bugs.
-    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
-
-  @staticmethod
-  def reason(op_idx, details):
-    """Returns reason why the Op at op_idx is traced or not."""
-
-    return '%d %s'%(op_idx, details)
-
-  @staticmethod
-  def topological_sort(g):
-    """Performs topological sort on the given graph.
-
-    Args:
-       g: the graph.
-
-    Returns:
-       A pair where the first element indicates if the topological
-       sort succeeded (True if there is no cycle found; False if a
-       cycle is found) and the second element is either the sorted
-       list of nodes or the cycle of nodes found.
-    """
-
-    def visit(op, cycle, permanently_marked_ops,
-              temporarily_marked_ops, sorted_ops):
-      """Recursively visits all Ops in a graph.
-
-      Args:
-         op: the current Op being visited.
-         cycle: a cycle of Ops found.
-         permanently_marked_ops: the set of Ops that were already visited.
-         temporarily_marked_ops: the set of Ops that we have visited during
-                                 the current descent.
-         sorted_ops: the list of Ops sorted in topological order.
-      """
-
-      if cycle:
-        return
-      if op in permanently_marked_ops:
-        return
-      if op in temporarily_marked_ops:
-        cycle = temporarily_marked_ops
-        return
-      temporarily_marked_ops.add(op)
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        for consumer_op in out_tensor.consumers():
-          visit(consumer_op, cycle, permanently_marked_ops,
-                temporarily_marked_ops, sorted_ops)
-      # pylint: disable=protected-access
-      for ctrl_output_op in op._control_outputs:
-        # pylint: enable=protected-access
-        visit(ctrl_output_op, cycle, permanently_marked_ops,
-              temporarily_marked_ops, sorted_ops)
-      temporarily_marked_ops.remove(op)
-      permanently_marked_ops.add(op)
-      sorted_ops.insert(0, op)
-
-    graph_cycle = set([])
-    sorted_ops = []
-    permanently_marked_ops = set([])
-    temporarily_marked_ops = set([])
-    unsorted_ops = g.get_operations()
-    for op in unsorted_ops:
-      visit(op, graph_cycle, permanently_marked_ops,
-            temporarily_marked_ops, sorted_ops)
-    if graph_cycle:
-      return (False, graph_cycle)
-    else:
-      assert len(unsorted_ops) == len(sorted_ops)
-      return (True, sorted_ops)
-
-  @staticmethod
-  def _make_op_and_tensor_maps(op_list):
-    """Creates various maps and lists from op_list.
-
-    Args:
-       op_list: a list of Ops
-
-    Returns:
-       opname_idx_map: a map from Op's name to its index in op_list.
-       tensor_list: a list of output tensors of the Ops in op_list.
-       tensorname_idx_map: a map from output tensor name to its index
-                           in tensor_list.
-    """
-
-    opname_idx_map = {}
-    tensor_list = []
-    tensorname_idx_map = {}
-    for op_id, op in enumerate(op_list):
-      if op.name in opname_idx_map:
-        raise ValueError('Duplicated Op name: %s'%op.name)
-      opname_idx_map[op.name] = op_id
-      for output_tensor in op.outputs:
-        if output_tensor.name not in tensorname_idx_map:
-          tensor_list.append(output_tensor)
-          tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
-    return (opname_idx_map, tensor_list, tensorname_idx_map)
-
-  def __init__(self):
-    """Initializes a TensorTracer.
-
-    Sets the various member fields from the flags (if given) or the defaults.
-    """
-    self._version = 'use-outside-compilation'
-    self._device_type = None
-    TensorTracer.validate_flag_names()
-    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
-    if not found or not self._trace_mode:
-      self._trace_mode = _TRACE_MODE_NAN_INF
-    TensorTracer.check_trace_mode(self._trace_mode)
-    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
-    if not found or not self._submode:
-      self._submode = _SUBMODE_DETAILED
-    TensorTracer.check_submode(self._submode)
-    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
-    self._instrument_records = {}
-    self._set_trace_dir()
-    self._set_report_file()
-    self._set_op_range()
-    self._set_excluded_opnames()
-    self._set_excluded_optypes()
-    self._set_included_opnames()
-    self._set_included_optypes()
-    self._num_replicas = None
-    self._num_replicas_per_host = None
-    self._num_hosts = None
-    self._replica_id = None
-
-  def _add_replica_id_to_graph(self, result_tensor):
-    """Adds nodes for computing the replica ID to the graph."""
-
-    if not self._num_replicas:
-      self._replica_id = 'unknown'
-      return result_tensor
-
-    with ops.control_dependencies(None):
-      # Uses None as dependency to run outside of TPU graph rewrites.
-      self._replica_id = tpu_ops.tpu_replicated_input(
-          list(range(self._num_replicas)),
-          name='tt_replica_id')
-    use_replica_id = array_ops.identity(self._replica_id).op
-    with ops.control_dependencies([use_replica_id]):
-      # Adds a control dependency from the result_tensor to
-      # the replica_id to ensure that replica_id will be added to the graph.
-      return array_ops.identity(result_tensor)
-
-  def _set_trace_dir(self):
-    found, self._trace_dir = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_DIR)
-    if found and self._trace_dir \
-       and TensorTracer.use_test_undeclared_outputs_dir():
-      raise ValueError('Cannot not use --%s and --%s at the same time'
-                       %(_FLAG_NAME_TRACE_DIR,
-                         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR))
-    if TensorTracer.use_test_undeclared_outputs_dir():
-      self._trace_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-
-  def _set_report_file(self):
-    """Sets the path of the output report file."""
-
-    found, self._report_file_path = TensorTracer.get_flag_value(
-        _FLAG_NAME_REPORT_FILE)
-    if found and self._report_file_path \
-       and TensorTracer.use_test_undeclared_outputs_dir():
-      if os.path.isabs(self._report_file_path):
-        raise ValueError('If use_test_undeclared_outputs_dir is set,'
-                         'report_file_path cannot be an absolute path (%s)'
-                         %self._report_file_path)
-      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-      self._report_file_path = os.path.join(outputs_dir,
-                                            self._report_file_path)
-    if not self._report_file_path:
-      self._report_file = None
-      return
-    try:
-      self._report_file = gfile.Open(self._report_file_path, 'w')
-    except IOError as e:
-      raise e
-
-  def _close_report_file(self):
-    if self._report_file:
-      self._report_file.close()
-
-  def _set_op_range(self):
-    """Sets the index range of the Ops that we will consider tracing."""
-
-    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
-    if not found or not op_range:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    match = _OP_RANGE_PAT.match(op_range)
-    if not match:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    self._op_range = (int(match.group(1)), int(match.group(2)))
-
-  def _inside_op_range(self, idx):
-    """Return True if the given index is inside the selected range."""
-
-    if idx < self._op_range[0]:
-      return False
-    return self._op_range[1] < 0 or idx <= self._op_range[1]
-
-  def _set_excluded_opnames(self):
-    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_EXCLUDED_OPNAMES)
-
-  def _set_excluded_optypes(self):
-    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_EXCLUDED_OPTYPES)
-
-  def _set_included_opnames(self):
-    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_INCLUDED_OPNAMES)
-
-  def _set_included_optypes(self):
-    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_INCLUDED_OPTYPES)
-
-  def _is_user_included_op(self, op):
-    for opname_re in self._included_opname_re_list:
-      if opname_re.match(op.name):
-        return True
-    for optype_re in self._included_optype_re_list:
-      if optype_re.match(op.type):
-        return True
-    return False
-
-  def _is_user_excluded_op(self, op):
-    for opname_re in self._excluded_opname_re_list:
-      if opname_re.match(op.name):
-        return True
-    for optype_re in self._excluded_optype_re_list:
-      if optype_re.match(op.type):
-        return True
-    return False
-
-  def _use_tensor_values_cache(self):
-    """Returns True if immediate tensors should be first saved to a cache."""
-
-    if self._trace_mode not in set([_TRACE_MODE_NAN_INF,
-                                    _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS]):
-      return False
-    if self._trace_dir and _trace_files_need_precreated(self._trace_dir):
-      return True
-    if TensorTracer.use_compact_trace():
-      return True
-    return False
-
-  def _save_tensor_value_to_cache_op(self, graph, cache_idx, updates):
-    """Returns an Op that will save the given updates to an entry in the cache."""
-
-    cache = _get_tensor_values_cache(graph)
-    indices = constant_op.constant([cache_idx])
-    return state_ops.scatter_update(cache, indices, updates).op
-
-  def _write_report(self, content):
-    """Writes the given content to the report."""
-
-    line = '%s %s'%(_TRACER_LOG_PREFIX, content)
-    if self._report_file:
-      self._report_file.write(line)
-    else:
-      logging.info(line)
-
-  def _write_config_section(self):
-    """Writes the config section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
-    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
-    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
-    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
-    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
-    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
-                                  self._num_replicas_per_host))
-    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, self._num_hosts))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
-
-  def _write_reason_section(self):
-    """Writes the reason section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
-    for key in sorted(self._instrument_records):
-      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
-
-  def _write_op_list_section(self, op_list):
-    """Writes the Op-list section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
-    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
-    for i in range(0, len(op_list)):
-      op = op_list[i]
-      line = '%d "%s" %s'%(i, op.name, op.type)
-      for out_tensor in op.outputs:
-        if out_tensor.name not in self._tensorname_idx_map:
-          raise ValueError(
-              'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
-        line += ' %d'%self._tensorname_idx_map[out_tensor.name]
-      line += '\n'
-      self._write_report(line)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
-
-  def _write_tensor_list_section(self, tensor_list, opname_idx_map):
-    """Writes the tensor-list section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
-                                  _SECTION_NAME_TENSOR_LIST))
-    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_TENSORS, len(tensor_list)))
-    for i in range(0, len(tensor_list)):
-      tensor = tensor_list[i]
-      line = '%d "%s"'%(i, tensor.name)
-      for consumer_op in tensor.consumers():
-        if consumer_op.name not in opname_idx_map:
-          raise ValueError(
-              'consumer_op %s is not in opname_idx_map'%consumer_op.name)
-        line += ' %d'%opname_idx_map[consumer_op.name]
-      line += '\n'
-      self._write_report(line)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
-                                  _SECTION_NAME_TENSOR_LIST))
-
-  def _write_cache_index_map_section(self):
-    """Writes the mapping from cache index to tensor index to the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
-                                  _SECTION_NAME_CACHE_INDEX_MAP))
-    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_CACHE_INDICES,
-                                  len(self._cache_idx_to_tensor_idx)))
-    for cache_idx in range(0, len(self._cache_idx_to_tensor_idx)):
-      tensor_idx = self._cache_idx_to_tensor_idx[cache_idx]
-      line = '%d %d\n'%(cache_idx, tensor_idx)
-      self._write_report(line)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
-                                  _SECTION_NAME_CACHE_INDEX_MAP))
-
-  def _write_graph_section(self, succeed, sorted_or_cycle):
-    """Writes the graph section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
-                                  succeed))
-    l = list(sorted_or_cycle)
-    for i in range(0, len(l)):
-      self._write_report('%d "%s"\n'%(i, l[i].name))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
-
-  def _preprocess_traced_tensor(self, tensor):
-    """Computes NAN/Norm/Max on TPUs before sending to CPU.
-
-    Args:
-      tensor: The tensor to be traced.
-    Returns:
-      A tensor that should be input to the trace_function.
-    Raises:
-      RuntimeError: If the trace mode is invalid.
-    """
-
-    def _detect_nan_inf(tensor):
-      """Trace function for detecting any NaN/Inf in the tensor."""
-
-      if tensor.dtype.is_floating:
-        mask = math_ops.reduce_any(
-            gen_math_ops.logical_or(
-                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
-        output_tensor = control_flow_ops.cond(mask,
-                                              lambda: constant_op.constant(1.0),
-                                              lambda: constant_op.constant(0.0))
-      else:
-        output_tensor = constant_op.constant(0.0)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
-
-    def _show_norm(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float32)
-      output_tensor = linalg_ops.norm(tensor)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
-
-    def _show_max_abs(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float32)
-      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
-      zero = constant_op.constant(0, dtypes.float32)
-      output_tensor = gen_math_ops.maximum(zero, output_tensor)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
-
-    if self._trace_mode == _TRACE_MODE_NAN_INF:
-      return _detect_nan_inf(tensor)
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
-      return tensor
-    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
-      return tensor
-    if self._trace_mode == _TRACE_MODE_NORM:
-      return _show_norm(tensor)
-    if self._trace_mode == _TRACE_MODE_MAX_ABS:
-      return _show_max_abs(tensor)
-    raise RuntimeError(
-        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
-
-  def _make_tensor_trace_fun(self, tensor_name):
-    """Makes the tensor tracing function called by outside compilation.
-
-    Args:
-      tensor_name: name of the tensor being traced.
-
-    Returns:
-      A function to be passed as the first argument to outside compilation.
-
-    Raises:
-      RuntimeError: If the trace mode is invalid.
-    """
-
-    def _print_tensor(tensor_name, num_elements, tensor, output_tensor):
-      """Prints a tensor value to a file.
-
-      Args:
-        tensor_name: name of the tensor being traced.
-        num_elements: number of elements to print (-1 means print all).
-        tensor: the tensor needs to be returned.
-        output_tensor: the tensor needs to be printed.
-
-      Returns:
-        The same tensor passed via the "tensor" argument.
-
-      Raises:
-        ValueError: If tensor_name is not already in
-                    self._tensorname_idx_map.
-      """
-
-      if self._submode == _SUBMODE_BRIEF:
-        if tensor_name not in self._tensorname_idx_map:
-          raise ValueError(
-              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
-        msg = '%d'%self._tensorname_idx_map[tensor_name]
-      else:
-        msg = '"%s"'%tensor_name
-
-      if self._trace_dir:
-        output_path = os.path.join(self._trace_dir, _TRACE_FILE_NAME)
-        output_stream = _OUTPUT_STREAM_ESCAPE + output_path
-      else:
-        output_stream = sys.stderr
-      print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
-                                      '@', self._replica_id,
-                                      '\n', output_tensor, '\n',
-                                      summarize=num_elements,
-                                      output_stream=output_stream)
-      with ops.control_dependencies([print_op]):
-        return array_ops.identity(tensor).op
-
-
-    def _show_part_tensor(tensor):
-      """Trace function for printing part of the tensor."""
-
-      return _print_tensor(tensor_name, self._part_tensor_size,
-                           tensor, tensor)
-
-    def _show_full_tensor(tensor):
-      """Trace function for printing the entire tensor."""
-
-      return _print_tensor(tensor_name, -1, tensor, tensor)
-
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
-      return _show_part_tensor
-    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
-    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
-    # performed within TPUs and only their results are transferred to CPU.
-    # Simply, print the full tensor for these trace modes.
-    if self._trace_mode in [
-        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
-        _TRACE_MODE_MAX_ABS
-    ]:
-      return _show_full_tensor
-
-    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
-                       %self._trace_mode)
-
-  def _skip_op(self, op_id, op, user_included, user_excluded,
-               in_exec_path=True):
-    """Returns True if we should not trace Op."""
-
-    if user_included:
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_USER_INCLUDED)
-      return False
-    if user_excluded:
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_USER_EXCLUDED)
-      return True
-    if not in_exec_path:
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_NOT_EXECUTED)
-      return True
-    if not self._inside_op_range(op_id):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_OUTSIDE_OP_RANGE)
-      return True
-    if TensorTracer.unsafe_op(op):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_UNSAFE_OP)
-      return True
-    if TensorTracer.device_mismatch(self._device_type, op):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_DEVICE_MISMATCH)
-      return True
-    if TensorTracer.less_interesting_op(op):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_LESS_INTERESTING_OP)
-      return True
-    return False
-
-  def _skip_tensor(self, op_id, out_tensor, user_included,
-                   user_excluded):
-    """Returns True if we should not trace out_tensor."""
-
-    # Skips a tensor if the tensor has a non-numeric type.
-    #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
-    #         because it also excludes tensors with dtypes, bool, and
-    #         float32_ref, which we actually want to trace.
-    non_numeric_tensor_types = set([dtypes.variant, dtypes.resource,
-                                    dtypes.string])
-    if out_tensor.dtype in non_numeric_tensor_types:
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_NON_NUMERIC_TENSOR)
-      return True
-
-    if user_included:
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_USER_INCLUDED)
-      return False
-    if user_excluded:
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_USER_EXCLUDED)
-      return True
-    if not out_tensor.get_shape().is_fully_defined():
-      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
-      # to a scalar before the outside compilation call.
-      if self._trace_mode in [
-          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
-      ]:
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_TENSOR_GET_TRACED)
-        return False
-      else:
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_DYNAMIC_SHAPE)
-        return True
-    rank = len(out_tensor.shape)
-    if rank < 1:
-      # scalar
-      if TensorTracer.unsafe_scalar_trace(out_tensor.op):
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_UNSAFE_SCALAR)
-        return True
-      else:
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_SCALAR_GET_TRACED)
-        return False
-    else:
-      # tensor
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_TENSOR_GET_TRACED)
-      return False
-
-  def _filter_execution_path_operations(self, operations, fetches):
-    """Returns the set of ops in the execution path to compute given fetches."""
-
-    # If no fetch provided, then return all operations.
-    if fetches is None:
-      return set(operations)
-    # Convert to list, if a single element is provided.
-    if not isinstance(fetches, (list, tuple)):
-      fetches = [fetches]
-    # If a tensor is given as fetch, convert it to op.
-    op_fetches = []
-    for fetch in fetches:
-      if isinstance(fetch, ops.Operation):
-        op_fetches.append(fetch)
-      elif isinstance(fetch, ops.Tensor):
-        op_fetches.append(fetch.op)
-      else:
-        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
-                           %fetch)
-
-    execution_path_operations = set(op_fetches)
-    traverse_stack = list(op_fetches)
-    while True:
-      if not traverse_stack:
-        break
-      head_op = traverse_stack.pop()
-      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
-      input_ops.extend(head_op.control_inputs)
-
-      for input_op in input_ops:
-        if input_op not in execution_path_operations:
-          execution_path_operations.add(input_op)
-          traverse_stack.append(input_op)
-    return execution_path_operations
-
-  def _determine_traced_tensors(self, graph, fetches):
-    """Determines the tensors that will be traced."""
-
-    self._traced_tensorname_to_cache_idx_map = {}
-    self._cache_idx_to_tensor_idx = []
-    operations = graph.get_operations()
-    # Filter out the operations that won't be executed.
-    # if fetches=None, then ops_in_exec_path = set(operations)
-    ops_in_exec_path = self._filter_execution_path_operations(operations,
-                                                              fetches)
-    checkpoint_operations = self._get_checkpoints(graph)
-    for op_id, op in enumerate(operations):
-      if checkpoint_operations and op.name not in checkpoint_operations:
-        continue
-      user_included = self._is_user_included_op(op)
-      user_excluded = self._is_user_excluded_op(op)
-      in_exec_path = op in ops_in_exec_path
-      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
-        continue
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        if self._skip_tensor(op_id, out_tensor, user_included,
-                             user_excluded):
-          continue
-        tensor_name = out_tensor.name
-        if tensor_name in self._traced_tensorname_to_cache_idx_map:
-          raise ValueError(
-              'Tensor name %s should not be already in '
-              'traced_tensorname_to_cache_idx_map'%tensor_name)
-        if tensor_name not in self._tensorname_idx_map:
-          raise ValueError(
-              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
-        tensor_idx = self._tensorname_idx_map[tensor_name]
-        cache_idx = len(self._traced_tensorname_to_cache_idx_map)
-        self._traced_tensorname_to_cache_idx_map[tensor_name] = cache_idx
-        self._cache_idx_to_tensor_idx.append(tensor_idx)
-        if len(self._traced_tensorname_to_cache_idx_map) != len(
-            self._cache_idx_to_tensor_idx):
-          raise RuntimeError('len(self._traced_tensorname_to_cache_idx_map) != '
-                             'len(self._cache_idx_to_tensor_idx')
-
-  def _check_trace_files(self):
-    """Checks if any requirements for trace files are satisfied."""
-
-    if not self._trace_dir:
-      # traces will be written to stderr. No need to check trace files.
-      return
-    if _trace_files_need_precreated(self._trace_dir):
-      for replica_id in range(0, self._num_replicas):
-        trace_file_path = os.path.join(
-            self._trace_dir,
-            _COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
-        if not gfile.Exists(trace_file_path):
-          raise RuntimeError(
-              '%s must be pre-created with the '
-              'appropriate properties.'%trace_file_path)
-    else:
-      if not gfile.Exists(self._trace_dir):
-        gfile.MkDir(self._trace_dir)
-        if not gfile.Exists(self._trace_dir):
-          raise RuntimeError('Failed to create %s'%self._trace_dir)
-
-  def _pre_tracing(self, graph, fetches):
-    """Work needs to be done prior to TPU or CPU tracing."""
-
-    self._check_trace_files()
-    operations = graph.get_operations()
-    (opname_idx_map, tensor_list, self._tensorname_idx_map) = (
-        TensorTracer._make_op_and_tensor_maps(operations))
-    self._write_config_section()
-    self._write_op_list_section(operations)
-    self._write_tensor_list_section(tensor_list, opname_idx_map)
-    self._determine_traced_tensors(graph, fetches)
-    self._write_cache_index_map_section()
-    # Does the topological sort before adding any nodes to the graph.
-    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
-    if self._use_tensor_values_cache():
-      _create_tensor_values_cache(graph,
-                                  len(self._cache_idx_to_tensor_idx))
-    return (operations, succeed, sorted_or_cycle)
-
-  def _post_tracing(self, succeed, sorted_or_cycle):
-    """Work needs to be done after TPU or CPU tracing."""
-
-    self._write_reason_section()
-    self._write_graph_section(succeed, sorted_or_cycle)
-    self._close_report_file()
-
-  def _get_checkpoints(self, graph):
-    """Returns the list of Ops that produce the tensors traced with API.
-
-    Args:
-      graph: the graph of Ops.
-
-    Returns:
-      A set of operation names which should be traced.
-    """
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
-                                  _TENSOR_TRACER_CHECKPOINT))
-    checkpoint_operations = set()
-    tensor_tracer_variables = graph.get_collection(_TENSOR_TRACER_COLLECTION)
-    for (tensor, checkpoint_name) in tensor_tracer_variables:
-      self._write_report('%s %s\n'%(tensor.name, checkpoint_name))
-      checkpoint_operations.add(tensor.op.name)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
-                                  _TENSOR_TRACER_CHECKPOINT))
-    return checkpoint_operations
-
-  def _generate_flush_cache_op(self, graph, start_replica, on_tpu):
-    """Generates an Op that will flush the cache to file.
-
-    Args:
-      graph: the graph of Ops
-      start_replica: the ID of the first replica being flushed by this Op.
-      on_tpu: if the graph is executed on TPU.
-
-    Returns:
-      The Op to flush the cache to file.
-    """
-    def _make_flush_fun(replica_id):
-      """Makes a function for flushing the cache for the given replica."""
-
-      def _fun():
-        """A function that flushes the cache to a file."""
-
-        def _flush_fun(cache):
-          """Flushes the cache to a file."""
-
-          if isinstance(replica_id, str):
-            replica_id_str = replica_id
-          else:
-            replica_id_str = '%d'%replica_id
-          output_path = os.path.join(self._trace_dir,
-                                     _COMPACT_TRACE_FILE_PREFIX) \
-                                     + replica_id_str
-          output_stream = _OUTPUT_STREAM_ESCAPE + output_path
-          new_step_line = _REPLICA_ID_TAG + replica_id_str
-          print_op = logging_ops.print_v2(
-              new_step_line, '\n',
-              cache, '\n',
-              summarize=-1,
-              output_stream=output_stream)
-          with ops.control_dependencies([print_op]):
-            return constant_op.constant(0).op
-
-        cache = _get_tensor_values_cache(graph)
-        if on_tpu:
-          flush_op = tpu.outside_compilation(_flush_fun, cache.value())
-        else:
-          flush_op = _flush_fun(cache.value())
-        with ops.control_dependencies([flush_op]):
-          reset_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
-                                             dtype=cache.dtype,
-                                             shape=cache.shape)
-          assign_op = state_ops.assign(cache, reset_value).op
-          with ops.control_dependencies([assign_op]):
-            return flush_op.outputs[0]
-
-      return _fun
-
-    def _f(replica_id):
-      return _make_flush_fun(replica_id)
-    def _eq(x):
-      return math_ops.equal(x, self._replica_id)
-    def _do_nothing():
-      return constant_op.constant(0)
-
-    return control_flow_ops.case({\
-                                  _eq(start_replica): _f(start_replica), \
-                                  _eq(start_replica+1): _f(start_replica+1), \
-                                  _eq(start_replica+2): _f(start_replica+2), \
-                                  _eq(start_replica+3): _f(start_replica+3), \
-                                  _eq(start_replica+4): _f(start_replica+4), \
-                                  _eq(start_replica+5): _f(start_replica+5), \
-                                  _eq(start_replica+6): _f(start_replica+6), \
-                                  _eq(start_replica+7): _f(start_replica+7), \
-    },
-                                 default=_do_nothing,
-                                 exclusive=True).op
-
-  def _flush_tensor_values_cache(self, graph, result_tensor, train_op, on_tpu):
-    """Flushes the intermediate tensor values in the graph to the cache.
-
-    Args:
-      graph: the graph of Ops
-      result_tensor: a result tensor of evaluating the graph.
-      train_op: the training op.
-      on_tpu: if the graph is executed on TPU.
-
-    Returns:
-      An identical copy of result tensor.
-    """
-
-    train_op_list = []
-    if train_op is not None:
-      train_op_list.append(train_op)
-    with ops.control_dependencies(train_op_list):
-      flush_cache_op_list = []
-      for host in range(self._num_hosts):
-        start_replica = host * 8
-        flush_op = self._generate_flush_cache_op(graph, start_replica, on_tpu)
-        flush_cache_op_list.append(flush_op)
-      with ops.control_dependencies(flush_cache_op_list):
-        return array_ops.identity(result_tensor)
-
-  def trace_tpu(self, graph,
-                result_tensor,
-                train_op,
-                num_replicas=None,
-                num_replicas_per_host=None,
-                num_hosts=None):
-    """Traces the tensors generated by TPU Ops in a TF graph.
-
-    Args:
-      graph: the graph of Ops executed on the TPU.
-      result_tensor: a result tensor of evaluating the graph.
-      train_op: the training op.
-      num_replicas: number of replicas used on the TPU.
-      num_replicas_per_host: number of replicas per TPU host.
-      num_hosts: total number of TPU hosts.
-
-    Returns:
-      A tuple (result_tensor_copy, tracing_ops), where:
-        result_tensor_copy: an exact copy of result_tensor
-        tracing_ops: a list of tracing ops. If this list
-                     is non empty, the caller of this function
-                     should pose control dependencies upon these
-                     Ops so that they will be executed when the
-                     graph is evaluated.
-
-    Raises:
-      RuntimeError: If num_replicas_per_host > 8.
-    """
-
-    def _cast_unsupported_dtypes(tensor):
-      """Casts tensor to a supported type."""
-
-      if tensor.dtype.__eq__(dtypes.int64):
-        # outside-compilation doesn't support int64 input yet.
-        return math_ops.cast(tensor, dtypes.int32)
-      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
-          dtypes.float16):
-        # Since host can't handle bf16, convert tensor to f32.
-        return math_ops.cast(tensor, dtypes.float32)
-      return tensor
-
-    self._device_type = _DEVICE_TYPE_TPU
-    self._num_replicas = num_replicas
-    self._num_replicas_per_host = num_replicas_per_host
-    self._num_hosts = num_hosts
-    if self._num_replicas_per_host > 8:
-      # Checks for the assumption in _generate_flush_cache_op().
-      raise RuntimeError(
-          'num_replicas_per_host (%d) is '
-          'greater than 8'%self._num_replicas_per_host)
-
-    TensorTracer.check_device_type(self._device_type)
-    result_tensor_copy = self._add_replica_id_to_graph(result_tensor)
-    fetches = _set_fetches(result_tensor, train_op)
-    (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph, fetches)
-
-    tracing_ops = []
-    for op in operations:
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        tensor_name = out_tensor.name
-        if tensor_name not in self._traced_tensorname_to_cache_idx_map:
-          continue
-        # Create the list of consumers before calling _preprocess_traced_tensor.
-        # Otherwise, adding control input below, will introduce a cycle in the
-        # graph.
-        consumers = out_tensor.consumers()
-        if not consumers:
-          continue
-        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
-        processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
-        if self._use_tensor_values_cache():
-          cache_idx = self._traced_tensorname_to_cache_idx_map[tensor_name]
-          trace_op = self._save_tensor_value_to_cache_op(graph,
-                                                         cache_idx,
-                                                         processed_out_tensor)
-        else:
-          trace_op = tpu.outside_compilation(
-              self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
-        for consumer_op in consumers:
-          # pylint: disable=protected-access
-          consumer_op._add_control_input(trace_op)
-          # pylint: enable=protected-access
-    if self._use_tensor_values_cache():
-      result_tensor_final = self._flush_tensor_values_cache(graph,
-                                                            result_tensor_copy,
-                                                            train_op,
-                                                            on_tpu=True)
-    else:
-      result_tensor_final = result_tensor_copy
-    self._post_tracing(succeed, sorted_or_cycle)
-    return (result_tensor_final, tracing_ops)
-
-  def _generate_cpu_result(self, result_tensor, train_op, graph):
-    """Generates the final CPU result."""
-
-    if self._use_tensor_values_cache():
-      result_tensor_final = self._flush_tensor_values_cache(graph,
-                                                            result_tensor,
-                                                            train_op,
-                                                            on_tpu=False)
-    else:
-      result_tensor_final = array_ops.identity(result_tensor)
-    return result_tensor_final
-
-  def trace_cpu(self, graph, result_tensor, train_op):
-    """Traces the tensors generated by CPU Ops in a TF graph.
-
-    Args:
-      graph: the graph of Ops executed on the CPU.
-      result_tensor: a result tensor of evaluating the graph.
-      train_op: the training op.
-
-    Returns:
-      A pair (final_result_tensor, tracing_calls) where:
-         final_result_tensor: an identical copy of result_tensor.
-         tracing_calls: a map from keys to trace calls.
-                     A key is constructed from an Op's name.
-                     A trace call consists of a function and a tensor (
-                     the function will be invoked with the tensor).
-    """
-
-    if result_tensor is None:
-      raise ValueError(
-          'The result_tensor passed to trace_cpu should not be None')
-
-    self._device_type = _DEVICE_TYPE_CPU
-    TensorTracer.check_device_type(self._device_type)
-    self._num_replicas = 1
-    self._num_replicas_per_host = 1
-    self._num_hosts = 1
-    self._replica_id = 0
-    fetches = _set_fetches(result_tensor, train_op)
-    (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph, fetches)
-
-    tracing_calls = {}
-    for op in operations:
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        tensor_name = out_tensor.name
-        if tensor_name not in self._traced_tensorname_to_cache_idx_map:
-          continue
-        # Create the list of consumers before calling _preprocess_traced_tensor.
-        # Otherwise, adding control input below, will introduce a cycle in the
-        # graph.
-        consumers = out_tensor.consumers()
-        if not consumers:
-          continue
-        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
-        if self._use_tensor_values_cache():
-          cache_idx = self._traced_tensorname_to_cache_idx_map[tensor_name]
-          trace_op = self._save_tensor_value_to_cache_op(graph,
-                                                         cache_idx,
-                                                         processed_out_tensor)
-          for consumer_op in consumers:
-            # pylint: disable=protected-access
-            consumer_op._add_control_input(trace_op)
-            # pylint: enable=protected-access
-        else:
-          trace_fun = self._make_tensor_trace_fun(tensor_name)
-          trace_call = (trace_fun, [processed_out_tensor])
-          trace_call_key = 'tensor_tracing_cpu-%s:%d'%(op.name, i)
-          tracing_calls[trace_call_key] = trace_call
-
-    self._post_tracing(succeed, sorted_or_cycle)
-    final_result_tensor = self._generate_cpu_result(result_tensor,
-                                                    train_op,
-                                                    graph)
-    return (final_result_tensor, tracing_calls)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tensor_tracer import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
index 6ae718cc2c9716587849aeee8abcd0a1de82a9ae..5bf805752cf51b0a0f4b7400b18b63aae93cf831 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -1,220 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Defines the `Topology` class, that describes a TPU fabric topology."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.proto import topology_pb2
-
-
-def _tpu_device_name(job, task, device):
-  """Returns the device name for the TPU `device` on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:TPU:%d" % (task, device)
-  else:
-    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
-
-
-def _tpu_host_device_name(job, task):
-  """Returns the device name for the CPU device on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:CPU:0" % task
-  else:
-    return "/job:%s/task:%d/device:CPU:0" % (job, task)
-
-
-class Topology(object):
-  """Describes a set of TPU devices.
-
-  Represents both the shape of the physical mesh, and the mapping between
-  TensorFlow TPU devices to physical mesh coordinates.
-  """
-
-  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
-    """Builds a Topology object.
-
-    If `serialized` is not `None`, the topology is parsed from `serialized` and
-    the other arguments are ignored. Otherwise, the topology is computed from
-    `mesh_shape` and `device_coordinates`.
-
-    Args:
-      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
-        serialized proto is parsed to discover the topology.
-      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
-        the shape of the TPU topology, in number of cores. Ignored if
-        `serialized` is not `None`.
-      device_coordinates: A rank 3 numpy array that describes the mapping from
-        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
-        if `serialized is not `None`.
-
-    Raises:
-      ValueError: If `serialized` does not describe a well-formed topology.
-      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
-        of 3 positive integers.
-      ValueError: If `serialized` is `None` and `device_coordinates` is not a
-        rank 3 numpy int32 array that describes a valid coordinate mapping.
-    """
-
-    self._serialized = serialized
-
-    if serialized:
-      self._parse_topology(serialized)
-    else:
-      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
-      self._device_coordinates = np.asarray(device_coordinates, np.int32)
-      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
-                         "entries; got {}".format(self._mesh_shape))
-
-      if (len(self._device_coordinates.shape) != 3 or
-          self._device_coordinates.shape[2] != len(self._mesh_shape)):
-        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
-                         "with minor dimension equal to the mesh shape rank")
-
-    self._topology_tasks, self._topology_devices = self._invert_topology()
-
-  def _parse_topology(self, serialized):
-    """Parses a serialized `TopologyProto` into `self`."""
-    proto = topology_pb2.TopologyProto()
-    proto.ParseFromString(serialized)
-
-    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
-    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
-                       "entries; got {}".format(self._mesh_shape))
-
-    if proto.num_tasks < 0:
-      raise ValueError("`num_tasks` must be >= 0; got {}".format(
-          proto.num_tasks))
-    if proto.num_tpu_devices_per_task < 0:
-      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
-          proto.num_tpu_devices_per_task))
-
-    expected_coordinates_size = (
-        proto.num_tasks * proto.num_tpu_devices_per_task * len(
-            proto.mesh_shape))
-    if len(proto.device_coordinates) != expected_coordinates_size:
-      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
-                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
-                       "got shape {}".format(proto.num_tasks,
-                                             proto.num_tpu_devices_per_task,
-                                             proto.mesh_shape,
-                                             len(proto.device_coordinates)))
-
-    coords = np.array(proto.device_coordinates, dtype=np.int32)
-    if any(coords < 0):
-      raise ValueError("`device_coordinates` must be >= 0")
-    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
-                             len(proto.mesh_shape)))
-    self._device_coordinates = coords
-
-  def _invert_topology(self):
-    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
-    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
-    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
-    for task in xrange(self.device_coordinates.shape[0]):
-      for device in xrange(self.device_coordinates.shape[1]):
-        x, y, z = self.device_coordinates[task, device, :]
-        tasks[x, y, z] = task
-        devices[x, y, z] = device
-    return tasks, devices
-
-  @property
-  def mesh_shape(self):
-    """A rank 1 int32 array describing the shape of the TPU topology."""
-    return self._mesh_shape
-
-  @property
-  def mesh_rank(self):
-    """Returns the number of dimensions in the mesh."""
-    return len(self._mesh_shape)
-
-  @property
-  def device_coordinates(self):
-    """Describes the mapping from TPU devices to topology coordinates.
-
-    Returns:
-      A rank 3 int32 array with shape `[tasks, devices, axis]`.
-      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
-      of TPU devices per task, and `axis` is the number of axes in the TPU
-      cluster topology. Each entry gives the `axis`-th coordinate in the
-      topology of a task/device pair. TPU topologies are 3-dimensional, with
-      dimensions `(x, y, core number)`.
-    """
-    return self._device_coordinates
-
-  def task_ordinal_at_coordinates(self, device_coordinates):
-    """Returns the TensorFlow task number attached to `device_coordinates`.
-
-    Args:
-      device_coordinates: An integer sequence describing a device's physical
-        coordinates in the TPU fabric.
-
-    Returns:
-      Returns the TensorFlow task number that contains the TPU device with those
-      physical coordinates.
-    """
-    return self._topology_tasks[tuple(device_coordinates)]
-
-  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
-    """Returns the TensorFlow device number at `device_coordinates`.
-
-    Args:
-      device_coordinates: An integer sequence describing a device's physical
-        coordinates in the TPU fabric.
-
-    Returns:
-      Returns the TensorFlow device number within the task corresponding to
-      attached to the device with those physical coordinates.
-    """
-    return self._topology_devices[tuple(device_coordinates)]
-
-  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
-    """Returns the CPU device attached to a logical core."""
-    return _tpu_host_device_name(
-        job, self._topology_tasks[tuple(device_coordinates)])
-
-  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
-    """Returns the name of the TPU device assigned to a logical core."""
-    return _tpu_device_name(job,
-                            self._topology_tasks[tuple(device_coordinates)],
-                            self._topology_devices[tuple(device_coordinates)])
-
-  @property
-  def num_tasks(self):
-    """Returns the number of TensorFlow tasks in the TPU slice."""
-    return self._device_coordinates.shape[0]
-
-  @property
-  def num_tpus_per_task(self):
-    """Returns the number of TPU devices per task in the TPU slice."""
-    return self._device_coordinates.shape[1]
-
-  def serialized(self):
-    """Returns the serialized form of the topology."""
-    if self._serialized is None:
-      proto = topology_pb2.TopologyProto()
-      proto.mesh_shape[:] = list(self._mesh_shape)
-      proto.num_tasks = self._device_coordinates.shape[0]
-      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
-      proto.device_coordinates.extend(list(self._device_coordinates.flatten()))
-      self._serialized = proto.SerializeToString()
-
-    return self._serialized
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.topology import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index de2bfd49eca50c87dc506d9aa690d49c8da20460..5364b20f231ac7af8adf943c3d5e21921b7a06a9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1,1566 +1,25 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-
-"""Library of TPU helper functions."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.compiler import xla
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.proto import dynamic_padding_pb2 as dynamic_padding
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.compat import compat as api_compat
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-from tensorflow.python.util import nest
-
-
-# Operations that indicate some error in the users graph, e.g. a placeholder
-# that's introduced outside of the infeed.
-_BLACKLISTED_OPS = set([
-    "Placeholder",
-])
-
-# XLA doesn't currently support reading of intermediate tensors, thus some ops
-# are not supported.
-_UNSUPPORTED_OPS = set([
-    "AudioSummary",
-    "AudioSummaryV2",
-    "HistogramSummary",
-    "ImageSummary",
-    "MergeSummary",
-    "Print",
-    "ScalarSummary",
-    "TensorSummary",
-    "TensorSummaryV2",
-    ])
-
-_MAX_WARNING_LINES = 5
-
-_TPU_REPLICATE_ATTR = "_tpu_replicate"
-_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
-_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
-
-
-def _tpu_system_device_name(job):
-  """Returns the device name for the TPU_SYSTEM device of `job`."""
-  if job is None:
-    return "/device:TPU_SYSTEM:0"
-  else:
-    return "/job:%s/device:TPU_SYSTEM:0" % job
-
-
-def initialize_system(embedding_config=None, job=None):
-  """Initializes a distributed TPU system for use with TensorFlow.
-
-  Args:
-    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
-      describing the desired configuration of the hardware embedding lookup
-      tables. If embedding_config is None, no hardware embeddings can be used.
-    job: The job (the XXX in TensorFlow device specification /job:XXX) that
-      contains the TPU devices that will be initialized. If job=None it is
-      assumed there is only one job in the TensorFlow flock, and an error will
-      be returned if this assumption does not hold.
-  Returns:
-    A serialized `TopologyProto` that describes the TPU system. Note:
-      the topology must be evaluated using `Session.run` before it can be used.
-  """
-  config_string = ("" if embedding_config is None else
-                   embedding_config.SerializeToString())
-  with ops.device(_tpu_system_device_name(job)):
-    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
-
-
-def shutdown_system(job=None):
-  """Shuts down a running a distributed TPU system."""
-  with ops.device(_tpu_system_device_name(job)):
-    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
-  return shutdown_distributed_tpu
-
-
-def core(num):
-  """Returns the device name for a core in a replicated TPU computation.
-
-  Args:
-    num: the virtual core number within each replica to which operators should
-    be assigned.
-  Returns:
-    A device name, suitable for passing to `tf.device()`.
-  """
-  return "device:TPU_REPLICATED_CORE:{}".format(num)
-
-
-class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
-  """A `ControlFlowContext` for nodes inside a TPU computation.
-
-  The primary role of `TPUReplicateContext` is to mark operators inside a
-  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
-  is a unique name.
-
-  We use a `ControlFlowContext` to perform the annotation since it integrates
-  with Tensorflow constructs like ResourceVariables. For example, if a
-  `ResourceVariable` is constructed inside a tpu.replicate() block, the
-  `ResourceVariable` implementation can use
-  `with ops.control_dependencies(None)` to build the variable's definition
-  outside the replicated computation.
-  """
-
-  def __init__(self, name, num_replicas, pivot):
-    """Builds a new TPUReplicateContext.
-
-    Args:
-      name: a unique name for the context, used to populate the `_tpu_replicate`
-        attribute.
-      num_replicas: an integer that gives the number of replicas for the
-        computation.
-      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
-        inputs will have a control dependency on the pivot node. This ensures
-        that nodes are correctly included in any enclosing control flow
-        contexts.
-    """
-    super(TPUReplicateContext, self).__init__()
-    self._num_replicas = num_replicas
-    self._outer_device_function_stack = None
-    self._oc_dev_fn_stack = None
-    self._outside_compilation_cluster = None
-    self._outside_compilation_counter = 0
-    self._in_gradient_colocation = None
-    self._gradient_colocation_stack = []
-    self._host_compute_core = []
-    self._name = name
-    self._name_as_bytes = compat.as_bytes(name)
-    self._unsupported_ops = []
-    self._pivot = pivot
-    self._replicated_vars = {}
-
-  def get_replicated_var_handle(self, name, vars_):
-    """Returns a variable handle for replicated TPU variable 'var'.
-
-    This is a method used by an experimental replicated variable implementation
-    and is not intended as a public API.
-
-    Args:
-      name: The common name of the variable.
-      vars_: The replicated TPU variables.
-
-    Returns:
-      The handle of the TPU replicated input node.
-    """
-    handle = self._replicated_vars.get(name)
-    if handle is not None:
-      return handle
-
-    # Builds a TPUReplicatedInput node for the variable, if one does not already
-    # exist. The TPUReplicatedInput node must belong to the enclosing
-    # control-flow scope of the TPUReplicateContext.
-    # TODO(phawkins): consider changing the contract of the TPU encapsulation
-    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
-    # instead.
-
-    # pylint: disable=protected-access
-    graph = ops.get_default_graph()
-    saved_context = graph._get_control_flow_context()
-    graph._set_control_flow_context(self.outer_context)
-    handle = tpu_ops.tpu_replicated_input(
-        [v.handle for v in vars_], name=name + "/handle")
-    graph._set_control_flow_context(saved_context)
-    # pylint: enable=protected-access
-    self._replicated_vars[name] = handle
-    return handle
-
-  def report_unsupported_operations(self):
-    if self._unsupported_ops:
-      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
-                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
-      logging.warning("%d unsupported operations found: \n%s",
-                      len(self._unsupported_ops), op_str)
-      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
-        logging.warning("... and %d more" %
-                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
-
-  def EnterGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      self._gradient_colocation_stack.append(op)
-      if not self._outside_compilation_cluster:
-        try:
-          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
-          if self._in_gradient_colocation:
-            raise NotImplementedError(
-                "Cannot nest gradient colocation operations outside compilation"
-            )
-          if gradient_uid == "__unsupported__":
-            raise NotImplementedError(
-                "No gradient_uid calling gradient within outside_compilation")
-          # When we take the gradient of an op X in an outside_compilation
-          # cluster C in a forward computation we would like to put the ops
-          # corresponding to the gradient of X into a new outside_compilation
-          # cluster C'. However, if we take the gradient of X twice, the second
-          # one should get yet another new outside_compilation cluster C''.
-          #
-          # The mechanism we adopt is to use a 'root_cluster' which is the
-          # cluster that X was in before we took gradients, and a 'gradient_uid'
-          # which is different for every invocation of gradients, and put the
-          # gradient of X in cluster 'root_cluster.gradient_uid'.
-          #
-          # When taking a gradient of a gradient, some ops will be colocated
-          # with Op in the forward pass (e.g., cluster root_cluster) and some in
-          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
-          # We need all of the grad-of-grad ops to be in the same cluster to
-          # avoid cyclic dependencies between clusters. We adopt a heuristic
-          # that puts any op clustered with root_cluster.<xxx> in
-          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
-          self._in_gradient_colocation = op
-          parts = outside_attr.split(".")
-          cluster = parts[0] + "." + gradient_uid
-          self._EnterOutsideCompilationScope(cluster=cluster)
-        except ValueError:
-          # The attr was not present: do nothing.
-          pass
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      if not self._gradient_colocation_stack:
-        raise errors.InternalError(
-            op.node_def, op,
-            "Badly nested gradient colocation: empty stack when popping Op " +
-            op.name)
-      last_op = self._gradient_colocation_stack.pop()
-      if op is last_op:
-        if op is self._in_gradient_colocation:
-          self._in_gradient_colocation = None
-          self._ExitOutsideCompilationScope()
-      else:
-        raise errors.InternalError(
-            op.node_def, op, "Badly nested gradient colocation, expected " +
-            last_op + ", got " + op.name)
-
-  def _EnterOutsideCompilationScope(self, cluster=None):
-
-    class FakeOp(object):
-      """A helper class to determine the current device.
-
-      Supports only the type and device set/get methods needed to run the
-      graph's _apply_device_function method.
-      """
-
-      def __init__(self):
-        self._device = ""
-
-      @property
-      def type(self):
-        return "FakeOp"
-
-      @property
-      def device(self):
-        return self._device
-
-      def _set_device(self, device):
-        if isinstance(device, pydev.DeviceSpec):
-          self._device = device.to_string()
-        else:
-          self._device = device
-
-    if self._outside_compilation_cluster:
-      raise NotImplementedError("Cannot nest outside_compilation clusters")
-    if cluster:
-      self._outside_compilation_cluster = cluster
-    else:
-      self._outside_compilation_cluster = str(self._outside_compilation_counter)
-      self._outside_compilation_counter += 1
-    graph = ops.get_default_graph()
-    fake_op = FakeOp()
-    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
-    device = pydev.DeviceSpec.from_string(fake_op.device)
-    if (device.device_type == "TPU_REPLICATED_CORE" and
-        device.device_index is not None):
-      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
-                                     str(device.device_index))
-    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
-    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
-
-  def _ExitOutsideCompilationScope(self):
-    if not self._outside_compilation_cluster:
-      raise NotImplementedError(
-          "Attempted to exit outside_compilation scope when not in scope")
-    self._outside_compilation_cluster = None
-    graph = ops.get_default_graph()
-    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
-
-  def Enter(self):
-    if not self._outer_device_function_stack:
-      # Capture the device function stack at the time of first entry
-      # since that is the stack that will be used outside_compilation.
-      graph = ops.get_default_graph()
-      # pylint: disable=protected-access
-      self._outer_device_function_stack = graph._device_function_stack.copy()
-      # pylint: enable=protected-access
-    super(TPUReplicateContext, self).Enter()
-
-  def HostComputeCore(self):
-    return self._host_compute_core
-
-  def _RemoveExternalControlEdges(self, op):
-    """Remove any external control dependency on this op."""
-    internal_control_inputs = []
-    external_control_inputs = []
-    for x in op.control_inputs:
-      # pylint: disable=protected-access
-      is_internal_op = False
-      ctxt = x._get_control_flow_context()
-      while ctxt is not None:
-        if ctxt == self:
-          is_internal_op = True
-          break
-        ctxt = ctxt._outer_context
-      if is_internal_op:
-        internal_control_inputs.append(x)
-      else:
-        external_control_inputs.append(x)
-      # pylint: enable=protected-access
-    # pylint: disable=protected-access
-    op._remove_all_control_inputs()
-    op._add_control_inputs(internal_control_inputs)
-    # pylint: enable=protected-access
-    return internal_control_inputs, external_control_inputs
-
-  def AddOp(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_OPS:
-      logging.error("Operation of type %s (%s) is not supported on the TPU. "
-                    "Execution will fail if this op is used in the graph. " %
-                    (op.type, op.name))
-
-    if op.type in _UNSUPPORTED_OPS:
-      self._unsupported_ops.append(op)
-
-    if any(x.dtype._is_ref_dtype for x in op.inputs):
-      raise NotImplementedError(
-          "Non-resource Variables are not supported inside TPU computations "
-          "(operator name: %s)" % op.name)
-    if _TPU_REPLICATE_ATTR in op.node_def.attr:
-      raise ValueError("TPU computations cannot be nested")
-    op._set_attr(_TPU_REPLICATE_ATTR,
-                 attr_value_pb2.AttrValue(s=self._name_as_bytes))
-    if self._outside_compilation_cluster:
-      op._set_attr(
-          _OUTSIDE_COMPILATION_ATTR,
-          attr_value_pb2.AttrValue(
-              s=compat.as_bytes(self._outside_compilation_cluster)))
-    if self._num_replicas > 1 or not self._outside_compilation_cluster:
-      # Prevent feeding or fetching anything that is being compiled,
-      # and any replicated outside_compilation Op.
-      op.graph.prevent_feeding(op)
-      op.graph.prevent_fetching(op)
-
-    # Remove any control edges from outer control flow contexts. These may cause
-    # mismatched frame errors.
-    (internal_control_inputs,
-     external_control_inputs) = self._RemoveExternalControlEdges(op)
-
-    if not op.inputs:
-      # Add a control edge from the control pivot to this op.
-      if not internal_control_inputs:
-        # pylint: disable=protected-access
-        op._add_control_input(self.GetControlPivot())
-        # pylint: enable=protected-access
-    else:
-      for index in xrange(len(op.inputs)):
-        x = op.inputs[index]
-        real_x = self.AddValue(x)
-        if real_x != x:
-          op._update_input(index, real_x)  # pylint: disable=protected-access
-
-    if external_control_inputs:
-      # Use an identity to pull control inputs as data inputs. Note that we
-      # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      with ops.control_dependencies(None):
-        self.Enter()
-        external_control_inputs = [
-            array_ops.identity(x.outputs[0]).op
-            for x in external_control_inputs
-            if x.outputs
-        ]
-        self.Exit()
-      # pylint: disable=protected-access
-      op._add_control_inputs(external_control_inputs)
-      # pylint: enable=protected-access
-
-    # Mark op's outputs as seen by this context and any outer contexts.
-    output_names = [x.name for x in op.outputs]
-    context = self
-    while context is not None:
-      # pylint: disable=protected-access
-      context._values.update(output_names)
-      context = context._outer_context
-      # pylint: enable=protected-access
-
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    """Add `val` to the current context and its outer context recursively."""
-    if val.name in self._values:
-      # Use the real value if it comes from outer context.
-      result = self._external_values.get(val.name)
-      return val if result is None else result
-
-    result = val
-    self._values.add(val.name)
-    if self._outer_context:
-      result = self._outer_context.AddValue(val)
-      self._values.add(result.name)
-
-    self._external_values[val.name] = result
-
-    return result
-
-  def AddInnerOp(self, op):
-    self.AddOp(op)
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  @property
-  def grad_state(self):
-    # Define the gradient loop state associated with the TPUReplicateContext to
-    # be None as the TPUReplicateContext does not get nested nor does the
-    # grad_state outside the TPUReplicateContext affect the graph inside so the
-    # grad_state should be as if this is the top-level gradient state.
-    return None
-
-  @property
-  def back_prop(self):
-    """Forwards to the enclosing while context, if any."""
-    if self.GetWhileContext():
-      return self.GetWhileContext().back_prop
-    return False
-
-  def GetControlPivot(self):
-    return self._pivot
-
-
-def outside_compilation(computation, *args, **kwargs):
-  """Builds part of a computation outside any current TPU replicate scope.
-
-  Args:
-    computation: A Python function that builds the computation to
-      place on the host.
-    *args: the positional arguments for the computation.
-    **kwargs: the keyword arguments for the computation.
-
-  Returns:
-    The Tensors returned by computation.
-  """
-  args = [] if args is None else args
-  graph = ops.get_default_graph()
-
-  # If we are in a TPUReplicateContext, signal that we are now
-  # outside_compilation
-  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  retval = computation(*args, **kwargs)
-
-  # If we are in a TPUReplicateContext, signal that we are no longer
-  # outside_compilation
-  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  if initial_context is not final_context:
-    raise NotImplementedError(
-        "Control-flow context cannot be different at start and end of an "
-        "outside_compilation scope")
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  return retval
-
-
-def replicate(computation,
-              inputs=None,
-              infeed_queue=None,
-              device_assignment=None,
-              name=None,
-              maximum_shapes=None):
-  """Builds a graph operator that runs a replicated TPU computation.
-
-  Args:
-    computation: A Python function that builds the computation to replicate.
-    inputs: A list of lists of input tensors or `None` (equivalent to
-      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs. Each input can be a nested structure
-      containing values that are convertible to tensors. Note that passing an
-      N-dimension list of compatible values will result in a N-dimention list of
-      scalar tensors rather than a single Rank-N tensors. If you need different
-      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to computation.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each replica of the computation uses
-      only one core, and there is either only one replica, or the number of
-      replicas is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-    maximum_shapes: A nested structure of tf.TensorShape representing the shape
-      to which the respective component of each input element in each replica
-      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
-      tf.TensorShape or -1 in a tensor-like object) will be padded to the
-      maximum size of that dimension over all replicas. Note that if the input
-      dimension is already static, we won't do padding on it and we require the
-      maximum_shapes to have the same value or None on that dimension. The
-      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
-  Returns:
-    A list of outputs, indexed by `[replica_num]` each output can be a nested
-    structure same as what computation() returns with a few exceptions.
-
-    Exceptions include:
-      1) None output: a NoOp would be returned which control-depends on
-         computation.
-      2) Single value output: A tuple containing the value would be returned.
-      3) Operation-only outputs: a NoOp would be returned which
-         control-depends on computation.
-      TODO(b/121383831): Investigate into removing these special cases.
-
-  Raises:
-    ValueError: If all replicas do not have equal numbers of input tensors.
-    ValueError: If the number of inputs per replica does not match
-      the number of formal parameters to `computation`.
-    ValueError: If the static `inputs` dimensions don't match with the values
-      given in `maximum_shapes`.
-    ValueError: If the structure of inputs per replica does not match
-      the structure of `maximum_shapes`.
-  """
-  return split_compile_and_replicate(
-      computation,
-      inputs,
-      infeed_queue,
-      device_assignment,
-      name,
-      maximum_shapes=maximum_shapes)[1]
-
-
-def _pad_all_input(inputs, padded_shapes):
-  """Pad all input tensors given padded_shapes.
-
-  The real shape tensors will be concatenated with the padded original inputs.
-
-  Args:
-    inputs: The original inputs.
-    padded_shapes: A list of padded shapes for each input.
-
-  Returns:
-    The padded inputs and a PaddingMap list which maps the padded input
-    dimension to the real shape argument index.
-  """
-  input_shape_tensors = []
-  for core_idx, inputs_per_core in enumerate(inputs):
-    for idx, input_tensor in enumerate(inputs_per_core):
-      if core_idx == 0:
-        input_shape_tensors.append([])
-      input_shape_tensors[idx].append(array_ops.shape(input_tensor))
-
-  maximum_shapes = []
-  for shapes_per_input in input_shape_tensors:
-    maximum_shapes.append(
-        math_ops.reduce_max(array_ops.stack(shapes_per_input), axis=0))
-
-  padded_inputs = []
-  real_shapes = []
-  padding_maps = []
-  for core_idx, inputs_per_core in enumerate(inputs):
-    padded_inputs.append([])
-    real_shapes.append([])
-    real_shape_idx = len(inputs_per_core) - 1
-    for idx, input_tensor in enumerate(inputs_per_core):
-      input_shape_tensor = input_shape_tensors[idx][core_idx]
-      input_shape = input_tensor.get_shape()
-      padded_shape = padded_shapes[idx]
-
-      # The static shape of inputs should be compatible with the given padded
-      # shapes.
-      input_shape.assert_is_compatible_with(padded_shape)
-
-      if input_shape.is_fully_defined():
-        # Do nothing if the shape of the whole tensor is already static.
-        padded_inputs[core_idx].append(input_tensor)
-      else:
-        # Only pad the non static shape dimension.
-        for i, s in enumerate(input_shape):
-          if s.value is None:
-            if core_idx == 0:
-              real_shape_idx += 1
-              padding_map = dynamic_padding.PaddingMap()
-              padding_map.arg_index = idx
-              padding_map.shape_index = i
-              padding_map.padding_arg_index = real_shape_idx
-              padding_maps.append(padding_map)
-            real_shapes[core_idx].append(
-                math_ops.cast(input_shape_tensor[i], dtypes.uint32))
-
-        paddings = []
-        for i, s in enumerate(padded_shape):
-          if input_shape[i].value:
-            # Don't pad if input shape is already static.
-            padding = [0, 0]
-          else:
-            if s.value:
-              # Pad to the given maximum value.
-              padding = [0, s.value - input_shape_tensor[i]]
-            else:
-              # If maximum value is not given, then pad to the maximum dimension
-              # among all the cores.
-              padding = [0, maximum_shapes[idx][i] - input_shape_tensor[i]]
-          paddings.append(padding)
-
-        padded_input = array_ops.pad(input_tensor, paddings)
-        padded_inputs[core_idx].append(padded_input)
-
-  num_replicas = len(padded_inputs)
-  for i in range(num_replicas):
-    padded_inputs[i].extend(real_shapes[i])
-
-  return padded_inputs, padding_maps
-
-
-def split_compile_and_replicate(computation,
-                                inputs=None,
-                                infeed_queue=None,
-                                device_assignment=None,
-                                name=None,
-                                use_tpu=True,
-                                maximum_shapes=None):
-  """Builds graph operators that runs compilation and replicated computation.
-
-  This is a lower level interface than replicate that returns a separate compile
-  and execute output tensor. In the generated graph the compile op feeds into
-  the execute op and no additional compilation is incurred when running the
-  compile op before the execute op. The compile op returns additional
-  information about the compilation but does not return the compiled program.
-
-  Args:
-    computation: A Python function that builds the computation to replicate.
-    inputs: A list of lists of input tensors or `None` (equivalent to
-      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs. Each input can be a nested structure
-      containing values that are convertible to tensors. Note that passing an
-      N-dimension list of compatible values will result in a N-dimention list of
-      scalar tensors rather than a single Rank-N tensors. If you need different
-      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to computation.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each replica of the computation uses
-      only one core, and there is either only one replica, or the number of
-      replicas is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
-      backends. Currently, only supports a default placement (computation is
-      placed on GPU if one is available, and on CPU if not).
-    maximum_shapes: A nested structure of tf.TensorShape representing the shape
-      to which the respective component of each input element in each replica
-      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
-      tf.TensorShape or -1 in a tensor-like object) will be padded to the
-      maximum size of that dimension over all replicas. Note that if the input
-      dimension is already static, we won't do padding on it and we require the
-      maximum_shapes to have the same value or None on that dimension. The
-      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
-
-  Returns:
-    A list of lists with the first list corresponding to the compile op and the
-    second a list of output tensors, indexed by `[replica_num][output_num]`.
-  Raises:
-    ValueError: If all replicas do not have equal numbers of input tensors.
-    ValueError: If the number of inputs per replica does not match
-      the number of formal parameters to `computation`.
-    ValueError: If the static `inputs` dimensions don't match with the values
-      given in `maximum_shapes`.
-    ValueError: If the structure of inputs per replica does not match
-      the structure of `maximum_shapes`.
-  """
-  del name
-  inputs = [[]] if inputs is None else inputs
-
-  metadata_kwargs = {}
-  if device_assignment is not None:
-    # Turn the Numpy array into a flattened list so we can pass it as an
-    # operator attribute.
-    metadata_kwargs = {
-        "topology":
-            device_assignment.topology.serialized(),
-        "device_assignment":
-            device_assignment.core_assignment.flatten().tolist()
-    }
-    # TODO(phawkins): remove this case after the forward compatibility window
-    # expires on 2018-10-5.
-    if api_compat.forward_compatible(2018, 10, 5):
-      metadata_kwargs["num_cores_per_replica"] = (
-          device_assignment.num_cores_per_replica)
-    else:
-      metadata_kwargs["computation_shape"] = [
-          device_assignment.num_cores_per_replica
-      ]
-
-  if ((not isinstance(inputs, list)) or
-      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
-    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
-
-  num_replicas = len(inputs)
-
-  # No replicas? Nothing to do.
-  if num_replicas == 0:
-    return []
-
-  # Checks all replicas have the same structure.
-  for i in xrange(1, num_replicas):
-    nest.assert_same_structure(inputs[0], inputs[i])
-
-  # Flatten inputs.
-  flat_inputs = [
-      nest.flatten(per_replica_input) for per_replica_input in inputs
-  ]
-  # Converts inputs to Tensors.
-  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
-
-  # Verifies that all replicas have matching numbers and types of inputs
-  flat_input_types = [x.dtype for x in flat_inputs[0]]
-  input_arity = len(inputs[0])
-  flat_input_arity = len(flat_input_types)
-  for i in range(num_replicas):
-    if len(inputs[i]) != input_arity:
-      raise ValueError("Replicas must have the same number of inputs. "
-                       "Replica 0 had {} inputs, replica {} had {} "
-                       "inputs.".format(input_arity, i, len(inputs[i])))
-
-    types = [x.dtype for x in flat_inputs[i]]
-    if types != flat_input_types:
-      raise ValueError("Replicas must have matching input types. Replica 0 had "
-                       "input types {}, replica {} had input types {}".format(
-                           flat_input_types, i, types))
-
-  arg_error = xla.check_function_argument_count(
-      computation, input_arity, infeed_queue)
-  if arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied computation cannot be called with the specified inputs. "
-          "You specified %d inputs: %s, but the computation needs %s" % (
-              input_arity, str([i.name for i in inputs[0]]), arg_error))
-    else:
-      raise TypeError(
-          "Supplied computation cannot be called with the specified inputs. "
-          "You specified %d inputs: %s and %d additional inputs from infeed,"
-          " but the computation needs %s" % (input_arity, str(
-              [i.name
-               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
-                                             arg_error))
-
-  if maximum_shapes:
-    if infeed_queue:
-      raise ValueError(
-          "Dynamic input shapes are not supported with infeed queues")
-
-    # Make sure maximum_shapes has the same structure as inputs.
-    nest.assert_same_structure(inputs[0], maximum_shapes, check_types=False)
-
-    # Flatten padded shapes.
-    flat_maximum_shapes = nest.flatten(maximum_shapes)
-    flat_maximum_shapes = [
-        tensor_shape.TensorShape(s) for s in flat_maximum_shapes
-    ]
-
-    flat_inputs, padding_maps = _pad_all_input(flat_inputs, flat_maximum_shapes)
-
-    serialized_padding_maps = []
-    for padding_map in padding_maps:
-      serialized_padding_maps.append(padding_map.SerializeToString())
-    metadata_kwargs["padding_map"] = serialized_padding_maps
-
-  graph = ops.get_default_graph()
-
-  # Fan-in: Builds a TPUReplicatedInput node for each input.
-  flat_replicated_inputs = []
-  for i in range(0, len(flat_inputs[0])):
-    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
-    flat_replicated_inputs.append(
-        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
-
-  cluster_name = graph.unique_name("cluster")
-  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
-  context = TPUReplicateContext(
-      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
-  try:
-    context.Enter()
-
-    metadata = tpu_ops.tpu_replicate_metadata(
-        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
-
-    with tpu_function.tpu_shard_context(
-        num_replicas), ops.control_dependencies([metadata]):
-
-      # Add identity ops so even unused inputs are "consumed" by the
-      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
-      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
-      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
-      flat_replicated_inputs = [
-          array_ops.identity(x, name="replicated_input_{}".format(i))
-          for i, x in enumerate(flat_replicated_inputs)
-      ]
-      for i in flat_replicated_inputs:
-        # pylint: disable=protected-access
-        # Add an attribute to the identity node so that they could be removed in
-        # encapsulate TPU computation pass if unused. However we don't remove
-        # inputs when dynamic padding is enabled.
-        # TODO(rxsang): Use other ways except argument index in padding_map so
-        # outside compilation can work with dynamic padding correctly.
-        if maximum_shapes is None:
-          i.op._set_attr("_tpu_input_identity",
-                         attr_value_pb2.AttrValue(b=True))
-        # pylint: enable=protected-access
-
-      # Unflatten the computation inputs to match original input structure.
-      computation_inputs = nest.pack_sequence_as(
-          structure=inputs[0],
-          flat_sequence=flat_replicated_inputs[:flat_input_arity])
-
-      # If there is an infeed queue, adds the dequeued values to the
-      # computation's inputs.
-      if infeed_queue is not None:
-        infeed_queue.set_number_of_shards(num_replicas)
-        for t in infeed_queue.generate_dequeue_op():
-          computation_inputs.append(t)
-
-      # Only resource variables work inside a TPU computation, so turn on
-      # resource variables for the computation.
-      # TODO(phawkins): consider removing this code. It will
-      # be less confusing to clients if they knowingly choose to use resource
-      # variables.
-      # Partitioned variables is not supported (b/112311320).
-      vscope = variable_scope.get_variable_scope()
-      saved_use_resource = vscope.use_resource
-      saved_custom_getter = vscope.custom_getter
-
-      def custom_getter(getter, name, *args, **kwargs):
-        """Variables on TPU have a few restrictions."""
-        partitioner = kwargs["partitioner"]
-        if partitioner is not None:
-          kwargs["partitioner"] = None
-          logging.warning(
-              "Partitioned variables are not supported on TPU. Got "
-              "`partitioner` that is {} for variable {}. "
-              "Setting `partitioner` to `None`."
-              .format(partitioner, name))
-        if saved_custom_getter is None:
-          return getter(name, *args, **kwargs)
-        else:
-          return saved_custom_getter(getter, name, *args, **kwargs)
-
-      vscope.set_use_resource(True)
-      vscope.set_custom_getter(custom_getter)
-
-      outputs = computation(*computation_inputs)
-
-      vscope.set_use_resource(saved_use_resource)
-      vscope.set_custom_getter(saved_custom_getter)
-
-    outputs_is_flat = xla.is_flat(outputs)
-    if outputs_is_flat:
-      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
-    else:
-      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
-
-    context.ExitResult(output_tensors)
-  finally:
-    context.report_unsupported_operations()
-    context.Exit()
-    host_compute_core = context.HostComputeCore()
-
-  if host_compute_core:
-    attr_value = attr_value_pb2.AttrValue()
-    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
-    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
-
-  with ops.control_dependencies([metadata]):
-    if use_tpu:
-      compile_status = tpu_ops.tpu_compilation_result()
-      op = compile_status.op
-      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
-      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
-    else:
-      compile_status = control_flow_ops.no_op(name="compilation_status")
-
-  if not output_tensors:
-    # Returns a list of NoOps dependent on the replication Op, indexed by
-    # [replica_num].
-    return [
-        compile_status,
-        [
-            control_flow_ops.group(control_deps, name="shard_%d" % i)
-            for i in range(num_replicas)
-        ]
-    ]
-
-  # Fan-out: Builds a TPUReplicatedOutput node for each output.
-  replicated_outputs = [[] for i in xrange(num_replicas)]
-  for i, t in enumerate(output_tensors):
-    # Fan-out: Builds a TPUReplicatedOutput node for each output.
-    ys = tpu_ops.tpu_replicated_output(
-        t, num_replicas, name="output{}".format(i))
-
-    # Wraps the outputs in identity operators so the names of any possible
-    # `fetch` nodes are preserved by the replication rewrite.
-    with ops.control_dependencies(control_deps):
-      for replica in xrange(num_replicas):
-        replicated_outputs[replica].append(
-            array_ops.identity(
-                ys[replica], name="output_%d_shard_%d" % (i, replica)))
-
-  if not outputs_is_flat:
-    replicated_outputs = [
-        nest.pack_sequence_as(outputs, replica_outs)
-        for replica_outs in replicated_outputs
-    ]
-
-  return [compile_status, replicated_outputs]
-
-
-def _postprocess_flat_outputs(outputs):
-  """Validates non-flat outputs, add backs device assignments and other attrs.
-
-  Args:
-    outputs: Output from `computation` inside `tpu.rewrite`.
-
-  Returns:
-    Tensors and Operations extracted from outputs.
-  """
-  # Following code segment is to preserve legacy behavior. Previously we only
-  # supported flat outputs and thus for consistency it was nice to convert even
-  # single element into a tuple. But now that we support arbitrary output
-  # structure, this is no longer necessary.
-  # TODO(b/121383831): Migrate all legacy use cases and delete this special
-  # case.
-  # If the computation returns `None`, make it an empty tuple.
-  if outputs is None:
-    outputs = tuple()
-  # If the computation only returned one value, makes it a tuple.
-  if not isinstance(outputs, collections.Sequence):
-    outputs = (outputs,)
-
-  # Append `no_op` here so that fetching any return value of this function
-  # will trigger TPUExecute node.
-  outputs += (control_flow_ops.no_op(),)
-  try:
-    with ops.device(core(0)):
-      outputs = [
-          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-          for o in outputs
-      ]
-  except Exception as e:
-    raise ValueError(
-        "TPU function return values must all either be Operations or "
-        "convertible to Tensors. Got '%s'" % str(e))
-
-  # Separates the returned Operations and Tensors.
-  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
-
-  if outputs != output_tensors + output_operations:
-    raise ValueError(
-        "TPU functions must return zero-or more Tensor values followed by "
-        "zero or more Operations.")
-
-  # Wraps outputs in Identity ops. Otherwise a replicated input copied
-  # straight to an output would bypass the replicate(). This would be bad
-  # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-  # be rewritten away, leading to a runtime error.
-  # TODO(phawkins): extend the rewrite to elide these nodes instead.
-  new_output_tensors = []
-  for t in output_tensors:
-    with ops.device(t.device if t.device else core(0)):
-      o = array_ops.identity(t)
-      # pylint: disable=protected-access
-      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-      new_output_tensors.append(o)
-  return new_output_tensors, output_operations
-
-
-def _postprocess_non_flat_outputs(outputs):
-  """Validates non-flat outputs, add backs device assignments and other attrs.
-
-  Args:
-    outputs: Output from `computation` inside `tpu.rewrite`.
-
-  Returns:
-    Tensors extracted from outputs and an empty list because Operations are not
-    allowed in non-flat outputs..
-  """
-
-  # Flatten output items.
-  flat_outputs = nest.flatten(outputs)
-
-  # Convert all non-Operation outputs to Tensors.
-  for i, o in enumerate(flat_outputs):
-    if isinstance(o, ops.Operation):
-      raise ValueError(
-          "tpu.rewrite does not support Operation as return value in non-flat "
-          "output structure. You can set returned Operations as control "
-          "dependencies of returned Tensors so Operations are triggered when "
-          'Tensors are evaluated. Operation found: "%s"' % o.name)
-
-    try:
-      o = ops.convert_to_tensor(o)
-    except Exception as e:
-      raise ValueError(
-          "TPU function return values must all either be Operations or "
-          'convertible to Tensors. Got error: "%s"' % str(e))
-
-    # Wraps outputs in Identity ops. Otherwise a replicated input copied
-    # straight to an output would bypass the replicate(). This would be bad
-    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-    # be rewritten away, leading to a runtime error.
-    # TODO(phawkins): extend the rewrite to elide these nodes instead.
-    with ops.device(core(0)):
-      o = array_ops.identity(o)
-      # pylint: disable=protected-access
-      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-      flat_outputs[i] = array_ops.identity(o)
-
-  # All flat_outputs are Tensors, and no Operations.
-  return flat_outputs, []
-
-
-def split_compile_and_shard(computation,
-                            inputs=None,
-                            num_shards=1,
-                            input_shard_axes=None,
-                            outputs_from_all_shards=True,
-                            output_shard_axes=None,
-                            infeed_queue=None,
-                            device_assignment=None,
-                            name=None):
-  """Shards `computation` for parallel execution.
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
-  of which has a corresponding split axis (from `input_shard_axes`). Each input
-  is split into `num_shards` pieces along the corresponding axis, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  If `outputs_from_all_shards` is true, the outputs from all shards of
-  `computation` are concatenated back together along their `output_shards_axes`.
-  Otherwise, each output is taken from an arbitrary shard.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). Each
-      input tensor has a corresponding shard axes, given by `input_shard_axes`,
-      which must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    input_shard_axes: A list of dimensions along which to shard `inputs`, or
-      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
-      there must be one dimension per input.
-    outputs_from_all_shards: Boolean or list of boolean. For each output, if
-      `True`, outputs from all shards are concatenated along the corresponding
-      `output_shard_axes` entry. Otherwise, each output is taken
-      from an arbitrary shard. If the argument is a boolean, the argument's
-      value is used for each output.
-    output_shard_axes: A list of dimensions along which to concatenate the
-      outputs of `computation`, or `None`. `None` means "concatenate all outputs
-      along dimension 0". If not `None`, there must be one dimension per output.
-      Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
-      of `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A tuple of (compile op, [output tensors]).
-  Raises:
-    ValueError: If num_shards <= 0
-    ValueError: If len(input_shard_axes) != len(inputs)
-    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
-  """
-  # TODO(phawkins): consider adding support for broadcasting Tensors passed as
-  # inputs.
-
-  if num_shards <= 0:
-    raise ValueError("num_shards must be a positive integer.")
-
-  inputs = [] if inputs is None else inputs
-  if not isinstance(inputs, list):
-    raise TypeError("tpu.shard()'s inputs must be a list of Tensors or None.")
-
-  # Converts inputs to Tensors.
-  inputs = [ops.convert_to_tensor(x) for x in inputs]
-
-  if input_shard_axes is None:
-    input_shard_axes = [0] * len(inputs)
-  if len(inputs) != len(input_shard_axes):
-    raise ValueError("Length of input_shard_axes must be equal to the number "
-                     "of inputs.")
-
-  if inputs:
-    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
-    # lists with layout [input][shard]
-    split_inputs = [
-        array_ops.split(x, num_shards, axis=axis)
-        for (axis, x) in zip(input_shard_axes, inputs)]
-
-    # Transposes the input lists to have layout [shard][input]
-    transposed_inputs = [list(i) for i in zip(*split_inputs)]
-  else:
-    transposed_inputs = [[]] * num_shards
-
-  compile_op, outputs = split_compile_and_replicate(
-      computation,
-      transposed_inputs,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-
-  # There must be at least one shard since num_shards > 0.
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  if isinstance(outputs[0], ops.Operation):
-    # pylint: enable=indexing-exception
-    # There were no outputs from the computation and replicate returned a list
-    # of NoOps with control dependencies on the computation. Return the first
-    # one so it can be used as a control dependency or fetch node.
-    # TODO(b/36647078) remove disable when pylint bug is fixed.
-    # pylint: disable=indexing-exception
-    return compile_op, [outputs[0]]
-    # pylint: enable=indexing-exception
-
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  num_outputs = len(outputs[0])
-  # pylint: enable=indexing-exception
-
-  if output_shard_axes is None:
-    output_shard_axes = [0] * num_outputs
-  if num_outputs != len(output_shard_axes):
-    raise ValueError("Length of output_shard_axes must be equal to the number "
-                     "of outputs.")
-
-  if isinstance(outputs_from_all_shards, bool):
-    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
-
-  if num_outputs != len(outputs_from_all_shards):
-    raise ValueError("Length of outputs_from_all_shards must be equal to the "
-                     "number of outputs.")
-
-  results = []
-  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
-                                   zip(*outputs)):
-    if all_shards:
-      # Concatenate all of the outputs together (use stack for scalars).
-      shape = x[0].shape
-      is_scalar = shape is not None and (shape.ndims == 0)
-      results.append((array_ops.stack(list(x)) if is_scalar
-                      else array_ops.concat(list(x), axis=axis)))
-    else:
-      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
-      results.append(x[0])
-
-  return compile_op, results
-
-
-def shard(computation,
-          inputs=None,
-          num_shards=1,
-          input_shard_axes=None,
-          outputs_from_all_shards=True,
-          output_shard_axes=None,
-          infeed_queue=None,
-          device_assignment=None,
-          name=None):
-  """Shards `computation` for parallel execution.
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
-  of which has a corresponding split axis (from `input_shard_axes`). Each input
-  is split into `num_shards` pieces along the corresponding axis, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  TODO(phawkins): consider adding support for broadcasting Tensors passed
-  as inputs.
-
-  If `outputs_from_all_shards` is true, the outputs from all shards of
-  `computation` are concatenated back together along their `output_shards_axes`.
-  Otherwise, each output is taken from an arbitrary shard.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). Each
-      input tensor has a corresponding shard axes, given by `input_shard_axes`,
-      which must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    input_shard_axes: A list of dimensions along which to shard `inputs`, or
-      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
-      there must be one dimension per input.
-    outputs_from_all_shards: Boolean or list of boolean. For each output, if
-      `True`, outputs from all shards are concatenated along the corresponding
-      `output_shard_axes` entry. Otherwise, each output is taken
-      from an arbitrary shard. If the argument is a boolean, the argument's
-      value is used for each output.
-    output_shard_axes: A list of dimensions along which to concatenate the
-      outputs of `computation`, or `None`. `None` means "concatenate all outputs
-      along dimension 0". If not `None`, there must be one dimension per output.
-      Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
-      of `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  Raises:
-    ValueError: If num_shards <= 0
-    ValueError: If len(input_shard_axes) != len(inputs)
-    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
-  """
-  return split_compile_and_shard(
-      computation,
-      inputs=inputs,
-      num_shards=num_shards,
-      input_shard_axes=input_shard_axes,
-      outputs_from_all_shards=outputs_from_all_shards,
-      output_shard_axes=output_shard_axes,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)[1]
-
-
-def batch_parallel(computation,
-                   inputs=None,
-                   num_shards=1,
-                   infeed_queue=None,
-                   device_assignment=None,
-                   name=None):
-  """Shards `computation` along the batch dimension for parallel execution.
-
-  Convenience wrapper around shard().
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list).
-  Each input is split into `num_shards` pieces along the 0-th dimension, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  The outputs from all shards are concatenated back together along their 0-th
-  dimension.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). The
-      0-th dimension of each Tensor must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  Raises:
-    ValueError: If `num_shards <= 0`
-  """
-  return shard(
-      computation,
-      inputs,
-      num_shards=num_shards,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-
-
-def rewrite(computation,
-            inputs=None,
-            infeed_queue=None,
-            device_assignment=None,
-            name=None):
-  """Rewrites `computation` for execution on a TPU system.
-
-  Args:
-    computation: A Python function that builds a computation to apply to the
-      input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors.
-
-      `computation` may return a list of operations and tensors. Tensors must
-      come before operations in the returned list.  The return value of
-      `rewrite` is a list of tensors corresponding to the tensors from the
-      output of `computation`.
-
-      All `Operation`s constructed during `computation` will be executed when
-      evaluating any of the returned output tensors, not just the ones returned.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
-      Each input can be a nested structure containing values that are
-      convertible to tensors. Note that passing an N-dimension list of
-      compatible values will result in a N-dimention list of scalar tensors
-      rather than a single Rank-N tensors. If you need different behavior,
-      convert part of inputs to tensors with `tf.convert_to_tensor`.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: if not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. May be omitted for a single-core computation, in which
-      case the core attached to task 0, TPU device 0 is used.
-    name: (Deprecated) Does nothing.
-  Returns:
-    Same data structure as if computation(*inputs) is called directly with some
-    exceptions for correctness. Exceptions include:
-      1) None output: a NoOp would be returned which control-depends on
-         computation.
-      2) Single value output: A tuple containing the value would be returned.
-      3) Operation-only outputs: a NoOp would be returned which
-         control-depends on computation.
-      TODO(b/121383831): Investigate into removing these special cases.
-  """
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  return replicate(
-      computation,
-      None if inputs is None else [inputs],
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)[0]
-  # pylint: enable=indexing-exception
-
-  # Operations that indicate some error in the user's inference graph.
-_BLACKLISTED_INFERENCE_OPS = set([
-    "ReadVariableOp",
-    "AssignVariableOp",
-    "AssignAddVariableOp",
-    "AssignSubVariableOp",
-    "VarHandleOp",
-    "Variable",
-    "VariableV2",
-])
-
-
-def under_tpu_inference_context():
-  """Check if it is currently under `tpu.rewrite_for_inference()`."""
-  graph = ops.get_default_graph()
-
-  context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  while context:
-    if isinstance(context, _TPUInferenceContext):
-      return True
-    context = context.outer_context
-
-  return False
-
-
-class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
-  """A `ControlFlowContext` for nodes inside a TPU inference computation.
-
-  The primary role of `TPUReplicateContext` is to sanity check operators inside
-  a tpu.rewrite_for_inference() computation.
-  """
-
-  def __init__(self, name):
-    super(_TPUInferenceContext, self).__init__()
-    self._name = name
-
-  def AddOp(self, op):
-    self._AddOpInternal(op)
-
-  def _AddOpInternal(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_INFERENCE_OPS:
-      raise NotImplementedError(
-          "Operation of type %s (%s) is not supported on the TPU for inference."
-          " Execution will fail if this op is used in the graph. Make sure your"
-          " variables are using variable_scope." % (op.type, op.name))
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    result = val
-    if self._outer_context:
-      result = self._outer_context.AddValue(val)
-    return result
-
-  def AddInnerOp(self, op):
-    self._AddOpInternal(op)
-
-  @property
-  def grad_state(self):
-    return None
-
-
-@experimental
-def validate_inference_rewrite_for_variables(graph):
-  """Validates whether rewrite_for_inference() 'worked' for variables.
-
-     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
-     after ReadVariableOps, but this mechanism works only if you are using
-     tf.get_variable() to create and access variables in your tpu computation.
-     This validation method can be called immediately after calling
-     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
-     to the graph.
-
-     Typical usages:
-       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
-
-       tpu.validate_inference_rewrite_for_variables(sess.graph)
-
-  Args:
-    graph: The graph which needs to be validated.
-  Raises:
-    RuntimeError: if validation failed.
-  """
-  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
-    raise RuntimeError(
-        "No GuaranteeConst ops found in the graph after running "
-        "tpu.rewrite_for_inference(...). Please check that you are using "
-        "tf.get_variable() to create and access variables in your tpu "
-        "computation.")
-
-
-@experimental
-def rewrite_for_inference(computation,
-                          inputs=None,
-                          infeed_queue=None,
-                          device_assignment=None,
-                          name=None):
-  """Rewrites `computation` for inference on a TPU system.
-
-     Other than 'rewriting' the computation to run on a TPU, if using variables
-     in your computation, it moves the ReadVariableOps outside the TPU
-     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
-     This mechanism works only if you are using tf.get_variable() to create and
-     access variables in your tpu computation. You can validate whether this
-     worked, by calling validate_inference_rewrite_for_variables() method
-     immediately after this method to check whether GuaranteeConstOps where
-     added to the graph.
-
-  Args:
-    computation: A Python function that builds a computation to apply to the
-      input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors. If the function returns m outputs, rewrite will return a list of
-      m tensors.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: if not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. May be omitted for a single-core computation, in which
-      case the core attached to task 0, TPU device 0 is used.
-    name: The name of the operator.
-  Returns:
-    A list of output tensors.
-  """
-
-  def guarantee_const_getter(getter, name, *args, **kwargs):
-    with ops.control_dependencies(None):
-      return array_ops.guarantee_const(
-          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
-
-  def wrapped_computation(*args, **kwargs):
-    """Execute computation under `_TPUInferenceContext`."""
-    context = _TPUInferenceContext(
-        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
-    try:
-      context.Enter()
-
-      vscope = variable_scope.get_variable_scope()
-      prev_custom_getter = vscope.custom_getter
-      prev_caching_device = vscope.caching_device
-      vscope.set_custom_getter(guarantee_const_getter)
-      vscope.set_caching_device(lambda op: op.device)
-
-      result = computation(*args, **kwargs)
-
-      vscope.set_custom_getter(prev_custom_getter)
-      vscope.set_caching_device(prev_caching_device)
-    finally:
-      context.Exit()
-    return result
-
-  # pylint: disable=undefined-variable
-  return rewrite(
-      wrapped_computation,
-      inputs=inputs,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-  # pylint: enable=undefined-variable
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu import *
+# used by tests
+from tensorflow.python.tpu.tpu import _TPU_REPLICATE_ATTR
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 9f8d14706845baa1ed45c84b2c15d372915a0eb4..c36aaa38c0e4823bfc438773e4aa5b5109794da4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -1,275 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""A RunConfig subclass with TPU support."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import os
-
-from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import tf_logging as logging
-
-# pylint: disable=protected-access
-_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
-_SERVICE_KEY = run_config_lib._SERVICE_KEY
-_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
-# pylint: enable=protected-access
-
-
-class InputPipelineConfig(object):
-  r"""Please see the definition of these values in TPUConfig."""
-  PER_SHARD_V1 = 1
-  PER_HOST_V1 = 2
-  PER_HOST_V2 = 3
-  BROADCAST = 4
-
-
-class TPUConfig(
-    collections.namedtuple('TPUConfig', [
-        'iterations_per_loop',
-        'num_shards',
-        'num_cores_per_replica',
-        'per_host_input_for_training',
-        'tpu_job_name',
-        'initial_infeed_sleep_secs',
-        'input_partition_dims',
-    ])):
-  r"""TPU related configuration required by `TPUEstimator`.
-
-  Args:
-    iterations_per_loop: This is the number of train steps running in TPU
-      system before returning to CPU host for each `Session.run`. This means
-      global step is increased `iterations_per_loop` times in one `Session.run`.
-      It is recommended to be set as number of global steps for next checkpoint.
-    num_shards: (Deprecated, ignored by TPUEstimator).
-      The number of model replicas in the system. For non-model-parallelism
-      case, this number equals the total number of TPU cores. For
-      model-parallelism, the total number of TPU cores equals
-      num_cores_per_replica * num_shards.
-    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
-      An integer which describes the number of TPU cores per model replica. This
-      is required by model-parallelism which enables partitioning
-      the model to multiple cores. Currently num_cores_per_replica must be
-      1, 2, 4, or 8.
-    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
-      `input_fn` is invoked once on each host. With the per-core input pipeline
-      configuration, it is invoked once for each core.
-      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
-      the batch size for each shard is `train_batch_size` // #hosts in the
-      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
-      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. The batch size equals to train_batch_size`. With the per-core
-      input pipeline configuration, the shard batch size is also
-      `train_batch_size` // #cores.
-      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
-    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
-      within TPUEstimator, however when using ClusterSpec propagation in more
-      esoteric cluster configurations, you may need to specify the job name as a
-      string.
-    initial_infeed_sleep_secs: The number of seconds the infeed thread should
-      wait before enqueueing the first batch. This helps avoid timeouts for
-      models that require a long compilation time.
-    input_partition_dims: A nested list to describe the partition dims
-      for all the tensors from input_fn(). The structure of
-      input_partition_dims must match the structure of `features` and
-      `labels` from input_fn(). The total number of partitions must match
-      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
-      images with shape [N, H, W, C] and labels [N].
-      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
-      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
-      to all the TPU cores since the partition dims is `None`.
-      Current limitations: This feature is only supported with the PER_HOST_V2
-      input mode.
-
-    Raises:
-      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
-  """
-
-  def __new__(cls,
-              iterations_per_loop=2,
-              num_shards=None,
-              num_cores_per_replica=None,
-              per_host_input_for_training=True,
-              tpu_job_name=None,
-              initial_infeed_sleep_secs=None,
-              input_partition_dims=None):
-
-    # Check iterations_per_loop.
-    util_lib.check_positive_integer(iterations_per_loop,
-                                    'TPUConfig iterations_per_loop')
-
-    # Check num_shards.
-    if num_shards is not None:
-      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
-
-    if input_partition_dims is not None:
-      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
-        raise ValueError(
-            'input_partition_dims must be a list/tuple with one or two'
-            ' elements.')
-
-      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
-        raise ValueError(
-            'input_partition_dims is only supported in PER_HOST_V2 mode.')
-
-      if num_cores_per_replica is None:
-        raise ValueError(
-            'input_partition_dims requires setting num_cores_per_replica.')
-
-    # Check num_cores_per_replica
-    if num_cores_per_replica is not None:
-      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
-        raise ValueError(
-            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
-                str(num_cores_per_replica)))
-
-    # per_host_input_for_training may be True, False, or integer in [1..3].
-    # Map legacy values (True, False) to numeric values.
-    if per_host_input_for_training is False:
-      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
-    elif per_host_input_for_training is True:
-      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
-
-    # Check initial_infeed_sleep_secs.
-    if initial_infeed_sleep_secs:
-      util_lib.check_positive_integer(initial_infeed_sleep_secs,
-                                      'TPUConfig initial_infeed_sleep_secs')
-
-    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
-
-    return super(TPUConfig, cls).__new__(
-        cls,
-        iterations_per_loop=iterations_per_loop,
-        num_shards=num_shards,
-        num_cores_per_replica=num_cores_per_replica,
-        per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name,
-        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
-        input_partition_dims=input_partition_dims)
-
-
-class RunConfig(run_config_lib.RunConfig):
-  """RunConfig with TPU support."""
-
-  def __init__(self,
-               tpu_config=None,
-               evaluation_master=None,
-               master=None,
-               cluster=None,
-               **kwargs):
-    """Constructs a RunConfig.
-
-    Args:
-      tpu_config: the TPUConfig that specifies TPU-specific configuration.
-      evaluation_master: a string. The address of the master to use for eval.
-        Defaults to master if not set.
-      master: a string. The address of the master to use for training.
-      cluster: a ClusterResolver
-      **kwargs: keyword config parameters.
-
-    Raises:
-      ValueError: if cluster is not None and the provided session_config has a
-        cluster_def already.
-    """
-    super(RunConfig, self).__init__(**kwargs)
-    self._tpu_config = tpu_config or TPUConfig()
-    self._cluster = cluster
-
-    # If user sets master and/or evaluation_master explicitly, including empty
-    # string '', take it. Otherwise, take the values set by parent class.
-    if master is not None:
-      if cluster is not None:
-        raise ValueError('Both master and cluster are set.')
-      self._master = master
-    else:
-      if cluster:
-        self._master = cluster.master()
-
-    if evaluation_master is not None:
-      self._evaluation_master = evaluation_master
-    elif (not self._evaluation_master and
-          self.task_type != run_config_lib.TaskType.EVALUATOR):
-      # If the task type is EVALUATOR, it means some cluster manager sets the
-      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
-      #
-      # Otherwise, it means user executes the code without external cluster
-      # manager. For that, we optimize the user experience by setting
-      # evaluation_master to master, unless user overwrites it.
-      self._evaluation_master = self._master
-
-    # Set the ClusterSpec to use
-    if cluster:
-      self._cluster_spec = cluster.cluster_spec()
-
-      # Merge the cluster_def into the ConfigProto.
-      if self._session_config is None:  # pylint: disable=access-member-before-definition
-        self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-      if self._session_config.HasField('cluster_def'):
-        raise ValueError(
-            'You cannot provide a ClusterResolver and '
-            'session_config.cluster_def.')
-      if self._cluster_spec:
-        self._session_config.cluster_def.CopyFrom(
-            self._cluster_spec.as_cluster_def())
-
-  def _maybe_overwrite_session_config_for_distributed_training(self):
-    # Overrides the parent class session_config overwrite for between-graph. TPU
-    # runs with in-graph, which should not have device filter. Doing nothing
-    # ("pass") basically disables it.
-    pass
-
-  @property
-  def evaluation_master(self):
-    return self._evaluation_master
-
-  @property
-  def master(self):
-    return self._master
-
-  @property
-  def tpu_config(self):
-    return self._tpu_config
-
-  @property
-  def cluster(self):
-    return self._cluster
-
-  def replace(self, **kwargs):
-    if 'tpu_config' not in kwargs:
-      return super(RunConfig, self).replace(**kwargs)
-
-    tpu_config = kwargs.pop('tpu_config')
-    new_instance = super(RunConfig, self).replace(**kwargs)
-    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
-    return new_instance
-
-
-def _get_tpu_job_name_from_tf_config():
-  """Extracts the TPU job name from TF_CONFIG env variable."""
-  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
-  # spec propagation.
-  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
-  if tpu_job_name:
-    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
-  return tpu_job_name
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_config import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index ed1e0f0401a96c34e6ff9323685857b64e10bd14..b77b010cba6bf32c3b6d170bc522eebfb6a04f77 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -1,763 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
-import copy
-
-from tensorflow.contrib.tpu.python.tpu import _tpu_estimator_embedding
-from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.platform import tf_logging as logging
-
-
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-_NUM_CORES_TO_COMPUTATION_SHAPE = {
-    1: [1, 1, 1],
-    2: [1, 1, 2],
-    4: [1, 2, 2],
-    8: [2, 2, 2],
-    16: [4, 2, 2],
-}
-
-
-class TPUContext(object):
-  """A context that holds the current configuration of the TPU computation."""
-
-  def __init__(self,
-               internal_ctx,
-               input_device=None,
-               invocation_index=None,
-               call_from_input_fn=True):
-    self._internal_ctx = internal_ctx
-    self._input_device = input_device
-    self._invocation_index = invocation_index
-    self._call_from_input_fn = call_from_input_fn
-
-  def current_input_fn_deployment(self):
-    """The configuration of the current input_fn invocation.
-
-    The configuration depends on `TPUConfig.per_host_input_for_training`. See
-    `TPUConfig` for details.
-
-    Only set in params dict of input_fn
-
-    Returns:
-      A tuple of
-        1. Device spec string: String, is the current CPU host where the
-           input_fn is invoked.
-        2. Current invocation index: Int, 0-based index of the input_fn
-           invocation. See next item for details.
-        3. Total invocation count: Int, the total number of times to invoke the
-           input_fn on all CPU hosts. Each invocation will be passed with a new
-           `TPUContext` instance with current invocation index set properly.
-        4. Total number of replicas consumed by current_invocation: Int, the
-           number of replicas fed by the data returned by current input_fn. For
-           example, for per_core input pipeline deployment
-           and non-model-parallelism, total invocation count is equal to
-           the number of cores in the system and num replicas consumed by
-           current invocation is 1. For per-host v2 input pipeline deployment,
-           total invocation count is equal to the number of hosts in the system
-           and num replicas consumed by current invocation is equal to number of
-           cores per host.
-
-    Raises:
-      RuntimeError: If this method must not be called from input_fn.
-    """
-    if not self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' model_fn.')
-
-    if self._internal_ctx.is_input_sharded_per_core():
-      total_invocation_count = (self._internal_ctx.num_hosts
-                                * self._internal_ctx.num_of_replicas_per_host)
-      replicas_consumed = 1
-    elif self._internal_ctx.is_input_broadcast_with_iterators():
-      total_invocation_count = 1
-      replicas_consumed = self._internal_ctx.num_replicas
-    else:
-      total_invocation_count = self._internal_ctx.num_hosts
-      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
-    return (self._input_device, self._invocation_index,
-            total_invocation_count, replicas_consumed)
-
-  @property
-  def num_replicas(self):
-    """The total number of replicas.
-
-    For non-model-parallelism, num_replicas should be the total num of TPU
-    cores in the system.
-
-    Returns:
-      The number of replicas.
-    """
-    return self._internal_ctx.num_replicas
-
-  @property
-  def num_hosts(self):
-    """The number of hosts for the TPU system."""
-    return self._internal_ctx.num_hosts
-
-  @property
-  def current_host(self):
-    """The current host index for the TPU system."""
-    return self._invocation_index
-
-  @property
-  def num_of_replicas_per_host(self):
-    """The number of replicas for each host."""
-    if self._internal_ctx.model_parallelism_enabled:
-      raise ValueError(
-          'num_of_replicas_per_host is not supported for model_parallelism')
-    return self._internal_ctx.num_of_replicas_per_host
-
-  @property
-  def device_assignment(self):
-    """Returns device_assignment object."""
-    if self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' input_fn.')
-    return self._internal_ctx.device_assignment
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    # Note that: For the non-model parallelism, the mapping could be
-    # a random permutation. The order should not matter in most cases
-    # as far as model is replicated to all cores in the system.
-    return self._internal_ctx.device_for_replica(replica_id)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function.
-
-    The place function takes host_id as the input and returns the TF device
-    for the correspoding host.
-    """
-
-    def _placement_function(host_id):
-      """Return the host device given host_id."""
-      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
-
-    return _placement_function
-
-
-class _InternalTPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, based on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  if eval_on_tpu is False, then execution of eval on TPU is disabled.
-  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
-  and TPU execution is disabled for all modes.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _InternalTPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self,
-               config,
-               train_batch_size,
-               eval_batch_size,
-               predict_batch_size,
-               use_tpu,
-               eval_on_tpu=True,
-               embedding_config_spec=None):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._predict_batch_size = predict_batch_size
-    self._use_tpu = use_tpu
-    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
-    if not use_tpu and eval_on_tpu:
-      logging.warning('eval_on_tpu ignored because use_tpu is False.')
-
-    self._eval_on_tpu = eval_on_tpu
-    self._model_parallelism_enabled = (
-        use_tpu and config.tpu_config.num_cores_per_replica)
-    self._mode = None
-    num_cores_per_replica = config.tpu_config.num_cores_per_replica
-    if self._model_parallelism_enabled:
-      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
-          num_cores_per_replica]
-    else:
-      self._computation_shape = None
-    self._lazy_tpu_system_metadata_dict = {}  # key by master address
-    self._lazy_device_assignment_dict = {}  # key by master address
-    self._lazy_validation_dict = {}  # key by ModeKeys
-    self._embedding_config_spec = embedding_config_spec
-    self._lazy_embedding_config_dict = {}  # key by master address
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @contextmanager
-  def with_mode(self, mode):
-    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
-    # such as _lazy_tpu_system_metadata_dict between new copy and the original
-    # one. Note that all lazy states stored in properties _lazy_foo are sort of
-    # immutable as they should be same for the process lifetime.
-    new_ctx = copy.copy(self)
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  def _get_master_address(self):
-    mode = self._assert_mode()
-    config = self._config
-    master = (
-        config.master
-        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
-    return master
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    cluster_def = None
-    if (self._config.session_config and
-        self._config.session_config.cluster_def.job):
-      cluster_def = self._config.session_config.cluster_def
-
-    # pylint: disable=protected-access
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(
-            master,
-            cluster_def=cluster_def,
-            query_topology=self.model_parallelism_enabled))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-  def _get_device_assignment(self):
-    """Gets the (maybe cached) TPU device assignment."""
-    master = self._get_master_address()
-    device_assignment = self._lazy_device_assignment_dict.get(master)
-    if device_assignment is not None:
-      return device_assignment
-
-    tpu_system_metadata = self._get_tpu_system_metadata()
-
-    device_assignment = tpu_device_assignment.device_assignment(
-        tpu_system_metadata.topology,
-        computation_shape=self._computation_shape,
-        num_replicas=self.num_replicas)
-
-    logging.info('num_cores_per_replica: %s',
-                 str(self._config.tpu_config.num_cores_per_replica))
-    logging.info('computation_shape: %s', str(self._computation_shape))
-    logging.info('num_replicas: %d', self.num_replicas)
-    logging.info('device_assignment.topology.device_coordinates: %s',
-                 str(device_assignment.topology.device_coordinates))
-    logging.info('device_assignment.core_assignment: %s',
-                 str(device_assignment.core_assignment))
-
-    self._lazy_device_assignment_dict[master] = device_assignment
-    return device_assignment
-
-  @property
-  def embedding_config(self):
-    """Returns the embedding config based on current mode."""
-    master = self._get_master_address()
-    if master in self._lazy_embedding_config_dict:
-      embedding_config = self._lazy_embedding_config_dict[master]
-    else:
-      embedding_config = None
-      if self._use_tpu and self._embedding_config_spec:
-        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
-            self._embedding_config_spec, self._train_batch_size,
-            self._eval_batch_size, self.num_hosts, self.num_cores, master)
-        if not embedding_config.has_embedding_tables():
-          embedding_config = None
-      self._lazy_embedding_config_dict[master] = embedding_config
-
-    if embedding_config is not None:
-      mode = self._assert_mode()
-      # Dynamically attach tpu_embedding based on mode. With
-      # this, we could keep embedding_config immutable but call site always
-      # accesses the unified API '.tpu_embedding'.
-      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
-    return embedding_config
-
-  @property
-  def model_parallelism_enabled(self):
-    return self._model_parallelism_enabled
-
-  @property
-  def input_partition_dims(self):
-    return self._config.tpu_config.input_partition_dims
-
-  @property
-  def device_assignment(self):
-    return (self._get_device_assignment()
-            if self._model_parallelism_enabled else None)
-
-  @property
-  def num_of_cores_per_host(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_of_cores_per_host
-
-  @property
-  def num_cores(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_cores
-
-  @property
-  def num_of_replicas_per_host(self):
-    """Return the number of replicas per host."""
-    if self.model_parallelism_enabled:
-      return self.num_replicas // self.num_hosts
-    else:
-      return self.num_of_cores_per_host
-
-  @property
-  def num_replicas(self):
-    num_cores_in_system = self.num_cores
-
-    if self.model_parallelism_enabled:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      if num_cores_per_replica > num_cores_in_system:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the total num of '
-            'TPU cores in the system. num_cores_per_replica: {}, num cores '
-            'in the system: {}'.format(num_cores_per_replica,
-                                       num_cores_in_system))
-
-      if num_cores_in_system % num_cores_per_replica != 0:
-        raise RuntimeError(
-            'The num of cores in the system ({}) is not divisible by the num '
-            'of cores ({}) required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
-                num_cores_in_system, num_cores_per_replica))
-
-      return num_cores_in_system // num_cores_per_replica
-    else:
-      return num_cores_in_system
-
-  @property
-  def num_hosts(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_hosts
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    mode = self._assert_mode()
-    return (mode == model_fn_lib.ModeKeys.TRAIN and
-            (self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.PER_SHARD_V1))
-
-  def is_input_per_host_with_iterators(self):
-    """Return true if input_fn should be run in the per-host v2 config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.PER_HOST_V2)
-
-  def is_input_broadcast_with_iterators(self):
-    """Return true if input_fn should be run in the full_replicae config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.BROADCAST)
-
-  def is_running_on_cpu(self, is_export_mode=False):
-    """Determines whether the input_fn and model_fn should be invoked on CPU.
-
-    This API also validates user provided configuration, such as batch size,
-    according the lazy initialized TPU system metadata.
-
-    Args:
-      is_export_mode: Indicates whether the current mode is for exporting the
-        model, when mode == PREDICT. Only with this bool, we could
-        tell whether user is calling the Estimator.predict or
-        Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distinguish these two.
-
-    Returns:
-      bool, whether current input_fn or model_fn should be running on CPU.
-
-    Raises:
-      ValueError: any configuration is invalid.
-    """
-
-    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
-    if not is_running_on_cpu:
-      self._validate_tpu_configuration()
-    return is_running_on_cpu
-
-  def _is_running_on_cpu(self, is_export_mode):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
-    mode = self._assert_mode()
-
-    if not self._use_tpu:
-      return True
-
-    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
-      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
-      return True
-
-    if is_export_mode:
-      return True
-
-    return False
-
-  @property
-  def global_batch_size(self):
-    mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return self._eval_batch_size
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return self._predict_batch_size
-    else:
-      return None
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU
-    if self.is_input_sharded_per_core() or (
-        self.is_input_per_host_with_iterators()):
-      return global_batch_size // self.num_replicas
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU. always sharded per shard.
-    return global_batch_size // self.num_replicas
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (
-        run_config.evaluation_master
-        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
-    if master in _LOCAL_MASTERS:
-      return None
-
-    if (not run_config.session_config or
-        not run_config.session_config.cluster_def.job):
-      return _DEFAULT_JOB_NAME
-    cluster_def = run_config.session_config.cluster_def
-    job_names = set([job.name for job in cluster_def.job])
-    if _DEFAULT_JOB_NAME in job_names:
-      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
-      raise ValueError('Currently, tpu_worker is not an allowed job name.')
-    if len(job_names) == 1:
-      return cluster_def.job[0].name
-    if len(job_names) == 2:
-      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
-        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
-        return job_names.pop()
-      # TODO(b/67716447): Include more sophisticated heuristics.
-    raise ValueError(
-        'Could not infer TPU job name. Please specify a tpu_job_name as part '
-        'of your TPUConfig.')
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-
-    master = self.master_job
-
-    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
-      """Return the host device given replica_id or host_id."""
-      assert _sentinal is None
-      if replica_id is not None and host_id is not None:
-        raise RuntimeError(
-            'replica_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        if replica_id is not None:
-          if self.model_parallelism_enabled:
-            return self.device_assignment.host_device(
-                replica=replica_id, job=master)
-          else:
-            host_id = replica_id / self.num_of_cores_per_host
-
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    """Returns a TPU device placement Fn."""
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    def _placement_function(i):
-      if self.model_parallelism_enabled:
-        return self.device_assignment.tpu_device(replica=i, job=master)
-      else:
-        num_of_cores_per_host = self.num_of_cores_per_host
-        host_id = i / num_of_cores_per_host
-        ordinal_id = i % num_of_cores_per_host
-        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
-
-    return _placement_function
-
-  def tpu_ordinal_function(self, host_id):
-    """Returns the TPU ordinal fn."""
-
-    def _tpu_ordinal_function(shard_index_in_host):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        shard_index_in_host: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      if self.model_parallelism_enabled:
-        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
-        replica = self.device_assignment.lookup_replicas(host_id,
-                                                         0)[shard_index_in_host]
-        return self.device_assignment.tpu_ordinal(replica=replica)
-      else:
-        return shard_index_in_host % self.num_of_cores_per_host
-
-    return _tpu_ordinal_function
-
-  def _validate_tpu_configuration(self):
-    """Validates the configuration based on the TPU system metadata."""
-    mode = self._assert_mode()
-    if self._lazy_validation_dict.get(mode):
-      return
-
-    # All following information is obtained from TPU system metadata.
-    num_cores = self.num_cores
-    num_replicas = self.num_replicas
-    num_hosts = self.num_hosts
-
-    if not num_cores:
-      tpu_system_metadata = self._get_tpu_system_metadata()
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system. Please double check '
-          'Tensorflow master address and TPU worker(s). Available devices '
-          'are {}.'.format(tpu_system_metadata.devices))
-
-    if self._config.tpu_config.num_shards:
-      user_provided_num_replicas = self._config.tpu_config.num_shards
-      if user_provided_num_replicas != num_replicas:
-        message = (
-            'TPUConfig.num_shards is not set correctly. According to TPU '
-            'system metadata for Tensorflow master ({}): num_replicas should '
-            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
-            'be the total num of TPU cores in the system. For '
-            'model-parallelism, the total number of TPU cores should be '
-            'num_cores_per_replica * num_replicas. Please set it '
-            'accordingly or leave it as `None`'.format(
-                self._get_master_address(), num_replicas,
-                user_provided_num_replicas))
-
-        raise ValueError(message)
-
-    if self._config.tpu_config.num_cores_per_replica:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
-      if num_cores_per_replica > num_cores_per_host:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the '
-            'num_cores_per_host. num_cores_per_replica: {}, '
-            'num_cores_per_host: {}'.format(num_cores_per_replica,
-                                            num_cores_per_host))
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      if (self._train_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'train batch size {} must be divisible by number of replicas {}'
-            .format(self._train_batch_size, num_replicas))
-
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      if self._eval_batch_size is None:
-        raise ValueError(
-            'eval_batch_size in TPUEstimator constructor cannot be `None`'
-            'if .evaluate is running on TPU.')
-      if (self._eval_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'eval batch size {} must be divisible by number of replicas {}'
-            .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.evaluate should be running on single TPU'
-            ' instead of a Pod.')
-    else:
-      assert mode == model_fn_lib.ModeKeys.PREDICT
-      if self._predict_batch_size is None:
-        raise ValueError(
-            'predict_batch_size in TPUEstimator constructor should not be '
-            '`None` if .predict is running on TPU.')
-      if (self._predict_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'predict batch size {} must be divisible by number of replicas {}'
-            .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.predict should be running on single TPU worker. '
-            'got {}.'.format(num_hosts))
-
-    # Record the state "validated" into lazy dictionary.
-    self._lazy_validation_dict[mode] = True
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    master = self.master_job
-
-    if self.model_parallelism_enabled:
-      return (self.device_assignment.host_device(
-          replica=replica_id, job=master),
-              self.device_assignment.tpu_ordinal(replica=replica_id))
-
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
-
-
-class _OneCoreTPUContext(_InternalTPUContext):
-  """Special _InternalTPUContext for one core usage."""
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
-
-    super(_OneCoreTPUContext, self).__init__(
-        config, train_batch_size, eval_batch_size,
-        predict_batch_size, use_tpu)
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
-            num_cores=1,
-            num_hosts=1,
-            num_of_cores_per_host=1,
-            topology=None,
-            devices=[]))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-
-def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu, eval_on_tpu,
-                     embedding_config_spec):
-  """Returns an instance of `_InternalTPUContext`."""
-
-  if (config.tpu_config.num_shards == 1 and
-      config.tpu_config.num_cores_per_replica is None):
-    if embedding_config_spec is not None:
-      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
-                       'when embedding_config_spec is not None.')
-    logging.warning(
-        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
-        'Please fix as soon as possible (leaving num_shards as None.)')
-    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
-                              predict_batch_size, use_tpu)
-
-  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
-                             predict_batch_size, use_tpu, eval_on_tpu,
-                             embedding_config_spec)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_context import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
index 0e4597bd6fae500c93f74fcb1b16a39739d2310c..cb38a8f1a6bee3c2adfbefc203c1d143303c3368 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -1,10 +1,10 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,1076 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TPU embedding APIs."""
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import math
-import re
-import six
-
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.ops import gen_tpu_ops
-from tensorflow.contrib.tpu.proto import tpu_embedding_configuration_pb2 as elc
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-
-TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
-INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
-
-
-class TableConfig(
-    collections.namedtuple(
-        'TableConfig',
-        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
-  """Embedding table configuration."""
-
-  @experimental
-  def __new__(cls,
-              vocabulary_size,
-              dimension,
-              initializer=None,
-              combiner='mean'):
-    """Embedding table configuration.
-
-    Args:
-      vocabulary_size: Number of vocabulary (/rows) in the table.
-      dimension: The embedding dimension.
-      initializer: A variable initializer function to be used in embedding
-        variable initialization. If not specified, defaults to
-        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-        `1/sqrt(dimension)`.
-      combiner: A string specifying how to reduce if there are multiple entries
-        in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-        with bag-of-words columns. For more information, see
-        `tf.nn.embedding_lookup_sparse`.
-
-    Returns:
-      `TableConfig`.
-
-    Raises:
-      ValueError: if `vocabulary_size` is not positive integer.
-      ValueError: if `dimension` is not positive integer.
-      ValueError: if `initializer` is specified and is not callable.
-      ValueError: if `combiner` is not supported.
-    """
-    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
-      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
-
-    if not isinstance(dimension, int) or dimension < 1:
-      raise ValueError('Invalid dimension {}.'.format(dimension))
-
-    if (initializer is not None) and (not callable(initializer)):
-      raise ValueError('initializer must be callable if specified.')
-    if initializer is None:
-      initializer = init_ops.truncated_normal_initializer(
-          mean=0.0, stddev=1 / math.sqrt(dimension))
-
-    if combiner not in ('mean', 'sum', 'sqrtn'):
-      raise ValueError('Invalid combiner {}'.format(combiner))
-
-    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
-                                           initializer, combiner)
-
-
-AdamSlotVariableNames = collections.namedtuple(
-    'AdamSlotVariableNames', ['m', 'v'])
-
-AdagradSlotVariableName = collections.namedtuple(
-    'AdagradSlotVariableName', ['accumulator'])
-
-AdamSlotVariables = collections.namedtuple(
-    'AdamSlotVariables', ['m', 'v'])
-
-AdagradSlotVariable = collections.namedtuple(
-    'AdagradSlotVariable', ['accumulator'])
-
-VariablesAndOps = collections.namedtuple(
-    'VariablesAndOps',
-    ['embedding_variables_by_table', 'slot_variables_by_table',
-     'load_ops', 'retrieve_ops']
-)
-
-
-# TODO(shizhiw): Factor `use_gradient_accumulation` and
-# `pipeline_execution_with_tensor_core` out of `_OptimizationParameters`.
-class _OptimizationParameters(object):
-  """Parameters common to all optimizations."""
-
-  def __init__(self, learning_rate, use_gradient_accumulation,
-               pipeline_execution_with_tensor_core):
-    self.learning_rate = learning_rate
-    self.use_gradient_accumulation = use_gradient_accumulation
-    self.pipeline_execution_with_tensor_core = (
-        pipeline_execution_with_tensor_core)
-
-
-class AdagradParameters(_OptimizationParameters):
-  """Optimization parameters for Adagrad."""
-
-  def __init__(self, learning_rate, initial_accumulator,
-               use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    """Optimization parameters for Adagrad.
-
-    Args:
-      learning_rate: used for updating embedding table.
-      initial_accumulator: initial accumulator for Adagrad.
-      use_gradient_accumulation: setting this to `True` makes embedding
-         gradients calculation more accurate but slower. Please see
-         `optimization_parameters.proto` for details.
-         for details.
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding ID. Please see
-        `tpu_embedding_configuration.proto` for details.
-    """
-    super(AdagradParameters, self).__init__(learning_rate,
-                                            use_gradient_accumulation,
-                                            pipeline_execution_with_tensor_core)
-    self.initial_accumulator = initial_accumulator
-
-
-class AdamParameters(_OptimizationParameters):
-  """Optimization parameters for Adam."""
-
-  def __init__(self, learning_rate,
-               beta1=0.9,
-               beta2=0.999,
-               epsilon=1e-08,
-               lazy_adam=True,
-               sum_inside_sqrt=True,
-               use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    """Optimization parameters for Adam.
-
-    Args:
-      learning_rate: a floating point value. The learning rate.
-      beta1: A float value.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value.
-        The exponential decay rate for the 2nd moment estimates.
-      epsilon: A small constant for numerical stability.
-      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
-        Please see `optimization_parameters.proto` for details.
-      sum_inside_sqrt: This improves training speed. Please see
-        `optimization_parameters.proto` for details.
-      use_gradient_accumulation: setting this to `True` makes embedding
-        gradients calculation more accurate but slower. Please see
-        `optimization_parameters.proto` for details.
-        for details.
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding ID. Please see
-        `tpu_embedding_configuration.proto` for details.
-    """
-    super(AdamParameters, self).__init__(learning_rate,
-                                         use_gradient_accumulation,
-                                         pipeline_execution_with_tensor_core)
-    self.beta1 = beta1
-    self.beta2 = beta2
-    self.epsilon = epsilon
-    self.lazy_adam = lazy_adam
-    self.sum_inside_sqrt = sum_inside_sqrt
-
-
-class StochasticGradientDescentParameters(_OptimizationParameters):
-  """Optimization parameters for stochastic gradient descent.
-
-  Args:
-    learning_rate: a floating point value. The learning rate.
-    use_gradient_accumulation: setting this to `True` makes embedding
-      gradients calculation more accurate but slower. Please see
-         `optimization_parameters.proto` for details.
-    pipeline_execution_with_tensor_core: setting this to `True` makes training
-      faster, but trained model will be different if step N and step N+1
-      involve the same set of embedding ID. Please see
-      `tpu_embedding_configuration.proto` for details.
-    """
-
-  def __init__(self, learning_rate, use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    super(StochasticGradientDescentParameters, self).__init__(
-        learning_rate, use_gradient_accumulation,
-        pipeline_execution_with_tensor_core)
-
-
-class TPUEmbedding(object):
-  """API for using TPU for embedding.
-
-    Example:
-    ```
-    table_config_user = tpu_embedding.TableConfig(
-        vocabulary_size=4, dimension=2,
-        initializer=initializer, combiner='mean')
-    table_to_config_dict = {'video': table_config_video,
-                          'user': table_config_user}
-    feature_to_table_dict = {'watched': 'video',
-                             'favorited': 'video',
-                             'friends': 'user'}
-    batch_size = 4
-    num_hosts = 1
-    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
-    mode = tpu_embedding.TRAINING
-    embedding = tpu_embedding.TPUEmbedding(
-        table_to_config_dict, feature_to_table_dict,
-        batch_size, num_hosts, mode, optimization_parameters)
-
-    batch_size_per_core = embedding.batch_size_per_core
-    sparse_features_list = []
-    for host in hosts:
-      with ops.device(host):
-        for _ in range(embedding.num_cores_per_host):
-          sparse_features = {}
-          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
-          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
-          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
-          sparse_features_list.append(sparse_features)
-
-    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
-    embedding_variables_and_ops = embedding.create_variables_and_ops()
-
-    def computation():
-      activations = embedding.get_activations()
-      loss = compute_loss(activations)
-
-      base_optimizer = gradient_descent.GradientDescentOptimizer(
-          learning_rate=1)
-      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
-          base_optimizer)
-
-      train_op = cross_shard_optimizer.minimize(loss)
-      # `train_op` and `send_gradients_op` must happen in order.
-      with ops.control_dependencies([train_op]):
-        send_gradients_op = embedding.generate_send_gradients_op()
-      with ops.control_dependencies([send_gradients_op]):
-        loss = array_ops.identity(loss)
-
-    loss = tpu.shard(computation,
-                     num_shards=embedding.num_cores)
-
-    with self.test_session() as sess:
-      sess.run(tpu.initialize_system(embedding_config=
-                                     embedding.config_proto))
-      sess.run(variables.global_variables_initializer())
-      sess.run(embedding.init_ops)
-      sess.run(embedding_variables_and_ops.load_ops)
-      sess.run(enqueue_ops)
-      loss_val = sess.run(loss)
-    ```
-  """
-
-  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
-  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
-  # `FeatureConfig` could have fields other than table name. For example, it
-  # could have a field to indicate that the feature should not be used to
-  # update embedding table (cr/204852758, cr/204940540). Also, this can support
-  # different combiners for different features within the same table.
-  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
-  # to `FeatureConfig`?
-
-  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
-  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
-
-  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
-  # for-loops around construction of inputs.
-
-  # `optimization_parameter` applies to all tables. If the need arises,
-  # we can add `optimization_parameters` to `TableConfig` to override this
-  # global setting.
-  @experimental
-  def __init__(self,
-               table_to_config_dict,
-               feature_to_table_dict,
-               batch_size,
-               mode,
-               master,
-               optimization_parameters=None):
-    """API for using TPU for embedding lookups.
-
-    Args:
-      table_to_config_dict: A dictionary mapping from string of table name to
-        `TableConfig`. Table refers to an embedding table, e.g. `params`
-        argument to `tf.nn.embedding_lookup_sparse()`.
-      feature_to_table_dict: A dictionary mapping from string of feature name
-        to string of table name. Feature refers to ids to lookup in embedding
-        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
-      batch_size: An `int` representing the global batch size.
-      mode: `TRAINING` or `INFERENCE`.
-      master: A `string` representing the TensorFlow master to use.
-      optimization_parameters: `AdagradParameters`, `AdamParameters`,
-        `Stochasticgradientdescentparameters`. Must be set in training and must
-        be `None` in inference.
-
-    Raises:
-      ValueError: if any input is invalid.
-    """
-    _validate_table_to_config_dict(table_to_config_dict)
-    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
-    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
-    self._combiners = _create_combiners(self._table_to_config_dict)
-
-    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
-    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
-    self._table_to_features_dict = _create_table_to_features_dict(
-        self._feature_to_table_dict)
-
-    self._batch_size = batch_size
-
-    self._master = master
-    self._tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
-    if self._tpu_system_metadata.num_cores == 0:
-      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
-                       'TPUs.'.format(self._master))
-    self._num_hosts = self._tpu_system_metadata.num_hosts
-    self._hosts = [device.name for device in self._tpu_system_metadata.devices
-                   if 'device:CPU:' in device.name]
-    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
-    self._num_cores = self._tpu_system_metadata.num_cores
-
-    _validate_batch_size(self._batch_size, self._num_cores)
-    self._batch_size_per_core = self._batch_size // self._num_cores
-
-    self._init_ops = []
-
-    # TODO(shizhiw): remove `mode`?
-    if mode == TRAINING:
-      _validate_optimization_parameters(optimization_parameters)
-      self._optimization_parameters = optimization_parameters
-    elif mode == INFERENCE:
-      if optimization_parameters is not None:
-        raise ValueError('`optimization_parameters` should be `None` '
-                         'for inference mode.')
-      self._optimization_parameters = (
-          StochasticGradientDescentParameters(1.))
-    else:
-      raise ValueError('`mode` only supports {} and {}; got {}.'
-                       .format(TRAINING, INFERENCE, mode))
-    self._mode = mode
-
-    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
-    # and create special handler for inference that inherits from
-    # StochasticGradientDescentHandler with more user-friendly error message
-    # on get_slot().
-    self._optimizer_handler = _get_optimization_handler(
-        self._optimization_parameters)
-
-    dummy_table_variables_init_op = self._create_dummy_table_variables()
-    self._init_ops.append(dummy_table_variables_init_op)
-
-    self._config_proto = self._create_config_proto()
-
-  @property
-  def hosts(self):
-    """A list of device names for CPU hosts.
-
-    Returns:
-      A list of device names for CPU hosts.
-    """
-    return copy.copy(self._hosts)
-
-  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
-  # to be consistent with `tpu_embedding_configuration.proto`.
-  @property
-  def num_cores_per_host(self):
-    """Number of TPU cores on a CPU host.
-
-    Returns:
-      Number of TPU cores on a CPU host.
-    """
-    return self._num_cores_per_host
-
-  @property
-  def num_cores(self):
-    """Total number of TPU cores on all hosts.
-
-    Returns:
-      Total number of TPU cores on all hosts.
-    """
-    return self._num_cores
-
-  @property
-  def batch_size_per_core(self):
-    """Batch size for each TPU core.
-
-    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
-       must have batch dimension equal to this.
-
-    Returns:
-      Batch size for each TPU core.
-    """
-    return self._batch_size_per_core
-
-  @property
-  def config_proto(self):
-    """Create embedding config proto for `tpu.initialize_system()`.
-
-    Returns:
-      an `TPUEmbeddingConfiguration` proto describing the desired
-         configuration of the hardware embedding lookup tables, which
-         is passed to `tpu.initialize_system()`.
-    """
-    return self._config_proto
-
-  @property
-  def init_ops(self):
-    """Initialization ops for TPU embedding.
-
-    It must be called after all global variables have been initialized,
-    i.e. after `global_variables_initializer()`, as it loads embedding
-    tables into TPU.
-
-    Returns:
-      A list of ops.
-    """
-    return self._init_ops
-
-  @property
-  def feature_to_table_dict(self):
-    return copy.copy(self._feature_to_table_dict)
-
-  def _create_config_proto(self):
-    """Create `TPUEmbeddingConfiguration`."""
-    config_proto = elc.TPUEmbeddingConfiguration()
-    for table in self._table_to_config_dict:
-      table_descriptor = config_proto.table_descriptor.add()
-      table_descriptor.name = table
-
-      table_config = self._table_to_config_dict[table]
-      table_descriptor.vocabulary_size = table_config.vocabulary_size
-      table_descriptor.dimension = table_config.dimension
-
-      features_for_table = self._table_to_features_dict[table]
-      table_descriptor.num_features = len(features_for_table)
-
-      table_descriptor.optimization_parameters.learning_rate.constant = (
-          self._optimization_parameters.learning_rate)
-      table_descriptor.optimization_parameters.use_gradient_accumulation = (
-          self._optimization_parameters.use_gradient_accumulation)
-      self._optimizer_handler.set_optimization_parameters(table_descriptor)
-
-    config_proto.mode = self._mode
-    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
-    config_proto.num_hosts = self._num_hosts
-    config_proto.num_tensor_cores = self._num_cores
-    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
-    config_proto.pipeline_execution_with_tensor_core = (
-        self._optimization_parameters.pipeline_execution_with_tensor_core)
-
-    return config_proto
-
-  def create_variables_and_ops(self, embedding_variable_name_by_table=None,
-                               slot_variable_names_by_table=None):
-    """Create embedding and slot variables, with ops to load and retrieve them.
-
-    Args:
-      embedding_variable_name_by_table: A dictionary mapping from string of
-        table name to string of embedding variable name. If `None`,
-        defaults from `get_default_slot_variable_names()` will be used.
-      slot_variable_names_by_table: A dictionary mapping from string of table
-        name to `AdamSlotVariableNames`, `AdagradSlotVariableNames` etc. If
-        `None`, defaults from `get_default_slot_variable_names()` will be used.
-
-    Returns:
-      `tpu_embedding.VariablesAndOps` with:
-        A dictionary mapping from string of table name to embedding variables,
-        A dictionary mapping from string of table name to AdagradSlotVariable,
-         AdamSlotVariables etc with slot variables,
-        A list of ops to load embedding and slot variables on CPU to TPU,
-        A list of ops to retrieve embedding and slot variables from TPU to CPU.
-    """
-    embedding_variables_by_table = {}
-    slot_variables_by_table = {}
-    load_ops = []
-    retrieve_ops = []
-    for table in self._table_to_config_dict:
-      if embedding_variable_name_by_table:
-        embedding_variable_name = embedding_variable_name_by_table[table]
-      else:
-        embedding_variable_name = table
-      if slot_variable_names_by_table:
-        slot_variable_names = slot_variable_names_by_table[table]
-      else:
-        slot_variable_names = (
-            self._optimizer_handler.get_default_slot_variable_names(table))
-
-      device_fn = _create_device_fn(self._hosts)
-      with ops.device(device_fn):
-        table_variables = _create_partitioned_variables(
-            name=embedding_variable_name,
-            num_hosts=self._num_hosts,
-            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
-            embedding_dimension=self._table_to_config_dict[table].dimension,
-            initializer=self._table_to_config_dict[table].initializer,
-            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-        embedding_variables_by_table[table] = table_variables
-
-        slot_variables_for_table, load_ops_for_table, retrieve_ops_for_table = (
-            self._optimizer_handler.create_variables_and_ops(
-                table, slot_variable_names, self._num_hosts,
-                self._table_to_config_dict[table], table_variables)
-        )
-        slot_variables_by_table[table] = slot_variables_for_table
-        load_ops.extend(load_ops_for_table)
-        retrieve_ops.extend(retrieve_ops_for_table)
-    return VariablesAndOps(embedding_variables_by_table,
-                           slot_variables_by_table,
-                           load_ops, retrieve_ops)
-
-  def _create_dummy_table_variables(self):
-    """Create dummy embedding table variables.
-
-    The sole purpose of these dummy variables are to trigger gradient
-    calcuation wrt them so that the gradients wrt activation can be captured
-    and later sent to TPU embedding.
-
-    Returns:
-      Initializer for these variables.
-
-    Raises:
-      RuntimeError: if collection to store gradients already exists and is not
-      empty.
-    """
-    self._dummy_table_variables = []
-    # TODO(shizhiw): remove table id.
-    for table_id, table in enumerate(self._table_to_features_dict):
-      self._dummy_table_variables.append(
-          variable_scope.get_variable(
-              'tpu_embedding_dummy_table_variable_%s' % table,
-              dtype=dtypes.float32,
-              shape=[1],
-              use_resource=True,
-              trainable=True,
-              # TODO(shizhiw): Remove these dummy variables as
-              # tensorflow optimizer creates slot variable for them which
-              # is undesirable.
-              # e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1}.
-              # Explicitly specifying collections prevents this variable from
-              # being added to the GLOBAL_VARIABLES collection, so that Saver()
-              # ignores it.
-              collections=['tpu_embedding_dummy_table_variables']))
-
-      g = ops.get_default_graph()
-      table_gradients = g.get_collection_ref(
-          'tpu_embedding_gradients_table_%d' % table_id)
-      if table_gradients:
-        raise RuntimeError(
-            'tpu_embedding_gradients_table_%d is not empty.' % table_id)
-      table_gradients.extend([None] * len(self._table_to_features_dict[table]))
-
-    return variables.variables_initializer(
-        self._dummy_table_variables,
-        name='tpu_embedding_dummy_table_variables_init')
-
-  def generate_enqueue_ops(self, sparse_features_list):
-    """Generate enqueue ops.
-
-    Args:
-      sparse_features_list: a list of dictionary mapping from string
-        of feature names to sparse tensor. Each dictionary is for one
-        TPU core. Dictionaries for the same core should be contiguous
-        on the list.
-
-    Returns:
-      Ops to enqueue to TPU for embedding.
-    """
-    self._validate_generate_enqueue_ops_sparse_features_list(
-        sparse_features_list)
-    return [
-        self._generate_enqueue_op(
-            sparse_features, device_ordinal=i % self._num_cores_per_host)
-        for i, sparse_features in enumerate(sparse_features_list)
-    ]
-
-  def _validate_generate_enqueue_ops_sparse_features_list(
-      self, sparse_features_list):
-    """Validate `sparse_features_list`."""
-    if len(sparse_features_list) != self._num_cores:
-      raise ValueError('Length of `sparse_features_list` should match the '
-                       'number of cores; '
-                       '`len(sparse_features_list)` is {}, '
-                       'number of cores is {}.'.format(
-                           len(sparse_features_list), self._num_cores))
-
-    feature_set = set(self._feature_to_table_dict.keys())
-    contiguous_device = None
-    for i, sparse_features in enumerate(sparse_features_list):
-      used_feature_set = set(sparse_features.keys())
-
-      # Check features are valid.
-      missing_feature_set = feature_set - used_feature_set
-      if missing_feature_set:
-        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
-                         'in `feature_to_config_dict`: {}.'.format(
-                             i, missing_feature_set))
-
-      extra_feature_set = used_feature_set - feature_set
-      if extra_feature_set:
-        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
-                         'in `feature_to_config_dict`: {}.'.format(
-                             i, extra_feature_set))
-
-      device = None
-      device_feature = None
-      for feature, tensor in six.iteritems(sparse_features):
-        if not isinstance(tensor, sparse_tensor.SparseTensor):
-          raise ValueError('`sparse_features_list[{}]` has a feature that is '
-                           'not mapped to `SparseTensor`. '
-                           '`feature`: {}, type: {}'.format(
-                               i, feature, type(tensor)))
-
-        # Check all features are on the same device.
-        if device is None:
-          device = tensor.op.device
-          device_feature = feature
-        else:
-          if device != tensor.op.device:
-            raise ValueError('Devices are different between features in '
-                             '`sparse_features_list[{}]`; '
-                             'devices: {}, {}; features: {}, {}.'.format(
-                                 i, device, tensor.op.device, feature,
-                                 device_feature))
-
-      if i % self._num_cores_per_host:
-        if device != contiguous_device:
-          raise ValueError('We expect the `sparse_features` which are on the '
-                           'same host to be contiguous in '
-                           '`sparse_features_list`, '
-                           '`sparse_features_list[{}]` is on device {}, '
-                           'but is expected to be on device {}.'.format(
-                               i, device, contiguous_device))
-      else:
-        contiguous_device = device
-
-  def _generate_enqueue_op(self, sparse_features, device_ordinal):
-    with ops.colocate_with(list(sparse_features.values())[0]):
-      sample_idcs, embedding_idcs, aggregation_weights = (
-          self._format_for_tpu_embedding_sparse_batch(sparse_features))
-      return tpu_ops.enqueue_tpu_embedding_sparse_batch(
-          sample_idcs,
-          embedding_idcs,
-          aggregation_weights,
-          combiners=self._combiners,
-          device_ordinal=device_ordinal)
-
-  def _format_for_tpu_embedding_sparse_batch(self, sparse_features):
-    """Format sparse features for `enqueue_tpu_embedding_sparse_batch()`.
-
-    Args:
-      sparse_features: a `Dict` of `SparseTensor`s for embedding.
-
-    Returns:
-      Arguments for `enqueue_tpu_embedding_sparse_batch()`.
-    """
-
-    sample_idcs, embedding_idcs, aggregation_weights = list(), list(), list()
-    for table in self._table_to_features_dict:
-      sample_t, indices_t, weights_t = list(), list(), list()
-
-      features = self._table_to_features_dict[table]
-      for i, feature in enumerate(features):
-        tensor = sparse_features[feature]
-        sample_indices = tensor.indices[:, 0]
-        embedding_indices = tensor.values
-        weights = array_ops.ones_like(embedding_indices)
-        sample_t.append(i * self._batch_size_per_core + sample_indices)
-        indices_t.append(embedding_indices)
-        weights_t.append(weights)
-
-      sample_idcs.append(
-          math_ops.cast(array_ops.concat(sample_t, axis=0), dtype=dtypes.int32))
-      embedding_idcs.append(
-          math_ops.cast(
-              array_ops.concat(indices_t, axis=0), dtype=dtypes.int32))
-      aggregation_weights.append(
-          math_ops.cast(
-              array_ops.concat(weights_t, axis=0), dtype=dtypes.float32))
-
-    return sample_idcs, embedding_idcs, aggregation_weights
-
-  def get_activations(self):
-    """Get activations for features.
-
-    This should be called within `computation` that is passed to
-      `tpu.replicate` and friends.
-
-    Returns:
-      A dictionary mapping from `String` of feature name to `Tensor`
-        of activation.
-    """
-    recv_activations = tpu_ops.recv_tpu_embedding_activations(
-        num_outputs=len(self._table_to_config_dict),
-        config=self._config_proto.SerializeToString())
-
-    activations = collections.OrderedDict()
-    for table_id, table in enumerate(self._table_to_features_dict):
-      features = self._table_to_features_dict[table]
-      for lookup_id, feature in enumerate(features):
-        start_row = lookup_id * self._batch_size_per_core
-        end_row = start_row + self._batch_size_per_core
-        activations[feature] = gen_tpu_ops.tpu_embedding_activations(
-            self._dummy_table_variables[table_id],
-            recv_activations[table_id][start_row:end_row, :],
-            table_id=table_id,
-            lookup_id=lookup_id)
-    return activations
-
-  # TODO(shizhiw): Make `gradient_multiplier` per feature. Setting it to 0 would
-  # have the effect of `tf.stop_gradients()`.
-  # TODO(shizhiw): Consider alternative ways to capture gradients wrt embedding
-  # layer outputs to remove `_dummy_table_variables`,
-  # `_embedding_activation_grad` and `tpu_embedding_gradients_table_%d'.
-  def generate_send_gradients_op(self, gradient_multipliers=None):
-    """Retrieve gradients from collections and send them to TPU embedding.
-
-    Args:
-      gradient_multipliers: None, or dict mapping table names to gradient
-        multiplier Tensors.
-
-    Returns:
-      SendTPUEmbeddingGradients Op.
-
-    Raises:
-      ValueError: If required gradients have not been defined.
-      RuntimeError: If `mode` is not `TRAINING`.
-    """
-    if self._mode != TRAINING:
-      raise RuntimeError('Only in training mode gradients need to '
-                         'be sent to TPU embedding; got mode {}.'
-                         .format(self._mode))
-
-    g = ops.get_default_graph()
-    gradients = list()
-    for table_id, table in enumerate(self._table_to_config_dict):
-      table_gradients = g.get_collection(
-          'tpu_embedding_gradients_table_%d' % table_id)
-      if any(gradient is None for gradient in table_gradients):
-        raise ValueError(
-            'Table {}/{} has undefined gradients: this is probably because the '
-            'model asked TPUEmbedding to compute activations that were not '
-            'used.'.format(table_id, table))
-      concat_table_grads = array_ops.concat(table_gradients, axis=0)
-      if gradient_multipliers is not None:
-        concat_table_grads *= gradient_multipliers[table.name]
-      gradients.append(concat_table_grads)
-
-    return tpu_ops.send_tpu_embedding_gradients(
-        inputs=gradients, config=self.config_proto.SerializeToString())
-
-
-def _validate_table_to_config_dict(table_to_config_dict):
-  """Validate `table_to_config_dict`."""
-  for k, v in six.iteritems(table_to_config_dict):
-    if not isinstance(v, TableConfig):
-      raise ValueError('Value of `table_to_config_dict` must be of type '
-                       '`TableConfig`, got {} for {}.'.format(type(v), k))
-
-
-def _validate_feature_to_table_dict(table_to_config_dict,
-                                    feature_to_table_dict):
-  """Validate `feature_to_table_dict`."""
-  used_table_set = set(feature_to_table_dict.values())
-  table_set = set(table_to_config_dict.keys())
-
-  unused_table_set = table_set - used_table_set
-  if unused_table_set:
-    raise ValueError('`table_to_config_dict` specifies table that is not '
-                     'used in `feature_to_table_dict`: {}.'
-                     .format(unused_table_set))
-
-  extra_table_set = used_table_set - table_set
-  if extra_table_set:
-    raise ValueError('`feature_to_table_dict` refers to a table that is not '
-                     'specified in `table_to_config_dict`: {}.'
-                     .format(extra_table_set))
-
-
-def _validate_batch_size(batch_size, num_cores):
-  if batch_size % num_cores:
-    raise ValueError('`batch_size` is not a multiple of number of '
-                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
-                         batch_size, num_cores))
-
-
-def _validate_optimization_parameters(optimization_parameters):
-  if not isinstance(optimization_parameters, _OptimizationParameters):
-    raise ValueError('`optimization_parameters` must inherit from '
-                     '`_OptimizationPramaters`. '
-                     '`type(optimization_parameters)`={}'.format(
-                         type(optimization_parameters)))
-
-
-class _OptimizerHandler(object):
-  """Interface class for handling optimizer specific logic."""
-
-  def __init__(self, optimization_parameters):
-    self._optimization_parameters = optimization_parameters
-
-  def set_optimization_parameters(self, table_descriptor):
-    raise NotImplementedError()
-
-  def get_default_slot_variable_names(self, table):
-    raise NotImplementedError()
-
-  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
-    raise NotImplementedError()
-
-
-class _AdagradHandler(_OptimizerHandler):
-  """Handles Adagrad specific logic."""
-
-  def __init__(self, optimization_parameters):
-    super(_AdagradHandler, self).__init__(optimization_parameters)
-    self._table_to_accumulator_variables_dict = {}
-
-  def set_optimization_parameters(self, table_descriptor):
-    table_descriptor.optimization_parameters.adagrad.SetInParent()
-
-  def get_default_slot_variable_names(self, table):
-    return AdagradSlotVariableName('{}/{}'.format(table, 'Adagrad'))
-
-  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
-    accumulator_initializer = init_ops.constant_initializer(
-        self._optimization_parameters.initial_accumulator)
-    accumulator_variables = _create_partitioned_variables(
-        name=slot_variable_names.accumulator,
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=accumulator_initializer)
-    slot_variables = AdagradSlotVariable(accumulator_variables)
-
-    load_ops = []
-    retrieve_ops = []
-    for host_id, table_variable, accumulator_variable in (zip(
-        range(num_hosts), table_variables, accumulator_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops.load_tpu_embedding_adagrad_parameters(
-                parameters=table_variable,
-                accumulators=accumulator_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table, retrieved_accumulator = (
-            tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table),
-            state_ops.assign(accumulator_variable, retrieved_accumulator))
-
-      load_ops.append(load_parameters_op)
-      retrieve_ops.append(retrieve_parameters_op)
-    return slot_variables, load_ops, retrieve_ops
-
-
-class _AdamHandler(_OptimizerHandler):
-  """Handles Adam specific logic."""
-
-  def __init__(self, optimization_parameters):
-    super(_AdamHandler, self).__init__(optimization_parameters)
-    self._table_to_m_variables_dict = {}
-    self._table_to_v_variables_dict = {}
-
-  def set_optimization_parameters(self, table_descriptor):
-    table_descriptor.optimization_parameters.adam.beta1 = (
-        self._optimization_parameters.beta1)
-    table_descriptor.optimization_parameters.adam.beta2 = (
-        self._optimization_parameters.beta2)
-    table_descriptor.optimization_parameters.adam.epsilon = (
-        self._optimization_parameters.epsilon)
-    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
-        not self._optimization_parameters.lazy_adam)
-    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
-        self._optimization_parameters.sum_inside_sqrt)
-
-  def get_default_slot_variable_names(self, table):
-    return AdamSlotVariableNames('{}/{}/m'.format(table, 'Adam'),
-                                 '{}/{}/v'.format(table, 'Adam'))
-
-  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
-    m_initializer = init_ops.zeros_initializer()
-    m_variables = _create_partitioned_variables(
-        name=slot_variable_names.m,
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=m_initializer)
-    v_initializer = init_ops.zeros_initializer()
-    v_variables = _create_partitioned_variables(
-        name=slot_variable_names.v,
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=v_initializer)
-    slot_variables = AdamSlotVariables(m_variables, v_variables)
-
-    load_ops = []
-    retrieve_ops = []
-    for host_id, table_variable, m_variable, v_variable in (zip(
-        range(num_hosts), table_variables,
-        m_variables, v_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops.load_tpu_embedding_adam_parameters(
-                parameters=table_variable,
-                momenta=m_variable,
-                velocities=v_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table, retrieved_m, retrieved_v = (
-            tpu_ops.retrieve_tpu_embedding_adam_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table),
-            state_ops.assign(m_variable, retrieved_m),
-            state_ops.assign(v_variable, retrieved_v))
-
-      load_ops.append(load_parameters_op)
-      retrieve_ops.append(retrieve_parameters_op)
-    return slot_variables, load_ops, retrieve_ops
-
-
-class _StochasticGradientDescentHandler(_OptimizerHandler):
-  """Handles stochastic gradient descent specific logic."""
-
-  def set_optimization_parameters(self, table_descriptor):
-    (table_descriptor.optimization_parameters.stochastic_gradient_descent
-     .SetInParent())
-
-  def get_default_slot_variable_names(self, table):
-    return None
-
-  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
-    del table_config
-
-    load_ops = []
-    retrieve_ops = []
-    for host_id, table_variable in (zip(
-        range(num_hosts), table_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops
-            .load_tpu_embedding_stochastic_gradient_descent_parameters(
-                parameters=table_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table = (
-            tpu_ops
-            .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table))
-
-      load_ops.append(load_parameters_op)
-      retrieve_ops.append(retrieve_parameters_op)
-    return None, load_ops, retrieve_ops
-
-
-def _get_optimization_handler(optimization_parameters):
-  if isinstance(optimization_parameters, AdagradParameters):
-    return _AdagradHandler(optimization_parameters)
-  elif isinstance(optimization_parameters, AdamParameters):
-    return _AdamHandler(optimization_parameters)
-  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
-    return _StochasticGradientDescentHandler(optimization_parameters)
-  else:
-    return NotImplementedError()
-
-
-def _create_ordered_dict(d):
-  """Create an OrderedDict from Dict."""
-  return collections.OrderedDict((k, d[k]) for k in sorted(d))
-
-
-def _create_combiners(table_to_config_dict):
-  return [table_to_config_dict[t].combiner for t in table_to_config_dict]
-
-
-def _create_table_to_features_dict(feature_to_table_dict):
-  """Create mapping from table to a list of its features."""
-  table_to_features_dict_tmp = {}
-  for feature, table in six.iteritems(feature_to_table_dict):
-    if table in table_to_features_dict_tmp:
-      table_to_features_dict_tmp[table].append(feature)
-    else:
-      table_to_features_dict_tmp[table] = [feature]
-
-  table_to_features_dict = collections.OrderedDict()
-  for table in sorted(table_to_features_dict_tmp):
-    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
-  return table_to_features_dict
-
-
-def _create_device_fn(hosts):
-  """Create device_fn() to use with _create_partitioned_variables()."""
-
-  def device_fn(op):
-    """Returns the `device` for `op`."""
-    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
-
-    if part_match:
-      idx = int(part_match.group(1))
-    else:
-      raise RuntimeError('Internal Error: '
-                         'Expected %s to contain /part_*.' % op.name)
-
-    device = hosts[idx]
-    return device
-
-  return device_fn
-
-
-def _create_partitioned_variables(name,
-                                  num_hosts,
-                                  vocabulary_size,
-                                  embedding_dimension,
-                                  initializer,
-                                  collections=None):  # pylint: disable=redefined-outer-name
-  """Creates ParitionedVariables based on `num_hosts` for `table`."""
-  # TODO(shizhiw): automatically place embedding lookup elsewhere?
-  if vocabulary_size < num_hosts:
-    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
-                     'As TPU embedding is not optimized for small tables, '
-                     'please consider other ways for this embedding lookup.')
-
-  return list(variable_scope.get_variable(
-      name,
-      shape=(vocabulary_size, embedding_dimension),
-      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
-      dtype=dtypes.float32,
-      initializer=initializer,
-      collections=collections,
-      trainable=False))
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/training/mode_keys_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
similarity index 63%
rename from tensorflow/python/training/mode_keys_test.py
rename to tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
index c4435b7d4870ac1675a3f2f4d80def111dc85ae5..308adc77e9ad2d912d0461512655b55faa53da60 100644
--- a/tensorflow/python/training/mode_keys_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,18 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for `tf.train.ModeKeys."""
+"""Stub file to maintain backwards compatibility."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.platform import test
-from tensorflow.python.training import mode_keys
-
-
-class ModeKeysTest(test.TestCase):
-
-  def testKeyEquality(self):
-    self.assertEqual(mode_keys.ModeKeys.PREDICT, 'predict')
-    self.assertEqual(mode_keys.ModeKeys.TRAIN, 'train')
-    self.assertEqual(mode_keys.ModeKeys.TEST, 'test')
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_embedding_gradient import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 6a3ed9bb79502505d64156c6a405d9d57ee83eb5..893118412e1363ce50416e6ef36692bc23d04179 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1,3733 +1,33 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPUEstimator class."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import os
-import signal
-import sys
-import threading
-import time
-
-import numpy as np
-import six
-from six.moves import queue as Queue  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.ops import tpu_ordinal_selector_op
-from tensorflow.contrib.tpu.python.tpu import _tpu_estimator_embedding
-from tensorflow.contrib.tpu.python.tpu import error_handling
-from tensorflow.contrib.tpu.python.tpu import functional as tpu_functional
-from tensorflow.contrib.tpu.python.tpu import session_support
-from tensorflow.contrib.tpu.python.tpu import tensor_tracer
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_context
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import training_loop
-from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.contrib.tpu.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
-from tensorflow.contrib.tpu.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
-from tensorflow.contrib.training.python.training import hparam
-from tensorflow.core.framework import variable_pb2
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest as data_nest
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as contrib_summary
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import evaluation
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
-
-_INITIAL_LOSS = 1e7
-_ZERO_LOSS = 0.
-_TPU_ESTIMATOR = 'tpu_estimator'
-_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
-_BATCH_SIZE_KEY = 'batch_size'
-_CTX_KEY = 'context'
-_USE_TPU_KEY = 'use_tpu'
-_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
-_ONE_GIGABYTE = 1024 * 1024 * 1024
-_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
-_TPU_TRAIN_OP = '_tpu_train_op'
-_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
-_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
-
-# Ideally _USE_TPU_KEY should be reserved as well. However there are already
-# models that make use of this key, thus it can not be reserved now to prevent
-# breakage. In the long run, we would like to mitigate this by migrating models
-# off of using _USE_TPU_KEY.
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
-
-# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
-# only used for per-core based deployments. For per-host based pipelines, if a
-# user returns a Dataset instance it will be automatically wrapped in a
-# tf.while_loop (This can be disabled by returning features and labels
-# explicitly).
-_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
-
-ops.register_proto_function(
-    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
-    proto_type=variable_pb2.VariableDef,
-    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
-    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
-
-
-def _is_iterable(obj):
-  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
-  try:
-    iter(obj)
-    return True
-  except TypeError:
-    return False
-
-
-class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
-
-  def AddOp(self, op):
-    if op.type in [
-        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
-        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
-    ]:
-      raise ValueError('Use tf.contrib.summary inside of host_calls.')
-
-
-def _create_global_step(graph):
-  graph = graph or ops.get_default_graph()
-  if training.get_global_step(graph) is not None:
-    raise ValueError('"global_step" already exists.')
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        ops.GraphKeys.GLOBAL_STEP,
-        shape=[],
-        dtype=dtypes.int64,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
-
-
-def _create_or_get_iterations_per_loop():
-  """Creates or gets the iterations_per_loop variable.
-
-  In TPUEstimator, the user provided computation, the model_fn, is wrapped
-  inside a tf.while_loop for peak performance. The iterations of the loop are
-  specified by this variable, which adjusts its value on the CPU after each TPU
-  program execution and before the next TPU execution.
-
-  The purpose of using a variable, rather then a constant, is to allow
-  TPUEstimator adapt the TPU training iterations according to the final steps
-  specified by users. For example, if the user sets the iterations_per_loop as 4
-  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
-  variable will have the following value before each TPU training.
-
-      - 1-th TPU execution: iterations_per_loop = 4
-      - 2-th TPU execution: iterations_per_loop = 4
-      - 3-th TPU execution: iterations_per_loop = 2
-
-  As model_fn increases the global step once per train_op invocation, the global
-  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
-  users.
-
-  Returns:
-    A TF non-trainable resource variable.
-
-  Raises:
-    RuntimeError: If multi iterations_per_loop variables were found.
-  """
-  graph = ops.get_default_graph()
-  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
-  iter_vars = graph.get_collection(collection_name)
-  if len(iter_vars) == 1:
-    return iter_vars[0]
-  elif len(iter_vars) > 1:
-    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
-
-  with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(
-        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
-      return variable_scope.get_variable(
-          _ITERATIONS_PER_LOOP_VAR,
-          initializer=init_ops.zeros_initializer(),
-          shape=[],
-          dtype=dtypes.int32,
-          trainable=False,
-          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
-          use_resource=True)
-
-
-def _sync_variables_ops(ctx):
-  """Create varriables synchronization ops.
-
-  Gets the variables back from TPU nodes. This means the variables updated
-  by TPU will now be *synced* to host memory.
-  In BROADCAST mode, we skip this sync since the variables are ususally too
-  big to transmit via RPC.
-
-  Args:
-    ctx: A `_InternalTPUContext` instance with mode.
-
-  Returns:
-    A list of sync ops.
-  """
-
-  if not ctx.is_input_broadcast_with_iterators():
-    return [
-        array_ops.check_numerics(v.read_value(),
-                                 'Gradient for %s is NaN' % v.name).op
-        for v in variables.trainable_variables()
-    ]
-  else:
-    return [control_flow_ops.no_op()]
-
-
-def _increase_eval_step_op(iterations_per_loop):
-  """Returns an op to increase the eval step for TPU evaluation.
-
-  Args:
-    iterations_per_loop: Tensor. The number of eval steps running in TPU system
-      before returning to CPU host for each `Session.run`.
-
-  Returns:
-    An operation
-  """
-  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
-  # Estimator evaluate increases 1 by default. So, we increase the difference.
-  return state_ops.assign_add(
-      eval_step,
-      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
-      use_locking=True)
-
-
-def _extract_key_names(tensor_or_dict):
-  if isinstance(tensor_or_dict, dict):
-    return sorted(tensor_or_dict.keys())
-  return []
-
-
-class _SIGNAL(object):
-  """Signal used to control the thread of infeed/outfeed.
-
-  All preserved signals must be negative numbers. Positive numbers are used to
-  indicate the number of iterations for next training/evaluation loop.
-  """
-  NEXT_BATCH = -1
-  STOP = -2
-
-
-class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
-
-  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
-  `export_outputs`.
-
-  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
-  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
-  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
-  To be precise, TPU evaluation expects a slightly different signature from the
-  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
-  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
-  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
-  `tensors` usually specify the model logits, which are transferred back from
-  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
-  size is the first dimension. Once all tensors are available at CPU host from
-  all shards, they are concatenated (on CPU) and passed as positional arguments
-  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
-  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
-  name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
-  `eval_metrics`.
-
-  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
-  function should not capture any Tensors in `model_fn`.
-
-  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
-  to pass to that function and returns a list of Tensors. `host_call` currently
-  works for train() and evaluate(). The Tensors returned by the function is
-  executed on the CPU on every step, so there is communication overhead when
-  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
-  size of the tensors. The `tensors` are concatenated along their major (batch)
-  dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with `tf.contrib.summary.create_file_writer`.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metrics=None,
-              export_outputs=None,
-              scaffold_fn=None,
-              host_call=None,
-              training_hooks=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a validated `TPUEstimatorSpec` instance."""
-    host_calls = {}
-    if eval_metrics is not None:
-      host_calls['eval_metrics'] = eval_metrics
-    if host_call is not None:
-      host_calls['host_call'] = host_call
-    _OutfeedHostCall.validate(host_calls)
-
-    training_hooks = tuple(training_hooks or [])
-    evaluation_hooks = tuple(evaluation_hooks or [])
-    prediction_hooks = tuple(prediction_hooks or [])
-
-    for hook in training_hooks + evaluation_hooks + prediction_hooks:
-      if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
-                        .format(hook))
-
-    return super(TPUEstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metrics=eval_metrics,
-        export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn,
-        host_call=host_call,
-        training_hooks=training_hooks,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def as_estimator_spec(self):
-    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    host_calls = {}
-    if self.eval_metrics is not None:
-      host_calls['eval_metrics'] = self.eval_metrics
-    if self.host_call is not None:
-      host_calls['host_call'] = self.host_call
-    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
-    eval_metric_ops = None
-    if self.eval_metrics is not None:
-      eval_metric_ops = host_call_ret['eval_metrics']
-    hooks = None
-    if self.host_call is not None:
-      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    loss = self.loss
-    if tensor_tracer.TensorTracer.is_enabled() \
-       and self.train_op is not None:
-      tt = tensor_tracer.TensorTracer()
-      (loss, tracing_calls) = tt.trace_cpu(ops.get_default_graph(),
-                                           loss, self.train_op)
-      tracing_call_ret = _OutfeedHostCall.create_cpu_hostcall(tracing_calls)
-      tracing_functions = tracing_call_ret.values()
-      if tracing_functions:
-        if hooks:
-          hooks.extend([_OutfeedHostCallHook(tracing_functions)])
-        else:
-          hooks = [_OutfeedHostCallHook(tracing_functions)]
-    hooks = tuple(hooks or [])
-    scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(
-        mode=self.mode,
-        predictions=self.predictions,
-        loss=loss,
-        train_op=self.train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=self.export_outputs,
-        scaffold=scaffold,
-        training_hooks=self.training_hooks + hooks,
-        evaluation_hooks=self.evaluation_hooks + hooks,
-        prediction_hooks=self.prediction_hooks + hooks)
-
-
-class _OpQueueContext(object):
-  """Manages work queue and thread for a infeed/outfeed thread."""
-
-  def __init__(self, name, target, args):
-    self._name = name
-    self._queue = Queue.Queue()
-    args = (self,) + args
-    self._thread = threading.Thread(name=name, target=target, args=args)
-    self._thread.daemon = True
-    self._thread.start()
-
-  def stop(self):
-    self._queue.put(_SIGNAL.STOP)
-
-  def send_next_batch_signal(self, iterations):
-    self._queue.put(iterations)
-
-  def read_iteration_counts(self):
-    while True:
-      iterations = self._queue.get(block=True)
-      logging.debug('%s read iterations %s', self._name, iterations)
-      if iterations == _SIGNAL.STOP:
-        logging.info('%s received shutdown signal, stopping.', self._name)
-        return
-      yield iterations
-
-  def join(self):
-    logging.info('Shutting down %s thread.', self._name)
-    self.stop()
-    self._thread.join()
-
-
-class _OpSignalOnceQueueContext(_OpQueueContext):
-  """Manages work queue and thread for a infeed/outfeed thread.
-
-  This subclass only signals once.
-  """
-
-  def __init__(self, name, target, args):
-    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
-    self._has_signaled = False
-
-  def send_next_batch_signal(self, iterations):
-    if not self._has_signaled:
-      self._queue.put(iterations)
-      self._has_signaled = True
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
-
-  def __init__(self,
-               ctx,
-               enqueue_ops,
-               dequeue_ops,
-               tpu_compile_op,
-               run_infeed_loop_on_coordinator=True,
-               rendezvous=None,
-               master=None,
-               session_config=None,
-               tpu_init_ops=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._rendezvous = rendezvous
-    self._master = master
-    self._session_config = session_config
-    self._init_ops = list(tpu_init_ops or [])
-    if ctx.embedding_config is None:
-      self._embedding_layer_config = None
-    else:
-      self._embedding_layer_config = (
-          ctx.embedding_config.tpu_embedding.config_proto)
-    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
-
-    self._feed_error = None
-    self._finished = False
-    self._should_initialize_tpu = True
-    self._tpu_compile_op = tpu_compile_op
-
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    if self._should_initialize_tpu:
-      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
-    else:
-      self._finalize_ops = []
-
-    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
-    self._init_ops.extend(summary_writer_init_ops)
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    for op in summary_writer_init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def _run_infeed(self, queue_ctx, session):
-    logging.info('Starting infeed thread controller.')
-    if self._initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   self._initial_infeed_sleep_secs)
-      time.sleep(self._initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-
-    with self._rendezvous.catch_errors(source='infeed', session=session):
-      if self._run_infeed_loop_on_coordinator:
-        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-          for i in xrange(steps):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(self._enqueue_ops)
-      else:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      logging.info('Infeed thread finished, shutting down.')
-
-  def _run_outfeed(self, queue_ctx, session):
-    logging.info('Starting outfeed thread controller.')
-    with self._rendezvous.catch_errors(source='outfeed', session=session):
-      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-        for i in xrange(steps):
-          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-          session.run(self._dequeue_ops)
-      logging.info('Outfeed thread finished, shutting down.')
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpQueueContext(name=name, target=target, args=args)
-
-  def _assertCompilationSucceeded(self, result, coord):
-    proto = tpu_compilation_result.CompilationResultProto()
-    proto.ParseFromString(result)
-    if proto.status_error_message:
-      logging.error('Compilation failed: {}'.format(proto.status_error_message))
-      coord.request_stop()
-    else:
-      logging.info('Compilation succeeded')
-
-  def after_create_session(self, session, coord):
-    if self._should_initialize_tpu:
-      logging.info('Init TPU system')
-      start = time.time()
-      with ops.Graph().as_default():
-        with tf_session.Session(
-            self._master, config=self._session_config) as sess:
-          sess.run(
-              tpu.initialize_system(
-                  job=self._master_job,
-                  embedding_config=self._embedding_layer_config))
-      logging.info('Initialized TPU in %d seconds', time.time() - start)
-
-    session.run(self._init_ops,
-                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-
-    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
-      logging.info('Compiling user program: this may take a while...')
-      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
-
-    self._infeed_controller = self._create_infeed_controller(
-        name='InfeedController', target=self._run_infeed, args=(session,))
-
-    self._outfeed_controller = _OpQueueContext(
-        name='OutfeedController', target=self._run_outfeed, args=(session,))
-
-    # Enable the worker watchdog to terminate workers on coordinator exit.
-    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
-    if watchdog_timeout > 0:
-      session_support.start_worker_watchdog(session,
-                                            shutdown_timeout=watchdog_timeout)
-
-  def before_run(self, run_context):
-    self._feed_error = None
-
-    iterations = run_context.session.run(self._iterations_per_loop_var)
-
-    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
-    self._infeed_controller.send_next_batch_signal(iterations)
-
-    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                 iterations)
-    self._outfeed_controller.send_next_batch_signal(iterations)
-
-  def end(self, session):
-    self._finished = True
-    logging.info('Stop infeed thread controller')
-    self._infeed_controller.join()
-    self._rendezvous.record_done('infeed')
-
-    logging.info('Stop output thread controller')
-    self._outfeed_controller.join()
-    self._rendezvous.record_done('outfeed')
-
-    logging.info('Shutdown TPU system.')
-    session.run(self._finalize_ops)
-
-
-class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
-
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
-               rendezvous=None, master=None, session_config=None):
-    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx,
-        enqueue_ops,
-        dequeue_ops,
-        tpu_compile_op=tpu_compile_op,
-        run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous,
-        master=master,
-        session_config=session_config)
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
-
-
-class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step.
-
-  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
-  following differences for TPU training:
-
-  1. This hook sets the variable for iterations_per_loop, which is used by
-     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
-     As the hook execution order is not guaranteed, the variable update is
-     handled in `after_create_session` and `after_run` as
-     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
-
-  2. For each training loop (session.run), the global step could be increased
-     multiple times on TPU. The global step tensor value will be explicitly read
-     again in `after_run` to ensure the latest value is retrieved to avoid race
-     condition.
-  """
-
-  def __init__(self, iterations, num_steps=None, last_step=None):
-    """Initializes a `StopAtStepHook`.
-
-    Args:
-      iterations: The number of iterations to run optimizer per training loop.
-      num_steps: Number of steps to execute.
-      last_step: Step after which to stop.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
-    if num_steps is None and last_step is None:
-      raise ValueError('One of num_steps or last_step must be specified.')
-    if num_steps is not None and last_step is not None:
-      raise ValueError('Only one of num_steps or last_step can be specified.')
-    self._num_steps = num_steps
-    self._last_step = last_step
-    self._iterations = iterations
-
-  def _next_iterations(self, global_step, last_step):
-    gap = last_step - global_step
-    return min(gap, self._iterations)
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError('Global step should be created.')
-
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-    if self._last_step is None:
-      self._last_step = global_step + self._num_steps
-
-    iterations = self._next_iterations(global_step, self._last_step)
-
-    self._iterations_per_loop_var.load(iterations, session=session)
-
-  def after_run(self, run_context, run_values):
-    # Global step cannot be retrieved via SessionRunArgs and before_run due to
-    # race condition.
-    global_step = run_context.session.run(self._global_step_tensor)
-    if global_step >= self._last_step:
-      run_context.request_stop()
-    else:
-      iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(
-          iterations, session=run_context.session)
-
-
-class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step."""
-
-  def __init__(self, num_steps):
-    """Initializes a `_SetEvalIterationsHook`.
-
-    Args:
-      num_steps: Number of steps to execute.
-    """
-    self._num_steps = num_steps
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    self._iterations_per_loop_var.load(self._num_steps, session=session)
-
-
-class _StoppingPredictHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop according to the stopping signal in prediction."""
-
-  def __init__(self, scalar_stopping_signal):
-    self._scalar_stopping_signal = scalar_stopping_signal
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
-    # in side threads for prediction model. But it makes the
-    # TPUInfeedOutfeedSessionHook prints nice message.
-    self._iterations_per_loop_var.load(1, session=session)
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
-
-  def after_run(self, run_context, run_values):
-    _ = run_context
-    scalar_stopping_signal = run_values.results
-    if _StopSignals.should_stop(scalar_stopping_signal):
-      # NOTE(xiejw): In prediction, stopping signals are inserted for each
-      # batch. And we append one more batch to signal the system it should stop.
-      # The data flow might look like
-      #
-      #  batch   0: images, labels, stop = 0  (user provided)
-      #  batch   1: images, labels, stop = 0  (user provided)
-      #  ...
-      #  batch  99: images, labels, stop = 0  (user provided)
-      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
-      #
-      # where the final batch (id = 100) is appended by TPUEstimator, so we
-      # should drop it before returning the predictions to user.
-      # To achieve that, we throw the OutOfRangeError in after_run. Once
-      # Monitored Session sees this error in SessionRunHook.after_run, the
-      # "current" prediction, i.e., batch with id=100, will be discarded
-      # immediately
-      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
-
-
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
-  """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A fn returns enqueue_ops."""
-    num_cores_per_host = ctx.num_of_cores_per_host
-    per_host_sharded_inputs = []
-    for core_ordinal in range(num_cores_per_host):
-      with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        user_context = tpu_context.TPUContext(
-            internal_ctx=ctx,
-            input_device=host_device,
-            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
-        inputs = _Inputs.from_input_fn(input_fn(user_context))
-        if inputs.is_dataset:
-          raise TypeError(
-              '`input_fn` returning `Dataset`  is not yet supported in '
-              'per-Core input pipeline deployment yet. Please set '
-              'TPUConfig.per_host_input_for_training to True or return '
-              '`features` and `labels` from `input_fn`')
-        features, labels = inputs.features_and_labels()
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels))
-        per_host_sharded_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-
-    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
-    return per_host_enqueue_ops
-
-  return enqueue_ops_fn, captured_infeed_queue
-
-
-def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-      if batch_axis is not None:
-        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A Fn returning the TPU infeed enqueue ops.
-
-    By providing as a Fn, it can be invoked inside the tf.while_loop such that
-    the input pipeline for multiple iterations can be executed by one
-    Session.run call.
-
-    Returns:
-      list of dict of ops.
-    """
-    with ops.device(device):
-      num_of_replicas_per_host = ctx.num_of_replicas_per_host
-      # Convert user input to features and labels.  If the user returns a
-      # dataset, it is initialized and the features and labels extracted via
-      # `dataset.iterator.get_next()`
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
-      unsharded_tensor_list = (
-          inputs_structure_recorder.flatten_features_and_labels(
-              features, labels, signals))
-
-      infeed_queue = tpu_feed.InfeedQueue(
-          tuple_types=[t.dtype for t in unsharded_tensor_list],
-          tuple_shapes=[t.shape for t in unsharded_tensor_list],
-          shard_dimensions=batch_axis)
-      captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
-      per_host_enqueue_ops = (
-          infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device,
-              tpu_ordinal_function=tpu_ordinal_function_impl))
-      if signals is None:
-        return per_host_enqueue_ops
-      else:
-        return {
-            'ops': per_host_enqueue_ops,
-            'signals': signals,
-        }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_per_host_v2_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if not is_dataset:
-      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
-                      'input pipeline configuration.')
-
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True,
-          num_invocations_per_step=ctx.num_of_replicas_per_host)
-
-    dataset_initializer = inputs.dataset_initializer()
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """Generates the per_host enqueue ops."""
-    control_deps = []
-    per_host_sharded_inputs = []
-    sparse_features_list = []
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-    cached_signals = None
-    with ops.device(device):
-      if not inputs.is_dataset:
-        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
-      for _ in range(num_replicas_per_host):
-        # Use control dependencies to ensure a deterministic ordering.
-        with ops.control_dependencies(control_deps):
-          features, labels = inputs.features_and_labels()  # Calls get_next()
-          signals = inputs.signals()
-
-          # All the replicas share the replica 0's stopping singal.
-          # This avoids inconsistent state among different model replcias.
-          if cached_signals:
-            signals['stopping'] = cached_signals['stopping']
-          else:
-            cached_signals = signals
-
-        features, labels, sparse_features = (
-            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
-        sparse_features_list.append(sparse_features)
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels, signals))
-        control_deps.extend(flattened_inputs)
-        per_host_sharded_inputs.append(flattened_inputs)
-
-      if inputs_structure_recorder.flattened_input_dims:
-        input_partition_dims = inputs_structure_recorder.flattened_input_dims
-        if signals:
-          input_partition_dims += [None] * len(signals)
-        # pylint: disable=protected-access
-        infeed_queue = tpu_feed._PartitionedInfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
-            host_id=host_id,
-            input_partition_dims=input_partition_dims,
-            device_assignment=ctx.device_assignment)
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs)
-      else:
-        infeed_queue = tpu_feed.InfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs,
-            tpu_ordinal_function=tpu_ordinal_function_impl)
-      captured_infeed_queue.capture(infeed_queue)
-
-    if ctx.embedding_config:
-      per_host_enqueue_ops.extend(
-          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
-              sparse_features_list))
-
-    if signals is None:
-      return per_host_enqueue_ops
-    else:
-      return {
-          'ops': per_host_enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
-                                      num_hosts):
-  """Generates infeed enqueue ops for one input_fn on all the hosts."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device_0, invocation_index=0)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-
-  def tpu_ordinal_function_impl(replica_id):
-    if ctx.device_assignment:
-      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
-    else:
-      return replica_id % num_replicas_per_host
-
-  def device_function_impl(replica_id):
-    return ctx.tpu_host_placement_function(replica_id=replica_id)
-
-  def enqueue_ops_fn():
-    """Generates enqueue ops for all the hosts."""
-    broadcasted_inputs = []
-    flattened_inputs = None  # Cache result from input_fn.
-    signals = None
-    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
-        for _ in xrange(ctx.num_of_replicas_per_host):
-          # Note: input_fn is only called once at host 0 for the first replica.
-          # The features and labels returned from that invocation are
-          # broadcasted to other replicas(including the replicas on other
-          # hosts).
-          if flattened_inputs is None:
-            features, labels = inputs.features_and_labels()  # Calls get_next()
-            signals = inputs.signals()
-
-            inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
-            flattened_inputs = (
-                inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels, signals))
-          broadcasted_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(broadcasted_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-    enqueue_ops = infeed_queue.generate_enqueue_ops(
-        broadcasted_inputs,
-        tpu_ordinal_function=tpu_ordinal_function_impl,
-        placement_function=device_function_impl)
-
-    if signals is None:
-      return enqueue_ops
-    else:
-      return {
-          'ops': enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-class _InputPipeline(object):
-  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
-
-  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in
-  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
-  multi-host TPU training) or for one host (usually for single-host TPU
-  evaluation), and sends all `features` and `labels` returned by `input_fn` to
-  TPU infeed. For per-core invocation, `features` and `labels` are piped to
-  infeed directly, one tuple for each core. For per-host invocation,  `features`
-  and `labels` are split at host (with respect to `batch_axis`) and piped to all
-  cores accordingly.
-
-  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
-  inputs returned by the `input_fn` can have one of the following forms:
-  1. features
-  2. (features, labels)
-  3. ((arbitrarily nested structure of features), labels)
-
-  Internally, form 1 is reformed to `(features, None)` as features and labels
-  are passed separately to underlying methods. For TPU training, TPUEstimator
-  may expect multiple `features` and `labels` tuples one for each core.
-
-  TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  Both `features` and `labels` can be any nested sturcture
-  supported by TF nest (namely, dict, tuples, namedtuples or any nested
-  structure of such of Tensors).  `labels` could be `None` as well.
-
-  These are flattened before they are passed to the infeed/outfeed library
-  as that expectes flattend lists.
-  """
-
-  class InputsStructureRecorder(object):
-    """The recorder to record inputs structure."""
-
-    def __init__(self, input_partition_dims=None):
-      # Holds the structure of inputs
-      self._feature_structure = {}
-      self._flattened_input_dims = None
-
-      if input_partition_dims:
-        # This should have been validated in TPUConfig.
-        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
-        if len(input_partition_dims) == 2:
-          self._feature_dims, self._label_dims = input_partition_dims
-        else:
-          self._feature_dims = input_partition_dims[0]
-          self._label_dims = None
-
-        assert self._feature_dims is not None, ('input_partition_dims[0] must '
-                                                'not be None')
-      else:
-        self._feature_dims = None
-        self._label_dims = None
-
-      # Internal state.
-      self._initialized = False
-
-    @property
-    def flattened_input_dims(self):
-      assert self._initialized, 'InputsStructureRecorder is not initialized.'
-      return self._flattened_input_dims
-
-    def has_labels(self):
-      return 'labels' in self._feature_structure
-
-    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
-                            label_dims_names, label_names, has_labels):
-      """Flatten input dims with the same order as flattened input tensors."""
-      flattened_input_dims = []
-      if feature_dims_names:
-        # We need a fixed ordering for matching the tensors in features.
-        flattened_input_dims.extend(
-            [feature_dims[name] for name in feature_dims_names])
-      else:
-        flattened_input_dims.append(feature_dims)
-
-      if label_dims_names:
-        # We need a fixed ordering for matching the tensors in labels.
-        flattened_input_dims.extend(
-            [label_dims[name] for name in label_dims_names])
-      else:
-        if label_names:
-          num_tensors_in_label = len(label_names)
-        else:
-          num_tensors_in_label = int(has_labels)
-        # Setting `None` in input_partition_dims[1] will apply `None` to
-        # all the tensors in labels, regardless of internal structure.
-        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
-
-      return flattened_input_dims
-
-    def validate_and_record_structure(self, features, labels):
-      """Validates and records the structure of `features` and `labels`."""
-      # Extract structure.
-      has_labels = labels is not None
-      feature_names = _extract_key_names(features)
-      label_names = _extract_key_names(labels)
-
-      if not self._initialized:
-        # Record structure.
-        self._initialized = True
-        if self._feature_dims is not None:
-          feature_dims_names = _extract_key_names(self._feature_dims)
-          if feature_dims_names != feature_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[0] mismatched feature'
-                ' keys. Expected {}, got {}'.format(feature_names,
-                                                    feature_dims_names))
-
-          label_dims_names = _extract_key_names(self._label_dims)
-          if self._label_dims is not None and label_dims_names != label_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[1] mismatched label'
-                ' keys. Expected {}, got {}'.format(label_names,
-                                                    label_dims_names))
-
-          self._flattened_input_dims = self._flatten_input_dims(
-              self._feature_dims, feature_dims_names, self._label_dims,
-              label_dims_names, label_names, has_labels)
-
-    def flatten_features_and_labels(self, features, labels, signals=None):
-      """Flattens the `features` and `labels` to a single tensor list."""
-      self._feature_structure['features'] = features
-      if labels is not None:
-        self._feature_structure['labels'] = labels
-      if signals is not None:
-        self._feature_structure['signals'] = signals
-      return data_nest.flatten(self._feature_structure)
-
-    def unflatten_features_and_labels(self, flattened_inputs):
-      """Restores the flattened inputs to original features and labels form.
-
-      Args:
-        flattened_inputs: Flattened inputs for each shard.
-
-      Returns:
-        A tuple of (`features`, `labels`), where `labels` could be None.
-        Each one, if present, should have identical structure (single tensor vs
-        dict) as the one returned by input_fn.
-
-      Raises:
-        ValueError: If the number of expected tensors from `flattened_inputs`
-          mismatches the recorded structure.
-      """
-
-      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
-                                                      flattened_inputs)
-      return _Inputs(
-          unflattened_inputs['features'],
-          unflattened_inputs.get('labels'),
-          signals=unflattened_inputs.get('signals'))
-
-  def __init__(self, input_fn, batch_axis, ctx):
-    """Constructor.
-
-    Args:
-      input_fn: input fn for train or eval.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards.
-      ctx: A `_InternalTPUContext` instance with mode.
-
-    Raises:
-      ValueError: If both `sharded_features` and `num_cores` are `None`.
-    """
-    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
-        ctx.input_partition_dims)
-
-    self._sharded_per_core = ctx.is_input_sharded_per_core()
-    self._input_fn = input_fn
-    self._infeed_queue = None
-    self._ctx = ctx
-    self._batch_axis = batch_axis
-
-  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
-    """Generates infeed enqueue ops and dequeue_fn."""
-    # While tf.while_loop is called, the body function, which invokes
-    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
-    # structure is recorded.
-    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
-        self._invoke_input_fn_and_record_structure())
-
-    self._validate_input_pipeline()
-
-    def dequeue_fn():
-      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
-      # computations must agree on the core on which infeed takes place. We
-      # choose to perform infeed on logical core 0 of each replica.
-      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
-      # The unflatten process uses the structure information recorded above.
-      return self._inputs_structure_recorder.unflatten_features_and_labels(
-          values)
-
-    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
-
-  def _invoke_input_fn_and_record_structure(self):
-    """Deploys the input pipeline and record input structure."""
-    enqueue_ops = []
-    infeed_queues = []
-    all_dataset_initializers = []
-    num_hosts = self._ctx.num_hosts
-    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
-
-    run_infeed_loop_on_coordinator = True
-
-    if self._sharded_per_core:
-      # Per-Core input pipeline deployment.
-      # Invoke input pipeline for each core and placed on the corresponding
-      # host.
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
-                generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    host_device, host_id))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              run_infeed_loop_on_coordinator = False
-              enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(captured_infeed_queue.get())
-
-    elif self._ctx.is_input_broadcast_with_iterators():
-      # Only calls input_fn in host 0.
-      host_device = tpu_host_placement_fn(host_id=0)
-      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
-                                            self._inputs_structure_recorder,
-                                            num_hosts))
-      if dataset_initializer:
-        all_dataset_initializers.append(dataset_initializer)
-        run_infeed_loop_on_coordinator = False
-        wrap_fn = (
-            _wrap_computation_in_while_loop
-            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-            _wrap_computation_in_while_loop_with_stopping_signals)
-        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-      else:
-        enqueue_ops.append(enqueue_ops_fn())
-      infeed_queues.append(captured_infeed_queue.get())
-    else:
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            if self._ctx.is_input_per_host_with_iterators():
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_v2_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, host_device, host_id))
-            else:
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, self._batch_axis,
-                      host_device, host_id))
-
-            # NOTE(xiejw): We dispatch here based on the return type of the
-            # users `input_fn`.
-            #
-            # 1. If input_fn returns a Dataset instance, we initialize the
-            # iterator outside of tf.while_loop, and call the iterator.get_next
-            # inside tf.while_loop.  This should be always safe.
-            #
-            # 2. If input_fn returns (features, labels), it is too late to wrap
-            # them inside tf.while_loop, as resource initialization cannot be
-            # handled in TF control flow properly. In this case, we will use
-            # python loop to enqueue the data into TPU system.  This may be
-            # slow compared to the previous case.
-            if dataset_initializer:
-              all_dataset_initializers.append(dataset_initializer)
-              run_infeed_loop_on_coordinator = False
-              wrap_fn = (
-                  _wrap_computation_in_while_loop
-                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-                  _wrap_computation_in_while_loop_with_stopping_signals)
-              enqueue_ops.append(
-                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(captured_infeed_queue.get())
-    # infeed_queue is used to generate dequeue ops. The only thing it uses for
-    # dequeue is dtypes and types. So, any one can be used. Here, grab the
-    # first one.
-    self._infeed_queue = infeed_queues[0]
-    return enqueue_ops, [
-        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
-    ], run_infeed_loop_on_coordinator
-
-  def _validate_input_pipeline(self):
-    """Validates the input pipeline.
-
-    Perform some sanity checks to log user friendly information. We should
-    error out to give users better error message. But, if
-    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    user code, so, log a warning.
-
-    Raises:
-      RuntimeError: If the validation failed.
-    """
-    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
-      err_msg = ('Input pipeline contains one or more QueueRunners. '
-                 'It could be slow and not scalable. Please consider '
-                 'converting your input pipeline to use `tf.data` instead (see '
-                 'https://www.tensorflow.org/guide/datasets for '
-                 'instructions.')
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
-      else:
-        logging.warn(err_msg)
-
-
-def call_computation(computation,
-                     experimental_exported_model_uses_all_cores=True):
-  """Call computation.
-
-  computation uses a single-core for TPU inference. If
-  `experimental_exported_model_uses_all_cores` is `True`, this function will
-  round-robin
-  computation among all TPU cores visible to the host; otherwise, it will use
-  a single core.
-
-  Args:
-    computation: A Python function that takes no inputs and builds computation
-      graph. If `computation` returns m outputs, this function will return a
-      list of m Tensors.
-    experimental_exported_model_uses_all_cores: Whether to round-robin among all
-      cores visible to the host, or to use a single core.
-
-  Returns:
-    A list of output tensors.
-  """
-  if experimental_exported_model_uses_all_cores:
-    # Using `TPUPartitionedCall` makes it possible to target a different
-    # TPU core with every `Session.run()` call. Note that the entire inference
-    # graph executes on a single core, and that invocations of this graph
-    # will round-robin among the cores attached to a host.
-    @function.Defun()
-    def tpu_subgraph():
-      return computation()
-
-    return tpu_functional.TPUPartitionedCall(
-        args=tpu_subgraph.captured_inputs,
-        device_ordinal=tpu_ordinal_selector_op.tpu_ordinal_selector(),
-        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
-        f=tpu_subgraph)
-  else:
-    return computation()
-
-
-class _ModelFnWrapper(object):
-  """A `model_fn` wrapper.
-
-  This makes calling model_fn on CPU and TPU easier and more consistent and
-  performs necessary check and mutation required by TPU training and evaluation.
-
-  In addition, this wrapper manages converting the `model_fn` to a single TPU
-  train and eval step.
-  """
-
-  def __init__(self, model_fn, config, params, ctx):
-    self._model_fn = model_fn
-    self._config = config
-    self._params = params
-    self._ctx = ctx
-
-  def call_without_tpu(self, features, labels, is_export_mode):
-    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
-
-  def _add_embedding_features(self, features):
-    if self._ctx.embedding_config:
-      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
-      embedding_activations = tpu_embedding_.get_activations()
-      features.update(embedding_activations)
-
-  def convert_to_single_tpu_train_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single train step on TPU.
-
-    The user provided `model_fn` takes input tuple
-    (features, labels) and produces the EstimatorSpec with train_op and loss for
-    train `mode`. This usually represents a single train computation on CPU.
-
-    For TPU training, a train (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input should be taken from TPU infeed rather
-    than input pipeline (input_fn) directly. To fit TPU loop and replicate
-    pattern, the original train computation should be reformed, which is the
-    returned `train_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
-      representing the train step for TPU.
-    """
-
-    host_call = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_training_hooks = _CapturedObject()
-
-    def train_step(loss):
-      """Training step function for use inside a while loop."""
-      del loss  # unused; required in function signature.
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      self._add_embedding_features(features)
-
-      estimator_spec = self._verify_estimator_spec(
-          self._call_model_fn(features, labels))
-      loss, train_op = estimator_spec.loss, estimator_spec.train_op
-
-      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
-      else:
-        captured_scaffold_fn.capture(None)
-
-      captured_training_hooks.capture(estimator_spec.training_hooks)
-
-      tracing_ops = []
-      if tensor_tracer.TensorTracer.is_enabled():
-        tt = tensor_tracer.TensorTracer()
-        loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(),
-                                         loss, train_op,
-                                         self._ctx.num_replicas,
-                                         self._ctx.num_of_replicas_per_host,
-                                         self._ctx.num_hosts)
-
-      if self._ctx.embedding_config is None:
-        apply_sparse_grads = []
-      else:
-        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
-        apply_sparse_grads = [tpu_embedding_.generate_send_gradients_op()]
-
-      # We must run train_op to update the variables prior to running the
-      # outfeed.
-      with ops.control_dependencies([train_op] + tracing_ops +
-                                    apply_sparse_grads):
-        host_call_outfeed_ops = []
-        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
-            and estimator_spec.host_call is not None):
-          host_call.record({'host_call': estimator_spec.host_call})
-          host_call_outfeed_ops = host_call.create_enqueue_op()
-        with ops.control_dependencies(host_call_outfeed_ops):
-          return array_ops.identity(loss)
-
-    return (train_step, host_call, captured_scaffold_fn,
-            captured_training_hooks)
-
-  def convert_to_single_tpu_eval_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single eval step on TPU.
-
-    Similar to training, the user provided `model_fn` takes input tuple
-    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
-    eval `mode`. This usually represents a single evaluation computation on CPU.
-
-    For TPU evaluation, a eval (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input and output are slightly different. Input,
-    features and labels, should be taken from TPU infeed rather than input
-    pipeline (input_fn) directly. Output is managed in two stages.  First, the
-    model outputs as the result of evaluation computation, usually model logits,
-    should be transferred from TPU system to CPU. Then, all model outputs are
-    concatenated first on CPU and sent to the metric_fn for metrics computation.
-    To fit TPU evaluation pattern, the original eval computation should be
-    reformed, which is the returned `eval_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
-      representing the eval step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_eval_hooks = _CapturedObject()
-
-    def eval_step(total_loss):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      self._add_embedding_features(features)
-
-      tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU evaluation must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      loss = tpu_estimator_spec.loss
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
-
-      to_record = {}
-      if tpu_estimator_spec.eval_metrics:
-        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
-      if tpu_estimator_spec.host_call is not None:
-        # We assume that evaluate won't update global step, so we don't wrap
-        # this host_call.
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return math_ops.add(total_loss, loss)
-
-    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-
-  def convert_to_single_tpu_predict_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single predict step on TPU.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
-      predict_fn representing the predict step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_predict_hooks = _CapturedObject()
-
-    def predict_step(unused_scalar_stopping_signal):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      stopping_signals = inputs.signals()
-
-      assert stopping_signals is not None, (
-          'Internal Error: `signals` is missing.')
-
-      tpu_estimator_spec = self._call_model_fn(
-          features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU prediction must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
-
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
-      to_record = {}
-      identity_fn = lambda **kwargs: kwargs
-      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
-      to_record['signals'] = [identity_fn, stopping_signals]
-      if tpu_estimator_spec.host_call is not None:
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
-
-    return (predict_step, host_calls, captured_scaffold_fn,
-            captured_predict_hooks)
-
-  def _verify_tpu_spec_predictions(self, predictions):
-    """Validates TPUEstimatorSpec.predictions dict."""
-    # TODO(xiejw): Adds validation for prediction dictionrary.
-    # TODO(xiejw): Adds support for single tensor as predictions.
-    if not isinstance(predictions, dict):
-      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
-
-    for (key, tensor) in predictions.items():
-      if tensor.shape.dims[0].value is None:
-        raise ValueError(
-            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
-            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
-    return predictions
-
-  def _validate_model_features_and_labels(self, features, labels,
-                                          is_export_mode):
-    """Validates that the features and labels for the model function are valid.
-
-    A valid features/labels object is the one with:
-    - Type: A tensor or any nested structure of tensors supported by TF nest,
-        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
-    - Static shape if is_export_mode is False.
-
-    Args:
-      features: the features that would be input to the model function.
-      labels: the labels that would be input to the model function.
-      is_export_mode: boolean value specifying if in export mode.
-
-    Raises:
-      TypeError: If features/labels are not of the correct type.
-      ValueError: If features/labels have dynamic shape.
-    """
-
-    def validate(obj, obj_name):
-      """Helper validate function."""
-      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
-        return
-      if isinstance(obj, ops.Tensor):
-        if not obj.get_shape().is_fully_defined():
-          raise ValueError(
-              'The {} to the model returned by input_fn must have static shape.'
-              ' Tensor: {}'.format(obj_name, obj))
-      else:
-        for tensor in data_nest.flatten(obj):
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                ('The {} to the model returned by input_fn must have static '
-                 'shape. Tensor: {}').format(obj_name, tensor))
-
-    validate(features, 'features')
-    if labels is not None:
-      validate(labels, 'labels')
-
-  def _call_model_fn(self, features, labels, is_export_mode=False):
-    """Calls the model_fn with required parameters."""
-    self._validate_model_features_and_labels(features, labels, is_export_mode)
-    model_fn_args = function_utils.fn_args(self._model_fn)
-    kwargs = {}
-
-    # Makes deep copy with `config` and params` in case user mutates them.
-    config = copy.deepcopy(self._config)
-    params = copy.deepcopy(self._params)
-
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    elif labels is not None:
-      raise ValueError(
-          'model_fn does not take labels, but input_fn returns labels.')
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = self._ctx.mode
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-
-    if 'params' not in model_fn_args:
-      raise ValueError('model_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params[\'batch_size\']'.format(self._model_fn))
-
-    if is_export_mode:
-      batch_size_for_model_fn = None
-    else:
-      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
-
-    if batch_size_for_model_fn is not None:
-      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
-
-    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
-    _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
-
-    if not running_on_cpu:
-      user_context = tpu_context.TPUContext(
-          internal_ctx=self._ctx, call_from_input_fn=False)
-      _add_item_to_params(params, _CTX_KEY, user_context)
-
-    estimator_spec = self._model_fn(features=features, **kwargs)
-    if (running_on_cpu and
-        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
-      # The estimator_spec will be passed to `Estimator` directly, which expects
-      # type `EstimatorSpec`.
-      return estimator_spec.as_estimator_spec()
-    else:
-      return estimator_spec
-
-  def _verify_estimator_spec(self, estimator_spec):
-    """Validates the estimator_spec."""
-    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-      return estimator_spec
-
-    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
-    if estimator_spec.training_chief_hooks:
-      raise ValueError(
-          err_msg.format('training_chief_hooks') + 'If you want' +
-          ' to pass training hooks, please pass via training_hooks.')
-
-    if estimator_spec.scaffold:
-      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
-                      'Please use TPUEstimatorSpec.')
-    return estimator_spec
-
-
-class _OutfeedHostCall(object):
-  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
-
-  def __init__(self, ctx):
-    self._ctx = ctx
-    self._names = []
-    # All of these are dictionaries of lists keyed on the name.
-    self._host_fns = {}
-    self._tensor_keys = collections.defaultdict(list)
-    self._tensors = collections.defaultdict(list)
-    self._tensor_dtypes = collections.defaultdict(list)
-    self._tensor_shapes = collections.defaultdict(list)
-
-  @staticmethod
-  def validate(host_calls):
-    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
-
-    for name, host_call in host_calls.items():
-      if not isinstance(host_call, (tuple, list)):
-        raise ValueError('{} should be tuple or list'.format(name))
-      if len(host_call) != 2:
-        raise ValueError('{} should have two elements.'.format(name))
-      if not callable(host_call[0]):
-        raise TypeError('{}[0] should be callable.'.format(name))
-      if not isinstance(host_call[1], (tuple, list, dict)):
-        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
-
-      if isinstance(host_call[1], (tuple, list)):
-        fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = function_utils.fn_args(host_call[0])
-        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
-        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
-          raise RuntimeError(
-              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
-              'method args of the function, which takes {}.'.format(
-                  name, len(host_call[1]), len(fn_args)))
-
-  @staticmethod
-  def create_cpu_hostcall(host_calls):
-    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
-
-    _OutfeedHostCall.validate(host_calls)
-    ret = {}
-    for name, host_call in host_calls.items():
-      host_fn, tensors = host_call
-      if isinstance(tensors, (tuple, list)):
-        ret[name] = host_fn(*tensors)
-      else:
-        # Must be dict.
-        try:
-          ret[name] = host_fn(**tensors)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling %s: %s. It is likely the tensors '
-              '(%s[1]) do not match the '
-              'function\'s arguments', name, e, name)
-          raise
-    return ret
-
-  def record(self, host_calls):
-    """Records the host_call structure."""
-
-    for name, host_call in host_calls.items():
-      host_fn, tensor_list_or_dict = host_call
-      self._names.append(name)
-      self._host_fns[name] = host_fn
-
-      if isinstance(tensor_list_or_dict, dict):
-        for (key, tensor) in six.iteritems(tensor_list_or_dict):
-          self._tensor_keys[name].append(key)
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-      else:
-        # List or tuple.
-        self._tensor_keys[name] = None
-        for tensor in tensor_list_or_dict:
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-
-  def create_enqueue_op(self):
-    """Create the op to enqueue the recorded host_calls.
-
-    Returns:
-      A list of enqueue ops, which is empty if there are no host calls.
-    """
-    if not self._names:
-      return []
-
-    tensors = []
-    # TODO(jhseu): Consider deduping tensors.
-    for name in self._names:
-      tensors.extend(self._tensors[name])
-
-    with ops.device(tpu.core(0)):
-      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
-
-  def create_tpu_hostcall(self):
-    """Sends the tensors through outfeed and runs the host_fn on CPU.
-
-    The tensors are concatenated along dimension 0 to form a global tensor
-    across all shards. The concatenated function is passed to the host_fn and
-    executed on the first host.
-
-    Returns:
-      A dictionary mapping name to the return type of the host_call by that
-      name.
-
-    Raises:
-      RuntimeError: If outfeed tensor is scalar.
-    """
-    if not self._names:
-      return {}
-
-    ret = {}
-    # For each i, dequeue_ops[i] is a list containing the tensors from all
-    # shards. This list is concatenated later.
-    dequeue_ops = []
-    tensor_dtypes = []
-    tensor_shapes = []
-    for name in self._names:
-      for _ in self._tensors[name]:
-        dequeue_ops.append([])
-      for dtype in self._tensor_dtypes[name]:
-        tensor_dtypes.append(dtype)
-      for shape in self._tensor_shapes[name]:
-        tensor_shapes.append(shape)
-
-    # Outfeed ops execute on each replica's first logical core. Note: we must
-    # constraint it such that we have at most one outfeed dequeue and enqueue
-    # per replica.
-    for i in xrange(self._ctx.num_replicas):
-      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
-        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes,
-            shapes=tensor_shapes,
-            device_ordinal=ordinal_id)
-        for j, item in enumerate(outfeed_tensors):
-          dequeue_ops[j].append(item)
-
-    # Deconstruct dequeue ops.
-    flat_dequeue_ops = []
-    for l in dequeue_ops:
-      flat_dequeue_ops.extend(l)
-
-    dequeue_ops_by_name = {}
-    pos = 0
-    for name in self._names:
-      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
-                                              len(self._tensors[name])]
-      pos += len(self._tensors[name])
-
-    def _call_host_fn(fn, *args, **kw):
-      context = CatchInvalidHostcallFunctions()
-      context.Enter()
-      result = fn(*args, **kw)
-      context.Exit()
-      context.ExitResult(result)
-      return result
-
-    # It is assumed evaluation always happens on single host TPU system. So,
-    # place all ops on tpu host if possible.
-    #
-    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
-      for name in self._names:
-        dequeue_ops = dequeue_ops_by_name[name]
-        for i, item in enumerate(dequeue_ops):
-          if dequeue_ops[i][0].shape.ndims == 0:
-            raise RuntimeError(
-                'All tensors outfed from TPU should preserve batch size '
-                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-          # TODO(xiejw): Make the specification of the outfeed combinaton
-          # function more explicit and well-documented.  We may want to give the
-          # user the option of concatenating along any axis.
-          if (self._ctx.config.tpu_config.per_host_input_for_training is
-              tpu_config.InputPipelineConfig.BROADCAST):
-            # If the infeed is in BROADCAST mode (each core recieving the same
-            # input), then we assume that the cores also produce identical
-            # copies of the same output, and we simply take the output from
-            # the first core.  This mode is used by Mesh-TensorFlow.
-            with ops.control_dependencies(dequeue_ops[i]):
-              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
-          else:
-            # Assume that the input has been batch-split and that axis 0 of the
-            # output tensors represents the batch size.  Concatenate along
-            # the axis 0 to re-combine the batch.
-            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
-
-        if self._tensor_keys[name] is not None:
-          # The user-provided eval_metrics[1] is a dict.
-          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
-          try:
-            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
-          except TypeError as e:
-            logging.warning(
-                'Exception while calling %s: %s. It is likely the tensors '
-                '(%s[1]) do not match the '
-                'function\'s arguments', name, e, name)
-            raise
-        else:
-          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
-
-    # force all dequeue operations to be run if not consumed by the host calls
-    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
-    return ret
-
-
-class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
-  """Hook to run host calls when use_tpu=False."""
-
-  def __init__(self, tensors):
-    self._tensors = tensors
-
-  def begin(self):
-    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
-    # create a separate hook to guarantee execution order, because summaries
-    # need to be initialized before the outfeed thread starts.
-    # TODO(jhseu): Make a wrapper hook instead?
-    self._init_ops = contrib_summary.summary_writer_initializer_op()
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    self._finalize_ops = []
-    for op in self._init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def after_create_session(self, session, coord):
-    session.run(self._init_ops)
-
-  def before_run(self, run_context):
-    return basic_session_run_hooks.SessionRunArgs(self._tensors)
-
-  def end(self, session):
-    session.run(self._finalize_ops)
-
-
-class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Calculate and report global_step/sec and examples/sec during runtime."""
-
-  def __init__(self,
-               batch_size,
-               every_n_steps=100,
-               every_n_secs=None,
-               output_dir=None,
-               summary_writer=None):
-    self._batch_size = batch_size
-    super(ExamplesPerSecondHook, self).__init__(
-        every_n_steps=every_n_steps,
-        every_n_secs=every_n_secs,
-        output_dir=output_dir,
-        summary_writer=summary_writer)
-
-  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    global_step_per_sec = elapsed_steps / elapsed_time
-    examples_per_sec = self._batch_size * global_step_per_sec
-    if self._summary_writer is not None:
-      global_step_summary = Summary(value=[
-          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
-      ])
-      example_summary = Summary(value=[
-          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
-      ])
-      self._summary_writer.add_summary(global_step_summary, global_step)
-      self._summary_writer.add_summary(example_summary, global_step)
-    logging.info('global_step/sec: %g', global_step_per_sec)
-    logging.info('examples/sec: %g', examples_per_sec)
-
-
-class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
-  """Change SIGINT (CTRL^C) handler to force quit the process.
-
-  The default behavior often results in hanging processes.
-  The original handler is restored after training/evaluation.
-  """
-
-  def __init__(self):
-    self._signal_fn = signal.getsignal(signal.SIGINT)
-
-  def before_run(self, run_context):
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-
-  def end(self, session):
-    signal.signal(signal.SIGINT, self._signal_fn)
-
-
-class TPUEstimator(estimator_lib.Estimator):
-  """Estimator with TPU support.
-
-  TPUEstimator also supports training on CPU and GPU. You don't need to define
-  a separate `tf.estimator.Estimator`.
-
-  TPUEstimator handles many of the details of running on TPU devices, such as
-  replicating inputs and models for each core, and returning to host
-  periodically to run hooks.
-
-  TPUEstimator transforms a global batch size in params to a per-shard batch
-  size when calling the `input_fn` and `model_fn`. Users should specify
-  global batch size in constructor, and then get the batch size for each shard
-  in `input_fn` and `model_fn` by `params['batch_size']`.
-
-  - For training, `model_fn` gets per-core batch size; `input_fn` may get
-    per-core or per-host batch size depending on `per_host_input_for_training`
-    in `TPUConfig` (See docstring for TPUConfig for details).
-
-  - For evaluation and prediction, `model_fn` gets per-core batch size and
-    `input_fn` get per-host batch size.
-
-  Evaluation
-  ==========
-
-  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation. However, if eval_on_tpu is False, `model_fn` must return
-  `EstimatorSpec` and the evaluation will execute on CPU or GPU; in this case
-  the following discussion on TPU evaluation does not apply.
-
-  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
-  `tensors` could be a list of any nested structure of `Tensor`s (See
-  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
-  a dict from metric string name to the result of calling a metric function,
-  namely a `(metric_tensor, update_op)` tuple.
-
-  One can set `use_tpu` to `False` for testing. All training, evaluation, and
-  predict will be executed on CPU. `input_fn` and `model_fn` will receive
-  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
-
-  Current limitations:
-  --------------------
-
-  1. TPU evaluation only works on a single host (one TPU worker) except
-     BROADCAST mode.
-
-  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
-     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
-     batches should have the same size.
-
-  Example (MNIST):
-  ----------------
-
-  ```
-  # The metric Fn which runs on CPU.
-  def metric_fn(labels, logits):
-    predictions = tf.argmax(logits, 1)
-    return {
-      'accuracy': tf.metrics.precision(
-          labels=labels, predictions=predictions),
-    }
-
-  # Your model Fn which runs on TPU (eval_metrics is list in this example)
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, [labels, logits]))
-
-  # or specify the eval_metrics tensors as dict.
-  def model_fn(features, labels, mode, config, params):
-    ...
-    final_layer_output = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, {
-              'labels': labels,
-              'logits': final_layer_output,
-          }))
-  ```
-
-  Prediction
-  ==========
-
-  Prediction on TPU is an experimental feature to support large batch inference.
-  It is not designed for latency-critical system. In addition, due to some
-  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
-  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
-  convenient.
-
-  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
-  *should* raise an end-of-input exception (`OutOfRangeError` or
-  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
-  precise, the ops created by `input_fn` produce one batch of the data.
-  The `predict()` API processes one batch at a time. When reaching the end of
-  the data source, an end-of-input exception should be raised by one of these
-  operations. The user usually does not need to do this manually. As long as the
-  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
-  exception automatically after the last batch has been produced.
-
-  Note: Estimator.predict returns a Python generator. Please consume all the
-  data from the generator so that TPUEstimator can shutdown the TPU system
-  properly for user.
-
-  Current limitations:
-  --------------------
-  1. TPU prediction only works on a single host (one TPU worker).
-
-  2. `input_fn` must return a `Dataset` instance rather than `features`. In
-  fact, .train() and .evaluate() also support Dataset as return value.
-
-  Example (MNIST):
-  ----------------
-  ```
-  height = 32
-  width = 32
-  total_examples = 100
-
-  def predict_input_fn(params):
-    batch_size = params['batch_size']
-
-    images = tf.random_uniform(
-        [total_examples, height, width, 3], minval=-1, maxval=1)
-
-    dataset = tf.data.Dataset.from_tensor_slices(images)
-    dataset = dataset.map(lambda images: {'image': images})
-
-    dataset = dataset.batch(batch_size)
-    return dataset
-
-  def model_fn(features, labels, params, mode):
-     # Generate predictions, called 'output', from features['image']
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      return tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode,
-          predictions={
-              'predictions': output,
-              'is_padding': features['is_padding']
-          })
-
-  tpu_est = TPUEstimator(
-      model_fn=model_fn,
-      ...,
-      predict_batch_size=16)
-
-  # Fully consume the generator so that TPUEstimator can shutdown the TPU
-  # system.
-  for item in tpu_est.predict(input_fn=input_fn):
-    # Filter out item if the `is_padding` is 1.
-    # Process the 'predictions'
-  ```
-
-  Exporting
-  =========
-
-  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
-  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
-  At serving time, these tags are used to select metagraph to load.
-
-  Before running the graph on TPU, TPU system needs to be initialized. If
-  TensorFlow Serving model-server is used, this is done automatically. If
-  not, please call `session.run(tpu.initialize_system())`.
-
-  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
-  `model_fn`.
-
-  Example:
-  ----------------
-
-  ```
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-    export_outputs = {
-      'logits': export_output_lib.PredictOutput(
-        {'logits': logits})
-    }
-
-    def host_call(logits):
-      class_ids = math_ops.argmax(logits)
-      classes = string_ops.as_string(class_ids)
-      export_outputs['classes'] =
-        export_output_lib.ClassificationOutput(classes=classes)
-
-    tpu.outside_compilation(host_call, logits)
-
-    ...
-  ```
-
-  """
-
-  def __init__(self,
-               model_fn=None,
-               model_dir=None,
-               config=None,
-               params=None,
-               use_tpu=True,
-               train_batch_size=None,
-               eval_batch_size=None,
-               predict_batch_size=None,
-               batch_axis=None,
-               eval_on_tpu=True,
-               export_to_tpu=True,
-               export_to_cpu=True,
-               warm_start_from=None,
-               experimental_exported_model_uses_all_cores=False,
-               experimental_export_device_assignment=False,
-               experimental_embedding_config_spec=None):
-    """Constructs an `TPUEstimator` instance.
-
-    Args:
-      model_fn: Model function as required by `Estimator` which returns
-        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
-        and `prediction_hooks` must not capure any TPU Tensor inside the
-        model_fn.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
-      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
-      params: An optional `dict` of hyper parameters that will be passed into
-        `input_fn` and `model_fn`.  Keys are names of parameters, values are
-        basic python types. There are reserved keys for `TPUEstimator`,
-        including 'batch_size'.
-      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
-        TPU training and evaluation respect this bit, but eval_on_tpu can
-        override execution of eval. See below. - Predict still happens on CPU.
-      train_batch_size: An int representing the global training batch size.
-        TPUEstimator transforms this global batch size to a per-shard batch
-        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
-        number of replicas.
-      eval_batch_size: An int representing evaluation batch size. Must be
-        divisible by total number of replicas.
-      predict_batch_size: An int representing the prediction batch size. Must be
-        divisible by total number of replicas.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards. For example, if your input_fn produced (images, labels)
-        where the images tensor is in `HWCN` format, your shard dimensions would
-        be [3, 0], where 3 corresponds to the `N` dimension of your images
-        Tensor, and 0 corresponds to the dimension along which to split the
-        labels to match up with the corresponding images. If None is supplied,
-        and per_host_input_for_training is True, batches will be sharded based
-        on the major dimension. If tpu_config.per_host_input_for_training is
-        False or `PER_HOST_V2`, batch_axis is ignored.
-      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
-        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
-      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU. Note that unsupported export modes such as EVAL will be
-        ignored. For those modes, only a CPU model will be exported.
-        Currently, export_to_tpu only supports PREDICT.
-      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on CPU.
-      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
-        configure warm-starting.  If the string filepath is provided instead of
-        a `WarmStartSettings`, then all variables are warm-started, and it is
-        assumed that vocabularies and Tensor names are unchanged.
-      experimental_exported_model_uses_all_cores: Whether to round-robin among
-        all cores visible to the host which is serving the saved model, or to
-        use a single core. This is a temporary flag to enable using all TPU
-        cores for inference with TPUPartitionedCall(). Once outside compilation
-        is supported in TPUPartitionedCall(), this flag will be enabled by
-        default.
-      experimental_export_device_assignment: Whether to include the device
-        assignment in the exported model. Doing so is useful in case of model
-        parallel inference but will tie the exported model to the TPU topology
-        used to export the model.
-      experimental_embedding_config_spec: Optional EmbeddingConfigSpec instance
-        to support using TPU embedding. IT IS STILL WORK IN PROGRESS, SO PLEASE
-        DO NOT USE.
-
-    Raises:
-      ValueError: `params` has reserved keys already.
-    """
-    if config is None or not isinstance(config, tpu_config.RunConfig):
-      raise ValueError(
-          '`config` must be provided with type `tpu_config.RunConfig`')
-
-    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError('{} are reserved keys but existed in params {}.'.format(
-          _RESERVED_PARAMS_KEYS, params))
-
-    if use_tpu:
-      # Perform some very basic validations. More validations will be found in
-      # _InternalTPUContext.
-      if train_batch_size is None:
-        raise ValueError('`train_batch_size` cannot be `None`')
-      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
-
-      if (config.tpu_config.per_host_input_for_training is
-          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
-          config.tpu_config.num_cores_per_replica):
-        raise ValueError(
-            'Model parallelism only supports per host input for training. '
-            'Please adjust TPURunconfig.per_host_input_for_training.')
-
-      if eval_batch_size is not None:
-        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
-
-      if predict_batch_size is not None:
-        util_lib.check_positive_integer(predict_batch_size,
-                                        'predict_batch_size')
-
-    # Verifies the model_fn signature according to Estimator framework.
-    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
-    # We cannot store config and params in this constructor as parent
-    # constructor might change them, such as assigning a temp dir for
-    # config.model_dir.
-    model_function = self._augment_model_fn(model_fn, batch_axis)
-
-    # Overwrite log_step_count_steps to disable TensorLoggingHook and
-    # StepCounterHook from being created in Estimator. TPUEstimator already
-    # added equivalent hooks in _augment_model_fn above.
-    self._log_every_n_steps = config.log_step_count_steps
-    config = config.replace(log_step_count_steps=None)
-
-    # Passing non-None params as wrapped model_fn has it.
-    params = params or {}
-    super(TPUEstimator, self).__init__(
-        model_fn=model_function,
-        model_dir=model_dir,
-        config=config,
-        params=params,
-        warm_start_from=warm_start_from)
-    self._iterations_per_training_loop = (
-        self._config.tpu_config.iterations_per_loop)
-
-    # All properties passed to _InternalTPUContext are immutable.
-    # pylint: disable=protected-access
-    self._ctx = tpu_context._get_tpu_context(
-        self._config, train_batch_size, eval_batch_size, predict_batch_size,
-        use_tpu, eval_on_tpu, experimental_embedding_config_spec)
-
-    self._export_to_cpu = export_to_cpu
-    self._export_to_tpu = export_to_tpu
-    self._experimental_exported_model_uses_all_cores = (
-        experimental_exported_model_uses_all_cores)
-    self._experimental_export_device_assignment = (
-        experimental_export_device_assignment)
-    if (experimental_exported_model_uses_all_cores and
-        experimental_export_device_assignment):
-      raise ValueError('experimental_exported_model_uses_all_cores and '
-                       'experimental_export_device_assignment is not supported '
-                       'at the same time.')
-
-    self._is_input_fn_invoked = None
-    self._rendezvous = {}
-
-  def _add_meta_graph_for_mode(self,
-                               builder,
-                               input_receiver_fn_map,
-                               checkpoint_path,
-                               save_variables=True,
-                               mode=model_fn_lib.ModeKeys.PREDICT,
-                               export_tags=None,
-                               check_variables=True):
-    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
-                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
-                      'for TPU.'.format(mode))
-
-    if not self._export_to_cpu and not self._export_to_tpu:
-      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
-
-    if self._export_to_cpu:
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables))
-
-    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
-      input_receiver_fn_map = {
-          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
-      }
-      export_tags = [tag_constants.SERVING, tag_constants.TPU]
-      mode = _REWRITE_FOR_INFERENCE_MODE
-
-      # See b/110052256 for why `check_variables` is `False`.
-      if not self._export_to_cpu:
-        check_variables = save_variables = True
-      else:
-        check_variables = save_variables = False
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables=save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables))
-
-  def _call_model_fn(self, features, labels, mode, config):
-    if mode == _REWRITE_FOR_INFERENCE_MODE:
-      return self._call_model_fn_for_inference(features, labels, mode, config)
-    else:
-      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                      config)
-
-  def _call_model_fn_for_inference(self, features, labels, mode, config):
-    """Wraps `_call_model_fn` for `export_savedmodel`."""
-    if mode != _REWRITE_FOR_INFERENCE_MODE:
-      raise ValueError('mode must be {}; '
-                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
-
-    computation, capture = self._build_computation_for_inference(
-        features, labels, mode, config)
-    tensors = call_computation(
-        computation,
-        experimental_exported_model_uses_all_cores=self
-        ._experimental_exported_model_uses_all_cores)
-    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
-        capture.get())
-    predictions_list = tensors[:len(predictions_dict)]
-    export_outputs_list_without_none = tensors[len(predictions_dict):]
-
-    # Reinsert `None`s which we've taken out in
-    # `_build_computation_for_inference()`.
-    export_outputs_list = []
-    while none_indices or export_outputs_list_without_none:
-      if none_indices and none_indices[0] == len(export_outputs_list):
-        export_outputs_list.append(None)
-        none_indices.pop(0)
-      else:
-        export_outputs_list.append(export_outputs_list_without_none.pop(0))
-
-    # Reconstruct `export_outputs` with updated tensors.
-    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
-                                                    export_outputs_list)
-    export_outputs = estimator_spec.export_outputs
-    new_export_outputs = collections.OrderedDict(
-        (k, _clone_export_output_with_tensors(export_outputs[k], v))
-        for k, v in six.iteritems(new_export_outputs_dict))
-    # Reconstruct `predictions` with updated tensors.
-    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
-    if (len(new_predictions) == 1 and
-        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
-      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
-
-    return estimator_spec._replace(
-        export_outputs=new_export_outputs, predictions=new_predictions)
-
-  def _build_computation_for_inference(self, features, labels, mode, config):
-    capture = _CapturedObject()
-
-    def computation():
-      """Computation to be passed to `TPUPartitionedCall()`."""
-      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
-          features, labels, mode, config)
-
-      if self._experimental_export_device_assignment:
-        # Export the device assignment as part of the model. This is useful for
-        # model parallel usecases where the model relies on the mapping between
-        # logical and physical devices.
-        with self._ctx.with_mode(mode) as ctx:
-          device_assignment = ctx.device_assignment
-      else:
-        device_assignment = None
-      tensors_on_cpu = tpu.rewrite_for_inference(
-          tpu_computation, device_assignment=device_assignment)
-      (estimator_spec, export_outputs_dict, export_outputs_list,
-       predictions_dict) = (
-           tpu_capture.get())
-      predictions_list = tensors_on_cpu[:len(predictions_dict)]
-      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
-
-      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
-      # with their CPU counterpart returned from `rewrite_for_inference()`.
-      # `function.Defun()` does not like `None`s in return values, so we leave
-      # `None`s out but record their positions for later reconstruction.
-      export_outputs_list_without_none = []
-      none_indices = []
-      for i, t in enumerate(export_outputs_list):
-        if t is None:
-          none_indices.append(i)
-        else:
-          export_outputs_list_without_none.append(
-              export_outputs_tpu_on_cpu_list.pop(0))
-
-      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
-                       none_indices))
-      return predictions_list + export_outputs_list_without_none
-
-    return computation, capture
-
-  def _build_tpu_computation_for_inference(self, features, labels, mode,
-                                           config):
-    capture = _CapturedObject()
-
-    def computation():
-      """Compute tpu tensors used in export_outputs.
-
-      Passed to rewrite_for_inference so that model_fn will be called under
-      the rewriting contexts. Only tpu tensors are returned, but export_outputs
-      and scaffold are captured.
-
-      Returns:
-         A list of Tensors used in export_outputs and not marked for
-         outside_compilation.
-      """
-      # We should only call model fn once and it should be inside `computation`
-      # so that building the graph will happen under `rewrite_for_inference`.
-      mode = model_fn_lib.ModeKeys.PREDICT
-      estimator_spec = self._call_model_fn(features, labels, mode, config)
-
-      # We pick the TPU tensors out from `export_output` and later return them
-      # from `computation` for rewriting.
-      export_outputs_dict = collections.OrderedDict(
-          (k, _export_output_to_tensors(v))
-          for k, v in six.iteritems(estimator_spec.export_outputs))
-      export_outputs_list = nest.flatten(export_outputs_dict)
-      export_outputs_tpu_list = [
-          t for t in export_outputs_list if t is not None
-      ]
-
-      if isinstance(estimator_spec.predictions, dict):
-        predictions_dict = collections.OrderedDict(
-            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
-      else:
-        predictions_dict = {
-            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
-        }
-      predictions_list = nest.flatten(predictions_dict)
-
-      # We cannot return everything we want through the return values, so
-      # capture the rest here for later use.
-      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
-                       predictions_dict))
-      return predictions_list + export_outputs_tpu_list
-
-    return computation, capture
-
-  def _create_global_step(self, graph):
-    """Creates a global step suitable for TPUs.
-
-    Args:
-      graph: The graph in which to create the global step.
-
-    Returns:
-      A global step `Tensor`.
-
-    Raises:
-      ValueError: if the global step tensor is already defined.
-    """
-    return _create_global_step(graph)
-
-  def _convert_train_steps_to_hooks(self, steps, max_steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
-            steps, max_steps)
-
-    # On TPU.
-    if steps is None and max_steps is None:
-      raise ValueError(
-          'For TPU training, one of `steps` or `max_steps` must be set. '
-          'Cannot be both `None`.')
-
-    # Estimator.train has explicit positiveness check.
-    if steps is not None:
-      util_lib.check_positive_integer(steps, 'Train steps')
-    if max_steps is not None:
-      util_lib.check_positive_integer(max_steps, 'Train max_steps')
-
-    return [
-        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
-    ]
-
-  def _convert_eval_steps_to_hooks(self, steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
-
-    if steps is None:
-      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
-
-    util_lib.check_positive_integer(steps, 'Eval steps')
-
-    return [
-        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
-            num_evals=steps),
-        _SetEvalIterationsHook(steps)
-    ]
-
-  def _call_input_fn(self, input_fn, mode):
-    """Calls the input function.
-
-    Args:
-      input_fn: The input function.
-      mode: ModeKeys
-
-    Returns:
-      In TPU mode, returns an input_fn to be called later in model_fn.
-      Otherwise, calls the input_fn and returns either fatures or
-        (features, labels).
-
-    Raises:
-      ValueError: if input_fn takes invalid arguments or does not have `params`.
-    """
-    input_fn_args = function_utils.fn_args(input_fn)
-    config = self.config  # a deep copy.
-    kwargs = {}
-    if 'params' in input_fn_args:
-      kwargs['params'] = self.params  # a deep copy.
-    else:
-      raise ValueError('input_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params["batch_size"]'.format(input_fn))
-    if 'config' in input_fn_args:
-      kwargs['config'] = config
-
-    if 'mode' in input_fn_args:
-      kwargs['mode'] = mode
-
-    # Records the fact input_fn has been invoked.
-    self._is_input_fn_invoked = True
-
-    with self._ctx.with_mode(mode) as ctx:
-      # Setting the batch size in params first. This helps user to have same
-      # input_fn for use_tpu=True/False.
-      batch_size_for_input_fn = ctx.batch_size_for_input_fn
-      if batch_size_for_input_fn is not None:
-        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
-                            batch_size_for_input_fn)
-
-      # For export_savedmodel, input_fn is never passed to Estimator. So,
-      # `is_export_mode` must be False.
-      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
-          return input_fn(**kwargs)
-
-      # For TPU computation, input_fn should be invoked in a tf.while_loop for
-      # performance. While constructing the tf.while_loop, the structure of
-      # inputs returned by the `input_fn` needs to be recorded. The structure
-      # includes whether features or labels is dict or single Tensor, dict keys,
-      # tensor shapes, and dtypes. The recorded structure is used to create the
-      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
-      # inside the TPU computation, as the TPU computation is wrapped inside a
-      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
-      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
-      # `features` in `model_fn` signature.
-      def _input_fn(ctx):
-        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
-        return input_fn(**kwargs)
-
-      return _input_fn
-
-  def _validate_features_in_predict_input(self, result):
-    """Skip the validation.
-
-    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
-    has stronger check. Parent class's check generates confusing warning msg.
-
-    Args:
-      result: `features` returned by input_fn.
-    """
-    pass
-
-  def train(self,
-            input_fn,
-            hooks=None,
-            steps=None,
-            max_steps=None,
-            saving_listeners=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
-    try:
-      return super(TPUEstimator, self).train(
-          input_fn=input_fn,
-          hooks=hooks,
-          steps=steps,
-          max_steps=max_steps,
-          saving_listeners=saving_listeners)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('training_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('training_loop')
-      rendezvous.raise_errors()
-
-  def evaluate(self,
-               input_fn,
-               steps=None,
-               hooks=None,
-               checkpoint_path=None,
-               name=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
-    try:
-      return super(TPUEstimator, self).evaluate(
-          input_fn,
-          steps=steps,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('evaluation_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('evaluation_loop')
-      rendezvous.raise_errors()
-
-  def predict(self,
-              input_fn,
-              predict_keys=None,
-              hooks=None,
-              checkpoint_path=None,
-              yield_single_examples=True):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
-    try:
-      for result in super(TPUEstimator, self).predict(
-          input_fn=input_fn,
-          predict_keys=predict_keys,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          yield_single_examples=yield_single_examples):
-        yield result
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('prediction_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('prediction_loop')
-      rendezvous.raise_errors()
-
-    rendezvous.record_done('prediction_loop')
-    rendezvous.raise_errors()
-
-  def _augment_model_fn(self, model_fn, batch_axis):
-    """Returns a new model_fn, which wraps the TPU support."""
-
-    def _model_fn(features, labels, mode, config, params):
-      """A Estimator `model_fn` for TPUEstimator."""
-      with self._ctx.with_mode(mode) as ctx:
-        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
-
-        # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
-        # but not in `export_savedmodel()`.
-        if self._is_input_fn_invoked:
-          is_export_mode = False
-        else:
-          is_export_mode = True
-
-        # Clear the bit.
-        self._is_input_fn_invoked = None
-
-        # examples_hook is added to training_hooks for both CPU and TPU
-        # execution.
-        if self._log_every_n_steps is not None:
-          examples_hook = ExamplesPerSecondHook(
-              ctx.global_batch_size,
-              # pylint:disable=g-long-ternary
-              output_dir=(self.model_dir
-                          if not config or config.save_summary_steps
-                          else None),
-              # pylint:enable=g-long-ternary
-              every_n_steps=self._log_every_n_steps)
-
-        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
-          logging.info('Running %s on CPU', mode)
-          estimator_spec = model_fn_wrapper.call_without_tpu(
-              features, labels, is_export_mode=is_export_mode)
-          if self._log_every_n_steps is not None:
-            estimator_spec = estimator_spec._replace(
-                training_hooks=estimator_spec.training_hooks + (examples_hook,))
-          return estimator_spec
-
-        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
-        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
-        assert callable(features), '`input_fn` is not callable.'
-        input_fn = features
-
-        tpu_init_ops = []
-        if ctx.embedding_config:
-          tpu_init_ops.extend(ctx.embedding_config.tpu_embedding.init_ops)
-          embedding_variables_and_ops = (
-              ctx.embedding_config.tpu_embedding.create_variables_and_ops())
-          tpu_init_ops.extend(embedding_variables_and_ops.load_ops)
-
-        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
-            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
-
-        graph = ops.get_default_graph()
-        for enqueue_op in enqueue_ops:
-          if isinstance(enqueue_op, list):
-            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
-          else:
-            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
-
-        if mode == model_fn_lib.ModeKeys.TRAIN:
-          compile_op, loss, host_call, scaffold, training_hooks = (
-              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          host_ops = host_call.create_tpu_hostcall()
-          if host_ops is None:
-            host_ops = []
-
-          shutdown_hooks = []
-          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
-                                         'shutdown_worker')
-          if shutdown_mode:
-            if shutdown_mode == 'shutdown_worker':
-              finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
-              ]
-            elif shutdown_mode == 'shutdown_computation':
-              finalizer_hooks = [
-                  session_support.RestartComputation(timeout_ms=60 * 1000),
-              ]
-            else:
-              raise ValueError(
-                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
-
-            shutdown_hooks.append(
-                session_support.GracefulShutdownHook(
-                    checkpoint_prefix=self.model_dir + '/model.ckpt',
-                    on_shutdown_hooks=finalizer_hooks))
-
-          with ops.control_dependencies([loss]):
-            global_step = array_ops.identity(training.get_global_step())
-          hooks = input_hooks + shutdown_hooks
-          hooks.extend([
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.master,
-                  session_config=self._session_config,
-                  tpu_init_ops=tpu_init_ops),
-              InstallSignalHandlerHook()
-          ])
-          if self._log_every_n_steps is not None:
-            logging_hook_frequency = (  # Divide and round up
-                (self._log_every_n_steps +
-                 self._config.tpu_config.iterations_per_loop - 1) //
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(
-                training.LoggingTensorHook({
-                    'loss': array_ops.identity(loss),
-                    'step': global_step,
-                },
-                                           every_n_iter=logging_hook_frequency))
-            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(examples_hook)
-
-          if training_hooks:
-            hooks.extend(training_hooks)
-
-          chief_hooks = []
-          if (self._config.save_checkpoints_secs or
-              self._config.save_checkpoints_steps):
-            checkpoint_hook = training.CheckpointSaverHook(
-                self.model_dir,
-                save_secs=self._config.save_checkpoints_secs,
-                save_steps=self._config.save_checkpoints_steps,
-                scaffold=scaffold)
-            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            chief_hooks.append(checkpoint_hook)
-
-          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
-          with ops.control_dependencies([loss]):
-            update_ops = _sync_variables_ops(ctx)
-
-          if ctx.embedding_config:
-            update_ops.extend(embedding_variables_and_ops.retrieve_ops)
-
-          # Validate the TPU training graph to catch basic errors
-          _validate_tpu_training_graph()
-
-          train_op = control_flow_ops.group(*update_ops)
-          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=loss,
-              training_chief_hooks=chief_hooks,
-              training_hooks=hooks,
-              train_op=train_op,
-              scaffold=scaffold)
-
-        if mode == model_fn_lib.ModeKeys.EVAL:
-          compile_op, total_loss, host_calls, scaffold, eval_hooks = (
-              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          iterations_per_loop_var = _create_or_get_iterations_per_loop()
-          mean_loss = math_ops.div(
-              total_loss,
-              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
-
-          with ops.control_dependencies([mean_loss]):
-            # After TPU evaluation computation is done (the mean_loss tensor),
-            # reads all variables back from TPU and updates the eval step
-            # counter properly
-            internal_ops_to_run = _sync_variables_ops(ctx)
-            internal_ops_to_run.append(
-                _increase_eval_step_op(iterations_per_loop_var))
-
-          host_call_ret = host_calls.create_tpu_hostcall()
-          eval_metric_ops = {}
-          eval_update_ops = []
-
-          eval_metrics = host_call_ret.get('eval_metrics', {})
-          if eval_metrics:
-            # Creates a dummy metric update_op for all metrics. Estimator
-            # expects all metrics in `eval_metric_ops` have update_op and calls
-            # them one by one. The real metric update_ops are invoked in a
-            # separated thread. So, here give Estimator the dummy op for all
-            # metrics.
-            with ops.control_dependencies(internal_ops_to_run):
-              dummy_update_op = control_flow_ops.no_op()
-
-            for k, v in eval_metrics.items():
-              eval_metric_ops[k] = (v[0], dummy_update_op)
-              eval_update_ops.append(v[1])
-          else:
-            # If no eval metrics are passed, create an identity node for the
-            # loss and add `internal_ops_to_run` to its dependencies. So
-            # `internal_ops_to_run` can be executed.
-            with ops.control_dependencies(internal_ops_to_run):
-              mean_loss = array_ops.identity(mean_loss)
-
-          if 'host_call' not in host_call_ret:
-            host_ops = []
-          else:
-            host_ops = host_call_ret['host_call']
-          hooks = [
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  eval_update_ops + host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.evaluation_master,
-                  session_config=self._session_config,
-                  tpu_init_ops=tpu_init_ops)
-          ] + input_hooks
-
-          if eval_hooks:
-            hooks.extend(eval_hooks)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=mean_loss,
-              evaluation_hooks=hooks,
-              eval_metric_ops=eval_metric_ops,
-              scaffold=scaffold)
-
-        # Predict
-        assert mode == model_fn_lib.ModeKeys.PREDICT
-
-        (compile_op, dummy_predict_op, host_calls,
-         scaffold, prediction_hooks) = _predict_on_tpu_system(
-             ctx, model_fn_wrapper, dequeue_fn)
-        with ops.control_dependencies([dummy_predict_op]):
-          internal_ops_to_run = _sync_variables_ops(ctx)
-          with ops.control_dependencies(internal_ops_to_run):
-            dummy_predict_op = control_flow_ops.no_op()
-
-        # In train and evaluation, the main TPU program is passed to monitored
-        # training session to run. Infeed enqueue and outfeed dequeue are
-        # executed in side threads. This is not the configuration for
-        # prediction mode.
-        #
-        # For prediction, the Estimator executes the EstimatorSpec.predictions
-        # directly and yield the element (via generator) to call site. So, the
-        # outfeed based prediction must be passed to MonitoredSession directly.
-        # Other parts of the TPU execution are organized as follows.
-        #
-        # 1. All outfeed based Tensors must be grouped with predictions Tensors
-        #    to form a single invocation. This avoid the issue we might trigger
-        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
-        #    placed in control_dependencies of `stopping_signals`, and
-        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
-        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
-        #    all SessionRunArgs with the fetch in session.run together.
-        #
-        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
-        #    are grouped together. They will be launched once and only once in
-        #    side threads and they quit naturally according to the SAME stopping
-        #    condition.
-        enqueue_ops.append(dummy_predict_op)
-
-        host_call_ret = host_calls.create_tpu_hostcall()
-        if 'host_call' not in host_call_ret:
-          host_ops = []
-        else:
-          host_ops = host_call_ret['host_call']
-
-        predictions = host_call_ret['predictions']
-        _verify_cross_hosts_transfer_size(
-            predictions,
-            message=(
-                'The estimated size for TPUEstimatorSpec.predictions is too '
-                'large.'))
-        signals = host_call_ret['signals']
-
-        with ops.control_dependencies(host_ops):
-          host_ops = []  # Empty, we do do not need it anymore.
-          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              signals)
-          predictions = _PaddingSignals.slice_tensor_or_dict(
-              predictions, signals)
-
-        hooks = [
-            _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
-                tpu_compile_op=compile_op,
-                master=self._config.master,
-                session_config=self._session_config),
-        ] + input_hooks
-
-        if prediction_hooks:
-          hooks.extend(prediction_hooks)
-
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            prediction_hooks=hooks,
-            predictions=predictions,
-            scaffold=scaffold)
-
-    return _model_fn
-
-
-def _export_output_to_tensors(export_output):
-  """Get a list of `Tensors` used in `export_output`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-
-  Returns:
-    a list of tensors used in export_output.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    return [export_output.scores, export_output.classes]
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    return [export_output.value]
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return list(export_output.outputs.values())
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _clone_export_output_with_tensors(export_output, tensors):
-  """Clones `export_output` but with new `tensors`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-    tensors: a list of `Tensors` used to construct a new `export_output`.
-
-  Returns:
-    A dict similar to `export_output` but with `tensors`.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    if len(tensors) != 2:
-      raise ValueError('tensors must be of length 2; '
-                       'got {}.'.format(len(tensors)))
-    return export_output_lib.ClassificationOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    if len(tensors) != 1:
-      raise ValueError('tensors must be of length 1; '
-                       'got {}'.format(len(tensors)))
-    return export_output_lib.RegressionOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output_lib.PredictOutput(
-        dict(zip(export_output.outputs.keys(), tensors)))
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
-
-  def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
-                                [_ZERO_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_eval_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get()
-
-
-def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_train_step, host_call, captured_scaffold_fn,
-   captured_training_hooks) = (
-       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
-
-  def multi_tpu_train_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
-                                [_INITIAL_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_train_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return compile_op, loss, host_call, scaffold, captured_training_hooks.get()
-
-
-def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
-   captured_predict_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
-
-  def multi_tpu_predict_steps_on_single_shard():
-
-    def cond(scalar_stopping_signal):
-      return math_ops.logical_not(
-          _StopSignals.should_stop(scalar_stopping_signal))
-
-    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
-    outputs = training_loop.while_loop(
-        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
-    return outputs
-
-  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
-      multi_tpu_predict_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  dummy_predict_op = dummy_predict_op[0]
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return (compile_op, dummy_predict_op, host_calls, scaffold,
-          captured_predict_hooks.get())
-
-
-def _wrap_computation_in_while_loop(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def computation(i):
-    with ops.control_dependencies(op_fn()):
-      return i + 1
-
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    iterations = array_ops.identity(iterations_per_loop_var)
-    return control_flow_ops.while_loop(
-        lambda i: i < iterations,
-        computation, [constant_op.constant(0)],
-        parallel_iterations=1)
-
-
-def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def cond(scalar_stopping_signal):
-    return math_ops.logical_not(
-        _StopSignals.should_stop(scalar_stopping_signal))
-
-  def computation(unused_scalar_stopping_signal):
-    return_value = op_fn()
-    execute_ops = return_value['ops']
-    signals = return_value['signals']
-    with ops.control_dependencies(execute_ops):
-      return _StopSignals.as_scalar_stopping_signal(signals)
-
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    return control_flow_ops.while_loop(
-        cond,
-        computation, [_StopSignals.NON_STOPPING_SIGNAL],
-        parallel_iterations=1)
-
-
-def _validate_tpu_training_graph():
-  """Validate graph before running distributed training.
-
-  Raises:
-    ValueError: If the graph seems invalid for running on device
-  """
-  operations = ops.get_default_graph().get_operations()
-
-  # Check if there is atleast one CrossReplicaSum operation in the graph
-  # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [
-      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
-  ]
-  if not cross_replica_sum_ops:
-    raise ValueError(
-        'CrossShardOptimizer must be used for model training on TPUs.')
-
-
-class _CapturedObject(object):
-  """A placeholder to capture an object.
-
-  This is useful when we need to capture a Python object in the Tensorflow
-  control flow body function and use it outside the control flow.
-  """
-
-  def __init__(self):
-    self._object = None
-    self._captured = False
-
-  def capture(self, o):
-    if self._captured:
-      raise RuntimeError(
-          'InternalError: Object can capture only once. Please file bug.')
-
-    self._captured = True
-    self._object = o
-
-  def get(self):
-    if not self._captured:
-      raise RuntimeError(
-          'InternalError: Object is not captured properly before `get`. '
-          'Please file bug.')
-    return self._object
-
-
-def _get_scaffold(captured_scaffold_fn):
-  """Retrieves the Scaffold from `captured_scaffold_fn`."""
-  with _CapturingContext(message='Inside scaffold_fn'):
-    scaffold_fn = captured_scaffold_fn.get()
-    if scaffold_fn:
-      scaffold = scaffold_fn()
-      if scaffold is None:
-        raise ValueError(
-            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
-    else:
-      scaffold = None
-
-  if scaffold:
-    wrapped_finalize = scaffold.finalize
-
-    def _finalize():
-      with _CapturingContext('Inside Scaffold.finalize'):
-        wrapped_finalize()
-
-    scaffold.finalize = _finalize
-  return scaffold
-
-
-class _CapturingContext(control_flow_ops.ControlFlowContext):
-  """Tracks references to Tensors defined in TPU replication."""
-
-  def __init__(self, message):
-    control_flow_ops.ControlFlowContext.__init__(self)
-    self._message = message
-
-  def to_control_flow_context_def(self, context_def, export_scope=None):
-    # pylint: disable=useless-super-delegation
-    # NOTE(slebedev): the method is required by `ControlFlowContext`.
-    super(_CapturingContext, self).to_control_flow_context_def(
-        context_def, export_scope)
-
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    for c in op.inputs:
-      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError('{}: Op {} depends on TPU computation {}, '
-                         'which is not allowed.'.format(self._message, op, c))
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._old = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    # pylint: enable=protected-access
-
-  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
-    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
-
-
-class _Inputs(object):
-  """A data structure representing the input_fn returned values.
-
-  This also supports the returned value from input_fn as `Dataset`.
-  """
-
-  def __init__(self, features=None, labels=None, dataset=None, signals=None):
-    if dataset is not None and (features is not None or labels is not None or
-                                signals is not None):
-      raise RuntimeError('Internal Error: Either (features and labels) or '
-                         'dataset should be provided, not both. Please file '
-                         'bug')
-
-    self._features = features
-    self._labels = labels
-    self._signals = signals
-
-    self._dataset = dataset
-    self._iterator = None
-
-  @staticmethod
-  def from_input_fn(return_values):
-    """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.DatasetV2):
-      dataset = return_values
-      return _Inputs(dataset=dataset)
-
-    features, labels = _Inputs._parse_inputs(return_values)
-    return _Inputs(features, labels)
-
-  @staticmethod
-  def _parse_inputs(return_values):
-    if isinstance(return_values, tuple):
-      features, labels = return_values
-    else:
-      features, labels = return_values, None
-    return features, labels
-
-  @property
-  def is_dataset(self):
-    """Returns True if the return value from input_fn is Dataset."""
-    return self._dataset is not None
-
-  def dataset_initializer(self):
-    """Returns the dataset's initializer.
-
-    The initializer must be run before calling `features_and_labels`.
-    """
-    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return self._iterator.initializer
-
-  def features_and_labels(self):
-    """Gets `features` and `labels`."""
-    if self.is_dataset:
-      if self._iterator is None:
-        raise RuntimeError('Internal error: Must run dataset_initializer '
-                           'before calling features_and_labels(). Please file '
-                           'a bug!')
-      return _Inputs._parse_inputs(self._iterator.get_next())
-
-    return (self._features, self._labels)
-
-  def signals(self):
-    return self._signals
-
-  @property
-  def dataset(self):
-    return self._dataset
-
-
-class _InputsWithStoppingSignals(_Inputs):
-  """Inputs with `_StopSignals` inserted into the dataset."""
-
-  def __init__(self,
-               dataset,
-               batch_size,
-               add_padding=False,
-               num_invocations_per_step=1):
-
-    assert dataset is not None
-    user_provided_dataset = dataset.map(
-        _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size, add_padding=add_padding))
-    if num_invocations_per_step == 1:
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-    else:
-      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
-      # user_provided_dataset and stop properly.
-      # For example, if num_invocations_per_step is 2, we append 3 additional
-      # padding batches: b1, b2, b3.
-      # If user_provided_dataset contains two batches: a1, a2
-      # Step 1: [a1, a2]
-      # Step 2: [b1, b2] -> STOP
-      # If user_provided_dataset contains three batches: a1, a2, a3.
-      # The training loops:
-      # Step 1: [a1, a2]
-      # Step 2: [a3, b1]
-      # Step 3: [b2, b3] -> STOP.
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-      final_batch_dataset = final_batch_dataset.repeat(
-          2 * num_invocations_per_step - 1)
-
-      def _set_mask(data_dict):
-        signals = data_dict['signals']
-        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
-        data_dict['signals'] = signals
-        return data_dict
-
-      # Mask out the extra batch.
-      final_batch_dataset = final_batch_dataset.map(_set_mask)
-
-    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
-
-    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
-    self._current_inputs = None
-
-  def features_and_labels(self):
-    if self._current_inputs is not None:
-      raise RuntimeError(
-          'Internal Error: The previous inputs have not been properly '
-          'consumed. First call features_and_labels, then call signals.')
-
-    inputs_with_signals = self._iterator.get_next()
-    features = inputs_with_signals['features']
-    labels = inputs_with_signals.get('labels')
-
-    self._current_inputs = inputs_with_signals
-    return features, labels
-
-  def signals(self):
-    """Returns the `Signals` from `_Inputs`."""
-    if self._current_inputs is None:
-      raise RuntimeError(
-          'Internal Error: The current inputs have not been properly '
-          'generated. First call features_and_labels, then call signals.')
-    signals = self._current_inputs['signals']
-    self._current_inputs = None
-    return signals
-
-  @staticmethod
-  def insert_stopping_signal(stop, batch_size, add_padding=False):
-    """Inserts stopping_signal into dataset via _map_fn.
-
-    Here we change the data structure in the dataset, such that the return value
-    is a dictionary now and `features`, `labels`, and `signals` are three
-    distinguished keys in that dict. This provides a better structure, which
-    eases the process to decompose the inputs (see `features_and_labels`).
-
-    Args:
-      stop: bool, state of current stopping signals.
-      batch_size: int, batch size.
-      add_padding: bool, whether to pad the tensor to full batch size.
-
-    Returns:
-      A map_fn passed to dataset.map API.
-    """
-
-    def _map_fn(*args):
-      """The map fn to insert signals."""
-      if len(args) == 1:
-        # Unpack the single Tensor/dict argument as features. This is required
-        # for the input_fn returns no labels.
-        args = args[0]
-      features, labels = _Inputs._parse_inputs(args)
-      new_input_dict = {}
-
-      if add_padding:
-        padding_mask, features, labels = (
-            _PaddingSignals.pad_features_and_labels(features, labels,
-                                                    batch_size))
-
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-
-      else:
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-        padding_mask = None
-
-      new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size,
-          padding_mask=padding_mask).as_dict()
-
-      return new_input_dict
-
-    return _map_fn
-
-
-class _StopSignals(object):
-  """Signals class holding all logic to handle TPU stopping condition."""
-
-  NON_STOPPING_SIGNAL = False
-  STOPPING_SIGNAL = True
-
-  def __init__(self, stop, batch_size, padding_mask=None):
-    self._stop = stop
-    self._batch_size = batch_size
-    self._padding_mask = padding_mask
-
-  def as_dict(self):
-    """Returns the signals as Python dict."""
-    shape = [self._batch_size, 1]
-    dtype = dtypes.bool
-
-    if self._stop:
-      stopping = array_ops.ones(shape=shape, dtype=dtype)
-    else:
-      stopping = array_ops.zeros(shape=shape, dtype=dtype)
-
-    signals = {'stopping': stopping}
-    if self._padding_mask is not None:
-      signals['padding_mask'] = self._padding_mask
-    return signals
-
-  @staticmethod
-  def as_scalar_stopping_signal(signals):
-    return array_ops.identity(signals['stopping'][0][0])
-
-  @staticmethod
-  def should_stop(scalar_stopping_signal):
-    """Detects whether scalar_stopping_signal indicates stopping."""
-    if isinstance(scalar_stopping_signal, ops.Tensor):
-      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
-      # way to express the bool check whether scalar_stopping_signal is True.
-      return math_ops.logical_and(scalar_stopping_signal,
-                                  _StopSignals.STOPPING_SIGNAL)
-    else:
-      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
-      # the graph anymore. Here, we use pure Python.
-      return bool(scalar_stopping_signal)
-
-
-class _PaddingSignals(object):
-  """Signals class holding all logic to handle padding."""
-
-  @staticmethod
-  def pad_features_and_labels(features, labels, batch_size):
-    """Pads out the batch dimension of features and labels."""
-    real_batch_size = array_ops.shape(
-        _PaddingSignals._find_any_tensor(features))[0]
-
-    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
-
-    check_greater = check_ops.assert_greater_equal(
-        batch_size_tensor,
-        real_batch_size,
-        data=(batch_size_tensor, real_batch_size),
-        message='The real batch size should not be greater than batch_size.')
-
-    with ops.control_dependencies([check_greater]):
-      missing_count = batch_size_tensor - real_batch_size
-
-    def pad_single_tensor(tensor):
-      """Pads out the batch dimension of a tensor to the complete batch_size."""
-      rank = len(tensor.shape)
-      assert rank > 0
-      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
-      padded_tensor = array_ops.pad(tensor, padding)
-      padded_tensor.set_shape(padded_shape)
-      return padded_tensor
-
-    def nest_pad(tensor_or_dict):
-      return nest.map_structure(pad_single_tensor, tensor_or_dict)
-
-    features = nest_pad(features)
-    if labels is not None:
-      labels = nest_pad(labels)
-
-    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
-                                                 batch_size)
-
-    return padding_mask, features, labels
-
-  @staticmethod
-  def slice_tensor_or_dict(tensor_or_dict, signals):
-    """Slice the real Tensors according to padding mask in signals."""
-
-    padding_mask = signals['padding_mask']
-    batch_size = array_ops.shape(padding_mask)[0]
-
-    def verify_batch_size(tensor):
-      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
-      with ops.control_dependencies([check_batch_size]):
-        return array_ops.identity(tensor)
-
-    def slice_single_tensor(tensor):
-      rank = len(tensor.shape)
-      assert rank > 0
-      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
-      return verify_batch_size(tensor)[0:real_batch_size]
-
-    # As we split the Tensors to all TPU cores and concat them back, it is
-    # important to ensure the real data is placed before padded ones, i.e.,
-    # order is preserved. By that, the sliced padding mask should have all 0's.
-    # If this assertion failed, # the slice logic here would not hold.
-    sliced_padding_mask = slice_single_tensor(padding_mask)
-    assert_padding_mask = math_ops.equal(
-        math_ops.reduce_sum(sliced_padding_mask), 0)
-
-    with ops.control_dependencies([assert_padding_mask]):
-      should_stop = _StopSignals.should_stop(
-          _StopSignals.as_scalar_stopping_signal(signals))
-
-    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
-
-    def slice_fn(tensor):
-      # If the current batch is full batch or part of stopping signals, we do
-      # not need to slice to save performance.
-      return control_flow_ops.cond(
-          math_ops.logical_or(should_stop, is_full_batch),
-          (lambda: verify_batch_size(tensor)),
-          (lambda: slice_single_tensor(tensor)))
-
-    return nest.map_structure(slice_fn, tensor_or_dict)
-
-  @staticmethod
-  def _find_any_tensor(batch_features):
-    tensors = [
-        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
-    ]
-    if not tensors:
-      raise ValueError('Cannot find any Tensor in features dict.')
-    return tensors[0]
-
-  @staticmethod
-  def _padding_mask(real_batch_size, missing_count, batch_size):
-    padding_mask = array_ops.concat([
-        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
-        array_ops.ones((missing_count,), dtype=dtypes.int32)
-    ],
-                                    axis=0)
-    padding_mask.set_shape((batch_size,))
-    return padding_mask
-
-
-def _verify_cross_hosts_transfer_size(tensor_dict, message):
-  total_size = 0
-  tensor_structure = {}
-  for key, tensor in tensor_dict.items():
-    shape = tensor.shape
-    size = np.product(shape) * tensor.dtype.size
-    tensor_structure[key] = shape
-    total_size += size
-  if total_size >= _ONE_GIGABYTE:
-    raise ValueError(
-        '{} The transfer size is larger than the protobuf limit. Please '
-        'consider to use Tensors with smaller shapes or reduce batch '
-        'size. Given:\n'
-        '{}'.format(
-            message, '\n'.join([
-                ' -- Key: {}, Shape: {}'.format(k, v)
-                for k, v in tensor_structure.items()
-            ])))
-
-
-def _add_item_to_params(params, key, value):
-  """Adds a new item into `params`."""
-  if isinstance(params, hparam.HParams):
-    # For HParams, we need to use special API.
-    if key in params:
-      params.set_hparam(key, value)
-    else:
-      params.add_hparam(key, value)
-  else:
-    # Now params is Python dict.
-    params[key] = value
-
-
-def export_estimator_savedmodel(estimator,
-                                export_dir_base,
-                                serving_input_receiver_fn,
-                                assets_extra=None,
-                                as_text=False,
-                                checkpoint_path=None,
-                                strip_default_attrs=False):
-  """Export `Estimator` trained model for TPU inference.
-
-  Args:
-    estimator: `Estimator` with which model has been trained.
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    serving_input_receiver_fn: A function that takes no argument and returns a
-      `ServingInputReceiver` or `TensorServingInputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs.
-
-  Returns:
-    The string path to the exported directory.
-  """
-  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
-  # `estimator.config`.
-  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
-  est = TPUEstimator(
-      estimator._model_fn,  # pylint: disable=protected-access
-      config=config,
-      params=estimator.params,
-      use_tpu=True,
-      train_batch_size=2048,  # Does not matter.
-      eval_batch_size=2048,  # Does not matter.
-  )
-  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                               assets_extra, as_text, checkpoint_path,
-                               strip_default_attrs)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_estimator import *
+# used by tests
+from tensorflow.python.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow.python.tpu.tpu_estimator import _create_global_step
+from tensorflow.python.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow.python.tpu.tpu_estimator import _get_scaffold
+from tensorflow.python.tpu.tpu_estimator import _Inputs
+from tensorflow.python.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow.python.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow.python.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow.python.tpu.tpu_estimator import _TPU_TRAIN_OP
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index d5957b7e8ec40b40c7af8822378cee6134ef0d0f..af2542ea85290170ce6a38223188c4f9b871f032 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -1,898 +1,25 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Helper library for handling infeed between hosts and TPUs.
-"""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_sharding
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.util import nest
-
-
-class InfeedQueue(object):
-  """A helper object to build a device infeed queue.
-
-  The InfeedQueue builds the host-side and device-side Ops to enqueue and
-  dequeue elements, respectively, and ensures that their types and
-  shapes match.
-  """
-
-  def __init__(self,
-               number_of_tuple_elements=None,
-               tuple_types=None,
-               tuple_shapes=None,
-               shard_dimensions=None,
-               name=None):
-    """Creates a new InfeedQueue with the given configuration.
-
-    The configuration need not be fully specified at creation since it
-    can be modified subsequently by methods that set the values
-    explicitly or infer them from the shapes of inputs.
-
-    Args:
-      number_of_tuple_elements: the number of Tensors fed atomically through the
-        queue, must be present unless it can be inferred from other arguments.
-      tuple_types: if not None, a list of types of the elements of the queue.
-      tuple_shapes: if not None, a list of shapes of the elements of the queue.
-      shard_dimensions: if not None, a list of dimensions on which the
-        elements of the queue should be sharded during automatic
-        parallelization.
-      name: the name of the queue.
-
-    Raises:
-      ValueError: if number_of_tuple_elements <= 0; or
-        number_of_tuple_arguments, tuple_types, tuple_shapes, and
-        shard_dimensions are all None; or the length of tuple_types,
-        tuple_shapes, or shard_dimensions is not equal to
-        number_of_tuple_elements; or any element of shard_dimensions
-        can't be converted to a Dimension.
-      TypeError: if any element of tuple_types or tuple_shapes can't
-        be converted to a dtype or TensorShape, respectively.
-    """
-    self._frozen = False
-    self._generated_enqueue_ops = False
-    self._generated_dequeue_op = False
-    self._name = "InfeedQueue" if name is None else name
-    if number_of_tuple_elements is None:
-      if tuple_types is not None:
-        number_of_tuple_elements = len(tuple_types)
-      elif tuple_shapes is not None:
-        number_of_tuple_elements = len(tuple_shapes)
-      elif shard_dimensions is not None:
-        number_of_tuple_elements = len(shard_dimensions)
-      else:
-        raise ValueError(
-            "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor")
-    if number_of_tuple_elements <= 0:
-      raise ValueError("number_of_tuple_elements %d must be > 0" %
-                       number_of_tuple_elements)
-    # Make an empty sharding policy for each tuple element.
-    self._sharding_policies = [
-        tpu_sharding.ShardingPolicy()
-        for _ in xrange(number_of_tuple_elements)
-    ]
-    if tuple_types is not None:
-      self.set_tuple_types(tuple_types)
-    else:
-      self._tuple_types = None
-    if tuple_shapes is not None:
-      self.set_tuple_shapes(tuple_shapes)
-    else:
-      self._tuple_shapes = None
-    if shard_dimensions is not None:
-      self.set_shard_dimensions(shard_dimensions)
-    self._validate()
-
-  def _validate(self):
-    """Checks that the configuration is self-consistent.
-
-    Raises:
-      ValueError: if the shapes and sharding policies don't match.
-    """
-    if self.tuple_shapes is not None:
-      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
-        # Raise an error if the policy is incompatible with the shape.
-        _ = policy.get_sharded_shape(shape)
-
-  @property
-  def number_of_tuple_elements(self):
-    """Returns the number of InfeedQueue tuple elements."""
-    return len(self._sharding_policies)
-
-  @property
-  def tuple_types(self):
-    """Returns the types of the InfeedQueue tuple elements."""
-    return self._tuple_types
-
-  def set_tuple_types(self, tuple_types):
-    """Sets the type of each element of the queue.
-
-    tuple_types must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a dtype.
-
-    Args:
-      tuple_types: the types of each queue element.
-
-    Raises:
-      ValueError: if tuple_types is not of length
-        self.number_of_tuple_elements.
-      TypeError: if an element of tuple_types cannot be converted to a
-        dtype.
-    """
-    if len(tuple_types) != self.number_of_tuple_elements:
-      raise ValueError("tuple_types is %s, but must be a list of length %d" %
-                       (str(tuple_types), self.number_of_tuple_elements))
-    if self._frozen:
-      for (frozen, updated) in zip(self._tuple_types, tuple_types):
-        if frozen != updated:
-          raise ValueError(
-              "Trying to update InfeedQueue with frozen configuration with an "
-              "incompatible type. Frozen types are %s, updated types are %s" % (
-                  str(self._tuple_types), str(tuple_types)))
-    else:
-      try:
-        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
-      except (TypeError) as e:
-        raise TypeError(
-            "tuple_types is %s, but must be a list of elements each "
-            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
-
-  @property
-  def tuple_shapes(self):
-    """Returns the shapes of the InfeedQueue tuple elements."""
-    return self._tuple_shapes
-
-  def set_tuple_shapes(self, tuple_shapes):
-    """Sets the shape of each element of the queue.
-
-    tuple_shapes must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a TensorShape.
-
-    Args:
-      tuple_shapes: the shapes of each queue element.
-
-    Raises:
-      ValueError: if tuple_shapes is not of length
-        self.number_of_tuple_elements.
-      TypeError: if an element of tuple_shapes cannot be converted to
-        a TensorShape.
-    """
-    if len(tuple_shapes) != self.number_of_tuple_elements:
-      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
-                       (str(tuple_shapes), self.number_of_tuple_elements))
-    try:
-      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
-    except (ValueError, TypeError) as e:
-      raise TypeError(
-          "tuple_shapes is %s, but must be a list of elements each "
-          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
-                                                        str(e)))
-    if self._frozen:
-      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
-        if frozen != updated:
-          raise ValueError(
-              "Trying to update InfeedQueue with frozen configuration with an "
-              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
-              % (str(self._tuple_shapes), str(tuple_shapes)))
-    else:
-      self._tuple_shapes = tuple_shapes
-    self._validate()
-
-  @property
-  def sharding_policies(self):
-    """Returns the sharding policies of the InfeedQueue tuple elements."""
-    return self._sharding_policies
-
-  @property
-  def shard_dimensions(self):
-    """Gets the shard dimension of each tuple element.
-
-    Returns:
-      A list of length number_of_tuple_elements, where each list entry
-      is the shard dimension of that tuple element or None if the
-      shard dimension has not been set.
-    """
-    # The number of shards is always the same for all the policies.
-    return [policy.shard_dimension for policy in self._sharding_policies]
-
-  def set_shard_dimensions(self, shard_dimensions):
-    """Sets the shard_dimension of each element of the queue.
-
-    shard_dimensions must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a Dimension compatible with self.tuple_shapes.
-
-    Args:
-      shard_dimensions: the dimensions of each queue element.
-
-    Raises:
-      ValueError: if shard_dimensions is not of length
-        self.number_of_tuple_elements; or an element of
-        shard_dimensions cannot be converted to a Dimension; or an
-        element of shard_dimensions is a Dimension that is out of
-        range for the corresponding tuple element shape.
-    """
-    if len(shard_dimensions) != self.number_of_tuple_elements:
-      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
-                       % (str(shard_dimensions),
-                          self.number_of_tuple_elements))
-    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
-      policy.set_shard_dimension(dimension)
-    self._validate()
-
-  @property
-  def number_of_shards(self):
-    """Gets the number of shards to use for the InfeedQueue.
-
-    Returns:
-      Number of shards or None if the number of shards has not been set.
-    """
-    # The number of shards is always the same for all the policies.
-    return self._sharding_policies[0].number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    """Sets the number of shards to use for the InfeedQueue.
-
-    Args:
-      number_of_shards: number of ways to shard the InfeedQueue.
-
-    Raises:
-      ValueError: if number_of_shards is not > 0; or the policies have
-        been frozen and number_of_shards was already set to something
-        else.
-    """
-    for policy in self._sharding_policies:
-      policy.set_number_of_shards(number_of_shards)
-    self._validate()
-
-  def set_configuration_from_input_tensors(self, input_tensors):
-    """Sets the shapes and types of the queue tuple elements.
-
-    input_tensors is a list of Tensors whose types and shapes are used
-    to set the queue configuration.
-
-    Args:
-      input_tensors: list of Tensors of the same types and shapes as
-        the desired queue Tuple.
-
-    Raises:
-      ValueError: if input_tensors is not a list of length
-        self.number_of_tuple_elements
-    """
-    if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
-                       % (str(input_tensors), self.number_of_tuple_elements))
-    self.set_tuple_shapes([t.shape for t in input_tensors])
-    self.set_tuple_types([t.dtype for t in input_tensors])
-
-  def set_configuration_from_sharded_input_tensors(self, input_tensors):
-    """Sets the shapes and types of the queue tuple elements.
-
-    input_tensors is a list of lists of Tensors whose types and shapes are used
-    to set the queue configuration. The length of the outer list is the number
-    of shards required, and each inner list is the tuple of Tensors to use to
-    determine the types and shapes of the corresponding shard. This method
-    depends on the shard dimension, and calling it freezes the shard policy.
-
-    Args:
-      input_tensors: list of lists of Tensors. The outer list length corresponds
-        to the desired number of shards, and each inner list is the size
-        and shape of the desired configuration of the corresponding shard.
-
-    Raises:
-      ValueError: if any inner list is not a list of length
-        self.number_of_tuple_elements; or the inner lists do not combine to
-        form a consistent unsharded shape.
-      TypeError: if the types of the Tensors in the inner lists do not match.
-    """
-    if not self._frozen:
-      # Unset the tuple shapes in case the configuration becomes
-      # transiently inconsistent.
-      self._tuple_shapes = None
-    number_of_shards = len(input_tensors)
-    self.set_number_of_shards(number_of_shards)
-    for t in input_tensors:
-      if len(t) != self.number_of_tuple_elements:
-        raise ValueError(
-            "input_tensors is %s but must be a list of lists, where each inner"
-            " list has length number_of_tuple_elements=%d" % (
-                str(input_tensors), self.number_of_tuple_elements))
-    # Transpose the inputs to make a list of shard shapes for each tuple
-    # element.
-    sharded_shapes = [[t[i].shape for t in input_tensors]
-                      for i in xrange(self.number_of_tuple_elements)]
-    # For each tuple, get the unsharded shape using that tuple's policy.
-    unsharded_shapes = [
-        policy.get_unsharded_shape(s)
-        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
-    ]
-    self.set_tuple_shapes(unsharded_shapes)
-    for i in xrange(1, self.number_of_shards):
-      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
-        if t1.dtype != t2.dtype:
-          raise TypeError(
-              "types of the tuple elements of input_tensors %s are not "
-              "consistent" % str(input_tensors))
-    self.set_tuple_types([t.dtype for t in input_tensors[0]])
-
-  def freeze(self):
-    """Freezes the InfeedQueue so it can no longer be modified.
-
-    The configuration is implicitly frozen before any host-side or
-    device-side Ops are generated. The configuration cannot be frozen
-    until the types and shapes of the tuple elements have been set.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set.
-    """
-    self._frozen = True
-    if self._tuple_types is None:
-      raise ValueError(
-          "Can't freeze an InfeedQueue without setting all tuple types.")
-    if self._tuple_shapes is None:
-      raise ValueError(
-          "Can't freeze an InfeedQueue without setting all tuple shapes.")
-    for shape in self._tuple_shapes:
-      if shape.dims is None:
-        raise ValueError(
-            "Can't freeze an InfeedQueue without setting all tuple shapes.")
-    for policy in self._sharding_policies:
-      policy.freeze()
-    self._validate()
-
-  def generate_dequeue_op(self, tpu_device=0):
-    """Generates the device-side Op to dequeue a tuple from the queue.
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen, which will raise errors if the shapes and types have not
-    been fully specified.
-
-    Args:
-      tpu_device: The TPU device ordinal where the infeed instruction should be
-        placed. If None, no explicit placement will be performed, and it is up
-        to the user to call this API from within a proper TPU device scope.
-        The XLA code will fail if the TPU dequeue instruction is not bound to
-        any device.
-
-    Returns:
-      A list of Outputs corresponding to a shard of infeed dequeued
-      into XLA, suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set; or if a dequeue op has already been generated.
-    """
-    self.freeze()
-    if self._generated_dequeue_op:
-      raise ValueError("Can't generate two dequeue Ops from the same queue")
-    self._generated_dequeue_op = True
-    full_name = "%s/dequeue" % self._name
-    sharded_shapes = [
-        policy.get_sharded_shape(shape)
-        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
-    ]
-    if tpu_device is not None:
-      with ops.device(tpu.core(tpu_device)):
-        return tpu_ops.infeed_dequeue_tuple(
-            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-    else:
-      return tpu_ops.infeed_dequeue_tuple(
-          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-
-  def _generate_enqueue_op(self,
-                           inputs,
-                           name_prefix,
-                           index,
-                           device=None,
-                           tpu_ordinal=-1):
-    """Generate a host-side Op to enqueue a tuple to the queue.
-
-    If device is None the inputs are all required to have the same
-    device specification, and the enqueue Op is colocated with
-    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
-
-    Args:
-      inputs: a list of Tensors with the types and shapes of the tuple elements.
-      name_prefix: the base name for the Op.
-      index: the shard index, used to uniquify the Op name.
-      device: device to place the Op on, or None if it should be
-        colocated with the inputs.
-      tpu_ordinal: ordinal of the TPU device on the host to use for
-      infeed if device is a CPU device. Should be set to -1 if device
-      is a TPU device.
-
-    Returns:
-      An Op corresponding to a shard of infeed enqueued at the host,
-      suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if device is None and inputs do not all have the
-        same device specification.
-    """
-    full_name = "%s/%d" % (name_prefix, index)
-    shapes = [t.shape for t in inputs]
-    if device is None:
-      devices = [t.device for t in inputs]
-      for i in xrange(1, self.number_of_tuple_elements):
-        if devices[0] != devices[i]:
-          raise ValueError(
-              "input devices for shard %d are %s, but should all be the same" %
-              (index, str(devices)))
-      with ops.colocate_with(inputs[0]):
-        return tpu_ops.infeed_enqueue_tuple(
-            inputs=inputs,
-            shapes=shapes,
-            name=full_name,
-            device_ordinal=tpu_ordinal)
-    else:
-      with ops.device(device):
-        return tpu_ops.infeed_enqueue_tuple(
-            inputs=inputs,
-            shapes=shapes,
-            name=full_name,
-            device_ordinal=tpu_ordinal)
-
-  def generate_enqueue_ops(self,
-                           sharded_inputs,
-                           tpu_ordinal_function=None,
-                           placement_function=None):
-    """Generates the host-side Ops to enqueue the shards of a tuple.
-
-    sharded_inputs is a list, one for each shard, of lists of
-    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
-    shard 0 if the queue. Returns the host-side Ops that must be run to
-    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
-    for shard i.
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen. If the configuration has already been frozen, and is not
-    compatible with the types and shapes of sharded_inputs, an error
-    will be raised.
-
-    Args:
-      sharded_inputs: a list of lists of Tensors. The length of the outer list
-        determines the number of shards. Each inner list indicates the types
-        and shapes of the tuples in the corresponding shard.
-      tpu_ordinal_function: if not None, a function that takes the
-        shard index as input and returns the ordinal of the TPU device
-        the shard's infeed should be placed on. tpu_ordinal_function must be
-        set if the inputs are placed on CPU devices.
-      placement_function: if not None, a function that takes the shard index as
-        input and returns the host device where the enqueue op should be placed
-        on.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the shapes of the elements of sharded_inputs
-        don't form a consistent unsharded tuple; or if the elements of a tuple
-        have different device constraints.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the types of the elements of sharded_inputs
-        don't form a consistent unsharded tuple.
-    """
-    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
-    self.freeze()
-    if self._generated_enqueue_ops:
-      raise ValueError("Can't generate two enqueue Ops from the same queue")
-    self._generated_enqueue_ops = True
-    if tpu_ordinal_function is None:
-      tpu_ordinal_function = lambda index: -1
-    name_prefix = "%s/enqueue" % self._name
-    return [
-        self._generate_enqueue_op(
-            shard,
-            name_prefix,
-            index,
-            tpu_ordinal=tpu_ordinal_function(index),
-            device=placement_function(index) if placement_function else None)
-        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
-    ]
-
-  # TODO(misard) Generalize this to the case of systems that don't
-  # have 8 devices per host, and figure out what to do with
-  # model-parallelism.
-  def _default_placement_function(self, index):
-    return "/task:%d/device:CPU:0" % (index / 8)
-
-  def _default_ordinal_function(self, index):
-    return index % 8
-
-  # TODO(b/36470756) remove this from tutorials once we have a better story
-  # for automatic placement of input pipelines.
-  def split_inputs_and_generate_enqueue_ops(self,
-                                            inputs,
-                                            device_assignment=None,
-                                            placement_function=None,
-                                            tpu_ordinal_function=None):
-    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
-
-    Generates the host-side Ops to enqueue a tuple.
-
-    This method performs poorly because it takes an entire input on a single
-    host, splits it, and distributes it to all of the cores. It is present only
-    to simplify tutorial examples.
-
-    inputs is a list of Tensors to use to feed the queue. Each input is split
-    into self.number_of_shards shards. Returns an Op for each shard to enqueue
-    the shard. The Op for shard i is placed on device placement_function(i).
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen. If the configuration has already been frozen, and is not
-    compatible with the types and shapes of inputs, an error
-    will be raised.
-
-    Args:
-      inputs: a list of Tensors which indicates the types and shapes of the
-        queue tuple.
-     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
-        device_assignment is not `None`, but `placement_function` and
-        `ordinal_function` are None, then `device_assignment` will be used to
-        place infeeds on the first k TPU shards, where k is the number of shards
-        in the queue. If all three are `None`, then default placement and
-        ordinal functions are used.
-      placement_function: if not None, a function that takes the shard
-        index as input and returns a device string indicating which
-        device the shard's infeed should be placed on. If placement_function
-        and tpu_ordinal_function are None, inputs are sharded round-robin
-        across the devices in the system.
-      tpu_ordinal_function: if not None, a function that takes the
-        shard index as input and returns the ordinal of the TPU device
-        the shard's infeed should be placed on. If placement_function
-        and tpu_ordinal_function are None, inputs are sharded round-robin
-        across the devices in the system.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of inputs are not compatible with the frozen
-        configuration.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of inputs are not compatible with the frozen
-        configuration.
-    """
-    if device_assignment is None:
-      if placement_function is None:
-        placement_function = self._default_placement_function
-      if tpu_ordinal_function is None:
-        tpu_ordinal_function = self._default_ordinal_function
-    else:
-
-      def _placement_function_from_map(index):
-        return device_assignment.host_device(replica=index)
-
-      def _ordinal_function_from_map(index):
-        return device_assignment.tpu_ordinal(replica=index)
-
-      if placement_function is None:
-        placement_function = _placement_function_from_map
-      if tpu_ordinal_function is None:
-        tpu_ordinal_function = _ordinal_function_from_map
-    self.set_configuration_from_input_tensors(inputs)
-    self.freeze()
-    if self._generated_enqueue_ops:
-      raise ValueError("Can't generate two enqueue Ops from the same queue")
-    self._generated_enqueue_ops = True
-    split_name_prefix = "%s/split" % self._name
-    if self.number_of_shards == 1:
-      transposed_sharded_inputs = [[inp] for inp in inputs]
-    else:
-
-      def split_fn(inp, num_shards, axis, name):
-        with ops.colocate_with(inp):
-          return array_ops.split(inp, num_shards, axis=axis, name=name)
-
-      transposed_sharded_inputs = [
-          split_fn(
-              inp,
-              self.number_of_shards,
-              axis=policy.shard_dimension,
-              name="%s/%d" % (split_name_prefix, index))
-          for (inp, policy, index) in zip(inputs, self._sharding_policies,
-                                          xrange(self.number_of_tuple_elements))
-      ]
-    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
-                      for i in xrange(self.number_of_shards)]
-    name_prefix = "%s/enqueue" % self._name
-    return [
-        self._generate_enqueue_op(
-            shard,
-            name_prefix,
-            index,
-            device=placement_function(index),
-            tpu_ordinal=tpu_ordinal_function(index))
-        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
-    ]
-
-
-class _PartitionedInfeedQueue(InfeedQueue):
-  """A helper object to build a device infeed queue with input partition.
-
-  Args:
-    number_of_tuple_elements: the number of Tensors fed atomically through the
-      queue, must be present unless it can be inferred from other arguments.
-    device_assignment: A TPU `DeviceAssignment` which is used to place all the
-      partitions to different TPU infeed queues.
-    host_id: The id of the host machine.
-    input_partition_dims: A nested list/tuple of integers. Each inner
-      list/tuple describes how to partition the corresponding input tensor.
-    tuple_types: If not None, a list of types of the elements of the queue.
-    tuple_shapes: If not None, a list of shapes of the elements of the queue.
-    name: The name of the queue.
-  """
-
-  def __init__(self,
-               number_of_tuple_elements,
-               device_assignment,
-               host_id,
-               input_partition_dims=None,
-               tuple_types=None,
-               tuple_shapes=None,
-               name=None):
-    super(_PartitionedInfeedQueue, self).__init__(
-        number_of_tuple_elements=number_of_tuple_elements,
-        tuple_types=tuple_types,
-        tuple_shapes=None,
-        shard_dimensions=None,
-        name="PartitionedInfeedQueue" if name is None else name)
-    self._input_partition_dims = input_partition_dims
-    self._host_id = host_id
-    self._device_assignment = device_assignment
-
-  def generate_dequeue_op(self, tpu_device=0):
-    """Generate TPU dequeue ops.
-
-    Args:
-      tpu_device: The TPU device ordinal where the infeed instruction should be
-        placed.
-
-    Returns:
-      A list of Outputs corresponding to a partition of infeed dequeued
-      into XLA, suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set; or if a dequeue op has already been generated.
-    """
-    self.freeze()
-    if self._generated_dequeue_op:
-      raise ValueError("Can't generate two dequeue Ops from the same queue")
-    self._generated_dequeue_op = True
-    full_name = "%s/dequeue" % self._name
-    sharded_shapes = [
-        policy.get_sharded_shape(shape)
-        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
-    ]
-    with ops.device(tpu.core(tpu_device)):
-      values = tpu_ops.infeed_dequeue_tuple(
-          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-    return self._tag_sharding_attribute_for_dequeued_tensors(
-        values, self._input_partition_dims)
-
-  def generate_enqueue_ops(self, per_host_sharded_inputs):
-    """Generates the host-side Ops to enqueue the partitioned inputs.
-
-    per_host_sharded_inputs is a list, one for each replica, of lists of
-    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
-    replica i.
-    sharded_inputs[i][j] is partitioned by self._input_partition_dims[j].
-
-    For example, if sharded_inputs[i][j] is a 2-D Tensor:
-    [[A, B, C, D],
-     [E ,F, G, H]]
-    self._input_partition_dims[j] is [2, 4].
-
-    sharded_inputs[i][j] will be partitioned and flattened into:
-    [A, B, C, D, E, F, G, H] and fed into the logical core ids:
-    [0, 1, 2, 3, 4, 5, 6, 7] respectively.
-
-    Args:
-      per_host_sharded_inputs: a list of lists of Tensors. The length of the
-        outer list determines the number of shards. Each inner list indicates
-        the types and shapes of the tuples in the corresponding shard.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the shapes of the elements of sharded_inputs
-        don't form a consistent unsharded tuple; or if the elements of a tuple
-        have different device constraints; or if the partition dims are invalid.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the types of the elements of sharded_inputs
-        don't form a consistent unsharded tuple.
-    """
-    self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs)
-    number_of_replicas_per_host = len(per_host_sharded_inputs)
-    number_of_tuple_elements = len(per_host_sharded_inputs[0])
-
-    assert len(self._input_partition_dims) == number_of_tuple_elements
-    per_host_enqueue_ops = []
-
-    for replica_index in range(number_of_replicas_per_host):
-      flattened_inputs = per_host_sharded_inputs[replica_index]
-      inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs,
-                                                 self._input_partition_dims)
-      inputs_parted_iters = [
-          iter(self._partition_or_replicate_on_host(x, dims)) for x, dims in
-          zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat)
-      ]
-
-      for logical_core in xrange(self._device_assignment.num_cores_per_replica):
-        # Places different partitions to different logic cores.
-        replica_id = self._device_assignment.lookup_replicas(
-            self._host_id, logical_core)[replica_index]
-        ordinal = self._device_assignment.tpu_ordinal(
-            replica=replica_id, logical_core=logical_core)
-        infeed_inputs = []
-        for it in inputs_parted_iters:
-          input_for_device = next(it, None)
-          if input_for_device is not None:
-            infeed_inputs.append(input_for_device)
-
-        if infeed_inputs:
-          per_host_enqueue_ops.append(
-              tpu_ops.infeed_enqueue_tuple(
-                  inputs=infeed_inputs,
-                  shapes=[x.shape for x in infeed_inputs],
-                  name="enqueue/replica_{0}/input_{1}".format(
-                      replica_index, logical_core),
-                  device_ordinal=ordinal))
-    return per_host_enqueue_ops
-
-  def _check_input_partition_dims(self, tensor, dims):
-    """Checks that input partition dims are valid for the `Tensor`.
-
-    Args:
-      tensor: Input tensor for partitioning.
-      dims: 1-D np.array of the list of integer describes how to partition the
-        input tensor.
-
-    Raises:
-      ValueError: If the tensor can't be partitioned by dims or the
-        num_cores_per_replica doesn't match the number of
-        partitions(dims.prod()).
-    """
-    if (dims < 1).any():
-      raise ValueError("All input partition dims must be >= 1.")
-
-    # No partitioning, so don't perform further checks.
-    if dims.prod() == 1:
-      return
-
-    if dims.prod() != self._device_assignment.num_cores_per_replica:
-      raise ValueError(
-          "The product of each input parition dim should equal to "
-          "num_cores_per_replica. (dim = {}, num_cores_per_replica "
-          "= {})".format(dims, self._device_assignment.num_cores_per_replica))
-    if dims.shape[0] != tensor.shape.ndims:
-      raise ValueError(
-          "Input partition dims must have the same number of dimensions "
-          "as the `Tensor` to be partitioned. (tensor shape = {}, input "
-          "partition dims = {}).".format(tensor.shape.as_list(), dims))
-
-    tensor.shape.assert_is_fully_defined()
-
-  def _partition_or_replicate_on_host(self, tensor, dims):
-    """Partitions or replicates the input tensor.
-
-      The ops inside this function are placed on the host side.
-
-    Args:
-      tensor: The input tensor which will be partioned or replicated.
-      dims: A list of integer describes how to partition the input tensor.
-    Returns:
-      An iterator of `Tensor`s or a list of partioned tensors.
-    """
-    if dims is None:
-      return itertools.repeat(tensor)
-    dims = np.array(dims)
-    self._check_input_partition_dims(tensor, dims)
-    output = [tensor]
-    shape_list = np.array(tensor.shape.as_list())
-    quotients, remainders = np.divmod(shape_list, dims)
-    for axis, (quotient, remainder, dim, original_size) in enumerate(
-        zip(quotients, remainders, dims, shape_list)):
-      if dim <= 1:
-        continue
-      if remainder > 0:
-        # For each dimension, when it cannot be evenly partitioned, XLA assumes
-        # tensors are partitioned in a greedy manner by using
-        # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
-        # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
-        # [[(3, 4), (3, 4), (2, 4), (2, 2)],
-        # [(2, 4), (2, 4), (2, 4), (2, 2)]]
-        ceil_ratio = quotient + 1
-        num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
-        num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
-        if len(num_or_size_splits) < dim:
-          num_or_size_splits += [0] * (dim - len(num_or_size_splits))
-        new_output = []
-        for x in output:
-          new_output.append(
-              array_ops.split(
-                  x, num_or_size_splits=num_or_size_splits, axis=axis))
-        output = new_output
-      else:
-        output = [array_ops.split(x, dim, axis=axis) for x in output]
-      output = nest.flatten(output)
-    return output
-
-  def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
-    """Tags appropriate XLA sharding attribute to the dequeued tensor.
-
-    Args:
-      tensor: The dequeued tensor on TPU.
-      dims: A list of integer describes how the tensor is partitioned.
-
-    Returns:
-      The same tensor with the xla_sharding attribute.
-    """
-    if dims is None:
-      return xla_sharding.replicate(tensor)
-    elif np.prod(dims) == 1:
-      return xla_sharding.assign_device(tensor, 0)
-    else:
-      tile_assignment = np.arange(np.prod(dims)).reshape(dims)
-      return xla_sharding.tile(
-          tensor=tensor,
-          tile_assignment=tile_assignment)
-
-  def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
-    """Tags appropriate XLA sharding attribute to the dequeued tensors.
-
-    Args:
-      dequeues: A list of dequeued tensors on TPU.
-      dims: A list of integer describes how the tensor is partitioned.
-
-    Returns:
-      The same dequeues with appropriate xla_sharding attribute.
-    """
-    nest.assert_shallow_structure(dequeues, dims)
-    return nest.map_structure_up_to(
-        dequeues, self._tag_sharding_attribute_for_dequeued_tensor, dequeues,
-        dims)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_feed import *
+# used by tests
+from tensorflow.python.tpu.tpu_feed import _PartitionedInfeedQueue
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
index 84d5967ea547f0c036f7c9aa936ac0c99c141304..f2755c6979c2e49dbc19b6800462949601811496 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -1,57 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper library for functions used during TPU compilation."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
-
-class TpuContext(object):
-  """A context object holding state about the TPU computation being built."""
-
-  def __init__(self):
-    """Creates a new TpuContext."""
-    self._number_of_shards = None
-
-  @property
-  def number_of_shards(self):
-    return self._number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    self._number_of_shards = number_of_shards
-
-
-# The Tpu context holds the number of shards when a sharded computation is
-# being built, or None if no computation is being built.
-_current_tpu_context = TpuContext()
-
-
-@contextlib.contextmanager
-def tpu_shard_context(number_of_shards):
-  if _current_tpu_context.number_of_shards is not None:
-    raise NotImplementedError("tpu_shard_context cannot be nested.")
-  try:
-    _current_tpu_context.set_number_of_shards(number_of_shards)
-    yield
-  finally:
-    _current_tpu_context.set_number_of_shards(None)
-
-
-def get_tpu_context():
-  return _current_tpu_context
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_function import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
index 1e11de6421e360faf0b9ad573a84f9aecdf9c98f..ca58e78d7b342c7ca70400652d99092ccbecbbde 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -1,203 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Optimizer that implements cross-shard gradient reduction for TPU."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer
-
-
-class CrossShardOptimizer(optimizer.Optimizer):
-  """An optimizer that averages gradients across TPU shards."""
-
-  def __init__(self,
-               opt,
-               reduction=losses.Reduction.MEAN,
-               name="CrossShardOptimizer",
-               group_assignment=None):
-    """Construct a new cross-shard optimizer.
-
-    Args:
-      opt: An existing `Optimizer` to encapsulate.
-      reduction: The reduction to apply to the shard losses.
-      name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "CrossShardOptimizer".
-      group_assignment: Optional 2d int32 lists with shape
-        [num_groups, num_replicas_per_group] which describles how to apply
-        optimizer to subgroups.
-
-    Raises:
-      ValueError: If reduction is not a valid cross-shard reduction.
-    """
-    if reduction not in (losses.Reduction.SUM, losses.Reduction.MEAN):
-      raise ValueError("Unsupported reduction: %s." % reduction)
-
-    super(CrossShardOptimizer, self).__init__(False, name)
-    self._opt = opt
-    self._reduction = reduction
-    self._group_assignment = group_assignment
-
-  def _verify_and_get_subgroup_size(self, group_assignment, num_shards):
-    """Verify group_assignment and get the subgroup size".
-
-    Args:
-      group_assignment: list of group ids for applying the optimizer
-        to subgroups.
-      num_shards: The number of TPU shards.
-
-    Returns:
-      The size of one subgroup in group_assignment.
-
-    Raises:
-      ValueError: If group_assignment is invalid.
-    """
-    if not group_assignment:
-      return None
-    if not (isinstance(group_assignment, list) and
-            all(isinstance(i, list) for i in group_assignment)):
-      raise ValueError("group_assignment must be a list of list. Got {}".format(
-          group_assignment))
-
-    replica_ids = set()
-    for g in group_assignment:
-      for i in g:
-        replica_ids.add(i)
-
-    if set(range(num_shards)) != replica_ids:
-      raise ValueError("group_assignment must be a permutation of range({0})."
-                       " Got group_assignment={1}".format(
-                           num_shards, group_assignment))
-
-    subgroup_size_list = [len(group) for group in group_assignment]
-    if all(subgroup_size_list[0] == size for size in subgroup_size_list):
-      return subgroup_size_list[0]
-    else:
-      raise ValueError("The size of each subgroup in group_assignment must "
-                       "be equal. Got group_assignment={}".format(
-                           self._group_assignment))
-
-  def compute_gradients(self, loss, var_list=None, **kwargs):
-    """Compute gradients of "loss" for the variables in "var_list".
-
-    This simply wraps the compute_gradients() from the real optimizer. The
-    gradients will be aggregated in the apply_gradients() so that user can
-    modify the gradients like clipping with per replica global norm if needed.
-    The global norm with aggregated gradients can be bad as one replica's huge
-    gradients can hurt the gradients from other replicas.
-
-    Args:
-      loss: A Tensor containing the value to minimize.
-      var_list: Optional list or tuple of `tf.Variable` to update to minimize
-        `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKey.TRAINABLE_VARIABLES`.
-      **kwargs: Keyword arguments for compute_gradients().
-
-    Returns:
-      A list of (gradient, variable) pairs.
-
-    Raises:
-      ValueError: If not within a tpu_shard_context or group_assignment is
-        invalid.
-    """
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      logging.warning(
-          "CrossShardOptimizer should be used within a tpu_shard_context, but "
-          "got unset number_of_shards. Assuming 1.")
-      num_shards = 1
-
-    subgroup_size = self._verify_and_get_subgroup_size(self._group_assignment,
-                                                       num_shards)
-
-    if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
-      if self._group_assignment:
-        scale = 1.0 / subgroup_size
-      else:
-        scale = 1.0 / num_shards
-      loss *= scale
-
-    return self._opt.compute_gradients(loss, var_list=var_list, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Apply gradients to variables.
-
-    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
-    replicas, and then applies the real optimizer.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        compute_gradients().
-      global_step: Optional Variable to increment by one after the
-        variables have been updated.
-      name: Optional name for the returned operation.  Default to the
-        name passed to the Optimizer constructor.
-
-    Returns:
-      An `Operation` that applies the gradients. If `global_step` was not None,
-      that operation also increments `global_step`.
-
-    Raises:
-      ValueError: If the grads_and_vars is malformed.
-    """
-    summed_grads_and_vars = []
-    for (grad, var) in grads_and_vars:
-      if grad is None:
-        summed_grads_and_vars.append((grad, var))
-      else:
-        with ops.colocate_with(grad):
-          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
-              grad, self._group_assignment), var))
-    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
-
-  def get_slot(self, *args, **kwargs):
-    """Return a slot named "name" created for "var" by the Optimizer.
-
-    This simply wraps the get_slot() from the actual optimizer.
-
-    Args:
-      *args: Arguments for get_slot().
-      **kwargs: Keyword arguments for get_slot().
-
-    Returns:
-      The `Variable` for the slot if it was created, `None` otherwise.
-    """
-    return self._opt.get_slot(*args, **kwargs)
-
-  def get_slot_names(self, *args, **kwargs):
-    """Return a list of the names of slots created by the `Optimizer`.
-
-    This simply wraps the get_slot_names() from the actual optimizer.
-
-    Args:
-      *args: Arguments for get_slot().
-      **kwargs: Keyword arguments for get_slot().
-
-    Returns:
-      A list of strings.
-    """
-    return self._opt.get_slot_names(*args, **kwargs)
-
-  def variables(self):
-    """Forwarding the variables from the underlying optimizer."""
-    return self._opt.variables()
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_optimizer import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
index f5af03f33ca8f13af517007672e9ce0e12be6205..93c52335a582e5fa83092f78212ca268079b7c12 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
@@ -1,253 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper library for sharding during TPU compilation."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.python.framework import tensor_shape
-
-_DEFAULT_NUMBER_OF_SHARDS = 1
-_DEFAULT_SHARD_DIMENSION = 0
-
-
-# TODO(b/36777903) change other parts of tpu.py to use this class.
-class ShardingPolicy(object):
-  """An object use to hold the sharding policy for a Tensor.
-  """
-
-  def __init__(self):
-    self._number_of_shards = None
-    self._shard_dimension = None
-    self._frozen = False
-
-  def __str__(self):
-    if self.number_of_shards is None or self.shard_dimension is None:
-      return "ShardingPolicy(unset)"
-    else:
-      return ("ShardingPolicy(%d shards dimension %d)" %
-              (self.number_of_shards, self.shard_dimension))
-
-  def _fill_default_values(self):
-    if self._number_of_shards is None:
-      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
-    if self._shard_dimension is None:
-      self._shard_dimension = tensor_shape.as_dimension(
-          _DEFAULT_SHARD_DIMENSION)
-
-  def freeze(self):
-    """Prevents further modification to the sharding policy.
-
-    Any values that have not been set when freeze is called are set to
-    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
-    """
-    if not self._frozen:
-      self._fill_default_values()
-      self._frozen = True
-
-  @property
-  def number_of_shards(self):
-    """Returns the number of shards in the policy or None if unspecified."""
-    return self._number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    """Sets the number of shards for the current policy.
-
-    If the policy has been frozen then number_of_shards must match the
-    existing setting.
-
-    Args:
-      number_of_shards: The number of shards to use in the policy.
-
-    Raises:
-      ValueError: If the policy has been frozen and number_of_shards
-        differs from the frozen value; or number_of_shards <= 0.
-    """
-    if self._frozen:
-      if self._number_of_shards != number_of_shards:
-        raise ValueError(
-            "Can't set sharding policy to use %d shards since it has been "
-            "frozen to use %d." % (number_of_shards, self._number_of_shards))
-    else:
-      if number_of_shards > 0:
-        self._number_of_shards = number_of_shards
-      else:
-        raise ValueError(
-            "Can't set sharding policy to use %s shards; value must be >0",
-            str(number_of_shards))
-
-  @property
-  def shard_dimension(self):
-    """Returns the shard dimension of the policy or None if unspecified."""
-    return self._shard_dimension
-
-  def set_shard_dimension(self, shard_dimension):
-    """Sets the shard dimension for the current policy.
-
-    If the policy has been frozen then shard_dimension must match the
-    existing setting.
-
-    Args:
-      shard_dimension: The shard dimension to use in the policy.
-
-    Raises:
-      ValueError: If the policy has been frozen and shard_dimension
-        differs from the frozen value, or shard_dimension can't be
-        interpreted as a Dimension.
-    """
-    if self._frozen:
-      if self._shard_dimension != shard_dimension:
-        raise ValueError(
-            "Can't set shard dimension to %d since it has been frozen to "
-            "use %d." % (shard_dimension, self._shard_dimension))
-    else:
-      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
-
-  def merge(self, other):
-    """Merges the policy of another policy into the current policy.
-
-    Args:
-      other: The policy to merge into this one.
-
-    Raises:
-      ValueError: If this policy has been frozen and the merge conflicts with
-      the frozen policy.
-    """
-    if other.number_of_shards is not None:
-      self.set_number_of_shards(other.number_of_shards)
-    if other.shard_dimension is not None:
-      self.set_shard_dimension(other.shard_dimension)
-
-  def get_sharded_shape(self, shape, shard_index=None):
-    """Returns the shape of a shard of a full Tensor.
-
-    When given the shape of a 'full-size' Tensor, returns the shape of
-    the sub-Tensor after it has been sharded. Freezes the policy if it
-    has not yet been frozen.
-
-    Args:
-      shape: The shape of the full-size Tensor to be sharded.
-      shard_index: The index of the shard whose shape should be returned.
-        shard_index can be None for sharding policies that use the same
-        shape for every shard.
-      freeze_config:
-
-    Returns:
-      The shape of the sharded version of the Tensor.
-
-    Raises:
-      ValueError: If shard_index is None when shards are of different
-        shapes; or shard_index is not None and
-        !(0<=shard_index<number_of_shards); or shape does not have at
-        least self.shard_dimension+1 dimensions; or the value of
-        shape's shard dimension is not a multiple of
-        self.number_of_shards
-    """
-    if self._shard_dimension is None or self._number_of_shards is None:
-      # Don't raise an error if the config is unset.
-      return None
-    if shard_index is not None:
-      if shard_index < 0 or shard_index >= self.number_of_shards:
-        raise ValueError("shard_index %d, but must be in [0,%d)." %
-                         (shard_index, self._number_of_shards))
-    shape = tensor_shape.as_shape(shape)
-    if self._number_of_shards == 1:
-      # Don't do anything when there's only one shard.
-      return shape
-    ndims = shape.ndims
-    if ndims is None:
-      raise ValueError("shape must be a specified shape not Unknown")
-    if ndims <= self._shard_dimension:
-      raise ValueError("shape %s does not contain shard_dimension %d" %
-                       (shape.as_list(), self._shard_dimension))
-    dims = shape.as_list()
-    if dims[self._shard_dimension] is None:
-      raise ValueError("shape %s must have a fixed size for dimension %d "
-                       "that is known at graph construction time." %
-                       (shape.as_list(), self._shard_dimension))
-    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
-      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
-                       (shape.as_list(), self._number_of_shards,
-                        self._shard_dimension))
-    dims[self._shard_dimension] /= self._number_of_shards
-    return tensor_shape.as_shape(dims)
-
-  def _unshard_shape(self, shape):
-    """Return the unsharded shape that would generate a given sharded shape.
-
-    Args:
-      shape: the sharded shape to unshard
-
-    Returns:
-      The unsharded shape.
-
-    Raises:
-      ValueError: if shape is unknown or does not contain
-        self.shard_dimension
-      TypeError: if shape is not convertible to a TensorShape
-    """
-    shape = tensor_shape.as_shape(shape)
-    if self._number_of_shards == 1:
-      # Don't do anything when there's only one shard.
-      return shape
-    ndims = shape.ndims
-    if ndims is None:
-      raise ValueError("shape must be a specified shape not Unknown")
-    if ndims <= self._shard_dimension:
-      raise ValueError("shape %s does not contain shard_dimension %d" %
-                       (shape.as_list(), self._shard_dimension))
-    dims = shape.as_list()
-    dims[self._shard_dimension] *= self._number_of_shards
-    return tensor_shape.as_shape(dims)
-
-  def get_unsharded_shape(self, shapes):
-    """Returns the shape of an unsharded Tensor given a list of shards.
-
-    When given a list of shapes of shards, returns the shape of the
-    unsharded Tensor that would generate the shards. Sets defaults for the
-    policy if number_of_shards or shard_dimension is None.
-
-    Args:
-      shapes: The shapes of the Tensor shards to be combined.
-
-    Returns:
-      The shape of the unsharded version of the Tensor.
-
-    Raises:
-      ValueError: if shapes is not a list of length
-        self.number_of_shards; or any element of shapes is not a valid
-        shape consistent with the sharding policy; or the list of
-        shapes is not a valid sharding of a full shape.
-      TypeError: if an element of shapes is not convertible to a
-        TensorShape
-    """
-    self._fill_default_values()
-    if len(shapes) != self.number_of_shards:
-      raise ValueError(
-          "shapes is %s but must be a list of length number_of_shards=%d" % (
-              str(shapes), self.number_of_shards))
-    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
-    for i in xrange(self.number_of_shards - 1):
-      if not unsharded_shapes[i].is_compatible_with(
-          unsharded_shapes[self.number_of_shards - 1]):
-        raise ValueError(
-            "sharded shapes %s are not consistent shards of a full shape "
-            "sharded %d ways along dimension %d" % (
-                str(shapes), self.number_of_shards, self.shard_dimension))
-    return unsharded_shapes[0]
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_sharding import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index d66ecfcf4a56b8da1c2d2f518bebe4baa76b315e..258d34ddaf5250e49c5a354caf018e4b64abae62 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -1,156 +1,25 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-
-_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
-_RETRY_TIMES = 120
-_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
-
-_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
-
-# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
-# including num_cores and num_hosts.
-_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
-    'num_cores',
-    'num_hosts',
-    'num_of_cores_per_host',
-    'topology',
-    'devices',
-])
-
-
-def _query_tpu_system_metadata(master_address, cluster_def=None,
-                               query_topology=False):
-  """Automatically detects the TPU system metadata in the system."""
-  tpu_core_count = 0
-  devices = []
-  device_dict = collections.defaultdict(list)
-
-  # TODO(b/120564445): Replace with standard library for retries.
-  retry_count = 1
-  while True:
-    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
-                 master_address)
-    try:
-      with ops.Graph().as_default():
-        with session_lib.Session(
-            master_address,
-            config=get_session_config_with_timeout(
-                _PINGING_MASTER_TIMEOUT_IN_MS,
-                cluster_def)) as sess:
-          devices = sess.list_devices()
-          for device in devices:
-            match = _TPU_DEVICE_REG.match(device.name)
-            if match:
-              host_id = match.group(1)
-              core_id = match.group(2)
-              device_dict[host_id].append(core_id)
-              tpu_core_count += 1
-          break
-    except errors.DeadlineExceededError:
-      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
-             'not be ready (still scheduling) or the Tensorflow master address '
-             'is incorrect: got (%s).' %
-             (master_address))
-
-      # TODO(xiejw): For local or grpc master we might not need retry logic
-      # here.
-      if retry_count <= _RETRY_TIMES:
-        logging.warning('%s', msg)
-        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
-        retry_count += 1
-      else:
-        raise ValueError(msg)
-
-  num_of_cores_per_host = 0
-  if tpu_core_count:
-    num_cores_per_host_set = set(
-        [len(core_ids) for core_ids in device_dict.values()])
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError(
-          'TPU cores on each host is not same. This should not happen!. '
-          'devices: {}'.format(devices))
-    num_of_cores_per_host = num_cores_per_host_set.pop()
-
-  topology = None
-  if query_topology:
-    if not tpu_core_count:
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system (master address {}). '
-          'This usually means the master address is incorrect or the '
-          'TPU worker has some problems. Available devices: {}'.format(
-              master_address, devices))
-
-    topology = _obtain_topology(master_address, cluster_def)
-
-  metadata = _TPUSystemMetadata(
-      num_cores=tpu_core_count,
-      num_hosts=len(device_dict),
-      num_of_cores_per_host=num_of_cores_per_host,
-      topology=topology,
-      devices=devices)
-
-  if tpu_core_count:
-    logging.info('Found TPU system:')
-    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
-    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
-    logging.info('*** Num TPU Cores Per Worker: %d',
-                 metadata.num_of_cores_per_host)
-    for device in metadata.devices:
-      logging.info('*** Available Device: %s', device)
-  else:
-    logging.info('Failed to find TPU: %s', metadata)
-  return metadata
-
-
-def _obtain_topology(master_address, cluster_def):
-  """Obtains TPU fabric topology."""
-  try:
-    logging.info('Initializing TPU system (master: %s) to fetch topology '
-                 'for model parallelism. This might take a while.',
-                 master_address)
-    with ops.Graph().as_default():
-      session_config = get_session_config_with_timeout(
-          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
-      with session_lib.Session(
-          master_address, config=session_config) as sess:
-        topology = sess.run(tpu.initialize_system())
-        return topology
-  except errors.DeadlineExceededError:
-    raise ValueError(
-        'Fail to initialize TPU system with master (%s). '
-        'Please double check the TPU system is functional.' % (
-            master_address))
-
-
-def get_session_config_with_timeout(timeout_in_secs, cluster_def):
-  """Returns a session given a timeout and a cluster configuration."""
-  config = config_pb2.ConfigProto(
-      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
-  return config
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_system_metadata import *
+# used by tests
+from tensorflow.python.tpu.tpu_system_metadata import _query_tpu_system_metadata
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index 0187b4bec6ecc55943bf48b9268a74e18ea5b488..673359b232d6857d468723873c449cb3e48168c7 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -1,214 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Library for constructing a training loop, suitable for TPUs."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.compiler import xla
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-
-
-def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop for TPUs.
-
-  The set of loop-carried tensors corresponds to `inputs`.  Both
-  `condition` and `body` take the current value of the loop-carried
-  tensors. 'body' additionally takes a tuple of infeed from
-  infeed_queue if infeed_queue is not None. `condition` must return a
-  single boolean value that determines whether iteration
-  continues. `body` must return an updated list of values for the
-  loop-carried tensors.
-
-  Args:
-    condition: a Python function that builds the loop condition.
-    body: a Python function that builds the loop body.
-    inputs: a list of initial values passed into the training loop, or
-      None (equivalent to an empty list).
-    infeed_queue: if not None, the infeed queue from which to append a tuple
-      of arguments as inputs to condition.
-    name: (Deprecated) Does nothing.
-
-  Returns:
-    The final values of the loop-carried tensors.
-
-  Raises:
-    TypeError: if body or condition has the wrong signature.
-  """
-  del name
-  # Converts inputs to Tensors.
-  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
-                                      x in inputs]
-  input_types = [x.dtype for x in inputs]
-  input_arity = len(inputs)
-
-  body_arg_error = xla.check_function_argument_count(
-      body, input_arity, infeed_queue)
-  if body_arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied loop body function cannot be called with the specified "
-          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
-              input_arity, str([i.name for i in inputs]), body_arg_error))
-    else:
-      raise TypeError(
-          "Supplied loop body function cannot be called with the specified "
-          "inputs. You specified %d inputs: %s and %d additional inputs from "
-          "infeed, but the computation needs %s" % (input_arity, str(
-              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
-                                                    body_arg_error))
-  condition_arg_error = xla.check_function_argument_count(
-      condition, input_arity, None)
-  if condition_arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied loop condition function cannot be called with the "
-          "specified inputs. You specified %d inputs: %s, but the loop "
-          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
-                                  condition_arg_error))
-    else:
-      raise TypeError(
-          "Supplied loop condition function cannot be called with the "
-          "specified inputs. You specified %d inputs: %s, but the loop "
-          "condition needs %s. Note that infeed is not passed to the loop "
-          "condition." % (input_arity, str([i.name for i in inputs]),
-                          condition_arg_error))
-
-  def condition_wrapper(*inputs):
-    # Discards the dummy output added for arity-0 loops.
-    if input_arity == 0:
-      inputs = []
-    return condition(*inputs)
-
-  def body_wrapper(*inputs):
-    """Wrapper around `body` that handles infeed queues and control deps."""
-    inputs = list(inputs)
-
-    # Discards the dummy output added for arity-0 loops.
-    if input_arity == 0:
-      inputs = []
-
-    # Runs `body` with the dequeue_ops appended.
-    if infeed_queue:
-      number_of_shards = tpu_function.get_tpu_context().number_of_shards
-      if number_of_shards is None:
-        raise ValueError("Can't build training loop with infeed when there is "
-                         "no tpu_shard_context. Are you building a loop or "
-                         "graph directly rather than from inside tpu.rewrite, "
-                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
-      infeed_queue.set_number_of_shards(number_of_shards)
-      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
-    else:
-      dequeue_ops = []
-    outputs = body(*(inputs + dequeue_ops))
-
-    # If the computation only returned one value, make it a tuple.
-    if not isinstance(outputs, (list, tuple)):
-      outputs = (outputs,)
-
-    outputs = [
-        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-        for o in outputs
-    ]
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs
-                      if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          "TPU training loop body must return zero or more Tensor values "
-          "followed by zero or more Operations.")
-
-    output_types = [op.dtype for op in output_tensors]
-    if input_types != output_types:
-      raise TypeError(
-          "Mismatch between input types and output types for training loop "
-          "body: {} vs {}".format(input_types, output_types))
-
-    # Add the dequeue operations to output_operations to ensure they are run
-    # by the loop, even if the programmer's loop body does not use them.
-    output_operations += dequeue_ops
-
-    # Add a dummy output, if needed.
-    if not output_tensors:
-      output_tensors = array_ops.constant(0)
-
-    if output_operations:
-      # TODO(phawkins): in principle this is too restrictive since it serializes
-      # the training loop steps. In practice it does not matter since this loop
-      # will be compiled by XLA.
-      return control_flow_ops.tuple(output_tensors,
-                                    control_inputs=output_operations)
-    else:
-      return output_tensors
-
-  # If the body has arity 0, add a dummy loop-carried value to which we can add
-  # control dependencies from any side-effecting operations.
-  if input_arity == 0:
-    inputs = [array_ops.constant(0)]
-  return control_flow_ops.while_loop(
-      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
-
-
-def repeat(n, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop that executes a fixed number of iterations.
-
-  The set of loop-carried tensors correspond to `inputs`.
-  `body` must be a function that takes and returns the values of the
-  loop-carried tensors.
-
-  Args:
-    n: the number of loop iterations
-    body: a Python function that builds the loop body.
-    inputs: a list of initial values passed into the training loop or
-      None (equivalent to an empty list).
-    infeed_queue: if not None, the infeed queue from which to append a tuple
-      of arguments as inputs to condition.
-    name: (Deprecated) Does nothing.
-  Returns:
-    The final values of the loop-carried tensors.
-  Raises:
-    ValueError: if there is a type error.
-  """
-  def _convert_to_list(xs):
-    if not isinstance(xs, (list, tuple)):
-      return [xs]
-    else:
-      return list(xs)
-
-  def cond(i, *args):
-    del args
-    return i < n
-
-  def body_wrapper(i, *args):
-    return [i + 1] + _convert_to_list(body(*args))
-
-  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
-  outputs = while_loop(
-      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
-  outputs = _convert_to_list(outputs)
-  if len(outputs) == 1:
-    # Returns the Op rather than an empty list.
-    return outputs[0].op
-  else:
-    return outputs[1:]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.training_loop import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/util.py b/tensorflow/contrib/tpu/python/tpu/util.py
index dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b..8d9b70d46eb42c9a525eeafc51d07f0ad4241d52 100644
--- a/tensorflow/contrib/tpu/python/tpu/util.py
+++ b/tensorflow/contrib/tpu/python/tpu/util.py
@@ -1,51 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Utilities for the functionalities."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-import six
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training
-
-def check_positive_integer(value, name):
-  """Checks whether `value` is a positive integer."""
-  if not isinstance(value, six.integer_types):
-    raise TypeError('{} must be int, got {}'.format(name, type(value)))
-
-  if value <= 0:
-    raise ValueError('{} must be positive, got {}'.format(name, value))
-
-
-# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
-# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
-# python/estimator/util.py.
-class MultiHostDatasetInitializerHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes all passed iterators."""
-
-  def __init__(self, dataset_initializers):
-    self._initializers = dataset_initializers
-
-  def after_create_session(self, session, coord):
-    del coord
-    start = time.time()
-    session.run(self._initializers)
-    logging.info('Initialized dataset iterators in %d seconds',
-                 time.time() - start)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.util import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index 093765dc2098d2135a1d86aa44b23c13546267ee..4ceb6e9350f5167efc8f7266d4e748cc6fa4ffd6 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -432,7 +432,7 @@ def create_train_op(total_loss,
   else:
     # Make sure that variables_to_train are in tf.trainable_variables()
     for v in variables_to_train:
-      assert v in tf_variables.trainable_variables()
+      assert v.trainable or v in tf_variables.trainable_variables()
 
   assert variables_to_train
 
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index 07dbd5ca8d65ec8232d33c016a7369c68a4c9e1f..ada08f95ae46ea06b3896ca3b1603277d62bf6fc 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -22,7 +22,9 @@ cc_library(
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/kernels:immutable_constant_op",
     ],
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 3d92a836d1c21845407ec53bd46a24638e158e3b..06c108b38fbf1d4b796c313ce700332803c73ef9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -77,6 +77,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
     "if_android",
+    "if_emscripten",
     "if_ios",
     "if_linux_x86_64",
     "if_mobile",
@@ -87,10 +88,12 @@ load(
     "tf_copts",
     "tf_cuda_library",
     "tf_features_nomodules_if_android",
+    "tf_features_nomodules_if_emscripten",
     "tf_gen_op_libs",
     "tf_generate_proto_text_sources",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_android",
+    "tf_opts_nortti_if_emscripten",
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
@@ -125,7 +128,6 @@ load(
     "tf_additional_libdevice_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
-    "tf_additional_proto_compiler_hdrs",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_test_deps",
@@ -144,6 +146,7 @@ load(
     "tf_protos_grappler",
     "tf_protos_grappler_impl",
     "tf_pyclif_proto_library",
+    "tf_grpc_service_all",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -226,13 +229,15 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
-    "protobuf/checkpointable_object_graph.proto",
+    "protobuf/trackable_object_graph.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
     "protobuf/meta_graph.proto",
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
+    "protobuf/saved_object_graph.proto",
+    "protobuf/struct.proto",
     "protobuf/tensorflow_server.proto",
     "protobuf/transport_options.proto",
     "util/test_log.proto",
@@ -415,9 +420,8 @@ cc_library(
     name = "platform_protobuf",
     srcs = tf_platform_hdrs([
         "protobuf.h",
-    ]) + tf_platform_srcs([
-        "protobuf.cc",
     ]) + [
+        "platform/protobuf.cc",
         "platform/protobuf_util.cc",
         "lib/core/status.h",
     ],
@@ -436,6 +440,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_services",
+    srcs = [],
+    hdrs = [
+        "platform/grpc_services.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = tf_grpc_service_all(),
+)
+
 cc_library(
     name = "human_readable_json",
     srcs = tf_platform_srcs(["human_readable_json.cc"]),
@@ -454,10 +469,7 @@ cc_library(
     hdrs = ["platform/logger.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
-    deps = [
-        ":lib_proto_parsing",
-        "@protobuf_archive//:protobuf",
-    ],
+    deps = [":lib_proto_parsing"],
 )
 
 filegroup(
@@ -664,7 +676,7 @@ cc_library(
     name = "lib_proto_compiler",
     hdrs = [
         "platform/protobuf_compiler.h",
-    ] + tf_additional_proto_compiler_hdrs(),
+    ],
     copts = tf_copts(),
     deps = tf_lib_proto_compiler_deps() + [
         ":lib_proto_parsing",
@@ -1049,13 +1061,13 @@ cc_library(
         "platform/default/integral_types.h",
         "platform/default/logging.h",
         "platform/default/mutex.h",
-        "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/dynamic_annotations.h",
         "platform/macros.h",
         "platform/mutex.h",
         "platform/platform.h",
         "platform/prefetch.h",
+        "platform/protobuf.h",
         "platform/thread_annotations.h",
         "platform/types.h",
         "platform/cpu_info.h",
@@ -1141,6 +1153,13 @@ tf_gen_op_libs(
     deps = [":protos_all_cc"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "mkl_array_ops",
+    ],
+    deps = [":protos_all_cc"],
+)
+
 tf_gen_op_libs(
     op_lib_names = [
         "audio_ops",
@@ -1161,6 +1180,29 @@ tf_gen_op_libs(
     deps = [":lib"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    deps = [
+        ":lib",
+        ":lib_proto_parsing",
+        ":protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
+    ],
+)
+
 # And one for all user ops
 cc_library(
     name = "user_ops_op_lib",
@@ -1277,10 +1319,23 @@ cc_library(
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
+        ":tpu_configuration_ops_op_lib",
+        ":tpu_cross_replica_ops_op_lib",
+        ":tpu_embedding_ops_op_lib",
+        ":tpu_functional_ops_op_lib",
+        ":tpu_heartbeat_ops_op_lib",
+        ":tpu_host_compute_ops_op_lib",
+        ":tpu_infeed_ops_op_lib",
+        ":tpu_outfeed_ops_op_lib",
+        ":tpu_ordinal_selector_ops_op_lib",
+        ":tpu_replication_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
-    ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(),
+    ] + if_mkl([
+        ":mkl_array_ops_op_lib",
+        ":mkl_nn_ops_op_lib",
+    ]) + tf_additional_cloud_op_deps(),
     alwayslink = 1,
 )
 
@@ -1382,7 +1437,7 @@ cc_library(
 # This includes implementations of all kernels built into TensorFlow.
 cc_library(
     name = "all_kernels_impl",
-    visibility = ["//visibility:private"],
+    visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
@@ -1452,6 +1507,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_identity_op",
         "//tensorflow/core/kernels:mkl_input_conversion_op",
         "//tensorflow/core/kernels:mkl_lrn_op",
+        "//tensorflow/core/kernels:mkl_requantize_ops",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
@@ -1540,6 +1596,7 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
+        ":ops",
         ":protos_all_cc",
         ":shape_inference_testutil",
         ":tensor_testutil",
@@ -1770,6 +1827,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "emscripten_tensorflow_lib_lite_nortti_lite_protos_no_runtime",
+    srcs = if_emscripten(["//tensorflow/core:mobile_srcs_no_runtime"]),
+    copts = ["-DSUPPORT_SELECTIVE_REGISTRATION"] + tf_opts_nortti_if_emscripten(),
+    defines = ["TENSORFLOW_LITE_PROTOS"],
+    linkopts = ["-lz"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":emscripten_proto_lib_no_rtti_lite_runtime",
+        ":mobile_additional_lib_deps",
+        ":stats_calculator_portable",
+        "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
+        "@zlib_archive//:zlib",
+    ],
+    alwayslink = 1,
+)
+
 # Native library support for iOS applications.
 #
 # bazel  build --config=ios_x86_64 \
@@ -1863,6 +1943,7 @@ filegroup(
             "**/*testutil*",
             "**/*testlib*",
             "**/*main.cc",
+            "**/tpu_*",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -2248,6 +2329,7 @@ cc_library(
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
+            "platform/protobuf.cc",
         ],
     ) + tf_additional_lib_srcs(
         exclude = [
@@ -2274,6 +2356,8 @@ cc_library(
         ":lib_proto_parsing",
         ":abi",
         ":core_stringpiece",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
@@ -2653,7 +2737,6 @@ tf_cuda_library(
             "example/**/*.cc",
             "framework/**/*.cc",
             "util/**/*.cc",
-        ] + [
             "graph/edgeset.cc",
             "graph/graph.cc",
             "graph/graph_def_builder.cc",
@@ -2898,6 +2981,7 @@ tf_cuda_library(
 
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
+    "common_runtime/shared_counter.h",
     "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
     "common_runtime/hierarchical_tree_broadcaster.h",
@@ -2922,6 +3006,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/lower_if_while.h",
     "common_runtime/lower_while_op.h",
     "common_runtime/memory_types.h",
+    "common_runtime/metrics.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
@@ -2933,6 +3018,8 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
     "common_runtime/ring_reducer.h",
+    "common_runtime/ring_alg.h",
+    "common_runtime/ring_gatherer.h",
     "common_runtime/session_factory.h",
     "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
@@ -2957,6 +3044,8 @@ tf_cuda_library(
         "common_runtime/collective_param_resolver_local.cc",
         "common_runtime/collective_rma_local.cc",
         "common_runtime/collective_util.cc",
+        "common_runtime/colocation_graph.cc",
+        "common_runtime/colocation_graph.h",
         "common_runtime/constant_folding.cc",
         "common_runtime/copy_tensor.cc",
         "common_runtime/costmodel_manager.cc",
@@ -2977,6 +3066,7 @@ tf_cuda_library(
         "common_runtime/lower_if_while.cc",
         "common_runtime/lower_while_op.cc",
         "common_runtime/memory_types.cc",
+        "common_runtime/metrics.cc",
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
         "common_runtime/parallel_concat_optimizer.cc",
@@ -2989,6 +3079,8 @@ tf_cuda_library(
         "common_runtime/renamed_device.cc",
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
+        "common_runtime/ring_alg.cc",
+        "common_runtime/ring_gatherer.cc",
         "common_runtime/ring_reducer.cc",
         "common_runtime/session.cc",
         "common_runtime/session_factory.cc",
@@ -3047,7 +3139,6 @@ tf_cuda_library(
         ":framework",
         ":graph",
         ":lib",
-        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -3078,15 +3169,6 @@ cc_library(
     deps = [":lib_internal"],
 )
 
-tf_cuda_library(
-    name = "metrics",
-    srcs = ["common_runtime/metrics.cc"],
-    hdrs = ["common_runtime/metrics.h"],
-    deps = [
-        ":lib",
-    ],
-)
-
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
@@ -3103,7 +3185,6 @@ tf_cuda_library(
         ":graph",
         ":lib",
         ":lib_internal",
-        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
@@ -3470,6 +3551,7 @@ tf_cc_tests(
         "platform/vmodule_benchmark_test.cc",
     ],
     deps = [
+        ":core_cpu_internal",
         ":lib",
         ":lib_internal",
         ":lib_test_internal",
@@ -3679,6 +3761,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "lib_strings_proto_serialization_test",
+    srcs = ["lib/strings/proto_serialization_test.cc"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test(
     name = "lib_random_weighted_picker_test",
     size = "medium",
@@ -3886,7 +3982,6 @@ tf_cc_test(
         "ops/cudnn_rnn_ops_test.cc",
     ],
     deps = [
-        ":cudnn_rnn_ops",
         "//tensorflow/core",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3946,6 +4041,35 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "ring_gatherer_test",
+    size = "medium",
+    srcs = [
+        "common_runtime/ring_gatherer_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "hierarchical_tree_broadcaster_test",
     size = "medium",
@@ -4484,7 +4608,7 @@ tf_cc_test(
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl([":mkl_array_ops_op_lib"]),
 )
 
 tf_cc_test(
@@ -5037,6 +5161,39 @@ transitive_hdrs(
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
+load("//tensorflow:tensorflow.bzl", "tf_portable_proto_library")
+
+genrule(
+    name = "emscripten_proto_config_lite_runtime",
+    outs = ["emscripten_proto_config_lite_runtime.asciipb"],
+    cmd = tf_genrule_cmd_append_to_srcs("optimize_mode:LITE_RUNTIME"),
+    visibility = ["//visibility:private"],
+)
+
+# We are keeping the "android" version of tf_android_core_proto_headers. All it does is
+# normalize CORE_PROTO_SRCS to generate valid output file names.
+tf_portable_proto_library(
+    name = "emscripten_proto_lib_no_rtti_lite_runtime",
+    config = ":emscripten_proto_config_lite_runtime",
+    copts = tf_opts_nortti_if_emscripten(),
+    features = tf_features_nomodules_if_emscripten(),
+    header_outs = tf_android_core_proto_headers(CORE_PROTO_SRCS) + ["//google/protobuf/any.proto.h"],
+    link_full_protobuf = False,
+    prefix_dir = "emscripten_proto_no_rtti",
+    proto_deps = [
+        ":protos_all_cc",
+        "@protobuf_archive//:protobuf",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# There is currently no need for a full proto version of emscripten tf lib lite.
+alias(
+    name = "emscripten_lib_lite_no_runtime",
+    actual = "//tensorflow/core:emscripten_tensorflow_lib_lite_nortti_lite_protos_no_runtime",
+    visibility = ["//visibility:public"],
+)
+
 alias(
     name = "android_srcs_no_runtime",
     actual = ":mobile_srcs_no_runtime",
diff --git a/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt b/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6f28bd022bcd843aa3a7aeb8b1b257a3b3ddfd3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "AllToAll"
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to the sum.
+END
+  }
+  in_arg {
+    name: "group_assignment"
+    description: <<END
+An int32 tensor with shape
+[num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+replica ids in the ith subgroup.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The exchanged result.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be exchanged.
+END
+  }
+  attr {
+    name: "concat_dimension"
+    description: <<END
+The dimension number to concatenate.
+END
+  }
+  attr {
+    name: "split_dimension"
+    description: <<END
+The dimension number to split.
+END
+  }
+  attr {
+    name: "split_count"
+    description: <<END
+The number of splits, this number must equal to the sub-group
+size(group_assignment.get_shape()[1])
+END
+  }
+  summary: "An Op to exchange data across TPU replicas."
+  description: <<END
+On each replica, the input is split into `split_count` blocks along
+`split_dimension` and send to the other replicas given group_assignment. After
+receiving `split_count` - 1 blocks from other replicas, we concatenate the
+blocks along `concat_dimension` as the output.
+
+For example, suppose there are 2 TPU replicas:
+replica 0 receives input: `[[A, B]]`
+replica 1 receives input: `[[C, D]]`
+
+group_assignment=`[[0, 1]]`
+concat_dimension=0
+split_dimension=1
+split_count=2
+
+replica 0's output: `[[A], [C]]`
+replica 1's output: `[[B], [D]]`
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Case.pbtxt b/tensorflow/core/api_def/base_api/api_def_Case.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56fef3ae6d0d452cb2caa57c36f35a04584864ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Case.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "Case"
+  in_arg {
+    name: "branch_index"
+    description: "The branch selector, an int32 Tensor."
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors passed to the branch function."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "branches"
+    description: <<END
+      A list of functions each of which takes 'inputs' and returns a list of
+      tensors, whose types are the same as what every other branch returns.
+END
+  }
+  summary: "An n-way switch statement which calls a single branch function."
+  description: <<END
+    An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cd833b9455458511787bec71d45531810574eb9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveGather"
+  summary: "Mutually accumulates multiple tensors of identical type and shape."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aec724a115ed974b39a0ecd00985f97a8e7c2a97
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "CollectivePermute"
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to be permuted. Currently only supports float and
+bfloat16.
+END
+  }
+  in_arg {
+    name: "source_target_pairs"
+    description: <<END
+A tensor with shape [num_pairs, 2].
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The permuted input.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be exchanged.
+END
+  }
+  summary: "An Op to permute tensors across replicated TPU instances."
+  description: <<END
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a710f605b74773e255f790954d09653b45983c03
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "ConfigureDistributedTPU"
+  out_arg {
+    name: "topology"
+    description: <<END
+A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+END
+  }
+  attr {
+    name: "embedding_config"
+    description: <<END
+Reserved. Do not use.
+END
+  }
+  attr {
+    name: "tpu_embedding_config"
+    description: <<END
+Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+describes the embedding lookups of the program.
+END
+  }
+  attr {
+    name: "is_global_init"
+    description: <<END
+Reserved. Do not use.
+END
+  }
+  summary: "Sets up the centralized structures for a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd4c3437775beb0cd3f31842f3046b15ee32f9f2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "CrossReplicaSum"
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to the sum.
+END
+  }
+  in_arg {
+    name: "group_assignment"
+    description: <<END
+An int32 tensor with shape
+[num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+replica ids in the ith subgroup.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The sum of all the distributed inputs.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be summed.
+END
+  }
+  summary: "An Op to sum inputs across replicated TPU instances."
+  description: <<END
+Each instance supplies its own input.
+
+For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+and `B, D, F, H` as group 1. Thus we get the outputs:
+`[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0317c4ea1d9345b9522fbbf978c15bc09ba534d0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingIntegerBatch"
+  in_arg {
+    name: "batch"
+    description: <<END
+A list of 1D tensors, one for each embedding table, containing the
+indices into the tables.
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  summary: "An op that enqueues a list of input batch tensors to TPUEmbedding."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb476ce3fcf2df6f1d8d7ce2f1085ff885343535
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingSparseBatch"
+  in_arg {
+    name: "sample_indices"
+    description: <<END
+A list of rank 1 Tensors specifying the training example and
+feature to which the corresponding embedding_indices and aggregation_weights
+values belong. sample_indices[i] must equal b * nf + f, where nf is the
+number of features from the corresponding table, f is in [0, nf), and
+b is in [0, batch size).
+END
+  }
+  in_arg {
+    name: "embedding_indices"
+    description: <<END
+A list of rank 1 Tensors, indices into the embedding tables.
+END
+  }
+  in_arg {
+    name: "aggregation_weights"
+    description: <<END
+A list of rank 1 Tensors containing per sample -- i.e. per
+(training example, feature) -- aggregation weights.
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  attr {
+    name: "combiners"
+    description: <<END
+A list of string scalars, one for each embedding table that specify
+how to normalize the embedding activations after weighted summation.
+Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+all tables.
+END
+  }
+  summary: "An op that enqueues TPUEmbedding input indices from a SparseTensor."
+  description: <<END
+This Op eases the porting of code that uses embedding_lookup_sparse(),
+although some Python preprocessing of the SparseTensor arguments to
+embedding_lookup_sparse() is required to produce the arguments to this Op,
+since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+step.
+
+The tensors at corresponding positions in the three input lists
+must have the same shape, i.e. rank 1 with dim_size() equal to the total
+number of lookups into the table described by the corresponding table_id.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cef870a6fe9e37492997baf86e8702412b82802
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  in_arg {
+    name: "sample_indices"
+    description: <<END
+A list of rank 1 Tensors specifying the training example to
+which the corresponding embedding_indices and aggregation_weights values
+belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "embedding_indices"
+    description: <<END
+A list of rank 1 Tensors, indices into the embedding tables.
+It corresponds to sp_ids.values in embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "aggregation_weights"
+    description: <<END
+A list of rank 1 Tensors containing per training example
+aggregation weights. It corresponds to sp_weights.values in
+embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  attr {
+    name: "combiners"
+    description: <<END
+A list of string scalars, one for each embedding table that specify
+how to normalize the embedding activations after weighted summation.
+Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+all tables.
+END
+  }
+  attr {
+    name: "table_ids"
+    description: <<END
+A list of integers specifying the identifier of the embedding table
+(offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+corresponding input. The ith input is looked up using table_ids[i]. The size
+of the table_ids list must be equal to that of sample_indices,
+embedding_indices and aggregation_weights.
+END
+  }
+  summary: "Eases the porting of code that uses tf.nn.embedding_lookup_sparse()."
+  description: <<END
+sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in the three input lists (sample_indices,
+embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt b/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d815b856bfb3c97b9347aa49b22a3b2f00908b4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "EuclideanNorm"
+  endpoint {
+    name: "EuclideanNorm"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the euclidean norm of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..99ca55a25ff865ecd0c15f2087e15e6ee41ea6f7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "InfeedDequeue"
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  summary: "A placeholder op for a value that will be fed into the computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61b6ded66ba2dc156526c040b44312ec1a33f603
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "InfeedDequeueTuple"
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  summary: "Fetches multiple values from infeed as an XLA tuple."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e08f4e6ff74f10dd7ffa6df18d0c5753e9498456
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "InfeedEnqueue"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "layout"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence.
+If a layout attribute is passed, but its values are all -1, the layout will
+be computed by the infeed operation.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "An op which feeds a single Tensor value into the computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f87d6d76d8bb92450d16d0935c0b533fdc02a94a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "InfeedEnqueueTuple"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `inputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `inputs`.
+END
+  }
+  attr {
+    name: "layouts"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence for
+all the tuple shapes, in the order the shapes appear in the "shapes" input.
+The layout elements for a sub-shape can be set to -1, in which case the
+corresponding layout will be computed by the infeed operation.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Feeds multiple Tensor values into the computation as an XLA tuple."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43901e1e4403a4ab0f3759470d8c26bc95ac8077
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingADAMParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "velocities"
+    description: <<END
+Value of velocities used in the ADAM optimization algorithm.
+END
+  }
+  summary: "Load ADAM embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aaa52f6aa900a9fc1bca708b5cf4c8ed563f38a8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "velocities"
+    description: <<END
+Value of velocities used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the ADAM optimization algorithm.
+END
+  }
+  summary: "Load ADAM embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e306329765f692bdae321c8e4e2e2915f6cd9ad5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdadeltaParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Value of updates used in the Adadelta optimization algorithm.
+END
+  }
+  summary: "Load Adadelta embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd2e3b924b0a519fe9f7071d48b90d6e127cdf54
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Value of updates used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  summary: "Load Adadelta parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94db7dfc263e0900e4c18e44a44dd43e23db85ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdagradParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  summary: "Load Adagrad embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2d10441dbeafd7885cc2f4c376b48c47ddfd5f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  summary: "Load Adagrad embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43cf8845861e54ac1e70061cfbcd23cd161a14ef
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Value of mg used in the centered RMSProp optimization algorithm.
+END
+  }
+  summary: "Load centered RMSProp embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3516888c71b82213048afc449d9064a39fe9c1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingFTRLParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "linears"
+    description: <<END
+Value of linears used in the FTRL optimization algorithm.
+END
+  }
+  summary: "Load FTRL embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6c80f66df6bf4238a0572e48eef3193bf946e48
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "linears"
+    description: <<END
+Value of linears used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the FTRL optimization algorithm.
+END
+  }
+  summary: "Load FTRL embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24b334380b91b04d7f8f482760a975c88b1b1d8c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+Value of weights used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "benefits"
+    description: <<END
+Value of benefits used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  summary: "Load MDL Adagrad Light embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e98956e65dbb6e3a34eea38c9874db81f5b49470
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMomentumParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the Momentum optimization algorithm.
+END
+  }
+  summary: "Load Momentum embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fff35adc68e7cefff0f551f7668755efc0e8747d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Momentum optimization algorithm.
+END
+  }
+  summary: "Load Momentum embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da049a72c7652b8e0784cbe2cd63da72172360a1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingProximalAdagradParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Load proximal Adagrad embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e0f275a44be90cb2169dec5aad76faeda6d0ded
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Load proximal Adagrad embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e79a3a4c58156a0ea18eba12ef3db3e272da2bcf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingRMSPropParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the RMSProp optimization algorithm.
+END
+  }
+  summary: "Load RMSProp embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..233e5afbff369f16da51af104708568d51c00460
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the RMSProp optimization algorithm.
+END
+  }
+  summary: "Load RMSProp embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37d0dcc1f4e1c06a649e191df158446ceaed283d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the stochastic gradient descent optimization algorithm.
+END
+  }
+  summary: "Load SGD embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51a5fa95d3d90cb9ac8798492d7bc469336ccb82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "MulNoNan"
+  endpoint {
+    name: "MulNoNan"
+  }
+  summary: "Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN."
+  description: <<END
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f262bdce969f7fbadb6f5ee6d0223cfb8cfd7aa1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "OutfeedDequeue"
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be read from the device outfeed.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Retrieves a single tensor from the computation outfeed."
+  description: <<END
+This operation will block indefinitely until data is available.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..457e495e3c726aefa5e64ecb800f6acf780afb1b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "OutfeedDequeueTuple"
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be read from the outfeed.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Retrieve multiple values from the computation outfeed."
+  description: <<END
+This operation will block indefinitely until data is available. Output `i`
+corresponds to XLA tuple element `i`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa6cb96de11c8d2a9bd3c04a3dfb19be5f363027
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "OutfeedEnqueue"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be inserted into the outfeed queue.
+END
+  }
+  summary: "Enqueue a Tensor on the computation outfeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb1ab3d63c0837f0bee7048d1b484f55b9dd619f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "OutfeedEnqueueTuple"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be inserted into the outfeed queue as an
+XLA tuple.
+END
+  }
+  summary: "Enqueue multiple Tensor values on the computation outfeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt b/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1921fd2624daf0200737dfc6dbaeee2311ad5b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "RecvTPUEmbeddingActivations"
+  out_arg {
+    name: "outputs"
+    description: <<END
+A TensorList of embedding activations containing one Tensor per
+embedding table in the model.
+END
+  }
+  attr {
+    name: "num_outputs"
+    description: <<END
+The number of output activation tensors, equal to the number of
+embedding tables in the model.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "An op that receives embedding activations on the TPU."
+  description: <<END
+The TPU system performs the embedding lookups and aggregations specified by
+the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+results of these aggregations are visible to the Tensorflow Graph as the
+outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+one Tensor of activations per table specified in the model. There can be at
+most one RecvTPUEmbeddingActivations op in the TPU graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
index 07bbd4ac6031765a070c5e5b4ee0726512dbb6ca..cd7d4e3ec2abf68b2c8461a5e301213e34252d1e 100644
--- a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
@@ -30,10 +30,12 @@ END
 The type of the input.
 END
   }
-  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
+  summary:
+"Computes a range that covers the actual values present in a quantized tensor."
   description: <<END
-range that covers the actual values present in that tensor.  This op is
-typically used to produce the requested_output_min and requested_output_max for
-Requantize.
+Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+range that covers the actual values present in that tensor. This op is typically
+used to produce the `requested_output_min` and `requested_output_max` for
+`Requantize`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58cf1222500e6bf58a22beb17ffccf2949dd4c81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "RequantizationRangePerChannel"
+  visibility : HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input tensor
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input tensor.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The minimum value of the final output tensor
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The maximum value of the final output tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The quantized type of input tensor that needs to be converted. 
+END
+  }
+  attr {
+    name: "clip_value_max"
+    description: <<END
+The maximum value of the output that needs to be clipped.
+Example: set this to 6 for Relu6. 
+END
+  }
+  summary: "Computes requantization range per channel."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
index 1b03f63b261e00c6b1dfdc0b1f11c69d71b536eb..23e1656288d6c97b8facd7eafb5ba5cd862dade1 100644
--- a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
@@ -48,13 +48,15 @@ END
 The type of the output. Should be a lower bit depth than Tinput.
 END
   }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
+  summary: 
+"Converts the quantized `input` tensor into a lower-precision `output`."
   description: <<END
-output range specified with 'requested_output_min' and 'requested_output_max'.
+Converts the quantized `input` tensor into a lower-precision `output`, using the
+output range specified with `requested_output_min` and `requested_output_max`.
 
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+`[input_min, input_max]` are scalar floats that specify the range for the float
+interpretation of the `input` data. For example, if `input_min` is -1.0f and
+`input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
 value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bce6c3dd5a0b5e6f4e7fe2aa990827a4020bed17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "RequantizePerChannel"
+  visibility : HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input tensor
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input tensor.
+END
+  }
+  in_arg {
+    name: "requested_output_min"
+    description: <<END
+The minimum value of the output tensor requested.
+END
+  }
+  in_arg {
+    name: "requested_output_max"
+    description: <<END
+The maximum value of the output tensor requested.
+END
+  }  
+  out_arg {
+    name: "output"
+    description: <<END
+Output tensor.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The minimum value of the final output tensor
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The maximum value of the final output tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The quantized type of input tensor that needs to be converted. 
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The quantized type of output tensor that needs to be converted.
+END
+  }
+  summary: "Requantizes input with min and max values known per channel."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3de7ad97b128aa61fbcc2b5101f18c5a23919150
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingADAMParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "velocities"
+    description: <<END
+Parameter velocities updated by the ADAM optimization algorithm.
+END
+  }
+  summary: "Retrieve ADAM embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ecfd080120e8c25031a312a5834dfeee247cda0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "velocities"
+    description: <<END
+Parameter velocities updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+END
+  }
+  summary: "Retrieve ADAM embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c7e6881aa22fe648dddb0b58ad3c5285e0e1fb1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "updates"
+    description: <<END
+Parameter updates updated by the Adadelta optimization algorithm.
+END
+  }
+  summary: "Retrieve Adadelta embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ee8cdaa882c63d7cb3cc9ddcca63b0c87219c7d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "updates"
+    description: <<END
+Parameter updates updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  summary: "Retrieve Adadelta embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f070f5483ec3ba80994fe056a83a040966cdcbe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdagradParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve Adagrad embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..078cda00d6fd1ad4d5393b4e24d16149e98be0b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve Adagrad embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a4cc4e5087f70452bc3580ab4321277343a9e91
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mg"
+    description: <<END
+Parameter mg updated by the centered RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve centered RMSProp embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daf87deff23db6d82179d74d8514115ca8c1939d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingFTRLParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "linears"
+    description: <<END
+Parameter linears updated by the FTRL optimization algorithm.
+END
+  }
+  summary: "Retrieve FTRL embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f72d1d1ce1289d5735869981515a2b04b86e2a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "linears"
+    description: <<END
+Parameter linears updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  summary: "Retrieve FTRL embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..478fbda1853e0035226b267c0330870cbc47b72a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "weights"
+    description: <<END
+Parameter weights updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "benefits"
+    description: <<END
+Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  summary: "Retrieve MDL Adagrad Light embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e620c39c4f2830a53912f624b5478c24f46a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMomentumParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the Momentum optimization algorithm.
+END
+  }
+  summary: "Retrieve Momentum embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f00680bb35906a122c38eaead9702b842e62082
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+END
+  }
+  summary: "Retrieve Momentum embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e279ca4fe9cc2c1768164053ce88617a9fa1bc9f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve proximal Adagrad embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37ae5db81c2d7fc24d0d175e6085e5868c5d0224
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve proximal Adagrad embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aad8ca8dae67daaac71bb6b5eaaa661d6bc1c173
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingRMSPropParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve RMSProp embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d9bd6659c5f01d8929ded1b9ff17f710eef4a84
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve RMSProp embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33ca8a7612c204d2cd95650f955ffdbc944fd0c2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+END
+  }
+  summary: "Retrieve SGD embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index d33a36ce06c37092bd25e241b36f1c564070c6e2..d5643c8a79a92fad53b1737a80172fb56004327f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the max is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_max(c, tf.constant([0, 0, 1]))
+# ==> [[4, 3, 3, 4],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index afdc39da96df01185af600409435faca49cabc0e..b03649ab077d893ceb17704c57060cde99be1db2 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the mean along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -30,5 +30,15 @@ If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_mean(c, tf.constant([0, 0, 1]))
+# ==> [[2.5, 2.5, 2.5, 2.5],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 026b5b3991f9cbbfce9add2ff4cb7e370a1cc799..6796678555ef8f3bbf27c742db6f1c0c30c483bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,14 @@ If the min is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_min(c, tf.constant([0, 0, 1]))
+# ==> [[1, 2, 2, 1],
+#      [5, 6, 7, 8]]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index a168eed87f668c97141f2c8966c68866b82477de..10b368fcca4dab3bb197609e3e10189323bf9bc7 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the product along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the product is empty for a given segment ID `i`, `output[i] = 1`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_prod(c, tf.constant([0, 0, 1]))
+# ==> [[4, 6, 6, 4],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 876b8608240df108c66bceefa5f7eba82ddb7524..487a6d10746ce684c9f0e27f6336c22994fee3b4 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the sum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+# ==> [[5, 5, 5, 5],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt b/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a3be3d8eaff24b9d1e58b84ff6709f9448fa186
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SendTPUEmbeddingGradients"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A TensorList of gradients with which to update embedding tables.
+This argument has the same length and shapes as the return value of
+RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+with respect to the embedding activations. The embedding tables are updated
+from these gradients via the optimizer specified in the TPU embedding
+configuration given to tpu.initialize_system.
+END
+  }
+  in_arg {
+    name: "learning_rates"
+    description: <<END
+A TensorList of float32 scalars, one for each dynamic learning
+rate tag: see the comments in
+//third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+Multiple tables can share the same dynamic learning rate tag as specified
+in the configuration. If the learning rates for all tables are constant,
+this list should be empty.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "Performs gradient updates of embedding tables."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd537e05d7c97a2d43c14916b44eb32ae9b2efc1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "ShardDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "num_shards"
+    description: <<END
+An integer representing the number of shards operating in parallel.
+END
+  }
+  in_arg {
+    name: "index"
+    description: <<END
+An integer representing the current worker index.
+END
+  }
+  summary: "Creates a `Dataset` that includes only 1/`num_shards` of this dataset."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87d4e8d1a88dc13d49948bb847a9d876055f53f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "ShutdownDistributedTPU"
+  summary: "Shuts down a running distributed TPU system."
+  description: <<END
+The op returns an error if no system is running.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 138a6366c8aa1e5b9d876621b93c7d36f16f38e2..0bbc0780dfee4af8cfcb036264969dbd4ec7bbdf 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,9 +21,7 @@ END
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-for an explanation of segments.
+See `tf.sparse.segment_sum` for usage examples.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index b8073d88ac3d10cad6bc7771d3fe28bae905d8e5..65b2358830ed9eeccbc099b055b9f8b50e92afdc 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -31,7 +31,7 @@ Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 945bbdcf627c48047ffa65c4c4e5124cbd96e54b..a28bd1a646445c660183d6e35b9e6df64637c4f3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,8 +23,7 @@ END
   description: <<END
 N is the size of the segment being reduced.
 
-Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-for an explanation of segments.
+See `tf.sparse.segment_sum` for usage examples.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index ff328c8a6195f9aca515de4d8a682b50df92117e..8a5d2bb02c4c42e3d67f6e01b8c609be84575270 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -33,7 +33,7 @@ Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index a68e14607f81e999f95e85b4481fb0474e691aa4..d7494dc8deb37927dce09ce3a854339c27758286 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -22,7 +22,7 @@ END
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index aa5c1fc8d0d698008787418ef24ecb3c0c635f6a..039ca9a23ba1abf2d67327ddb72bb49b8de1ab68 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -31,7 +31,7 @@ Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
 for an explanation of segments.
 
 For example:
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c145e05ac7d9d928dd22721a394df497f1aabab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "StatefulStandardNormalV2"
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "algorithm"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random normal values.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..98643e295c331f2ca17f27ead5e19c08c14a5523
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUCompilationResult"
+  summary: "CompilationResultProto indicating the status of the TPU compilation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0763a2644f8e8a59ea2e942386412e27b765f4ac
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "TPUEmbeddingActivations"
+  in_arg {
+    name: "embedding_variable"
+    description: <<END
+A trainable variable, enabling optimizers to find this op.
+END
+  }
+  in_arg {
+    name: "sliced_activations"
+    description: <<END
+The embedding activations Tensor to return.
+END
+  }
+  attr {
+    name: "table_id"
+    description: <<END
+The id of the table in the embedding layer configuration from which
+these activations were computed.
+END
+  }
+  attr {
+    name: "lookup_id"
+    description: <<END
+Identifier of the set of embedding indices which produced these
+activations.
+END
+  }
+  summary: "An op enabling differentiation of TPU Embeddings."
+  description: <<END
+This op simply returns its first input, which is assumed to have been sliced
+from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+this op, and its first argument being a trainable Variable, enables automatic
+differentiation of graphs containing embeddings via the TPU Embedding Python
+libraries.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c72abc68477184c582fdc26f8db7687b149b134
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "TPUOrdinalSelector"
+  out_arg {
+    name: "device_ordinals"
+    description: <<END
+A vector 1 or more TPU cores.
+END
+  }
+  summary: "A TPU core selector Op."
+  description: <<END
+This Op produces a set of TPU cores (for warm-up) or a single TPU core
+(for regular inference) to execute the TPU program on. The output is
+consumed by TPUPartitionedCall.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f384c21a9ce76c21a6b675048083172816d3b1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "TPUPartitionedCall"
+  in_arg {
+    name: "args"
+    description: <<END
+The arguments to the function.
+END
+  }
+  in_arg {
+    name: "device_ordinal"
+    description: <<END
+The TPU device ordinal to run the function on.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The output of the function call.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+The types of the arguments to the function.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+The types of the outputs of the function.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+The function to call.
+END
+  }
+  summary: "Calls a function placed on a specified TPU device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5664dfb9e6e2b8c896e3f26ed8b7863c1c4beec1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
@@ -0,0 +1,99 @@
+op {
+  graph_op_name: "TPUReplicate"
+  in_arg {
+    name: "inputs"
+    description: <<END
+the inputs to 'computation', flattened, in replica-major order.
+END
+  }
+  in_arg {
+    name: "broadcast_inputs"
+    description: <<END
+additional arguments to broadcast to all replicas. The
+broadcast inputs are appended to the per-replica inputs when calling
+computation.
+END
+  }
+  in_arg {
+    name: "guaranteed_constants"
+    description: <<END
+arguments which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+the outputs of 'computation'.
+END
+  }
+  attr {
+    name: "computation"
+    description: <<END
+a function containing the computation to run.
+END
+  }
+  attr {
+    name: "num_replicas"
+    description: <<END
+the number of replicas of the computation to run.
+END
+  }
+  attr {
+    name: "num_cores_per_replica"
+    description: <<END
+the number of logical cores in each replica.
+END
+  }
+  attr {
+    name: "topology"
+    description: <<END
+A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+END
+  }
+  attr {
+    name: "use_tpu"
+    description: <<END
+a bool indicating if this computation will run on TPU or CPU/GPU.
+Currently, only supports a default placement (computation is placed on GPU
+if one is available, and on CPU if not).
+END
+  }
+  attr {
+    name: "device_assignment"
+    description: <<END
+a flattened array with shape
+[replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
+of logical cores in each replica of a computation to physical coordinates in
+the TPU topology.
+END
+  }
+  attr {
+    name: "Tinputs"
+    description: <<END
+the types of the arguments to 'computation'.
+END
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    description: <<END
+the types of the additional arguments to broadcast to all
+replicas.
+END
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    description: <<END
+the types of the arguments to 'guaranteed_constants'.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+the types of the outputs of 'computation'.
+END
+  }
+  summary: "Runs replicated computations on a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b30d1b6218c3a6ea36d796ba77b2a7808b0be083
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    description: <<END
+Number of replicas of the computation
+END
+  }
+  attr {
+    name: "num_cores_per_replica"
+    description: <<END
+Number of cores per replica. Used for model parallelism.
+END
+  }
+  attr {
+    name: "topology"
+    description: <<END
+TopologyProto indicating the topology of the TPU pod slice.
+END
+  }
+  attr {
+    name: "use_tpu"
+    description: <<END
+Whether to place the computation on the TPU.
+END
+  }
+  attr {
+    name: "device_assignment"
+    description: <<END
+The assignment of devices for the computation.
+END
+  }
+  attr {
+    name: "computation_shape"
+    description: <<END
+DEPRECATED. Use num_cores_per_replica instead.
+END
+  }
+  attr {
+    name: "host_compute_core"
+  }
+  attr {
+    name: "padding_map"
+  }
+  summary: "Metadata indicaitng how the TPU computation should be replicated."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bba92e7843c3be7cb9536a58cf93ee5ec6b916f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUReplicatedInput"
+  summary: "Connects N inputs to an N-way replicated TPU computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cab78c7496e85751ee7ba5cc0a92620751d6b2d0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUReplicatedOutput"
+  summary: "Connects outputs of an N-way replicated computation to N outputs."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23da422971c485e8e8bba1a6b6cf1d9605d8ffbf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListScatterIntoExistingList"
+  summary: "Scatters tensor at indices in an input list."
+  description: <<END
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+input_handle: The list to scatter into.
+tensor: The input tensor.
+indices: The indices used to index into the list.
+output_handle: The TensorList.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..80f3675d551ce90435fb7b78969f70986a4e8c02
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "TridiagonalSolve"
+  visibility: HIDDEN
+  in_arg {
+    name: "diagonals"
+    description: <<END
+Shape is `[..., 3, M]`.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+
+  summary: "Solves tridiagonal systems of equations."
+  description: <<END
+`diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
+represent matrices with three rows being the superdiagonal, diagonals, and
+subdiagonals, in order. The last element of the superdiagonal and the first
+element of the subdiagonal is ignored.
+`rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
+each left-hand side.
+The output is a tensor of shape `[..., M, K]` containing the solutions.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index ed4a2bd5588eecb19d9d5effb386b2fe5c0c4409..f282b9fab56a2735519ec56d3292867feb84750a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -37,5 +37,15 @@ dropped, and will not be included in the result.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
 </div>
+
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 4,  3, 3, 4],
+#       [5,  6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
index 7e139ddf4d9fac5cd47fdb56927cb325be45d54d..0360cc09d064295a45032905f652e56055b92986 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -31,6 +31,15 @@ If the minimum is empty for a given segment ID `i`, it outputs the largest
 possible value for the specific numeric type,
 `output[i] = numeric_limits<T>::max()`.
 
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 1,  2, 2, 1],
+#       [5,  6, 7, 8]]
+```
+
 If the given segment ID `i` is negative, then the corresponding value is
 dropped, and will not be included in the result.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
index 9c8ea3b620832dba4f18a1cfbac953bad8bb6f56..67de4734bdedcf82acdcf993e1fb3e36d7b140d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the product along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -28,6 +28,15 @@ entries belonging to a segment such that:
 \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
 `j...` such that `segment_ids[j...] == i`.
 
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 4,  6, 6, 4],
+#       [5,  6, 7, 8]]
+```
+
 If there is no entry for a given segment ID `i`, it outputs 1.
 
 If the given segment ID `i` is negative, then the corresponding value is
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 7e5d9265c2ead2028fa8bb80076ea40f858cff39..08139235f4a792c353dda4db667a135c823c6e5a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the sum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -35,5 +35,13 @@ added to the sum of the segment.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 5,  5, 5, 5],
+#       [5,  6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt b/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e886b04ca69e61bc76e2207b33f661f74506e9b3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "WorkerHeartbeat"
+  in_arg {
+    name: "request"
+    description: <<END
+A string tensor containing a serialized WorkerHeartbeatRequest
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+A string tensor containing a serialized WorkerHeartbeatResponse
+END
+  }
+  summary: "Worker heartbeat op."
+  description: <<END
+Heartbeats may be sent periodically to indicate the coordinator is still active,
+to retrieve the current worker status and to expedite shutdown when necessary.
+END
+}
diff --git a/tensorflow/core/api_def/excluded_ops.cc b/tensorflow/core/api_def/excluded_ops.cc
index 02026e94abc5b3284578859e157279b27ba84446..65d2102ac80579b0ba6f9510cd7a95300cd10a3f 100644
--- a/tensorflow/core/api_def/excluded_ops.cc
+++ b/tensorflow/core/api_def/excluded_ops.cc
@@ -24,9 +24,9 @@ const std::unordered_set<std::string>* GetExcludedOps() {
            "GcsConfigureBlockCache", "GcsConfigureCredentials",
 #ifdef INTEL_MKL
            // QuantizedFusedOps for Intel CPU
-           "QuantizedConv2DAndRequantize", "QuantizedConv2DWithBias",
-           "QuantizedConv2DWithBiasAndRequantize", "QuantizedConv2DAndRelu",
-           "QuantizedConv2DAndReluAndRequantize",
+           "QuantizedConcatV2", "QuantizedConv2DAndRequantize",
+           "QuantizedConv2DWithBias", "QuantizedConv2DWithBiasAndRequantize",
+           "QuantizedConv2DAndRelu", "QuantizedConv2DAndReluAndRequantize",
            "QuantizedConv2DWithBiasAndRelu",
            "QuantizedConv2DWithBiasAndReluAndRequantize",
            "QuantizedConv2DWithBiasSumAndRelu",
diff --git a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
index cc16523a1567e8d7f2d0146c1c44d9ef11b6c6d5..72c281de342e553280c029d98a275395a93896d0 100644
--- a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "AvgPool3D"
-  endpoint {
-    name: "nn.avg_pool3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Case.pbtxt b/tensorflow/core/api_def/python_api/api_def_Case.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c8193a35061a93dc21f1ac02bde318095fbf7b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Case.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Case"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 590b37c95fb2a43e49d5c5ae4dcfe8cc499a4c6d..edbcba26ce3d31cc8c3d9aecb9efc5286ddd7002 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -2,6 +2,7 @@ op {
   graph_op_name: "Conv3DBackpropFilterV2"
   endpoint {
     name: "nn.conv3d_backprop_filter"
+    deprecation_version: 2
   }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
diff --git a/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt b/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3ea8859b5426926230f81a9ec31a6083d3a11dd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EuclideanNorm"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35b2f309fb3850a4393464209422822eebd9e2a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MulNoNan"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1816dd734fadd538ad81f243c203e3c1ccae259
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatefulStandardNormalV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20d9a43be4168af3f79d96224eb2fbefaa5752a2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatterIntoExistingList"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 03c0e9ce2b6e7c508f902f45e3c2d3284e081028..c9e3cf40860a2b68024dc0fc61e8c65640cbc9be 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -63,7 +63,7 @@ int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
       (chunk_bytes < EIGEN_MAX_ALIGN_BYTES)
           ? (EIGEN_MAX_ALIGN_BYTES - chunk_bytes)
           : (EIGEN_MAX_ALIGN_BYTES - (chunk_bytes % EIGEN_MAX_ALIGN_BYTES));
-  CHECK_EQ(0, diff % elt_bytes);
+  DCHECK_EQ(0, diff % elt_bytes);
   base_chunk_elts += (diff / elt_bytes);
   DCHECK_EQ(0, ((base_chunk_elts * elt_bytes) % EIGEN_MAX_ALIGN_BYTES))
       << "total_elts=" << total_elts << " num_chunks=" << num_chunks
@@ -78,17 +78,23 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
  public:
   // Takes ownership of output and prepares to properly alias its chunks.
   // Ownership is taken because the shape may temporarily change.
-  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator)
+  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator,
+                        bool align_chunks)
       : output_(std::move(*output)),
         dt_(output_.dtype()),
         old_shape_(output_.shape()),
         num_chunks_(num_chunks),
         allocator_(allocator),
         total_elts_(output_.NumElements()),
-        chunk_elts_(AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)),
+        chunk_elts_(align_chunks
+                        ? AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)
+                        : total_elts_ / num_chunks_),
         data_start_(reinterpret_cast<T*>(DMAHelper::base(&output_))),
         data_end_(data_start_ + total_elts_) {
-    CHECK_GT(chunk_elts_, 0);
+    if (!align_chunks) {
+      DCHECK_EQ(total_elts_, num_chunks_ * chunk_elts_);
+    }
+    DCHECK_GT(chunk_elts_, 0);
     Flatten();
   }
 
@@ -176,19 +182,24 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
 }  // namespace
 
 CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
-                                         Allocator* allocator) {
+                                         Allocator* allocator,
+                                         bool align_chunks) {
   switch (output->dtype()) {
     case DT_FLOAT:
-      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     case DT_DOUBLE:
-      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator,
+                                               align_chunks);
       break;
     case DT_INT32:
-      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     case DT_INT64:
-      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     default:
       LOG(FATAL) << "Unsupported type " << output->dtype()
@@ -227,6 +238,7 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
 
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
+                         col_params.instance.type == GATHER_COLLECTIVE ||
                          (col_params.instance.type == BROADCAST_COLLECTIVE &&
                           col_params.is_source))
                             ? &ctx->input(0)
@@ -301,6 +313,8 @@ bool BaseCollectiveExecutor::CheckDependencies(
   for (int32 instance : col_params.instance.impl_details.dependencies) {
     auto find_iter = launched_.find(instance);
     if (find_iter == launched_.end() || find_iter->second != 0) {
+      VLOG(1) << "Collective " << col_params.ToString()
+              << " blocked by instance " << instance;
       return false;
     }
   }
@@ -313,6 +327,7 @@ void BaseCollectiveExecutor::WaitForDependencies(
   while (!CheckDependencies(col_params)) {
     launch_cv_.wait(l);
   }
+  VLOG(1) << "Unblocking collective " << col_params.ToString();
 }
 
 void BaseCollectiveExecutor::Launched(const CollectiveParams& col_params) {
@@ -325,6 +340,8 @@ void BaseCollectiveExecutor::Launched(const CollectiveParams& col_params) {
     launched_[col_params.instance.instance_key] = num_devices;
   }
   if (--launched_[col_params.instance.instance_key] == 0) {
+    VLOG(1) << "Unblocking dependencies for collective instance "
+            << col_params.instance.instance_key;
     launch_cv_.notify_all();
   }
 }
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index b711aa6d5004947466d25e5e6e7d1be9216a2afb..bc85b5af5f87bd6d5fc1cdc28b17248eeb33a25d 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -78,9 +78,15 @@ class CollectiveAdapter {
 };
 
 // Create a CollectiveAdaptor wrapping 'output', specialized to its
-// data-type and shape.
+// data-type and shape.  If align_chunks == true then chunk size may
+// be larger than output->NumElements() / num_chunks and one or more
+// of the suffix chunks may be empty.  Chunks will be arranged to start
+// and end on alignment boundaries.  If align_chunks == false then
+// output->NumElements() % num_chunks must be 0 and all chunks will
+// have exactly the same size, ignoring alignment issues.
 CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
-                                         Allocator* allocator);
+                                         Allocator* allocator,
+                                         bool align_chunks = true);
 
 // Default implementation of CollectiveExecutor.  Delegates the actual
 // work of moving data to a class specialized for the operation type,
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 3843ea9e60cfbac4c428174f9b2201ccafaf505e..0e4ddb102002ec2802761e05013834cf491f7980 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -152,6 +153,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
+  c->freed_count = 0;
 
   region_manager_.set_handle(c->ptr, h);
 
@@ -180,29 +182,46 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   free_chunks_list_ = h;
 }
 
-void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
+void* BFCAllocator::AllocateRawInternalWithRetry(
+    size_t unused_alignment, size_t num_bytes,
+    const AllocationAttributes& allocation_attr) {
   // Fast path: Try once to allocate without getting the retry_helper_ involved
-  void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
+  uint64 freed_by_count = 0;
+  if (allocation_attr.freed_by_func != nullptr) {
+    freed_by_count = allocation_attr.freed_by_func();
+  }
+  void* r =
+      AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
   if (r != nullptr) {
     return r;
   } else {
     static const int64 kMaxMillisToWait = 10000;  // 10 seconds
-    return retry_helper_.AllocateRaw(
-        [this](size_t a, size_t nb, bool v) {
-          return AllocateRawInternal(a, nb, v);
+    r = retry_helper_.AllocateRaw(
+        [this, &allocation_attr](size_t a, size_t nb, bool v) {
+          uint64 freed_by_count = 0;
+          if (allocation_attr.freed_by_func != nullptr) {
+            freed_by_count = allocation_attr.freed_by_func();
+          }
+          return AllocateRawInternal(a, nb, v, freed_by_count);
         },
         kMaxMillisToWait, unused_alignment, num_bytes);
+    return r;
   }
 }
 
 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
+  VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
   if (allocation_attr.no_retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
     bool dump_log_on_failure = VLOG_IS_ON(2);
-    void* result =
-        AllocateRawInternal(unused_alignment, num_bytes, dump_log_on_failure);
+    uint64 freed_by_count = 0;
+    if (allocation_attr.freed_by_func != nullptr) {
+      freed_by_count = allocation_attr.freed_by_func();
+    }
+    void* result = AllocateRawInternal(unused_alignment, num_bytes,
+                                       dump_log_on_failure, freed_by_count);
     if (result == nullptr) {
       static std::atomic<int32> log_counter{0};
       int32 counter_value = log_counter.load(std::memory_order_relaxed);
@@ -218,7 +237,8 @@ void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
     }
     return result;
   } else {
-    return AllocateRaw(unused_alignment, num_bytes);
+    return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
+                                        allocation_attr);
   }
 }
 
@@ -233,7 +253,8 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
 
 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
                                         size_t num_bytes,
-                                        bool dump_log_on_failure) {
+                                        bool dump_log_on_failure,
+                                        uint64 freed_before) {
   if (num_bytes == 0) {
     LOG(ERROR) << "tried to allocate 0 bytes";
     return nullptr;
@@ -247,14 +268,14 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   BinNum bin_num = BinNumForSize(rounded_bytes);
 
   mutex_lock l(lock_);
-  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
   if (ptr != nullptr) {
     return ptr;
   }
 
   // Try to extend
   if (Extend(unused_alignment, rounded_bytes)) {
-    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -274,7 +295,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
 }
 
 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
-                                 size_t num_bytes) {
+                                 size_t num_bytes, uint64 freed_before) {
   // First identify the first bin that could satisfy rounded_bytes.
   for (; bin_num < kNumBins; bin_num++) {
     // Start searching from the first bin for the smallest chunk that fits
@@ -285,6 +306,9 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
       const BFCAllocator::ChunkHandle h = (*citer);
       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
       DCHECK(!chunk->in_use());
+      if (freed_before > 0 && freed_before < chunk->freed_count) {
+        continue;
+      }
       if (chunk->size >= rounded_bytes) {
         // We found an existing chunk that fits us that wasn't in use, so remove
         // it from the free bin structure prior to using.
@@ -311,10 +335,10 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
         // Update stats.
         ++stats_.num_allocs;
         stats_.bytes_in_use += chunk->size;
-        stats_.max_bytes_in_use =
-            std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-        stats_.max_alloc_size =
-            std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
+        stats_.peak_bytes_in_use =
+            std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+        stats_.largest_alloc_size =
+            std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
 
         VLOG(4) << "Returning: " << chunk->ptr;
         if (VLOG_IS_ON(4)) {
@@ -347,6 +371,9 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
   // The new chunk is not in use.
   new_chunk->allocation_id = -1;
 
+  // It inherits the freed time.
+  new_chunk->freed_count = c->freed_count;
+
   // Maintain the pointers.
   // c <-> c_neighbor becomes
   // c <-> new_chunk <-> c_neighbor
@@ -364,6 +391,7 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
 }
 
 void BFCAllocator::DeallocateRaw(void* ptr) {
+  VLOG(1) << "DeallocateRaw " << Name() << " " << RequestedSize(ptr);
   DeallocateRawInternal(ptr);
   retry_helper_.NotifyDealloc();
 }
@@ -415,6 +443,9 @@ void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
   // Set the new size
   c1->size += c2->size;
 
+  // Pick latest free time.
+  c1->freed_count = std::max(c1->freed_count, c2->freed_count);
+
   DeleteChunk(h2);
 }
 
@@ -460,6 +491,11 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
   // Mark the chunk as no longer in use.
   c->allocation_id = -1;
 
+  // Optionally record the free time.
+  if (timing_counter_) {
+    c->freed_count = timing_counter_->next();
+  }
+
   // Updates the stats.
   stats_.bytes_in_use -= c->size;
 
@@ -630,7 +666,10 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
         in_use_by_size[c->size]++;
       }
       LOG(INFO) << (c->in_use() ? "Chunk" : "Free ") << " at " << c->ptr
-                << " of size " << c->size;
+                << " of size " << c->size
+                << (timing_counter_
+                        ? strings::StrCat(" freed_count ", c->freed_count)
+                        : "");
       h = c->next;
     }
   }
@@ -647,16 +686,16 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }
 
-void BFCAllocator::GetStats(AllocatorStats* stats) {
+absl::optional<AllocatorStats> BFCAllocator::GetStats() {
   mutex_lock l(lock_);
-  *stats = stats_;
+  return stats_;
 }
 
 void BFCAllocator::ClearStats() {
   mutex_lock l(lock_);
   stats_.num_allocs = 0;
-  stats_.max_bytes_in_use = stats_.bytes_in_use;
-  stats_.max_alloc_size = 0;
+  stats_.peak_bytes_in_use = stats_.bytes_in_use;
+  stats_.largest_alloc_size = 0;
 }
 
 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 2d74bf2b286a1fac4d3f9b3921fef7a5b838fce8..b0fd0d8667da96d3590965ae3e05675968389089 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -50,9 +51,14 @@ class BFCAllocator : public Allocator {
   ~BFCAllocator() override;
 
   string Name() override { return name_; }
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return AllocateRaw(alignment, num_bytes, AllocationAttributes());
+  }
+
   void* AllocateRaw(size_t alignment, size_t num_bytes,
                     const AllocationAttributes& allocation_attr) override;
+
   void DeallocateRaw(void* ptr) override;
 
   bool TracksAllocationSizes() override;
@@ -63,15 +69,23 @@ class BFCAllocator : public Allocator {
 
   int64 AllocationId(const void* ptr) override;
 
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
 
   void ClearStats() override;
 
+  void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
+
  private:
   struct Bin;
 
   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
-                            bool dump_log_on_failure);
+                            bool dump_log_on_failure,
+                            uint64 freed_before_count);
+
+  void* AllocateRawInternalWithRetry(
+      size_t alignment, size_t num_bytes,
+      const AllocationAttributes& allocation_attr);
+
   void DeallocateRawInternal(void* ptr);
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
@@ -126,6 +140,9 @@ class BFCAllocator : public Allocator {
     // What bin are we in?
     BinNum bin_num = kInvalidBinNum;
 
+    // Optional count when this chunk was most recently made free.
+    uint64 freed_count = 0;
+
     bool in_use() const { return allocation_id != -1; }
 
     string DebugString(BFCAllocator* a,
@@ -314,8 +331,8 @@ class BFCAllocator : public Allocator {
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
-  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
+                     uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
   // of size 'num_bytes'.
@@ -420,6 +437,7 @@ class BFCAllocator : public Allocator {
 
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
+  SharedCounter* timing_counter_ = nullptr;
 
   // Structures mutable after construction
   mutable mutex lock_;
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 0e798235bf0649428409a2fa72ac3067736c347a..7621787dec76850e346f65b3883cb2b5073c0077 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -109,7 +109,7 @@ TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
 TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
   bool prod_callback_called = false;
   br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_callback_called](const Status& s) {
+                  [&prod_callback_called](const Status& s) {
                     prod_callback_called = true;
                   });
   Status bad_status;
@@ -129,11 +129,11 @@ TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
 
 TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
   Status cons_status;
-  br_->ConsumeBuf(
-      "key0", [this, &cons_status](const Status& s, BufRendezvous::Hook* h) {
-        cons_status = s;
-        EXPECT_EQ(h, nullptr);
-      });
+  br_->ConsumeBuf("key0",
+                  [&cons_status](const Status& s, BufRendezvous::Hook* h) {
+                    cons_status = s;
+                    EXPECT_EQ(h, nullptr);
+                  });
   EXPECT_TRUE(cons_status.ok());
   br_.reset();
   EXPECT_FALSE(cons_status.ok());
@@ -146,13 +146,13 @@ TEST_F(BufRendezvousTest, AbortNonEmpty) {
   Status prod_status;
   Notification prod_note;
   Notification cons_note;
-  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
-                              const Status& s, BufRendezvous::Hook* h) {
+  br_->ConsumeBuf("key0", [&cons_note, &cons_status](const Status& s,
+                                                     BufRendezvous::Hook* h) {
     cons_status = s;
     cons_note.Notify();
   });
   br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_note, &prod_status](const Status& s) {
+                  [&prod_note, &prod_status](const Status& s) {
                     prod_status = s;
                     prod_note.Notify();
                   });
@@ -175,13 +175,13 @@ TEST_F(BufRendezvousTest, UseAfterAbort) {
   Status prod_status;
   Notification prod_note;
   Notification cons_note;
-  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
-                              const Status& s, BufRendezvous::Hook* h) {
+  br_->ConsumeBuf("key0", [&cons_note, &cons_status](const Status& s,
+                                                     BufRendezvous::Hook* h) {
     cons_status = s;
     cons_note.Notify();
   });
   br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_note, &prod_status](const Status& s) {
+                  [&prod_note, &prod_status](const Status& s) {
                     prod_status = s;
                     prod_note.Notify();
                   });
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index f3d86aa633938042b862613162d1c2a94b0fe35a..3eef5ed0a0c5984474c4d75ae417c030d269290d 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -44,7 +44,7 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     std::unique_ptr<DeviceResolverInterface> drl(
         new DeviceResolverLocal(device_mgr_.get()));
     std::unique_ptr<ParamResolverInterface> prl(
-        new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+        new CollectiveParamResolverLocal(cp, device_mgr_.get(), drl.get(),
                                          task_name));
     cme_.reset(new CollectiveExecutorMgr(cp, device_mgr_.get(), std::move(drl),
                                          std::move(prl)));
@@ -73,11 +73,11 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
   EXPECT_EQ(CollectiveExecutor::kInvalidId, cme_->NextStepId(123));
   Notification ss_note;
   Status ss_status;
-  cme_->RefreshStepIdSequenceAsync(
-      123, [this, &ss_status, &ss_note](const Status& s) {
-        ss_status = s;
-        ss_note.Notify();
-      });
+  cme_->RefreshStepIdSequenceAsync(123,
+                                   [&ss_status, &ss_note](const Status& s) {
+                                     ss_status = s;
+                                     ss_note.Notify();
+                                   });
   ss_note.WaitForNotification();
   EXPECT_FALSE(ss_status.ok());
   EXPECT_EQ(ss_status.error_message(),
@@ -87,7 +87,7 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
   GetStepSequenceRequest* req = nullptr;
   GetStepSequenceResponse* resp = nullptr;
   cme_->GetStepSequenceAsync(req, resp,
-                             [this, &gs_status, &gs_note](const Status& s) {
+                             [&gs_status, &gs_note](const Status& s) {
                                gs_status = s;
                                gs_note.Notify();
                              });
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 8907f6d56a6d098b690b8e79cdc12a5da013d7d1..a76708385be59a2c6cec556d6ab1124b9c2bf541 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -38,9 +38,9 @@ void CollectiveParamResolverLocal::InstanceRec::WaitForOutMu(mutex_lock& lock) {
 }
 
 CollectiveParamResolverLocal::CollectiveParamResolverLocal(
-    const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-    const string& task_name)
-    : nccl_(false),  // (b/111897089): turn on NCCL collectives.
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverInterface* dev_resolver, const string& task_name)
+    : nccl_(config.experimental().collective_nccl()),
       dev_mgr_(dev_mgr),
       dev_resolver_(dev_resolver),
       task_name_(task_name) {}
@@ -144,7 +144,6 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
 }
 
 namespace {
-
 struct DevRec {
   string task;
   string device;
@@ -361,7 +360,7 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
   for (int i = 0; i < perm.size(); ++i) {
     perm[i] = i;
   }
-  std::sort(perm.begin(), perm.end(), [cp](const int& a, const int& b) {
+  std::sort(perm.begin(), perm.end(), [cp](int a, int b) {
     return cp->instance.device_names[a] < cp->instance.device_names[b];
   });
   std::vector<string> new_devs;
@@ -585,7 +584,7 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
 void CollectiveParamResolverLocal::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     const StatusCallback& done) {
-  VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
+  VLOG(1) << "CompleteParams local " << device << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
       device, cp,
@@ -619,6 +618,8 @@ void CollectiveParamResolverLocal::AssignCollectiveType(CollectiveParams* cp) {
     } else {
       cp->instance.impl_details.collective_name = "RingReduce";
     }
+  } else if (cp->instance.type == GATHER_COLLECTIVE) {
+    cp->instance.impl_details.collective_name = "RingGather";
   } else {
     cp->instance.impl_details.collective_name = "undef";
   }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index fd408e4ef31bd7bb8ff5f1e28a0b49e4283bc18c..08e2f338f3c642cdfa6cd2df824cb1177c4b4911 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class CompleteGroupRequest;
@@ -36,7 +37,8 @@ class DeviceMgr;
 // group leader for param resolution in a multi-task context.
 class CollectiveParamResolverLocal : public ParamResolverInterface {
  public:
-  CollectiveParamResolverLocal(const DeviceMgr* dev_mgr,
+  CollectiveParamResolverLocal(const ConfigProto& config,
+                               const DeviceMgr* dev_mgr,
                                DeviceResolverInterface* dev_resolver,
                                const string& task_name);
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 94d889c40dff89204ccfc43478f8732815a4ead4..70eb9f8081aedfde33e3eb67b478c72ca2dee72f 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -41,8 +41,8 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
-                                                task_name));
+    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
+                                                drl_.get(), task_name));
   }
 
   void RunCompleteDefaultRanking(
@@ -175,7 +175,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
       prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
                                 nullptr /*CancellationManager*/,
-                                [this, &statuses, &note, i](const Status& s) {
+                                [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
                                   note[i].Notify();
                                 });
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index 4263f3a4add524bf59e7c08cfb5d927ac9e23e06..2e9d8cd394e36ed6dbbd5cb6e49687b633bf9186 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -46,8 +46,8 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
-                                                kTaskName));
+    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
+                                                drl_.get(), kTaskName));
     rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
                                                kStepId));
   }
@@ -70,7 +70,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
                      "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
                      0 /*stream_index*/,
-                     [this, &recv_note, &recv_status](const Status& s) {
+                     [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
                      });
@@ -85,7 +85,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
   rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
                    cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [this, &send_note, &send_status](const Status& s) {
+                   [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
@@ -113,7 +113,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
                      "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
                      0 /*stream_index*/,
-                     [this, &recv_note, &recv_status](const Status& s) {
+                     [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
                      });
@@ -130,7 +130,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
                    cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [this, &send_note, &send_status](const Status& s) {
+                   [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..046f00ccc56020baac9a4a8cf2cb94d8125fbc8f
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -0,0 +1,994 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/colocation_graph.h"
+
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+namespace {
+
+// We hoist the conversion from C-style string literal to StringPiece here,
+// so that we can avoid the many repeated calls to strlen().
+const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
+// Returns a list of devices having type in supported_device_types.  The
+// returned list is sorted by preferred type (higher numeric type is preferred).
+std::vector<Device*> FilterSupportedDevices(
+    const std::vector<Device*>& devices,
+    const PrioritizedDeviceTypeVector& supported_device_types,
+    const Device* default_device) {
+  Device* filtered_default_device = nullptr;
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
+    for (Device* device : devices) {
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
+        if (device == default_device) {
+          filtered_default_device = device;
+        } else {
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
+        }
+      }
+    }
+  }
+
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
+    // First sort by prioritized device type (higher is preferred) and
+    // then by device name (lexicographically).
+    if (a_priority != b_priority) {
+      return a_priority > b_priority;
+    }
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
+  };
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
+  if (filtered_default_device != nullptr) {
+    filtered_devices.emplace_back(filtered_default_device);
+  }
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
+  return filtered_devices;
+}
+
+// Using absl::StrJoin with lambda does not work in tf-lite builds.
+std::vector<string> DevicesToString(const std::vector<Device*> devices) {
+  std::vector<string> v;
+  v.reserve(devices.size());
+  for (Device* d : devices) {
+    v.push_back(d->name());
+  }
+  return v;
+}
+
+// Using absl::StrJoin with lambda does not work in tf-lite builds.
+std::vector<string> DeviceTypeAndPriorityToString(
+    const PrioritizedDeviceTypeVector& devices) {
+  std::vector<string> v;
+  v.reserve(devices.size());
+  for (const std::pair<DeviceType, int32>& device_and_type : devices) {
+    v.push_back(DeviceTypeString(device_and_type.first));
+  }
+  return v;
+}
+
+// While Placer can override requested device on ops processing
+// resources, i.e. node that take (and potentially return) a resource,
+// it must not override requested device on ops generating a resource,
+// e.g. VarHandleOp, _Arg. Such ops are currently no-input, single resource/ref
+// output nodes.
+bool IsResourceGeneratorNode(const Node& node) {
+  return node.num_inputs() == 0 && node.num_outputs() == 1 &&
+         (IsRefType(node.output_type(0)) || node.output_type(0) == DT_RESOURCE);
+}
+
+bool IsExemptFromResourceInputColocation(const Node* node) {
+  // Note: Partitioned function calls, which place and partition their
+  // function bodies, are exempt from this check: they forward resource and
+  // ref inputs to operations that are appropriately placed, instead of
+  // dereferencing them.
+  const string& op_type = node->op_def().name();
+  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+}
+
+bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+  for (const auto& prioritized_device_type : device_types) {
+    if (prioritized_device_type.second != 0) return true;
+  }
+  return false;
+}
+
+bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                       const PrioritizedDeviceTypeVector& b_types) {
+  if (a_types.size() != b_types.size()) {
+    return false;
+  }
+  for (int i = 0; i < a_types.size(); ++i) {
+    if (a_types[i].first != b_types[i].first) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status Member::SetParentAndSupportedDevices(
+    const Node& node, const std::vector<DeviceType>& types) {
+  int id = node.id();
+  if (id < 0) {
+    return errors::Internal("Placer should not be creating a Member for node: ",
+                            node.DebugString());
+  }
+  parent_ = id;
+  return SupportedDeviceTypesForNode(types, node.def(),
+                                     &supported_device_types_);
+}
+
+Status Member::SetAssignedDeviceName(const string& device_name) {
+  if (DeviceNameUtils::HasSomeDetails(requested_device_name_)) {
+    return errors::Internal(
+        "Setting assigned device name when there is a requested device set "
+        "is unsupported");
+  }
+  if (!DeviceNameUtils::ParseFullName(device_name, &assigned_device_name_)) {
+    return errors::Internal("Malformed assigned device '", device_name, "'");
+  }
+  // Set requested device to assigned_device to maintain the invariant that
+  // requested is a specialization of assigned.
+  requested_device_name_ = assigned_device_name_;
+  return Status::OK();
+}
+
+Status Member::SetRequestedDeviceName(const Node& node) {
+  if (!DeviceNameUtils::ParseFullName(node.requested_device(),
+                                      &requested_device_name_)) {
+    return errors::InvalidArgument("Malformed device specification '",
+                                   node.requested_device(),
+                                   "' in node: ", node.DebugString());
+  }
+  if (DeviceNameUtils::HasSomeDetails(assigned_device_name_)) {
+    return errors::Internal(
+        "Setting requested device name when there is an assigned device set "
+        "is unsupported");
+  }
+  return Status::OK();
+}
+
+Status Member::EnsureCompatibilityAcrossResourceEdge(
+    const Node& src, const Member& src_root,
+    const Node& dst, /*dst_root is this*/
+    bool log_device_placement) {
+  if (!DeviceNameUtils::AreCompatibleDevNames(src_root.assigned_device_name_,
+                                              assigned_device_name_)) {
+    return errors::InvalidArgument(
+        "Cannot place the graph because a reference or resource edge "
+        "connects colocation groups with incompatible assigned devices: ",
+        DeviceNameUtils::ParsedNameToString(src_root.assigned_device_name_),
+        " vs ", DeviceNameUtils::ParsedNameToString(assigned_device_name_));
+  }
+
+  if (DeviceNameUtils::AreCompatibleDevNames(src_root.requested_device_name_,
+                                             requested_device_name_)) {
+    return Status::OK();
+  }
+
+  // If we are here, assigned devices are compatible but requested ones are
+  // not. We will be overriding the requested device for destination node, but
+  // need to preserve the invariant that it will be a specialization of
+  // the assigned device.
+  if (log_device_placement) {
+    LOG(INFO) << "Ignoring device specification "
+              << DeviceNameUtils::ParsedNameToString(requested_device_name_)
+              << " for node '" << dst.name()
+              << "' because the input edge from '" << src.name()
+              << "' is a reference connection and already has a device "
+                 "field set to "
+              << DeviceNameUtils::ParsedNameToString(
+                     src_root.requested_device_name_);
+  }
+  requested_device_name_ = src_root.requested_device_name_;
+  DeviceNameUtils::EnsureSpecification(&requested_device_name_,
+                                       assigned_device_name_);
+  return Status::OK();
+}
+
+void Member::Merge(std::vector<Member>* tree, int x_root, int y_root,
+                   Member** new_root, Member** old_root, bool dry_run) {
+  Member& x_root_member = (*tree)[x_root];
+  Member& y_root_member = (*tree)[y_root];
+
+  // Merge the sets by setting the parent pointer of the smaller tree's root
+  // node to point to the root of the larger tree. Together with path
+  // compression in ColocationGraph::FindRoot, this ensures that we do not
+  // experience pathological performance on graphs such as chains.
+  int new_root_id, old_root_id;
+  if (x_root_member.rank_ < y_root_member.rank_) {
+    // The tree rooted at x_root is shallower, so connect it to
+    // y_root. The rank of y_root is unchanged because its new
+    // child has strictly less rank.
+    if (!dry_run) {
+      x_root_member.parent_ = y_root;
+    }
+    new_root_id = y_root;
+    old_root_id = x_root;
+  } else if (x_root_member.rank_ > y_root_member.rank_) {
+    // The tree rooted at y_root is shallower, so connect it to
+    // x_root. The rank of x_root is unchanged because its new
+    // child has strictly less rank.
+    if (!dry_run) {
+      y_root_member.parent_ = x_root;
+    }
+    new_root_id = x_root;
+    old_root_id = y_root;
+  } else {
+    if (!dry_run) {
+      // Both trees have the same rank, so break the tie by choosing
+      // x_root as the new root.
+      y_root_member.parent_ = x_root;
+      // Increment the rank of the tree rooted at x_root, because it
+      // is now strictly deeper than before.
+      ++x_root_member.rank_;
+    }
+    new_root_id = x_root;
+    old_root_id = y_root;
+  }
+
+  *new_root = &(*tree)[new_root_id];
+  *old_root = &(*tree)[old_root_id];
+}
+
+// tree is non-const because we can change some `parent` pointers in some
+// members for more efficient future lookups. The vector itself is not
+// changed.
+int Member::FindRoot(std::vector<Member>* tree, int node_id) {
+  Member& member = (*tree)[node_id];
+  if (member.parent_ == node_id) {
+    // member.parent is the root of this disjoint tree.  Do nothing.
+  } else {
+    member.parent_ = FindRoot(tree, member.parent_);
+  }
+  // Now it is guaranteed that member.parent is the root of this disjoint
+  // tree.
+  return member.parent_;
+}
+
+Status Member::MergeDeviceNames(const Member& other,
+                                bool allow_soft_placement) {
+  // Assuming the "requested is a specialization of assigned" invariant holds
+  // for this and `other`, it will hold after the two merges below.
+  DeviceNameUtils::ParsedName assigned_device_name_copy = assigned_device_name_;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &assigned_device_name_copy, other.assigned_device_name_));
+
+  DeviceNameUtils::ParsedName requested_device_name_copy =
+      requested_device_name_;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &requested_device_name_copy, other.requested_device_name_,
+      allow_soft_placement));
+
+  // We checked for all errors, now change the devices.
+  assigned_device_name_ = assigned_device_name_copy;
+  requested_device_name_ = requested_device_name_copy;
+  return Status::OK();
+}
+
+// Updates this to contain the intersection of the device types in
+// this and "other".
+bool Member::MergeSupportedDevices(const Member& other) {
+  // Generate intersection with priorities.
+  // Each vector contains the same device types but with different priorities.
+  // The priorities are taken from the corresponding source vector.
+  PrioritizedDeviceTypeVector target_intersection;
+  PrioritizedDeviceTypeVector other_intersection;
+  for (const auto& prioritized_device_type : supported_device_types_) {
+    bool found = false;
+    for (const auto& other_prioritized_device_type :
+         other.supported_device_types_) {
+      if (prioritized_device_type.first ==
+          other_prioritized_device_type.first) {
+        found = true;
+        other_intersection.push_back(other_prioritized_device_type);
+        break;
+      }
+    }
+    if (found) {
+      target_intersection.push_back(prioritized_device_type);
+    }
+  }
+
+  // Sort the devices by priority order.
+  auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                        const std::pair<DeviceType, int32>& b) {
+    // First look at set priorities.
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+    // Then fallback to default priorities.
+    auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+    auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+    if (a_priority != b_priority) {
+      return a_priority > b_priority;
+    }
+    // Finally just look at the Device type strings.
+    return a.first.type_string() < b.first.type_string();
+  };
+
+  std::sort(target_intersection.begin(), target_intersection.end(),
+            device_sort);
+  std::sort(other_intersection.begin(), other_intersection.end(), device_sort);
+
+  PrioritizedDeviceTypeVector result;
+
+  bool is_target_prioritized = HasPriorities(target_intersection);
+  bool is_other_prioritized = HasPriorities(other_intersection);
+  if (!is_target_prioritized && !is_other_prioritized) {
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    result = target_intersection;
+  } else if (is_target_prioritized && !is_other_prioritized) {
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    result = target_intersection;
+  } else if (!is_target_prioritized && is_other_prioritized) {
+    result = other_intersection;
+  } else {
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (ArePrioritiesSame(target_intersection, other_intersection)) {
+      result = target_intersection;
+    } else {
+      for (const auto& prioritized_device : target_intersection) {
+        result.push_back(std::make_pair(prioritized_device.first, 0));
+      }
+      std::sort(result.begin(), result.end(), device_sort);
+    }
+  }
+
+  if (result.empty()) {
+    return false;
+  }
+  supported_device_types_ = result;
+  return true;
+}
+
+Status Member::AssignDevice(const Node& node, bool allow_soft_placement) {
+  if (node.assigned_device_name_index() == assigned_device_name_index_) {
+    return Status::OK();
+  }
+
+  DeviceNameUtils::ParsedName parsed;
+  DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
+  Status s = DeviceNameUtils::MergeDevNames(&assigned_device_name_, parsed,
+                                            allow_soft_placement);
+  if (!s.ok()) {
+    return errors::Internal(
+        "Constraining by assigned device should not cause an error. Original "
+        "root's assigned device name: ",
+        DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+        " node's assigned device name \"", node.assigned_device_name(),
+        ". Error: ", s.error_message());
+  }
+  s = DeviceNameUtils::MergeDevNames(&requested_device_name_, parsed,
+                                     allow_soft_placement);
+  if (!s.ok()) {
+    return errors::Internal(
+        "Constraining by assigned device should not cause an error. Original "
+        "root's requested device name: \"",
+        DeviceNameUtils::ParsedNameToString(requested_device_name_),
+        "\", node's assigned device name \"", node.assigned_device_name(),
+        "\". Error: ", s.error_message());
+  }
+
+  assigned_device_name_index_ = node.assigned_device_name_index();
+  // Clear cached possible_devices, if any.
+  possible_devices_.clear();
+  return Status::OK();
+}
+string Member::DebugString() {
+  return absl::StrCat(
+      "Member(assigned_device_name_index_=", assigned_device_name_index_,
+      " requested_device_name_=",
+      DeviceNameUtils::ParsedNameToString(requested_device_name_),
+      " assigned_device_name_=",
+      DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+      " supported_device_types_=[",
+      absl::StrJoin(DeviceTypeAndPriorityToString(supported_device_types_),
+                    ", "),
+      "] possible_devices_=[",
+      absl::StrJoin(DevicesToString(possible_devices_), ", "), "]");
+}
+ColocationGraph::ColocationGraph(const Graph* graph,
+                                 const DeviceSet* device_set,
+                                 const Device* default_device,
+                                 bool allow_soft_placement,
+                                 bool log_device_placement)
+    : graph_(graph),
+      device_set_(device_set),
+      device_types_(device_set->PrioritizedDeviceTypeList()),
+      default_device_(default_device),
+      allow_soft_placement_(allow_soft_placement),
+      log_device_placement_(log_device_placement) {
+  members_.resize(graph->num_node_ids());
+}
+
+// Adds each node of the Graph to this ColocationGraph as a singleton.
+//
+// NOTE: The implementation assumes that the ids of nodes passed to
+// this method are dense and zero-based; the memory used will be linear in
+// the largest node ID.
+// NOTE: If this method returns an error, *this is left in an undefined
+// state.
+Status ColocationGraph::ColocateAllNodes() {
+  // This maps from a colocation group identifier to the 'root' of that
+  // colocation group.  Note that the keys in this map are StringPiece; the
+  // actual strings are stored under the NodeDef.  The lifetime of this map
+  // is limited to this ColocateAllNodes() method, and no part of the
+  // NodeDef trees are changed during the lifetime of this method, so using
+  // StringPiece as a key is safe.
+  //
+  // Also, as a further optimization, we remove the "loc:@" prefix from
+  // "class" attribute values, when they are used as keys in this table.
+  // This allows us to use StringPiece values that refer to substrings of
+  // 'string' values stored in NodeDef attribute lists, as well as StringPiece
+  // values that refer to 'string' values from NodeDef::name(), without
+  // performing any string allocations.
+  std::unordered_map<StringPiece, const Node*, StringPieceHasher>
+      colocation_group_root;
+
+  for (const Node* node : graph_->op_nodes()) {
+    // When adding the node, identify whether it is part of a colocation
+    // group.
+
+    // This code is effectively the equivalent of GetNodeAttr() for a string
+    // array, but it avoids all internal allocations (the allocation of the
+    // backing store of the std::vector<string> as well as the copies of the
+    // strings within it).  Instead, we combine the query of the colocation
+    // attribute with the calls to ColocateNodeToGroup.
+    bool found_spec = false;
+    const AttrValue* attr_value =
+        node->attrs().Find(kColocationAttrNameStringPiece);
+    if (attr_value != nullptr && attr_value->has_list()) {
+      for (const string& class_spec : attr_value->list().s()) {
+        StringPiece spec(class_spec);
+        if (str_util::ConsumePrefix(&spec, kColocationGroupPrefixStringPiece)) {
+          found_spec = true;
+          TF_RETURN_IF_ERROR(
+              ColocateNodeToGroup(&colocation_group_root, node, spec));
+        }
+      }
+    }
+
+    // TODO(iga): Even when the node has a spec, we need to colocate the
+    // node to its "name group" because other nodes can still use
+    // "loc:@<this_node_name>" in their colocation specs.
+    if (!found_spec) {
+      // If the node does not specify a colocation group, then use the
+      // name of this node as the colocation group.
+      TF_RETURN_IF_ERROR(
+          ColocateNodeToGroup(&colocation_group_root, node, node->name()));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateResourceOrRefEdge(Node* src, Node* dst) {
+  // Colocate `src` and `dst` to maintain the invariant that nodes
+  // connected by reference edges are colocated.
+  int src_root_id = FindRoot(src->id());
+  int dst_root_id = FindRoot(dst->id());
+  auto& src_root = members_[src_root_id];
+  auto& dst_root = members_[dst_root_id];
+
+  TF_RETURN_IF_ERROR(dst_root.EnsureCompatibilityAcrossResourceEdge(
+      *src, src_root, *dst, log_device_placement_));
+  Status status = ColocateNodes(*src, src_root_id, *dst, dst_root_id);
+  if (!status.ok()) {
+    return AttachDef(
+        errors::InvalidArgument("Nodes were connected by a "
+                                "reference connection (requiring them to "
+                                "be on the same device), but the two nodes "
+                                "were assigned two different devices: ",
+                                status.error_message()),
+        *dst);
+  }
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateResourceAndRefEdges() {
+  // Enumerate the constraint edges, and use them to update the disjoint
+  // node set.
+  // If `node` has an input edge with reference type, add an edge from the
+  // source of that edge to `node`.
+  for (const Edge* edge : graph_->edges()) {
+    if (edge->IsControlEdge()) {
+      continue;
+    }
+    Node* src = edge->src();
+    Node* dst = edge->dst();
+    DataType input_type = dst->input_type(edge->dst_input());
+    if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
+        !IsExemptFromResourceInputColocation(dst)) {
+      TF_RETURN_IF_ERROR(ColocateResourceOrRefEdge(src, dst));
+    }
+  }
+  return Status::OK();
+}
+
+Status ColocationGraph::Initialize() {
+  TF_RETURN_IF_ERROR(InitializeMembers());
+  TF_RETURN_IF_ERROR(ColocateResourceAndRefEdges());
+  TF_RETURN_IF_ERROR(ColocateAllNodes());
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateNodeToGroup(
+    std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
+        colocation_group_root,
+    const Node* node, StringPiece colocation_group) {
+  const Node*& root_node = (*colocation_group_root)[colocation_group];
+  if (root_node == nullptr) {
+    // This is the first node of the colocation group, so
+    // designate this node as the 'root' of that colocation group.
+    root_node = node;
+  } else {
+    // Try to colocate the node with the root.  If there is an
+    // error, return it.
+    Status s = ColocateNodes(*node, *root_node);
+    if (!s.ok()) {
+      if (!allow_soft_placement_) {
+        return AttachDef(s, *node);
+      }
+      if (log_device_placement_) {
+        LOG(INFO) << "Ignoring request to colocate node '" << node->name()
+                  << "' with nodes in colocation group '" << colocation_group
+                  << "' because soft placement is on and an attempt at doing "
+                     "so resulted in the following error: "
+                  << AttachDef(s, *node).ToString();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Merge the (possibly disjoint) sets containing nodes "x" and
+// "y". Returns OK if the all nodes in the union of these sets can
+// be placed on the same device type.
+//
+// NOTE: If this method returns an error, *this is left in an undefined
+// state.
+Status ColocationGraph::ColocateNodes(const Node& x, const Node& y) {
+  int x_root = FindRoot(x.id());
+  int y_root = FindRoot(y.id());
+  return ColocateNodes(x, x_root, y, y_root);
+}
+
+// This overload of ColocateNodes() allows a caller to provide the root node
+// ids for the two nodes. For large graphs, this noticeably reduces the
+// graph load time.
+Status ColocationGraph::ColocateNodes(const Node& x, int x_root, const Node& y,
+                                      int y_root) {
+  if (x_root == y_root) {
+    return Status::OK();
+  }
+
+  Member* new_root_member;
+  Member* old_root_member;
+  Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
+                /*dry_run=*/true);
+
+  // Merge the partial device specifications, and ensure that they are
+  // compatible. NULL options_ is treated as allowing soft placement.
+  // If there is an error, nothing is modified.
+  // TODO(mrry): Consider enriching the error message by pointing
+  // out which nodes have the explicit partial device
+  // specifications that caused this conflict.
+  Status s = new_root_member->MergeDeviceNames(*old_root_member,
+                                               allow_soft_placement_);
+  if (!s.ok()) {
+    return errors::InvalidArgument(
+        "Cannot colocate nodes ",
+        errors::FormatColocationNodeForError(x.name()), " and ",
+        errors::FormatColocationNodeForError(y.name()), ": ",
+        s.error_message());
+  }
+
+  // Ensure that the common root has at least one supported device
+  // type, by computing the intersection of
+  // new_root_member.supported_device_types and
+  // old_root_member.supported_device_types.
+  if (!new_root_member->MergeSupportedDevices(*old_root_member)) {
+    return errors::InvalidArgument(
+        "Cannot colocate nodes ",
+        errors::FormatColocationNodeForError(x.name()), " and ",
+        errors::FormatColocationNodeForError(y.name()),
+        " because no device type supports both of those nodes and the "
+        "other nodes colocated with them.",
+        DebugInfo(x_root), DebugInfo(y_root));
+  }
+
+  // All error checks are done, merge the colocation graphs.
+  Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
+                /*dry_run=*/false);
+  return Status::OK();
+}
+
+// Limits the possible devices of `node`'s colocation group to the device
+// to which `node` is assigned. This makes sure that all nodes in this
+// colocation group will be assigned to the same device. Without this
+// explicit restriction, heuristics can choose a different possible device
+// for other nodes in the group.
+Status ColocationGraph::LimitToAssignedDevice(const Node& node) {
+  if (node.assigned_device_name_index() < 0) {
+    return errors::Internal(
+        "Expected an assigned node as argument to LimitToAssignedDevice but "
+        "got: ",
+        node.DebugString());
+  }
+  int root = FindRoot(node.id());
+  Member& root_member = members_[root];
+  return root_member.AssignDevice(node, allow_soft_placement_);
+}
+
+// For the given node, subject to the constraints previously given
+// to this ColocationGraph, set its assigned_device_name. Returns OK
+// if a satisfying device can be found, otherwise an error.
+//
+// Note: This method returns a pointer to a field within members_.
+// The caller must not use the returned pointer after there is any possibility
+// that the members_[i].possible_devices field has been modified.
+Status ColocationGraph::GetDevicesForNode(
+    Node* node, const std::vector<Device*>** possible_devices) {
+  *possible_devices = nullptr;
+  const int node_root = FindRoot(node->id());
+  if (!members_[node_root].possible_devices().empty()) {
+    *possible_devices = &members_[node_root].possible_devices();
+    return Status::OK();
+  }
+
+  // We have not yet computed the possible devices for the
+  // colocated node set containing 'node', so we do so now using the
+  // constraints on the root node.
+
+  // "devices" will contain the set of feasible placements for the
+  // colocated node set containing 'node'.
+  std::vector<Device*> devices;
+  if (DeviceNameUtils::HasSomeDetails(
+          members_[node_root].requested_device_name())) {
+    // The root node has a (possibly partial) device
+    // specification, so enumerate the physical devices that
+    // conform to it.
+    device_set_->FindMatchingDevices(
+        members_[node_root].requested_device_name(), &devices);
+
+    if (!devices.empty()) {
+      // Filter devices into those that are compatible with the root
+      // node (and its children).
+      devices = FilterSupportedDevices(
+          devices, members_[node_root].supported_device_types(),
+          default_device_);
+    }
+
+    // Perform soft placement if allow_soft_placement_ is set.
+    if (devices.empty() && allow_soft_placement_) {
+      // The soft_device_name is the same as the node's device name
+      // without specifying the device type or ID.
+      DeviceNameUtils::ParsedName soft_device_name =
+          members_[node_root].requested_device_name();
+      soft_device_name.type.clear();
+      soft_device_name.has_type = false;
+      soft_device_name.has_id = false;
+      device_set_->FindMatchingDevices(soft_device_name, &devices);
+      if (!devices.empty()) {
+        devices = FilterSupportedDevices(
+            devices, members_[node_root].supported_device_types(),
+            default_device_);
+      }
+    }
+
+    if (devices.empty()) {
+      // Return an error when a physical device that matches an explicit
+      // device specification is not found. This ensures that we don't
+      // assign a node to GPU when the user wanted to force it on CPU.
+      string debug_info = DebugInfo(node_root);
+
+      DeviceNameUtils::ParsedName specified_device_name;
+      if (DeviceNameUtils::ParseFullName(node->requested_device(),
+                                         &specified_device_name) &&
+          specified_device_name ==
+              members_[node_root].requested_device_name()) {
+        // The specified device and merged set device match, and
+        // will appear in the GraphDef (for debugging), so just
+        // print the specified device.
+        std::vector<Device*> devices_matching_nodedef;
+        device_set_->FindMatchingDevices(specified_device_name,
+                                         &devices_matching_nodedef);
+        if (devices_matching_nodedef.empty()) {
+          // Sometimes it is almost impossible to understand the problem
+          // without a list of available devices.
+          std::vector<string> device_names;
+          for (const Device* device : device_set_->devices()) {
+            device_names.push_back(device->name());
+          }
+          std::sort(device_names.begin(), device_names.end());
+
+          string gpu_msg = "";
+          if (!IsGoogleCudaEnabled() &&
+              str_util::Lowercase(specified_device_name.type) == "gpu") {
+            gpu_msg =
+                " The requested device appears to be a GPU, but CUDA is not "
+                "enabled.";
+          }
+
+          return errors::InvalidArgument(
+              errors::FormatNodeNameForError(node->name()),
+              "was explicitly assigned to ", node->requested_device(),
+              " but available devices are [ ",
+              str_util::Join(device_names, ", "), " ]. Make sure ",
+              "the device specification refers to a valid device.", gpu_msg);
+        } else if (specified_device_name.has_type) {
+          return errors::InvalidArgument(
+              "Could not satisfy explicit device specification '",
+              node->requested_device(), "' because no supported kernel for ",
+              specified_device_name.type, " devices is available.", debug_info,
+              "\nRegistered kernels:\n",
+              KernelsRegisteredForOp(node->type_string()));
+        } else {
+          return errors::InvalidArgument(
+              "Could not satisfy explicit device specification '",
+              node->requested_device(), debug_info);
+        }
+      } else {
+        // The specified device may be a valid device but the
+        // merged set device is different, so print both.
+        return errors::InvalidArgument(
+            "Could not satisfy explicit device specification '",
+            node->requested_device(), "' because the node ",
+            errors::FormatColocationNodeForError(node->name()),
+            " was colocated with a group of nodes that ",
+            "required incompatible device '",
+            DeviceNameUtils::ParsedNameToString(
+                members_[node_root].requested_device_name()),
+            "'", debug_info);
+      }
+    }
+  } else {
+    // The device is completely unspecified, so enumerate the devices that
+    // support all of the nodes in the set.
+    if (device_set_->devices().empty()) {
+      return errors::Internal("No devices are registered");
+    }
+    devices = FilterSupportedDevices(
+        device_set_->devices(), members_[node_root].supported_device_types(),
+        default_device_);
+
+    if (devices.empty()) {
+      return errors::InvalidArgument(
+          "Node had no OpKernel registered to support this operation: ",
+          "Operation was ", node->type_string(), " and inputs were ",
+          DataTypeVectorString(node->input_types()), DebugInfo(node_root));
+    }
+  }
+
+  // Cache the result of the possible devices for this node group.
+  members_[node_root].set_possible_devices(std::move(devices));
+  *possible_devices = &members_[node_root].possible_devices();
+  return Status::OK();
+}
+
+Status ColocationGraph::InitializeMembers() {
+  for (Node* node : graph_->op_nodes()) {
+    Status status = InitializeMember(*node, &members_[node->id()]);
+    if (!status.ok()) {
+      return AttachDef(status, *node);
+    }
+  }
+  return Status::OK();
+}
+
+string ColocationGraph::DebugString() {
+  std::unordered_set<int> roots;
+  std::vector<string> root_strings;
+  for (const Node* node : graph_->nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    int node_root = FindRoot(node->id());
+    if (roots.count(node_root) == 0) {
+      root_strings.push_back(DebugInfo(node_root));
+      roots.insert(node_root);
+    }
+  }
+  return absl::StrJoin(root_strings, "\n");
+}
+
+// Returns debugging info for the node referred to by 'node_root'.
+string ColocationGraph::DebugInfo(const int node_root) {
+  string text(
+      "\nColocation Debug Info:\n"
+      "Colocation group had the following types and devices: ");
+
+  // If this node is part of a colocation group, then we want to
+  // collect the mapping of ops to supported devices, so that
+  // the user can see why an unsatisfiable placement occurred.
+
+  std::unordered_map<string, string> type_to_devices;
+  std::vector<const Node*> colocation_nodes;
+  int num_nodes_found = 0;
+
+  for (const Node* node : graph_->nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    int id = node->id();
+    if (FindRoot(id) != node_root) {
+      continue;
+    }
+    ++num_nodes_found;
+    colocation_nodes.push_back(node);
+    const string& op_type = node->type_string();
+    string devices_registered;
+    for (const auto& device_type : members_[id].supported_device_types()) {
+      strings::StrAppend(&devices_registered,
+                         DeviceTypeString(device_type.first), " ");
+    }
+
+    type_to_devices[op_type] = std::move(devices_registered);
+  }
+
+  for (const auto& td : type_to_devices) {
+    strings::StrAppend(&text, "\n", td.first, ": ", td.second);
+  }
+  strings::StrAppend(&text,
+                     "\n\nColocation members and user-requested devices:");
+  for (const Node* node : colocation_nodes) {
+    strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
+                       ") ", node->requested_device());
+  }
+  strings::StrAppend(&text, "\n");
+
+  if (num_nodes_found <= 0) {
+    text.clear();
+  }
+  return text;
+}
+
+Status ColocationGraph::InitializeMemberWithAssignedDevice(
+    const string& assigned_device_name, const string& node_type,
+    bool must_be_full_name, Member* member) {
+  // This node has already been assigned to a device, so we
+  // respect this placement, after sanity-checking it.
+  // NOTE: Since any assignment must have been performed by
+  // the TensorFlow runtime, we consider errors in this branch to
+  // be INTERNAL.
+  TF_RETURN_IF_ERROR(member->SetAssignedDeviceName(assigned_device_name));
+  if (!must_be_full_name) {
+    return Status::OK();
+  }
+  // Since assigned device must be a full specification, do extra checks.
+  const Device* assigned_device =
+      device_set_->FindDeviceByName(assigned_device_name);
+  if (assigned_device == nullptr) {
+    return errors::Internal("Assigned device '", assigned_device_name,
+                            "' does not match any device");
+  }
+
+  for (const auto& d : member->supported_device_types()) {
+    if (DeviceType(assigned_device->attributes().device_type()) == d.first) {
+      return Status::OK();
+    }
+  }
+
+  return errors::Internal("Assigned device '", assigned_device_name,
+                          "' does not have registered OpKernel support "
+                          "for ",
+                          node_type);
+}
+
+Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
+  TF_RETURN_IF_ERROR(member->SetParentAndSupportedDevices(node, device_types_));
+
+  if (node.has_assigned_device_name()) {
+    TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
+        node.assigned_device_name(), node.type_string(), true, member));
+  } else {
+    // This node has not yet been assigned to a device, so we
+    // calculate any constraints due to the set of registered
+    // kernels and any (partial) user-provided device specification
+    // in the NodeDef.
+
+    // If no kernels are registered for this op type, fail with an error.
+    if (member->supported_device_types().empty()) {
+      std::set<string> registered_device_types;
+      for (Device* d : device_set_->devices()) {
+        registered_device_types.insert(d->device_type());
+      }
+      std::vector<string> attr_key_vals;
+      for (const auto& it : node.attrs()) {
+        const string& name = it.first;
+        const AttrValue& attr_value = it.second;
+        attr_key_vals.push_back(
+            strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
+      }
+      return errors::InvalidArgument(
+          "No OpKernel was registered to support Op '", node.type_string(),
+          "' used by ", errors::FormatNodeNameForError(node.name()),
+          "with these attrs: [", str_util::Join(attr_key_vals, ", "),
+          "]\n"
+          "Registered devices: [",
+          str_util::Join(registered_device_types, ", "), "]\n",
+          "Registered kernels:\n", KernelsRegisteredForOp(node.type_string()));
+    }
+
+    // If the NodeDef contains a device, then we interpret it as a
+    // (partial) device specification.
+    if (!node.requested_device().empty()) {
+      if (IsResourceGeneratorNode(node)) {
+        // Treat requested device on resource generating nodes as assigned
+        // device so that we don't override it.
+        TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
+            node.requested_device(), node.type_string(), false, member));
+      } else {
+        // The user has specified a device in the NodeDef, try to find a
+        // valid device matching their specification in the set of
+        // devices.
+        // NOTE: The full name may specify a device that is not in
+        // n.supported_device_types(), but we check that in AssignDevice().
+        TF_RETURN_IF_ERROR(member->SetRequestedDeviceName(node));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..12611496a5f53764fa13eb839753fd4289cca2d6
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocation_graph.h
@@ -0,0 +1,253 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// Represents a node in the disjoint node forest and the
+// accumulated constraints on the device used by that node.
+class Member {
+ public:
+  Member() = default;
+
+  Status SetParentAndSupportedDevices(const Node& node,
+                                      const std::vector<DeviceType>& types);
+
+  const DeviceNameUtils::ParsedName& requested_device_name() const {
+    return requested_device_name_;
+  }
+
+  Status SetAssignedDeviceName(const string& device_name);
+
+  Status SetRequestedDeviceName(const Node& node);
+
+  Status EnsureCompatibilityAcrossResourceEdge(
+      const Node& src, const Member& src_root,
+      const Node& dst, /*dst_root is this*/
+      bool log_device_placement);
+
+  const PrioritizedDeviceTypeVector& supported_device_types() const {
+    return supported_device_types_;
+  }
+
+  // If `dry_run` is true, just sets `new_root` and `old_root` and does not
+  // actually modify anything in the `tree`.
+  static void Merge(std::vector<Member>* tree, int x_root, int y_root,
+                    Member** new_root, Member** old_root, bool dry_run);
+
+  // tree is non-const because we can change some `parent` pointers in some
+  // members for more efficient future lookups. The vector itself is not
+  // changed.
+  static int FindRoot(std::vector<Member>* tree, int node_id);
+
+  Status MergeDeviceNames(const Member& other, bool allow_soft_placement);
+
+  // Updates this to contain the intersection of the device types in
+  // this and "other". If the intersection is empty, returns false and does
+  // not update this. Else returns true and updates this.
+  bool MergeSupportedDevices(const Member& other);
+
+  Status AssignDevice(const Node& node, bool allow_soft_placement);
+
+  void set_possible_devices(std::vector<Device*>&& devices) {
+    possible_devices_ = devices;
+  }
+  const std::vector<Device*>& possible_devices() { return possible_devices_; }
+
+  string DebugString();
+
+ private:
+  // The id of the node that is the parent of this one, or its own
+  // id if it is a root. parent <= 0 indicates that this member is invalid.
+  int parent_ = -1;
+
+  // A proxy for the depth of the tree that is used to prefer
+  // connecting smaller trees to larger trees when merging disjoint
+  // sets.
+  int rank_ = 0;
+
+  // Once colocation groups have been formed, the Placer starts actually
+  // choosing devices. All nodes in a group must be assigned to the same
+  // device. Once we assigned the first device to some node in this group,
+  // we set assigned_device_name_index to this device name's index in the
+  // graph.
+  // The `*_device_name_` fields will contain the parsed name of this device
+  // and `possible_devices`, if computed, will contain just this device.
+  // `assigned_device_name_index` is an optimization to avoid parsing and
+  // comparing device names. The value of -1 signals that a single device
+  // has not been chosen yet.
+  int assigned_device_name_index_ = -1;
+
+  // The merged form of the device requested for this node, with those of all of
+  // its children. requested_device_name_ is always kept a specialization (i.e.
+  // DeviceNameUtils::IsSpecialization) of assigned_device_name_. When no device
+  // is requested, this field is set to assigned_device_name_.  As a
+  // specialization of assigned_device_name_, requested_device_name_ represents
+  // the most specific form of all assigned and requested devices of this node
+  // and its children, if this node is a root. requested_device_name_ is used
+  // to finally select devices for nodes.  We can override requested devices due
+  // to resource colocation constraints but not assigned devices (unless soft
+  // placement is on).
+  DeviceNameUtils::ParsedName requested_device_name_;
+
+  // The merged form of the device assigned for this node, with
+  // those of all of its children.
+  // This field is used to raise errors due to unsatisfiable constraints.
+  // Can be a partial specification.
+  // INVARIANT: requested_device_name_ is always a
+  // DeviceNameUtils::IsSpecialization of assigned_device_name_.
+  DeviceNameUtils::ParsedName assigned_device_name_;
+
+  // The intersection of all device types supported by this node,
+  // and those of all of its children, in priority order
+  // of the preferred device.
+  PrioritizedDeviceTypeVector supported_device_types_;
+
+  // If this node is a root, stores a list of Devices to which this node
+  // and all of its children have been assigned, or nullptr if this
+  // has not yet been computed.
+  std::vector<Device*> possible_devices_;
+};  // namespace
+
+// This class maintains the connected components of a colocation
+// constraint graph, and uses this information to assign a satisfying
+// device placement to the nodes of the graph.
+//
+// The typical usage pattern is:
+//
+//   Graph graph = ...;
+//   DeviceSet device_set = ...;
+//   ColocationGraph colocation_graph(graph, device_set);
+//
+//   // Add all the nodes of the `graph` to the `colocation_graph`.
+//   for (Node* node : graph.nodes()) {
+//     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
+//   }
+//
+//   // Add one or more colocation constraints.
+//   Node node_1 = *graph.FindNodeId(...);
+//   Node node_2 = *graph.FindNodeId(...);
+//   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
+//
+//   // Assign devices based on the accumulated constraints.
+//   for (Node* node : graph.nodes()) {
+//     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
+//   }
+//
+// This implementation uses the Union-Find algorithm to efficiently maintain the
+// connected components and incrementally adds edges via
+// ColocationGraph::ColocateNodes() invocations.
+//
+// ColocationGraph does not assign any devices to graph nodes. The
+// `log_device_placement` argument is used to log messages when requested
+// device is ignored.
+class ColocationGraph {
+ public:
+  ColocationGraph(const Graph* graph, const DeviceSet* device_set,
+                  const Device* default_device, bool allow_soft_placement,
+                  bool log_device_placement);
+
+  // Adds each node of the Graph to this ColocationGraph as a singleton.
+  //
+  // NOTE: The implementation assumes that the ids of nodes passed to
+  // this method are dense and zero-based; the memory used will be linear in
+  // the largest node ID.
+  // NOTE: If this method returns an error, *this is left in an undefined
+  // state.
+  Status ColocateAllNodes();
+
+  Status ColocateResourceOrRefEdge(Node* src, Node* dst);
+
+  Status ColocateResourceAndRefEdges();
+
+  Status Initialize();
+
+  Status ColocateNodeToGroup(
+      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
+          colocation_group_root,
+      const Node* node, StringPiece colocation_group);
+
+  // Merge the (possibly disjoint) sets containing nodes "x" and
+  // "y". Returns OK if the all nodes in the union of these sets can
+  // be placed on the same device type.
+  //
+  // If this method returns an error, *this is unchanged.
+  Status ColocateNodes(const Node& x, const Node& y);
+
+  // This overload of ColocateNodes() allows a caller to provide the root node
+  // ids for the two nodes. For large graphs, this noticeably reduces the
+  // graph load time.
+  // If this method returns an error, *this is unchanged.
+  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root);
+
+  // Limits the possible devices of `node`'s colocation group to the device
+  // to which `node` is assigned. This makes sure that all nodes in this
+  // colocation group will be assigned to the same device. Without this
+  // explicit restriction, heuristics can choose a different possible device
+  // for other nodes in the group.
+  Status LimitToAssignedDevice(const Node& node);
+
+  // For the given node, subject to the constraints previously given
+  // to this ColocationGraph, set its assigned_device_name. Returns OK
+  // if a satisfying device can be found, otherwise an error.
+  //
+  // Note: This method returns a pointer to a field within members_.
+  // The caller must not use the returned pointer after there is any possibility
+  // that the members_[i].possible_devices field has been modified.
+  Status GetDevicesForNode(Node* node,
+                           const std::vector<Device*>** possible_devices);
+
+  Status InitializeMembers();
+
+  string DebugString();
+
+  // Returns debugging info for the node referred to by 'node_root'.
+  string DebugInfo(const int node_root);
+
+  Status InitializeMemberWithAssignedDevice(const string& assigned_device_name,
+                                            const string& node_type,
+                                            bool must_be_full_name,
+                                            Member* member);
+
+  Status InitializeMember(const Node& node, Member* member);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  int FindRoot(int node_id) { return Member::FindRoot(&members_, node_id); }
+
+  const Graph* const graph_;  // Not owned.
+  std::vector<Member> members_;
+  const DeviceSet* device_set_;  // Not owned.
+  const std::vector<DeviceType> device_types_;
+  const Device* default_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index 54f1119e139886096cb7c2007e584003992d86c2..62e82bcc5a3bcaf9faff62b1df557d0c99ddaccc 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -56,7 +56,7 @@ TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesKnown) {
   Notification note;
   Status status;
   drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
-                                 [this, &note, &status](const Status& s) {
+                                 [&note, &status](const Status& s) {
                                    status = s;
                                    note.Notify();
                                  });
@@ -74,7 +74,7 @@ TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesUnknown) {
   Notification note;
   Status status;
   drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
-                                 [this, &note, &status](const Status& s) {
+                                 [&note, &status](const Status& s) {
                                    status = s;
                                    note.Notify();
                                  });
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 80b62f273ce785c700f93ed68af1af9429276f79..40a1ffc42da5020b62932812f3939cbdb7686aff 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -501,7 +501,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       std::unique_ptr<DeviceResolverInterface> drl(
           new DeviceResolverLocal(device_mgr_.get()));
       std::unique_ptr<ParamResolverInterface> cprl(
-          new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+          new CollectiveParamResolverLocal(options_.config, device_mgr_.get(),
+                                           drl.get(),
                                            "/job:localhost/replica:0/task:0"));
       collective_executor_mgr_.reset(new CollectiveExecutorMgr(
           options_.config, device_mgr_.get(), std::move(drl), std::move(cprl)));
@@ -1194,6 +1195,8 @@ Status DirectSession::CreateExecutors(
   if (options_.config.experimental()
           .collective_deterministic_sequential_execution()) {
     options.collective_order = GraphCollectiveOrder::kEdges;
+  } else if (options_.config.experimental().collective_nccl()) {
+    options.collective_order = GraphCollectiveOrder::kAttrs;
   }
 
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index aa64b5f59bd0cb54b1872c0328c10ebb1de622b6..1b3fbcbd4a89a069c603d1f7d5c77d54ea5b06b0 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -54,10 +54,6 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
 
-// Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
-                      TF_AttrType* out, unsigned char* is_list);
-
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
 // An AttrBuilder is a convenience class to help with that - providing a smaller
 // interface than NodeDefBuilder and avoiding expensive (unnecessary?) sanity
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 8068905da96eaf8fadc6fd4f6cacbbe1212d9ce4..cdd5632f9de00dfc07a1df4906cfcc70d99d69cb 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#endif
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/util/env_var.h"
@@ -81,7 +83,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
   std::unique_ptr<DeviceResolverInterface> drl(
       new DeviceResolverLocal(local_device_mgr()));
   std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
-      local_device_mgr(), drl.get(), "/job:localhost/replica:0/task:0"));
+      opts.config, local_device_mgr(), drl.get(),
+      "/job:localhost/replica:0/task:0"));
   collective_executor_mgr_.reset(new CollectiveExecutorMgr(
       opts.config, local_device_mgr(), std::move(drl), std::move(cprl)));
 }
@@ -348,14 +351,27 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
   gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
 }
 
-bool EagerContext::ShouldStoreMetadata() {
+bool EagerContext::ShouldStoreGraphs() {
+  mutex_lock ml(metadata_mu_);
+  return should_store_graphs_.load() || metadata_listener_ != nullptr;
+}
+
+bool EagerContext::ShouldStoreStepStats() {
   mutex_lock ml(metadata_mu_);
-  return should_store_metadata_.load() || metadata_listener_ != nullptr;
+  return should_store_step_stats_.load() || metadata_listener_ != nullptr;
+}
+
+void EagerContext::SetShouldStoreGraphs(bool value) {
+  mutex_lock ml(metadata_mu_);
+  should_store_graphs_.store(value);
+  if (!value || metadata_listener_ != nullptr) {
+    run_metadata_.Clear();
+  }
 }
 
-void EagerContext::SetShouldStoreMetadata(bool value) {
+void EagerContext::SetShouldStoreStepStats(bool value) {
   mutex_lock ml(metadata_mu_);
-  should_store_metadata_.store(value);
+  should_store_step_stats_.store(value);
   if (!value || metadata_listener_ != nullptr) {
     run_metadata_.Clear();
   }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 1f24109a76793f125bcfd06d699632efd69bd5a2..330936e2330f86d6cba4ec3602b3c03a937ee42c 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -183,8 +183,10 @@ class EagerContext {
 
   // TODO(apassos) clean up RunMetadata storage.
   mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
-  bool ShouldStoreMetadata() LOCKS_EXCLUDED(metadata_mu_);
-  void SetShouldStoreMetadata(bool value);
+  bool ShouldStoreStepStats() LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreStepStats(bool value);
+  bool ShouldStoreGraphs() LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreGraphs(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
   void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
 
@@ -284,7 +286,8 @@ class EagerContext {
       GUARDED_BY(cache_mu_);
 
   // Whether we should compute RunMetadata.
-  std::atomic<bool> should_store_metadata_{false};
+  std::atomic<bool> should_store_step_stats_{false};
+  std::atomic<bool> should_store_graphs_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
   RunMetadataListener* metadata_listener_ GUARDED_BY(metadata_mu_) = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index c6e8573cc28dd22e806f1668dcaca0b47e6e77fc..f2af6267bf88f2a3cacceee6e8847ab1958f40d3 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -400,7 +400,7 @@ Status EagerLocalExecute(EagerOperation* op,
           device->name());
     }
     GraphCollector* graph_collector = nullptr;
-    if (ctx->ShouldStoreMetadata()) {
+    if (ctx->ShouldStoreGraphs()) {
       graph_collector = ctx->GetGraphCollector();
     }
     // Treat the function as multi_device only when we are not compiling
@@ -450,13 +450,15 @@ Status EagerLocalExecute(EagerOperation* op,
   }
   status = ValidateInputTypeAndPlacement(
       ctx, device, op, kernel,
-      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
+      ctx->ShouldStoreStepStats() ? ctx->RunMetadataProto() : nullptr);
   if (!status.ok()) return status;
   std::unique_ptr<NodeExecStats> maybe_stats;
   StepStats* maybe_step_stats = nullptr;
   GraphCollector* graph_collector = nullptr;
-  if (ctx->ShouldStoreMetadata()) {
+  if (ctx->ShouldStoreGraphs()) {
     graph_collector = ctx->GetGraphCollector();
+  }
+  if (ctx->ShouldStoreStepStats()) {
     maybe_step_stats = ctx->RunMetadataProto()->mutable_step_stats();
     int64 now_nanos = Env::Default()->NowNanos();
     maybe_stats.reset(new NodeExecStats);
@@ -919,6 +921,31 @@ Status EagerKernelExecute(EagerContext* ctx, Device* device,
                                    maybe_stats, maybe_step_stats,
                                    graph_collector));
   }
+  if (graph_collector != nullptr) {
+    mutex_lock ml(*ctx->MetadataMu());
+    {
+      GraphCollector* collector = ctx->GetGraphCollector();
+      mutex_lock mll(collector->mu);
+
+      // Adding to partition graphs for backward compatibility.
+      for (const auto& graph : collector->partitioned_graphs) {
+        *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+      }
+
+      if (collector->dirty) {
+        auto* function_graphs = ctx->RunMetadataProto()->add_function_graphs();
+        *function_graphs->mutable_post_optimization_graph() =
+            collector->optimized_graph;
+        *function_graphs->mutable_pre_optimization_graph() =
+            collector->raw_graph;
+        for (const auto& graph : collector->partitioned_graphs) {
+          *function_graphs->add_partition_graphs() = graph;
+        }
+      }
+
+      collector->ClearGraphs();
+    }
+  }
   if (maybe_stats != nullptr) {
     int64 nanos = Env::Default()->NowNanos();
     maybe_stats->set_op_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
@@ -927,34 +954,28 @@ Status EagerKernelExecute(EagerContext* ctx, Device* device,
     maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
                                         maybe_stats->all_start_micros());
     maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    if (ctx->ShouldStoreMetadata()) {
+    if (ctx->ShouldStoreStepStats()) {
       mutex_lock ml(*ctx->MetadataMu());
       {
-        GraphCollector* collector = ctx->GetGraphCollector();
-        mutex_lock mll(collector->mu);
-        for (const auto& graph : collector->graphs) {
-          *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+        auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
+        // Lazily initialize the RunMetadata with information about all devices
+        // if this is the first call.
+        while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+          step_stats->add_dev_stats();
         }
-        collector->graphs.clear();
-      }
-      auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
-      // Lazily initialize the RunMetadata with information about all devices if
-      // this is the first call.
-      while (step_stats->dev_stats_size() < ctx->devices()->size()) {
-        step_stats->add_dev_stats();
-      }
-      // Find the current device's index.
-      int device_idx = 0;
-      for (int i = 0; i < ctx->devices()->size(); ++i) {
-        if (ctx->devices()->at(i) == device) {
-          device_idx = i;
-          break;
+        // Find the current device's index.
+        int device_idx = 0;
+        for (int i = 0; i < ctx->devices()->size(); ++i) {
+          if (ctx->devices()->at(i) == device) {
+            device_idx = i;
+            break;
+          }
         }
+        // Populate the device stats for this device.
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        dev_stats->set_device(device->name());
+        *dev_stats->add_node_stats() = *maybe_stats;
       }
-      // Populate the device stats for this device.
-      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-      dev_stats->set_device(device->name());
-      *dev_stats->add_node_stats() = *maybe_stats;
     }
   }
   DCHECK_EQ(num_retvals, outputs.size());
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 80d381cecdd421fe9c580cd0a10fbc6db5953080..4945688900a71e9a1343ad7fdad285fb5775aa3b 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -41,8 +41,8 @@ Status EagerExecute(
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
     int* num_retvals);
 
-// Low-level utility to execute the kernel specified by kernel on device device,
-// with the inputs op_inputs, in the context ctx.
+// Low-level utility to execute the kernel specified by kernel on device
+// 'device', with the inputs op_inputs, in the context 'ctx'.
 Status EagerKernelExecute(EagerContext* ctx, Device* device,
                           const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                           KernelAndDevice* kernel, NodeExecStats* maybe_stats,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 41b4608c7e709ca5733f9d48a9a1bc60cea644e0..60b807588228c91194590558ba1c96513187a458 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -111,9 +111,9 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
 
     options.optimize_graph_fn = std::bind(
         grappler::OptimizeGraph, std::placeholders::_1, std::placeholders::_2,
-        std::placeholders::_3, std::placeholders::_4, config_proto,
-        function_def->signature().name(), optimization_options,
-        std::placeholders::_5);
+        std::placeholders::_3, std::placeholders::_4, std::placeholders::_5,
+        config_proto, function_def->signature().name(), optimization_options,
+        std::placeholders::_6);
   }
 #endif
   options.graph_collector = graph_collector;
@@ -147,9 +147,11 @@ void UpdateStats(OpKernelContext* context,
     memory->set_peak_bytes(std::get<1>(sizes));
     memory->set_live_bytes(std::get<2>(sizes));
 
-    AllocatorStats allocator_stats;
-    allocator_pair.first->GetStats(&allocator_stats);
-    memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
+    absl::optional<AllocatorStats> allocator_stats =
+        allocator_pair.first->GetStats();
+    if (stats) {
+      memory->set_allocator_bytes_in_use(allocator_stats->bytes_in_use);
+    }
     allocator_pair.second->GetRecordsAndUnRef();
   }
   auto* ms = stats->mutable_memory_stats();
@@ -194,6 +196,7 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
   params.cancellation_manager = &cm_;
+  cm_.Reset();
   params.log_memory = log_memory_;
   std::unique_ptr<StepStatsCollector> step_stats_collector;
   if (stats != nullptr) {
@@ -258,6 +261,7 @@ Status KernelAndDeviceFunc::Run(
   opts.rendezvous = nullptr;
   opts.create_rendezvous = true;
   opts.cancellation_manager = &cm_;
+  cm_.Reset();
   // eager runtime does not yet support collective ops.
   opts.collective_executor = nullptr;
   opts.allow_dead_tensors = true;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index c4ea99f53e14287c4852bcb1becb3c352fa9746b..027168dd6eac59a758e943eebfd2af36fd5d0914 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -151,7 +151,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   Device* OutputResourceDevice(int idx) const override;
 
   DataType input_type(int i) const override;
-  const DataTypeVector& output_dtypes() const {
+  const DataTypeVector& output_dtypes() const override {
     return kernel_->output_types();
   }
   int num_inputs() const override { return kernel_->num_inputs(); }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index e4ccb11dba9aee4bd6bea1c4909e37a13957021d..ac99fdbe294a7f50606653694a593ed16c363f7b 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -133,7 +133,7 @@ class TensorHandle : public core::RefCounted {
 
  private:
   // If the contents of the Tensor pointed to by this handle is yet to be
-  // computed by a EagerNode, this function will block till that compuatation is
+  // computed by a EagerNode, this function will block till that computation is
   // done and the handle is "ready".
   Status WaitReady();
   Status WaitForNode(uint64 node_id, bool return_if_is_ready);
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 05f3e851f7f78a7ef7f3263aca92e74e5699f76c..e4022ad3af764c002f6784f7283bc8d8ac603489 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -55,6 +56,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -1358,6 +1360,9 @@ class ExecutorState {
 
   // Clean up when this executor is done.
   void Finish();
+  // Schedule Finish() on a separate thread if it needs to wait for deferred
+  // async ops to complete; otherwise run it on the current thread.
+  void ScheduleFinish();
 
   // A standalone routine for this expression so that we can express
   // that we don't want thread safety analysis on this reference (it's
@@ -1778,7 +1783,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
           const bool completed =
               NodeDone(s, state->item->node, ready, stats, nullptr);
           delete state;
-          if (completed) Finish();
+          if (completed) ScheduleFinish();
         };
         nodestats::SetOpStart(stats);
         device->ComputeAsync(async, &state->ctx, done);
@@ -1865,7 +1870,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   }  // while !inline_ready.empty()
 
   // This thread of computation is done if completed = true.
-  if (completed) Finish();
+  if (completed) ScheduleFinish();
 }
 
 Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
@@ -2421,6 +2426,25 @@ void ExecutorState::DumpState() {
   }
 }
 
+void ExecutorState::ScheduleFinish() {
+  int num_deferred_ops;
+  {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops = num_deferred_ops_;
+  }
+  if (num_deferred_ops > 0) {
+    // Finish() may be blocked waiting for deferred async ops to complete. The
+    // execution of deferred async ops may be waiting for non-enqueued ops of
+    // other executors to complete. So running Finish() on the current thread
+    // (inter-op threadpool thread) may lead to a deadlock due to threadpool
+    // exhaustion. Instead, we run it on a separate thread to unblock the
+    // threadpool thread.
+    Env::Default()->SchedClosure([this]() { Finish(); });
+  } else {
+    Finish();
+  }
+}
+
 void ExecutorState::Finish() {
   mu_.lock();
   auto status = status_;
@@ -2463,6 +2487,20 @@ void ExecutorState::Finish() {
   // device has finished all relevant work at this point.
   if (!device->AllowsSyncOnCompletion()) {
     status.Update(device->RefreshStatus());
+    if (!status.ok()) {
+      // In device async execution mode, it's possible for device execution to
+      // lag behind ExecutorState scheduling so much that this is the first
+      // place a device execution error surfaces.
+      // If so, all ExecutorState::NodeDone calls have already happened with OK
+      // status. This is the last defense where StartCancel must be called to
+      // abort all computation still running on any device.
+      // TODO(b/124523000): Always call Finish in a separate thread, so even if
+      // StartCancel blocks the current thread's execution, we won't encounter
+      // deadlocks caused by inter-op thread exhaustion.
+      if (cancellation_manager_) {
+        cancellation_manager_->StartCancel();
+      }
+    }
     delete this;
     runner([=]() { done_cb(status); });
     return;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 82bbb7e68900164f86c718ec2f799a28c7125e81..99841588dcc685f4da12c15ccabaedc06e351746 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -803,12 +803,24 @@ void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
 
 namespace {
 // Removes all stateless nodes that do not contribute to a return
-// value from the function body.  Unlike `RemoveDeadNodes()`, which is
+// value from the function body. Unlike `RemoveDeadNodes()`, which is
 // triggered by `OptimizerOptions.do_function_inlining`, this pass
 // ignores the SINK node, from which (by definition) all nodes are
-// reverse reachable.
-void PruneFunctionBody(Graph* g) {
-  VLOG(2) << "Pruning function body";
+// reverse reachable, and preserves all nodes that are reachable from
+// control output nodes.
+//
+// TODO(ezhulenev, skyewm): Function body should not have special treatment of
+// stateful ops, graph should encode nodes that must execute with `control_ret`
+// and `control_output`.
+void PruneFunctionBody(const FunctionDef& fdef, Graph* g) {
+  VLOG(2) << "Pruning function body: function_name=" << fdef.signature().name();
+
+  // `control_ret` nodes must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_nodes;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_nodes.insert(control_ret.second);
+  }
+
   std::unordered_set<const Node*> nodes;
   for (auto n : g->nodes()) {
     // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
@@ -818,8 +830,9 @@ void PruneFunctionBody(Graph* g) {
     // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
     // still needed. It would be preferable to prune entire loops and/or
     // conditionals if they are not used in the graph.
-    if (n->IsControlFlow() || n->IsDataset() ||
-        (n->op_def().is_stateful() && n->type_string() != kArgOp)) {
+    if (n->IsControlFlow() ||
+        (n->op_def().is_stateful() && n->type_string() != kArgOp) ||
+        (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) {
       nodes.insert(n);
     }
   }
@@ -846,7 +859,7 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   std::unique_ptr<Graph> g(new Graph(lib_def));
   CopyGraph(*fbody->graph, g.get());
 
-  PruneFunctionBody(g.get());
+  PruneFunctionBody(fbody->fdef, g.get());
   optimizer_.Optimize(this, env(), device(), &g, /*shape_map=*/nullptr);
   TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device()->device_type()),
                                        device()->name(), g.get()));
@@ -1392,6 +1405,10 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
   if (static_cast<size_t>(node->num_outputs()) != fbody->ret_nodes.size()) {
     return false;
   }
+  // TODO(ezhulenev): Currently common_runtime function inlining can't guarantee
+  // that all side-effectful ops will be executed after inlining. See Grappler
+  // function_optimizer for details. Unify all function inlining mechanism.
+  // Do not inline if `!fbody->control_ret_nodes.empty()`.
   for (int i = 0; i < node->num_inputs(); ++i) {
     if (node->input_type(i) != fbody->arg_types[i]) return false;
   }
@@ -1424,6 +1441,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     if (e->IsControlEdge()) {
       if (input_control_node == nullptr) {
         input_control_node = AddNoOp(g);
+        input_control_node->set_requested_device(caller->def().device());
       }
       g->AddControlEdge(e->src(), input_control_node);
     } else {
@@ -1684,6 +1702,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       graph(g),
       arg_types(arg_t.begin(), arg_t.end()),
       ret_types(ret_t.begin(), ret_t.end()) {
+  // 1. Find regular Arg/Ret nodes.
   this->arg_nodes.resize(arg_types.size());
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
@@ -1701,6 +1720,17 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
     CHECK_LT(index, node_vec->size());
     (*node_vec)[index] = n;
   }
+  // 2. Find ControlRet nodes that must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_node_names;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_node_names.insert(control_ret.second);
+  }
+  this->control_ret_nodes.reserve(control_ret_node_names.size());
+  for (Node* n : this->graph->op_nodes()) {
+    if (control_ret_node_names.count(n->name()) > 0) {
+      this->control_ret_nodes.push_back(n);
+    }
+  }
 }
 
 FunctionBody::~FunctionBody() { delete this->graph; }
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 94b6beeb933ea3f27a1829835bb886d842301c19..37df90f5d98e6a2bc3a7aea7357a352dc51b3084 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -79,6 +79,7 @@ struct FunctionBody {
   DataTypeVector ret_types;
   gtl::InlinedVector<Node*, 4> arg_nodes;
   gtl::InlinedVector<Node*, 4> ret_nodes;
+  gtl::InlinedVector<Node*, 4> control_ret_nodes;
 
   FunctionBody() {}
   FunctionBody(const FunctionDef& f, DataTypeSlice arg_types,
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 97e46f406cf96cc284ec14718f9500767f5e9861..83694d2c40709f7b76dad002cf028a105a8cd15c 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -945,6 +945,48 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
   EXPECT_EQ(expected_node_names, executed_node_names);
 }
 
+TEST_F(FunctionLibraryRuntimeTest, DoNotPruneControlOutputsFromBody) {
+  // `add` node is not required to compute regular output `o`, but it must
+  // execute because it is in `control_ret`.
+  const FunctionDef func =
+      FDH::Create("FunctionWithControlOutputs", {"i: float"}, {"o: float"}, {},
+                  {
+                      {{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+                      {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}},
+                  },
+                  /*ret_def=*/{{"o", "ret:z:0"}},
+                  /*control_ret_def=*/{{"must_execute", "add"}});
+
+  Init({func});
+
+  auto x = test::AsTensor<float>({1.25});
+  Tensor z;
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(flr1_, "FunctionWithControlOutputs", {}, &handle));
+
+  StepStats stats;
+  StepStatsCollector stats_collector(&stats);
+  FunctionLibraryRuntime::Options opts;
+  opts.stats_collector = &stats_collector;
+  TF_CHECK_OK(Run(flr1_, handle, opts, {x}, {&z}));
+  TF_CHECK_OK(flr1_->ReleaseHandle(handle));
+
+  TF_CHECK_OK(
+      InstantiateAndRun(flr1_, "FunctionWithControlOutputs", {}, {x}, {&z}));
+  test::ExpectTensorEqual<float>(z, test::AsTensor<float>({1.25 * 1.25}));
+
+  stats_collector.FinalizeAndSwap(&stats);
+
+  std::set<string> expected_node_names(
+      {"_SOURCE", "i", "add", "ret", "o_RetVal"});
+  std::set<string> executed_node_names;
+  for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
+    executed_node_names.insert(node_stats.node_name());
+  }
+  EXPECT_EQ(expected_node_names, executed_node_names);
+}
+
 // Constant folding generates names using a global counter.
 // This function invokes constant folding and parses the counter
 // from the generated node name.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 60e82ed13bc1362f40dedfb93e5c001d946bf77f..9c0abd97a1825bbaf7bdc76473bb79a817497e99 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -36,14 +36,17 @@ namespace tensorflow {
 namespace {
 
 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 max_bytes_in_use, int64 max_alloc_size) {
-  AllocatorStats stats;
-  a->GetStats(&stats);
-  LOG(INFO) << "Alloc stats: " << std::endl << stats.DebugString();
-  EXPECT_EQ(stats.bytes_in_use, bytes_in_use);
-  EXPECT_EQ(stats.max_bytes_in_use, max_bytes_in_use);
-  EXPECT_EQ(stats.num_allocs, num_allocs);
-  EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
+                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+  absl::optional<AllocatorStats> stats = a->GetStats();
+  EXPECT_TRUE(stats);
+  if (!stats) {
+    return;
+  }
+  LOG(INFO) << "Alloc stats: " << std::endl << stats->DebugString();
+  EXPECT_EQ(stats->bytes_in_use, bytes_in_use);
+  EXPECT_EQ(stats->peak_bytes_in_use, peak_bytes_in_use);
+  EXPECT_EQ(stats->num_allocs, num_allocs);
+  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
@@ -291,9 +294,10 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
     a.DeallocateRaw(existing_ptrs[i]);
   }
 
-  AllocatorStats stats;
-  a.GetStats(&stats);
-  LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
+  absl::optional<AllocatorStats> stats = a.GetStats();
+  if (stats) {
+    LOG(INFO) << "Alloc stats: \n" << stats->DebugString();
+  }
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index c22bfcea2cedab93409d761686d852a5c4bbeeb9..0727196e1ceed88063a666a6a45fb139386203aa 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -132,8 +132,8 @@ int64 GPUDebugAllocator::AllocationId(const void* ptr) {
                                        MASK_BYTES);
 }
 
-void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
-  base_allocator_->GetStats(stats);
+absl::optional<AllocatorStats> GPUDebugAllocator::GetStats() {
+  return base_allocator_->GetStats();
 }
 
 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
@@ -208,8 +208,8 @@ size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
   return base_allocator_->AllocatedSize(ptr);
 }
 
-void GPUNanResetAllocator::GetStats(AllocatorStats* stats) {
-  base_allocator_->GetStats(stats);
+absl::optional<AllocatorStats> GPUNanResetAllocator::GetStats() {
+  return base_allocator_->GetStats();
 }
 
 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 17757a106c5c20939b2c2d3525efc1ad659c2902..fa0394c19d0f6c910aeb5847a2e765f292f9de88 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -43,7 +43,7 @@ class GPUDebugAllocator : public Allocator {
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   int64 AllocationId(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
   // For testing.
@@ -71,7 +71,7 @@ class GPUNanResetAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
  private:
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 010fdff4e90624191e8cea65013c4e547a0c3398..607193a0013878f33bb96c8b2540f067da8b3284 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -276,6 +276,24 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
   GPUProcessState::singleton()->EnableGPUDevice();
+  pending_cap_ = options.config.gpu_options().experimental().pending_cap();
+  timestamped_allocator_ =
+      options.config.gpu_options().experimental().timestamped_allocator();
+  if (timestamped_allocator_ || pending_cap_ > 0) {
+    SharedCounter* timing_counter = nullptr;
+    if (timestamped_allocator_) {
+      // In this case the SharedCounter was already created and set in the
+      // associated Allocator, with ownership by GPUProcessState.
+      // The GPUKernelTracker will use this SharedCounter, instead of
+      // owning its own.
+      timing_counter =
+          GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id);
+      DCHECK(timing_counter);
+    } else {
+      DCHECK_GT(pending_cap_, 0);
+    }
+    kernel_tracker_.reset(new GPUKernelTracker(Env::Default(), timing_counter));
+  }
 }
 
 BaseGPUDevice::~BaseGPUDevice() {
@@ -508,6 +526,10 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
   }
+  if (pending_cap_ > 0) {
+    DCHECK(kernel_tracker_);
+    kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
+  }
   se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
   if (context->status().ok()) {
@@ -525,6 +547,14 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       VLOG(1) << "GpuDevice::ComputeHelper scheduled "
               << ComputeOpKernelDebugString(*op_kernel, stream_id);
     }
+    if (kernel_tracker_) {
+      GPUKernelTracker* tracker = kernel_tracker_.get();
+      DCHECK(tracker);
+      uint64 queued_count = tracker->RecordQueued();
+      em_->ThenExecute(stream, [op_kernel, tracker, queued_count]() {
+        tracker->RecordTerminated(queued_count);
+      });
+    }
   } else {
     if (vlog_1) {
       VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
@@ -721,8 +751,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
       if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
         return errors::InvalidArgument(
             "Could not parse entry in 'visible_device_list': '",
-            platform_gpu_id_str, "'. visible_device_list = ",
-            visible_device_list);
+            platform_gpu_id_str,
+            "'. visible_device_list = ", visible_device_list);
       }
       if (platform_gpu_id < 0 ||
           platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
@@ -957,15 +987,15 @@ Status BaseGPUDeviceFactory::CreateDevices(
     for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
       err = cudaSetDevice(platform_gpu_id.value());
       if (err != cudaSuccess) {
-        return errors::Internal("cudaSetDevice() on GPU:",
-                                platform_gpu_id.value(), " failed. Status: ",
-                                cudaGetErrorString(err));
+        return errors::Internal(
+            "cudaSetDevice() on GPU:", platform_gpu_id.value(),
+            " failed. Status: ", cudaGetErrorString(err));
       }
       err = cudaFree(nullptr);
       if (err != cudaSuccess) {
         return errors::Internal("CUDA runtime implicit initialization on GPU:",
-                                platform_gpu_id.value(), " failed. Status: ",
-                                cudaGetErrorString(err));
+                                platform_gpu_id.value(),
+                                " failed. Status: ", cudaGetErrorString(err));
       }
     }
     // Reset to the original device.
@@ -1100,21 +1130,24 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
                             tf_gpu_id.value(), " with ", memory_limit,
                             " bytes of memory.");
   }
-  AllocatorStats stats;
-  gpu_allocator->GetStats(&stats);
+  absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
+  if (!stats) {
+    return errors::Internal("No allocator statistics");
+  }
   // 'memory_limit' is the required memory size, but if the allocator with given
   // tf_gpu_id was created before, we'll use it instead of creating a new one
   // (as TF gpu device is a shared resource), in which case the actual memory
   // limit represented by 'stats.bytes_limit' used by that allocator may be
   // different (which should be an error).
   //
-  // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
+  // TODO(laigd): report error if memory_limit doesn't match stats->bytes_limit.
+  int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
   std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
-      options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
+      options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
   LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
-            << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
+            << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
   devices->push_back(std::move(gpu_device));
@@ -1517,6 +1550,115 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   return Status::OK();
 }
 
+uint64 BaseGPUDevice::SafeAllocFrontier() {
+  if (timestamped_allocator_) {
+    return kernel_tracker_->LastTerminatedCount();
+  } else {
+    return 0;
+  }
+}
+
+int BaseGPUDevice::PendingKernels() {
+  if (kernel_tracker_) {
+    return kernel_tracker_->NumPending();
+  }
+  return 0;
+}
+
+uint64 GPUKernelTracker::RecordQueued() {
+  mutex_lock l(mu_);
+  uint64 queued_count = timing_counter_->next();
+  VLOG(2) << "RecordQueued queued_count=" << queued_count
+          << " first_available_=" << first_available_
+          << " last_completed_=" << last_completed_
+          << " num_pending_=" << num_pending_;
+  pending_kernels_[first_available_].queued_count = queued_count;
+  pending_kernels_[first_available_].terminated = false;
+  ++first_available_;
+  ++num_pending_;
+  if (first_available_ >= pending_kernels_.size()) {
+    first_available_ = 0;
+  }
+  if (first_available_ == last_completed_) {
+    // Ring buffer is full: double it.  All of the same valid PendingKernel
+    // entries exist after the copy, they are just shifted to begin
+    // at index 0 in the new array.
+    std::vector<PendingKernel> new_buffer(pending_kernels_.size() * 2);
+    for (int i = 0; i < pending_kernels_.size(); ++i) {
+      int j = (i + last_completed_) % pending_kernels_.size();
+      new_buffer[i] = pending_kernels_[j];
+    }
+    last_completed_ = 0;
+    first_available_ = pending_kernels_.size();
+    pending_kernels_.swap(new_buffer);
+    VLOG(1) << "last_completed_=" << last_completed_
+            << " first_available_=" << first_available_
+            << " num_pending_=" << num_pending_;
+  }
+  DCHECK_NE(first_available_, last_completed_) << "exhausted pending_kernels";
+  return queued_count;
+}
+
+void GPUKernelTracker::RecordTerminated(uint64 queued_count) {
+  mutex_lock l(mu_);
+  VLOG(2) << "RecordTerminated queued_count=" << queued_count
+          << " first_available_=" << first_available_
+          << " last_completed_=" << last_completed_
+          << " num_pending_=" << num_pending_ << " LC="
+          << ((last_completed_ >= 0)
+                  ? pending_kernels_[last_completed_].queued_count
+                  : -1);
+  DCHECK_NE(first_available_, last_completed_);
+  DCHECK_GT(num_pending_, 0);
+  // Starting just past the last completed entry, find the entry with
+  // this queued_count and mark it done.
+  int index = (last_completed_ + 1) % pending_kernels_.size();
+  while (true) {
+    if (index == first_available_) {
+      // This should never happen.
+      LOG(FATAL) << "Failed to find " << queued_count  // Crash OK
+                 << " in queue";
+    }
+    if (pending_kernels_[index].queued_count == queued_count) {
+      pending_kernels_[index].terminated = true;
+      break;
+    }
+    index = (index + 1) % pending_kernels_.size();
+  }
+  // Next move last_completed_ forward past all completed kernels.  In theory
+  // kernels should always complete in queued order so we should be able to
+  // advance the completed frontier to the last queued PendingKernel.  In
+  // practice we occassionally see the termination callbacks arrive out of order
+  // probably because of thread scheduling.  Eventually we may support out-of-
+  // order completion involving multple compute streams so here we follow a
+  // conservative approach and wait for every single callback to arrive before
+  // advancing the frontier.
+  while (true) {
+    int next_index = (last_completed_ + 1) % pending_kernels_.size();
+    if (next_index == first_available_) break;
+    if (pending_kernels_[next_index].terminated) {
+      last_completed_ = next_index;
+    } else {
+      break;
+    }
+  }
+  // Last decrease num_pending before maybe waking a waiter.
+  --num_pending_;
+  pending_decreased_.notify_one();
+}
+
+uint64 GPUKernelTracker::LastTerminatedCount() {
+  mutex_lock l(mu_);
+  if (last_completed_ < 0) {
+    // This is an edge case that can be encountered only at the beginning of
+    // execution.  There's not yet a safe threshold count. We don't want to
+    // return 0 since that bypasses the count mechanism in BFCAllocator, so
+    // return the least non-zero value.
+    return 1;
+  }
+  return pending_kernels_[last_completed_].queued_count;
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index d002d02c51d073ef3019fa1659d555b5d092d883..f8f2a2e2f3221cc3b6acd571ebf2037fb33cf17f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+class GPUKernelTracker;
 
 class BaseGPUDevice : public LocalDevice {
  public:
@@ -114,6 +116,17 @@ class BaseGPUDevice : public LocalDevice {
     return scoped_allocator_mgr_.get();
   }
 
+  // The following two functions always return 0 unless one of the
+  // related experimental config options has been specified.
+
+  // If returned value is > 0 then GPU Memory chunks freed before this count
+  // are guaranteed not to be in use by any kernel pending on this device.
+  uint64 SafeAllocFrontier() override;
+
+  // Returns the number of kernels that have been queued for execution on
+  // the compute stream and are not yet known to have completed.
+  int PendingKernels();
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -141,6 +154,9 @@ class BaseGPUDevice : public LocalDevice {
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  int pending_cap_ = 0;
+  bool timestamped_allocator_ = false;
 
   // Initialize scractch buffers used by Eigen.
   Status InitScratchBuffers();
@@ -163,6 +179,83 @@ class BaseGPUDevice : public LocalDevice {
                               StatusCallback done);
 };
 
+// A per-compute-stream utility that keeps track of kernels that have been
+// queued for execution but may not yet have terminated, and also the queued
+// time of the most recently terminated kernel.
+class GPUKernelTracker {
+ public:
+  // If we're going to share a SharedCounter with an allocator, it's owned
+  // by the allocator because allocators are initialized once per process.
+  // Devices are per-session.
+  explicit GPUKernelTracker(Env* env, SharedCounter* timing_counter)
+      : env_(env), timing_counter_(timing_counter), pending_kernels_(64) {
+    if (!timing_counter_) {
+      // There's not a preexisting counter owned by GPUProcessState, i.e.
+      // pending_cap > 0 but timestamped_allocator == false.
+      owned_counter_.reset(new SharedCounter);
+      timing_counter_ = owned_counter_.get();
+    }
+  }
+
+  // Record that a GPU kernel has just been enqueued on the compute stream.
+  // Inserts a new timing counter value in a new PendingKernel record appended
+  // to the end of the ring buffer then returns that same count.
+  uint64 RecordQueued();
+
+  // Takes a count value returned by RecordQueued and finds the corresponding
+  // PendingKernel record in the ring buffer.  Marks the kernel as completed and
+  // advances the completion frontier accordingly.
+  void RecordTerminated(uint64 at_count);
+
+  // Returns the largest timing count such that all kernels queued no
+  // later than that count are known to have terminated.
+  uint64 LastTerminatedCount();
+
+  // Returns the number of kernels enqueued that are not yet known to
+  // have terminated.
+  int NumPending() {
+    mutex_lock l(mu_);
+    return num_pending_;
+  }
+
+  // Yield current thread until number of pending kernels no longer
+  // exceeds the cap.
+  void PauseWhilePendingExceeds(int cap) {
+    mutex_lock l(mu_);
+    while (num_pending_ > cap) {
+      pending_decreased_.wait(l);
+    }
+  }
+
+ private:
+  Env* env_;
+  SharedCounter* timing_counter_;
+  std::unique_ptr<SharedCounter> owned_counter_;
+
+  // Records when a kernel was queued for execution.  Kernel launches are
+  // identified by a unique count value from a per-GPU device timing counter.
+  struct PendingKernel {
+    uint64 queued_count;
+    bool terminated;
+    PendingKernel(const PendingKernel& pk)
+        : queued_count(pk.queued_count), terminated(pk.terminated) {}
+    PendingKernel() : queued_count(0), terminated(false) {}
+  };
+  mutex mu_;
+  // Ring buffer of PendingKernel records.
+  std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
+  // Next unused slot in pending_kernels_.
+  int first_available_ GUARDED_BY(mu_) = 0;
+  // Last completed PendingKernel such that all prior PendingKernels are
+  // also completed.  With out-of-order completion there may be a mixture
+  // of completed and uncompleted entries between last_completed_ and
+  // first_available_, hence num_pending_ is not guaranteed equal to
+  // their differerence.
+  int last_completed_ GUARDED_BY(mu_) = -1;
+  int num_pending_ GUARDED_BY(mu_) = 0;
+  condition_variable pending_decreased_ GUARDED_BY(mu_);
+};
+
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index ae623b2adbe152de6cbad248db234ac5469f83e1..2628cd413faf63fdf9eee82e263dabc75ca01669 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -276,6 +277,70 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   allocator->DeallocateRaw(ptr);
 }
 
+class GPUKernelTrackerTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    timing_counter_.reset(new SharedCounter);
+    kernel_tracker_.reset(
+        new GPUKernelTracker(Env::Default(), timing_counter_.get()));
+  }
+
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  std::unique_ptr<SharedCounter> timing_counter_;
+};
+
+TEST_F(GPUKernelTrackerTest, basic) {
+  EXPECT_EQ(0, kernel_tracker_->NumPending());
+  // 1 is the expected value when no kernels have yet terminated.
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+
+  std::deque<int64> queued_counts;
+  for (int i = 0; i < 32; ++i) {
+    queued_counts.push_back(kernel_tracker_->RecordQueued());
+  }
+  EXPECT_EQ(32, kernel_tracker_->NumPending());
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+
+  // Mature the kernels in order until empty.
+  while (!queued_counts.empty()) {
+    int64 x = queued_counts.front();
+    queued_counts.pop_front();
+    kernel_tracker_->RecordTerminated(x);
+    EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
+    EXPECT_EQ(x, kernel_tracker_->LastTerminatedCount());
+  }
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+
+  // Next inject so many kernel events that the ring buffer needs
+  // to grow a couple of times, while maturing a few in random order
+  // to introduce gaps between last_completed_ and first_available_.
+  int64 lower_bound = timing_counter_->get();
+  for (int i = 0; i < 1111; ++i) {
+    queued_counts.push_back(kernel_tracker_->RecordQueued());
+    int64 upper_bound = timing_counter_->get();
+    if (0 == (i % 16)) {
+      size_t index = (random::New64() % queued_counts.size());
+      kernel_tracker_->RecordTerminated(queued_counts[index]);
+      queued_counts.erase(queued_counts.begin() + index);
+      EXPECT_LE(lower_bound, kernel_tracker_->LastTerminatedCount());
+      EXPECT_GE(upper_bound, kernel_tracker_->LastTerminatedCount());
+    }
+  }
+
+  // Next mature the remaining kernels in order until empty.
+  while (!queued_counts.empty()) {
+    int64 x = queued_counts.front();
+    queued_counts.pop_front();
+    kernel_tracker_->RecordTerminated(x);
+    EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
+    // There may be a gap here where we find a kernel that got terminated
+    // out of order, earlier, so the LastTerminatedCount can actually
+    // jump past x.
+    EXPECT_LE(x, kernel_tracker_->LastTerminatedCount());
+  }
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+}
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 3c1c31aa732d373e76599cdc8fe8ae8561765c9c..6531d6d367b1407d89da16f2023f72b75903daf9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -241,7 +241,9 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
 // events have recorded, and then retire them.  Initial observations
 // suggest that typical behavior in a TensorFlow program is to have
 // 0-3 events pending most of the time, but there are occasionally
-// spikes of up to several hundred outstanding.
+// spikes of up to several hundred outstanding.  (If GPUKernelTracker
+// is used to cap pending kernels there should never be more than
+// that many.)
 //
 // NOTE: If all events are on the same stream, no later event will
 // complete before an earlier event, except possibly if the earlier
@@ -249,13 +251,10 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
 // looking past the first kPending event.  However, if we're using
 // multiple streams there may be some gain in looking deeper.
 // As a compromise, PollEvent() calls that are triggered by the queueing
-// of a single event never look past the first kPending event.  Calls
-// coming from the dedicated polling thread always sweep the full queue.
-//
-// Note that allowing the queue to grow very long could cause overall
-// GPU memory use to spike needlessly.  An alternative strategy would
-// be to throttle new Op execution until the pending event queue
-// clears.
+// of a single event never look past the first kPending event.  Consequently
+// those calls do an expected constant amount of work, unaffected by the
+// length of the pending queue.  Calls coming from the dedicated
+// polling thread always sweep the full queue.
 void EventMgr::PollEvents(bool is_dedicated_poller,
                           gtl::InlinedVector<InUse, 4>* to_free) {
   VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 8167cfb9d7dc6cd91a17323b3083d1823cbaa5e0..39883d34f9e5a731cac4c6172c6a4caed9db6602 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
@@ -90,7 +91,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
   }
 
   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
-  if (allocator_parts.allocator.get() == nullptr) {
+  if (allocator_parts.allocator == nullptr) {
     // Validate allocator types.
     if (!allocator_type.empty() && allocator_type != "BFC") {
       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
@@ -110,9 +111,15 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
         (options.per_process_gpu_memory_fraction() > 1.0 ||
          options.experimental().use_unified_memory()),
         gpu_visitors_[bus_id], {});
-    Allocator* gpu_allocator =
+    GPUBFCAllocator* gpu_bfc_allocator =
         new GPUBFCAllocator(sub_allocator, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
+    Allocator* gpu_allocator = gpu_bfc_allocator;
+    SharedCounter* timing_counter = nullptr;
+    if (options.experimental().timestamped_allocator()) {
+      timing_counter = new SharedCounter;
+      gpu_bfc_allocator->SetTimingCounter(timing_counter);
+    }
 
     // If true, checks for memory overwrites by writing
     // distinctive patterns on both ends of allocated memory.
@@ -137,7 +144,9 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
       recording_allocator = new internal::RecordingAllocator(
           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
     }
-    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator,
+    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
+                       std::unique_ptr<SharedCounter>(timing_counter),
+                       sub_allocator,
                        std::unique_ptr<Allocator>(recording_allocator)};
   }
   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
@@ -151,6 +160,22 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
 #endif  // GOOGLE_CUDA
 }
 
+SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
+  DCHECK(process_state_);
+#if GOOGLE_CUDA
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
+  mutex_lock l(mu_);
+  if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
+    return nullptr;
+  }
+
+  AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
+  return allocator_parts.counter.get();
+#else
+  return nullptr;
+#endif
+}
+
 Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
   CHECK(process_state_);
   if (!HasGPUDevice() ||
@@ -224,6 +249,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       allocator = new TrackingAllocator(allocator, true);
     }
     cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
+                                     std::unique_ptr<SharedCounter>(nullptr),
                                      sub_allocator,
                                      std::unique_ptr<Allocator>(nullptr)});
     AllocatorParts& allocator_parts = cuda_host_allocators_.back();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index df51c10c8065fa94d736c8f4dfa76faebdc8bc62..861157ca0c6150be6e430e184fe6adb1a326f6fd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -33,6 +34,7 @@ namespace tensorflow {
 
 class Allocator;
 class PoolAllocator;
+class SharedCounter;
 
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
@@ -108,6 +110,8 @@ class GPUProcessState {
   // Returns bus_id for the given GPU id.
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
+  SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id);
+
  protected:
   // GPUProcessState is a singleton that should not normally be deleted except
   // at process shutdown.
@@ -132,6 +136,7 @@ class GPUProcessState {
 
   struct AllocatorParts {
     std::unique_ptr<Allocator> allocator;
+    std::unique_ptr<SharedCounter> counter;
     SubAllocator* sub_allocator;  // owned by allocator
     std::unique_ptr<Allocator> recording_allocator;
   };
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index f0656ff53332d7dd4f21d9d874846c16fb669681..12af4a82019d5f30b417b98f66bf9bc95bd15442 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -616,7 +616,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         auto* dev_info = device_->tensorflow_gpu_device_info();
         CHECK(dev_info);
         dev_info->default_context->CopyCPUTensorToDevice(
-            &cpu_tensor, device_, &tensor_, [this, &notification](Status s) {
+            &cpu_tensor, device_, &tensor_, [&notification](Status s) {
               TF_CHECK_OK(s);
               notification.Notify();
             });
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index f1fcca194e9ef56bf7b96e6c73717db7620b9812..2a6d6f5a7aefd0f191ebf0ee05164ab0985a9be6 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
@@ -53,15 +54,22 @@ struct LocalDevice::EigenThreadPoolInfo {
 
   explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
                                Allocator* allocator) {
+    // Use session setting if specified.
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
+    // If no session setting, use environment setting.
     if (intra_op_parallelism_threads == 0) {
-      intra_op_parallelism_threads = port::NumSchedulableCPUs();
-      if (numa_node != port::kNUMANoAffinity) {
-        // Assume that CPUs are equally distributed over available NUMA nodes.
-        // This may not be true, but there isn't currently a better way of
-        // determining the number of CPUs specific to the requested node.
-        intra_op_parallelism_threads /= port::NUMANumNodes();
+      static int env_num_threads = NumIntraOpThreadsFromEnvironment();
+      intra_op_parallelism_threads = env_num_threads;
+      // If no session setting or environment, compute a reasonable default.
+      if (intra_op_parallelism_threads == 0) {
+        intra_op_parallelism_threads = port::NumSchedulableCPUs();
+        if (numa_node != port::kNUMANoAffinity) {
+          // Assume that CPUs are equally distributed over available NUMA nodes.
+          // This may not be true, but there isn't currently a better way of
+          // determining the number of CPUs specific to the requested node.
+          intra_op_parallelism_threads /= port::NUMANumNodes();
+        }
       }
     }
     ThreadOptions thread_opts;
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 7552838ca11d5c8e863bddca4398b98caeac2759..f1c3bbd552812bf7c6b0e3866cdf2c4ebd452f6b 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -200,6 +200,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
                            .Input(NodeOut(edge->src(), edge->src_output()))
                            .Attr("frame_name", name_)
                            .Attr("parallel_iterations", parallel_iterations_)
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &enter_node));
     enter_nodes_[edge->dst_input()] = enter_node;
   }
@@ -216,6 +217,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopControlInputs"), "NoOp",
                                    graph_->op_registry(), &debug_info_)
                            .ControlInputs(control_inputs)
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &incoming_control_node));
     for (Node* n : enter_nodes_) {
       graph_->AddControlEdge(incoming_control_node, n);
@@ -231,6 +233,7 @@ Status LowerWhileHelper::CreateMergeNodes() {
         NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
+            .Device(while_op_->requested_device())
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
@@ -241,6 +244,7 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
   for (Node* merge_node : merge_nodes_) {
     cond_call_builder_.Input(NodeOut(merge_node, 0));
   }
+  cond_call_builder_.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(cond_call_builder_.Finalize(graph_, &cond_call_node_));
   // Add a control edge to make sure the Const nodes in the cond function
   // are in the same frame as the rest of the function, otherwise
@@ -249,6 +253,7 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
   TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopCond"), "LoopCond",
                                  graph_->op_registry(), &debug_info_)
                          .Input(NodeOut(cond_call_node_, 0))
+                         .Device(while_op_->requested_device())
                          .Finalize(graph_, &loop_cond_node_));
   return Status::OK();
 }
@@ -270,6 +275,7 @@ Status LowerWhileHelper::CreateSwitchNodes() {
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(merge_nodes_[i], 0))
                            .Input(NodeOut(loop_cond_node_, 0))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
@@ -280,6 +286,7 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   for (Node* switch_node : switch_nodes_) {
     body_call_builder_.Input(NodeOut(switch_node, 1));
   }
+  body_call_builder_.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(body_call_builder_.Finalize(graph_, &body_call_node_));
   // Add a control edge to make sure the Const nodes in the body function
   // are in the same frame as the rest of the function, otherwise
@@ -296,6 +303,7 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   TF_RETURN_IF_ERROR(NodeBuilder(NewName("loop_body_control"), op_type,
                                  graph_->op_registry(), &debug_info_)
                          .Input(NodeOut(switch_nodes_[0], 1))
+                         .Device(while_op_->requested_device())
                          .Finalize(graph_, &body_control_node_));
   graph_->AddControlEdge(body_control_node_, body_call_node_);
   return Status::OK();
@@ -309,6 +317,7 @@ Status LowerWhileHelper::CreateExitNodes() {
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("exit"), "Exit",
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(switch_node, 0))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &exit_node));
     exit_nodes_.emplace_back(exit_node);
     outputs.emplace_back(NodeOut(exit_node, 0));
@@ -320,6 +329,7 @@ Status LowerWhileHelper::CreateExitNodes() {
   // 2. Fetching the output of the While node by name in calls to sess.run.
   NodeBuilder ib(name_, "IdentityN", OpRegistry::Global(), &debug_info_);
   ib.Input(outputs);
+  ib.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(ib.Finalize(graph_, &lowered_while_output_));
   return Status::OK();
 }
@@ -330,6 +340,7 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(body_call_node_, i))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index a34a580c2ae13e81b8c4e468adfab1669add8287..fcdab26d3dbc4dee7a825d581543f7619860f225 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -32,6 +32,10 @@ auto* graph_run_time_usecs = monitoring::Counter<0>::New(
 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/autotune", "tf.data autotuning", "name");
 
+auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/bytes_read",
+    "The number of bytes read by tf.data Dataset sources.", "name");
+
 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/elements", "tf.data elements", "name");
 
@@ -61,6 +65,10 @@ void RecordTFDataAutotune(const string& name) {
   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
 }
 
+void RecordTFDataBytesRead(const string& name, int64 num_bytes) {
+  tf_data_bytes_read_counter->GetCell(name)->IncrementBy(num_bytes);
+}
+
 void RecordTFDataElements(const string& name, int64 num_elements) {
   tf_data_elements_counter->GetCell(name)->IncrementBy(num_elements);
 }
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index 49dbddd911f73577480c4d5fada63cedc6bcbd59..bc73da4acc37008bbd50f3162f1f0d7bd3f4d865 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -21,14 +21,20 @@ limitations under the License.
 namespace tensorflow {
 namespace metrics {
 
-// Records that a tf.data dataset op executed by the program used autotuning.
+// Records that a tf.data.Dataset executed by the program used autotuning.
 //
-// The `name` argument identifies the dataset (e.g. "ParallelMap").
+// The `name` argument identifies the Dataset type (e.g. "ParallelMap").
 void RecordTFDataAutotune(const string& name);
 
-// Records the number of elements produced by a tf.data dataset.
+// Records the number of bytes read from the filesystem by a tf.data.Dataset
+// source.
 //
-// The `name` argument identifies the dataset (e.g. "Batch" or "Map").
+// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
+void RecordTFDataBytesRead(const string& name, int64 num_bytes);
+
+// Records the number of elements produced by a tf.data.Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
 void RecordTFDataElements(const string& name, int64 num_elements);
 
 // Records the number of independent graph changes resulting from the applicaton
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 94f1b1f2044b1888062c3c5108c5be678e560fd8..b467e7b311e3fe73d2eb094e5d92f124a8266a0b 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -82,14 +82,18 @@ class MklSmallSizeAllocator : public Allocator {
     port::AlignedFree(ptr);
   }
 
-  void GetStats(AllocatorStats* stats) override {
+  absl::optional<AllocatorStats> GetStats() override {
     mutex_lock l(mutex_);
-    *stats = stats_;
+    return stats_;
   }
 
   void ClearStats() override {
     mutex_lock l(mutex_);
-    stats_.Clear();
+    stats_.num_allocs = 0;
+    stats_.peak_bytes_in_use = 0;
+    stats_.largest_alloc_size = 0;
+    stats_.bytes_in_use = 0;
+    stats_.bytes_limit = 0;
   }
 
  private:
@@ -98,10 +102,10 @@ class MklSmallSizeAllocator : public Allocator {
     mutex_lock l(mutex_);
     ++stats_.num_allocs;
     stats_.bytes_in_use += alloc_size;
-    stats_.max_bytes_in_use =
-        std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-    stats_.max_alloc_size =
-        std::max(alloc_size, static_cast<size_t>(stats_.max_alloc_size));
+    stats_.peak_bytes_in_use =
+        std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+    stats_.largest_alloc_size =
+        std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size));
   }
 
   // Decrement statistics for the allocator handling small allocations.
@@ -244,22 +248,22 @@ class MklCPUAllocator : public Allocator {
     }
   }
 
-  void GetStats(AllocatorStats* stats) override {
-    AllocatorStats l_stats, s_stats;
-    small_size_allocator_->GetStats(&s_stats);
-    large_size_allocator_->GetStats(&l_stats);
+  absl::optional<AllocatorStats> GetStats() override {
+    auto s_stats = small_size_allocator_->GetStats();
+    auto l_stats = large_size_allocator_->GetStats();
 
     // Combine statistics from small-size and large-size allocator.
-    stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
-    stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
-    stats->max_bytes_in_use =
-        l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
+    stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs;
+    stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use;
+    stats_.peak_bytes_in_use =
+        l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use;
 
     // Since small-size allocations go to MklSmallSizeAllocator,
     // max_alloc_size from large_size_allocator would be the maximum
     // size allocated by MklCPUAllocator.
-    stats->max_alloc_size = l_stats.max_alloc_size;
-    stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
+    stats_.largest_alloc_size = l_stats->largest_alloc_size;
+    stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit);
+    return stats_;
   }
 
   void ClearStats() override {
@@ -308,6 +312,7 @@ class MklCPUAllocator : public Allocator {
 
   SubAllocator* sub_allocator_;  // not owned by this class
   mutable mutex mutex_;
+  AllocatorStats stats_ GUARDED_BY(mutex_);
 
   // Hash map to keep track of "BFC" allocations
   // We do not use BFC allocator for small allocations.
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
index e08ab5763856956b435b7eb0451d8316af2d9337..ee1d9cd281bb5514074dd71ba2bdc2379c1ebfc1 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
@@ -24,22 +24,21 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(MKLBFCAllocatorTest, TestMaxLimit) {
-  AllocatorStats stats;
   setenv(MklCPUAllocator::kMaxLimitStr, "1000", 1);
   MklCPUAllocator a;
   TF_EXPECT_OK(a.Initialize());
-  a.GetStats(&stats);
-  EXPECT_EQ(stats.bytes_limit, 1000);
+  auto stats = a.GetStats();
+  EXPECT_EQ(stats->bytes_limit, 1000);
 
   unsetenv(MklCPUAllocator::kMaxLimitStr);
   TF_EXPECT_OK(a.Initialize());
-  a.GetStats(&stats);
+  stats = a.GetStats();
   uint64 max_mem_bytes = MklCPUAllocator::kDefaultMaxLimit;
 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
   max_mem_bytes =
       (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE);
 #endif
-  EXPECT_EQ(stats.bytes_limit, max_mem_bytes);
+  EXPECT_EQ(stats->bytes_limit, max_mem_bytes);
 
   setenv(MklCPUAllocator::kMaxLimitStr, "wrong-input", 1);
   EXPECT_TRUE(errors::IsInvalidArgument(a.Initialize()));
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 8120a2005ad16be504a277057ef51742b3d5c101..e7db3aed27e9dfeb6e2c051c482bb64f4f74f415 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -41,11 +41,13 @@ Status OptimizationPassRegistry::RunGrouping(
         Status s = pass->Run(options);
         if (!s.ok()) return s;
         if (VLOG_IS_ON(1)) {
-          DumpGraphToFile(
-              strings::StrCat(
-                  "after_phase_", phase.first, "_", pass->name(), "_",
-                  reinterpret_cast<uintptr_t>((*options.graph).get())),
-              **options.graph);
+          if (options.graph) {
+            DumpGraphToFile(
+                strings::StrCat(
+                    "after_phase_", phase.first, "_", pass->name(), "_",
+                    reinterpret_cast<uintptr_t>((*options.graph).get())),
+                **options.graph);
+          }
           if (options.partition_graphs) {
             for (auto& part : *options.partition_graphs) {
               DumpGraphToFile(
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 8c0a3fb0c5eb6a844c9a1e3a1c4d0443a8267948..b2f4f1aa49b0fea64e8db0eb34101edde0adc738 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -32,69 +34,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/port.h"
 
 namespace tensorflow {
 
 namespace {
 
-// We hoist the conversion from C-style string literal to StringPiece here,
-// so that we can avoid the many repeated calls to strlen().
-const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
-const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
-
-// Returns a list of devices having type in supported_device_types.  The
-// returned list is sorted by preferred type (higher numeric type is preferred).
-std::vector<Device*> FilterSupportedDevices(
-    const std::vector<Device*>& devices,
-    const PrioritizedDeviceTypeVector& supported_device_types,
-    const Device* default_device) {
-  Device* filtered_default_device = nullptr;
-  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
-  for (const auto& supported_device_type : supported_device_types) {
-    for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) ==
-          supported_device_type.first) {
-        if (device == default_device) {
-          filtered_default_device = device;
-        } else {
-          prioritized_filtered_devices.emplace_back(
-              device, supported_device_type.second);
-        }
-      }
-    }
-  }
-
-  auto device_sort = [](const std::pair<Device*, int32>& a,
-                        const std::pair<Device*, int32>& b) {
-    if (a.second != b.second) {
-      return a.second > b.second;
-    }
-
-    auto a_priority =
-        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
-    auto b_priority =
-        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
-    // First sort by prioritized device type (higher is preferred) and
-    // then by device name (lexicographically).
-    if (a_priority != b_priority) {
-      return a_priority > b_priority;
-    }
-    return StringPiece(a.first->name()) < StringPiece(b.first->name());
-  };
-  std::sort(prioritized_filtered_devices.begin(),
-            prioritized_filtered_devices.end(), device_sort);
-
-  std::vector<Device*> filtered_devices;
-  if (filtered_default_device != nullptr) {
-    filtered_devices.emplace_back(filtered_default_device);
-  }
-  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
-    filtered_devices.push_back(prioritized_filtered_device.first);
-  }
-  return filtered_devices;
-}
-
 // Returns true if the node has no inputs and produces outputs
 // that are consumed by a single node.
 //
@@ -106,840 +53,6 @@ bool IsGeneratorNode(const Node* node) {
          !IsRefType(node->output_type(0));
 }
 
-bool IsExemptFromResourceInputColocation(const Node* node) {
-  // Note: Partitioned function calls, which place and partition their
-  // function bodies, are exempt from this check: they forward resource and
-  // ref inputs to operations that are appropriately placed, instead of
-  // dereferencing them.
-  const string& op_type = node->op_def().name();
-  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
-}
-
-// This class maintains the connected components of a colocation
-// constraint graph, and uses this information to assign a satisfying
-// device placement to the nodes of the graph.
-//
-// The typical usage pattern is:
-//
-//   Graph graph = ...;
-//   DeviceSet device_set = ...;
-//   ColocationGraph colocation_graph(graph, device_set);
-//
-//   // Add all the nodes of the `graph` to the `colocation_graph`.
-//   for (Node* node : graph.nodes()) {
-//     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
-//   }
-//
-//   // Add one or more colocation constraints.
-//   Node node_1 = *graph.FindNodeId(...);
-//   Node node_2 = *graph.FindNodeId(...);
-//   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
-//
-//   // Assign devices based on the accumulated constraints.
-//   for (Node* node : graph.nodes()) {
-//     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
-//   }
-//
-// This implementation uses the Union-Find algorithm to efficiently maintain the
-// connected components and incrementally adds edges via
-// ColocationGraph::ColocateNodes() invocations.
-//
-// ColocationGraph does not assign any devices to graph nodes. The
-// `log_device_placement` argument is used to log messages when requested
-// device is ignored.
-class ColocationGraph {
- public:
-  ColocationGraph(const Graph* graph, const DeviceSet* device_set,
-                  const Device* default_device, bool allow_soft_placement,
-                  bool log_device_placement)
-      : graph_(graph),
-        device_set_(device_set),
-        device_types_(device_set->PrioritizedDeviceTypeList()),
-        default_device_(default_device),
-        allow_soft_placement_(allow_soft_placement),
-        log_device_placement_(log_device_placement) {
-    members_.resize(graph->num_node_ids());
-  }
-
-  // Adds each node of the Graph to this ColocationGraph as a singleton.
-  //
-  // NOTE: The implementation assumes that the ids of nodes passed to
-  // this method are dense and zero-based; the memory used will be linear in
-  // the largest node ID.
-  // NOTE: If this method returns an error, *this is left in an undefined
-  // state.
-  Status ColocateAllNodes() {
-    // This maps from a colocation group identifier to the 'root' of that
-    // colocation group.  Note that the keys in this map are StringPiece; the
-    // actual strings are stored under the NodeDef.  The lifetime of this map
-    // is limited to this ColocateAllNodes() method, and no part of the
-    // NodeDef trees are changed during the lifetime of this method, so using
-    // StringPiece as a key is safe.
-    //
-    // Also, as a further optimization, we remove the "loc:@" prefix from
-    // "class" attribute values, when they are used as keys in this table.
-    // This allows us to use StringPiece values that refer to substrings of
-    // 'string' values stored in NodeDef attribute lists, as well as StringPiece
-    // values that refer to 'string' values from NodeDef::name(), without
-    // performing any string allocations.
-    std::unordered_map<StringPiece, const Node*, StringPieceHasher>
-        colocation_group_root;
-
-    for (const Node* node : graph_->op_nodes()) {
-      // When adding the node, identify whether it is part of a colocation
-      // group.
-
-      // This code is effectively the equivalent of GetNodeAttr() for a string
-      // array, but it avoids all internal allocations (the allocation of the
-      // backing store of the std::vector<string> as well as the copies of the
-      // strings within it).  Instead, we combine the query of the colocation
-      // attribute with the calls to ColocateNodeToGroup.
-      bool found_spec = false;
-      const AttrValue* attr_value =
-          node->attrs().Find(kColocationAttrNameStringPiece);
-      if (attr_value != nullptr && attr_value->has_list()) {
-        for (const string& class_spec : attr_value->list().s()) {
-          StringPiece spec(class_spec);
-          if (str_util::ConsumePrefix(&spec,
-                                      kColocationGroupPrefixStringPiece)) {
-            found_spec = true;
-            TF_RETURN_IF_ERROR(
-                ColocateNodeToGroup(&colocation_group_root, node, spec));
-          }
-        }
-      }
-
-      if (!found_spec) {
-        // If the node does not specify a colocation group, then use the
-        // name of this node as the colocation group.
-        TF_RETURN_IF_ERROR(
-            ColocateNodeToGroup(&colocation_group_root, node, node->name()));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status ColocateResourceOrRefEdge(Node* src, Node* dst) {
-    // Colocate `src` and `dst` to maintain the invariant that nodes
-    // connected by reference edges are colocated.
-    int src_root_id = FindRoot(src->id());
-    int dst_root_id = FindRoot(dst->id());
-    auto& src_root = members_[src_root_id];
-    auto& dst_root = members_[dst_root_id];
-    // If both the source node and this node have partially
-    // specified a device, then 'dst's device should be
-    // cleared: the reference edge forces 'node' to be on the
-    // same device as the source node.
-    const auto& source_parsed_name = src_root.device_name;
-    const auto& dest_parsed_name = dst_root.device_name;
-    if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
-        DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
-      // Ignore a specified device for 'dst' if the two names were
-      // incompatible.
-      if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
-                                                  dest_parsed_name)) {
-        TF_RETURN_IF_ERROR(VerifyResourceAndRefInputsCanBeColocated(
-            dst, src, source_parsed_name));
-        if (log_device_placement_) {
-          LOG(INFO) << "Ignoring device specification "
-                    << DeviceNameUtils::ParsedNameToString(dest_parsed_name)
-                    << " for node '" << dst->name()
-                    << "' because the input edge from '" << src->name()
-                    << "' is a reference connection and already has a device "
-                       "field set to "
-                    << DeviceNameUtils::ParsedNameToString(source_parsed_name);
-        }
-
-        // Make 'dst' colocated with the source
-        dst_root.device_name = source_parsed_name;
-      }
-    }
-    Status status = ColocateNodes(*src, src_root_id, *dst, dst_root_id);
-    if (!status.ok()) {
-      return AttachDef(
-          errors::InvalidArgument("Nodes were connected by a "
-                                  "reference connection (requiring them to "
-                                  "be on the same device), but the two nodes "
-                                  "were assigned two different devices: ",
-                                  status.error_message()),
-          *dst);
-    }
-    return Status::OK();
-  }
-
-  Status ColocateResourceAndRefEdges() {
-    // Enumerate the constraint edges, and use them to update the disjoint
-    // node set.
-    // If `node` has an input edge with reference type, add an edge from the
-    // source of that edge to `node`.
-    for (const Edge* edge : graph_->edges()) {
-      if (edge->IsControlEdge()) {
-        continue;
-      }
-      Node* src = edge->src();
-      Node* dst = edge->dst();
-      DataType input_type = dst->input_type(edge->dst_input());
-      if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
-          !IsExemptFromResourceInputColocation(dst)) {
-        TF_RETURN_IF_ERROR(ColocateResourceOrRefEdge(src, dst));
-      }
-    }
-    return Status::OK();
-  }
-
-  Status Initialize() {
-    TF_RETURN_IF_ERROR(InitializeMembers());
-    TF_RETURN_IF_ERROR(ColocateAllNodes());
-    return ColocateResourceAndRefEdges();
-  }
-
-  Status ColocateNodeToGroup(
-      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
-          colocation_group_root,
-      const Node* node, StringPiece colocation_group) {
-    const Node*& root_node = (*colocation_group_root)[colocation_group];
-    if (root_node == nullptr) {
-      // This is the first node of the colocation group, so
-      // designate this node as the 'root' of that colocation group.
-      root_node = node;
-    } else {
-      // Try to colocate the node with the root.  If there is an
-      // error, return it.
-      Status s = ColocateNodes(*node, *root_node);
-      if (!s.ok()) {
-        return AttachDef(s, *node);
-      }
-    }
-    return Status::OK();
-  }
-
-  // Merge the (possibly disjoint) sets containing nodes "x" and
-  // "y". Returns OK if the all nodes in the union of these sets can
-  // be placed on the same device type.
-  //
-  // NOTE: If this method returns an error, *this is left in an undefined
-  // state.
-  Status ColocateNodes(const Node& x, const Node& y) {
-    int x_root = FindRoot(x.id());
-    int y_root = FindRoot(y.id());
-    return ColocateNodes(x, x_root, y, y_root);
-  }
-
-  // This overload of ColocateNodes() allows a caller to provide the root node
-  // ids for the two nodes. For large graphs, this noticeably reduces the
-  // graph load time.
-  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root) {
-    if (x_root == y_root) {
-      return Status::OK();
-    }
-
-    DCHECK_EQ(x_root, FindRoot(x.id()));
-    DCHECK_EQ(y_root, FindRoot(y.id()));
-
-    Member& x_root_member = members_[x_root];
-    Member& y_root_member = members_[y_root];
-
-    // Merge the sets by setting the parent pointer of the smaller tree's root
-    // node to point to the root of the larger tree. Together with path
-    // compression in ColocationGraph::FindRoot, this ensures that we do not
-    // experience pathological performance on graphs such as chains.
-    int new_root, old_root;
-    if (x_root_member.rank < y_root_member.rank) {
-      // The tree rooted at x_root is shallower, so connect it to
-      // y_root. The rank of y_root is unchanged because its new
-      // child has strictly less rank.
-      x_root_member.parent = y_root;
-      new_root = y_root;
-      old_root = x_root;
-    } else if (x_root_member.rank > y_root_member.rank) {
-      // The tree rooted at y_root is shallower, so connect it to
-      // x_root. The rank of x_root is unchanged because its new
-      // child has strictly less rank.
-      y_root_member.parent = x_root;
-      new_root = x_root;
-      old_root = y_root;
-    } else {
-      // Both trees have the same rank, so break the tie by choosing
-      // x_root as the new root.
-      y_root_member.parent = x_root;
-      // Increment the rank of the tree rooted at x_root, because it
-      // is now strictly deeper than before.
-      ++x_root_member.rank;
-      new_root = x_root;
-      old_root = y_root;
-    }
-
-    Member& new_root_member = members_[new_root];
-    Member& old_root_member = members_[old_root];
-
-    // Merge the partial device specifications, and ensure that they are
-    // compatible. NULL options_ is treated as allowing soft placement.
-    // TODO(mrry): Consider enriching the error message by pointing
-    // out which nodes have the explicit partial device
-    // specifications that caused this conflict.
-    Status s = DeviceNameUtils::MergeDevNames(&new_root_member.device_name,
-                                              old_root_member.device_name,
-                                              allow_soft_placement_);
-    if (!s.ok()) {
-      return errors::InvalidArgument(
-          "Cannot colocate nodes ",
-          errors::FormatColocationNodeForError(x.name()), " and ",
-          errors::FormatColocationNodeForError(y.name()), ": ",
-          s.error_message());
-    }
-
-    // Ensure that the common root has at least one supported device
-    // type, by computing the intersection of
-    // new_root_member.supported_device_types and
-    // old_root_member.supported_device_types.
-    MergeSupportedDevices(&new_root_member.supported_device_types,
-                          old_root_member.supported_device_types);
-    if (new_root_member.supported_device_types.empty()) {
-      return errors::InvalidArgument(
-          "Cannot colocate nodes ",
-          errors::FormatColocationNodeForError(x.name()), " and ",
-          errors::FormatColocationNodeForError(y.name()),
-          " because no device type supports both of those nodes and the "
-          "other nodes colocated with them.",
-          DebugInfo(x_root), DebugInfo(y_root));
-    }
-
-    return Status::OK();
-  }
-
-  // Limits the possible devices of `node`'s colocation group to the device
-  // to which `node` is assigned. This makes sure that all nodes in this
-  // colocation group will be assigned to the same device. Without this
-  // explicit restriction, heuristics can choose a different possible device
-  // for other nodes in the group.
-  Status LimitToAssignedDevice(const Node& node) {
-    if (node.assigned_device_name_index() < 0) {
-      return errors::Internal(
-          "Expected an assigned node as argument to LimitToAssignedDevice but "
-          "got: ",
-          node.DebugString());
-    }
-    int root = FindRoot(node.id());
-    Member& root_member = members_[root];
-    if (node.assigned_device_name_index() ==
-        root_member.assigned_device_name_index) {
-      return Status::OK();
-    }
-    DeviceNameUtils::ParsedName parsed;
-    DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
-    Status s = DeviceNameUtils::MergeDevNames(&root_member.device_name, parsed,
-                                              allow_soft_placement_);
-    if (!s.ok()) {
-      return errors::Internal(
-          "Constraining by assigned device should not cause an error. Original "
-          "root device name: ",
-          DeviceNameUtils::ParsedNameToString(root_member.device_name),
-          " assigned device name \"", node.assigned_device_name(),
-          ". Error: ", s.error_message());
-    }
-
-    root_member.assigned_device_name_index = node.assigned_device_name_index();
-    // Clear cached possible_devices, if any.
-    root_member.possible_devices.clear();
-
-    return Status::OK();
-  }
-
-  // For the given node, subject to the constraints previously given
-  // to this ColocationGraph, set its assigned_device_name. Returns OK
-  // if a satisfying device can be found, otherwise an error.
-  //
-  // Note: This method returns a pointer to a field within members_.
-  // The caller must not use the returned pointer after there is any possibility
-  // that the members_[i].possible_devices field has been modified.
-  Status GetDevicesForNode(Node* node,
-                           std::vector<Device*>** possible_devices) {
-    *possible_devices = nullptr;
-    const int node_root = FindRoot(node->id());
-    if (!members_[node_root].possible_devices.empty()) {
-      *possible_devices = &members_[node_root].possible_devices;
-      return Status::OK();
-    }
-
-    // We have not yet computed the possible devices for the
-    // colocated node set containing 'node', so we do so now using the
-    // constraints on the root node.
-
-    // "devices" will contain the set of feasible placements for the
-    // colocated node set containing 'node'.
-    std::vector<Device*> devices;
-    if (DeviceNameUtils::HasSomeDetails(members_[node_root].device_name)) {
-      // The root node has a (possibly partial) device
-      // specification, so enumerate the physical devices that
-      // conform to it.
-      device_set_->FindMatchingDevices(members_[node_root].device_name,
-                                       &devices);
-
-      if (!devices.empty()) {
-        // Filter devices into those that are compatible with the root
-        // node (and its children).
-        devices = FilterSupportedDevices(
-            devices, members_[node_root].supported_device_types,
-            default_device_);
-      }
-
-      // Perform soft placement if allow_soft_placement_ is set.
-      if (devices.empty() && allow_soft_placement_) {
-        // The soft_device_name is the same as the node's device name
-        // without specifying the device type or ID.
-        DeviceNameUtils::ParsedName soft_device_name =
-            members_[node_root].device_name;
-        soft_device_name.type.clear();
-        soft_device_name.has_type = false;
-        soft_device_name.has_id = false;
-        device_set_->FindMatchingDevices(soft_device_name, &devices);
-        if (!devices.empty()) {
-          devices = FilterSupportedDevices(
-              devices, members_[node_root].supported_device_types,
-              default_device_);
-        }
-      }
-
-      if (devices.empty()) {
-        // Return an error when a physical device that matches an explicit
-        // device specification is not found. This ensures that we don't
-        // assign a node to GPU when the user wanted to force it on CPU.
-        string debug_info = DebugInfo(node_root);
-
-        DeviceNameUtils::ParsedName specified_device_name;
-        if (DeviceNameUtils::ParseFullName(node->requested_device(),
-                                           &specified_device_name) &&
-            specified_device_name == members_[node_root].device_name) {
-          // The specified device and merged set device match, and
-          // will appear in the GraphDef (for debugging), so just
-          // print the specified device.
-          std::vector<Device*> devices_matching_nodedef;
-          device_set_->FindMatchingDevices(specified_device_name,
-                                           &devices_matching_nodedef);
-          if (devices_matching_nodedef.empty()) {
-            // Sometimes it is almost impossible to understand the problem
-            // without a list of available devices.
-            std::vector<string> device_names;
-            for (const Device* device : device_set_->devices()) {
-              device_names.push_back(device->name());
-            }
-            std::sort(device_names.begin(), device_names.end());
-
-            string gpu_msg = "";
-            if (!IsGoogleCudaEnabled() &&
-                str_util::Lowercase(specified_device_name.type) == "gpu") {
-              gpu_msg =
-                  " The requested device appears to be a GPU, but CUDA is not "
-                  "enabled.";
-            }
-
-            return errors::InvalidArgument(
-                errors::FormatNodeNameForError(node->name()),
-                "was explicitly assigned to ", node->requested_device(),
-                " but available devices are [ ",
-                str_util::Join(device_names, ", "), " ]. Make sure ",
-                "the device specification refers to a valid device.", gpu_msg);
-          } else if (specified_device_name.has_type) {
-            return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->requested_device(), "' because no supported kernel for ",
-                specified_device_name.type, " devices is available.",
-                debug_info, "\nRegistered kernels:\n",
-                KernelsRegisteredForOp(node->type_string()));
-          } else {
-            return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->requested_device(), debug_info);
-          }
-        } else {
-          // The specified device may be a valid device but the
-          // merged set device is different, so print both.
-          return errors::InvalidArgument(
-              "Could not satisfy explicit device specification '",
-              node->requested_device(), "' because the node ",
-              errors::FormatColocationNodeForError(node->name()),
-              " was colocated with a group of nodes that ",
-              "required incompatible device '",
-              DeviceNameUtils::ParsedNameToString(
-                  members_[node_root].device_name),
-              "'", debug_info);
-        }
-      }
-    } else {
-      // The device is completely unspecified, so enumerate the devices that
-      // support all of the nodes in the set.
-      if (device_set_->devices().empty()) {
-        return errors::Internal("No devices are registered");
-      }
-      devices = FilterSupportedDevices(
-          device_set_->devices(), members_[node_root].supported_device_types,
-          default_device_);
-
-      if (devices.empty()) {
-        return errors::InvalidArgument(
-            "Node had no OpKernel registered to support this operation: ",
-            "Operation was ", node->type_string(), " and inputs were ",
-            DataTypeVectorString(node->input_types()), DebugInfo(node_root));
-      }
-    }
-
-    // Cache the result of the possible devices for this node group.
-    members_[node_root].possible_devices = std::move(devices);
-    *possible_devices = &members_[node_root].possible_devices;
-    return Status::OK();
-  }
-
-  Status InitializeMembers() {
-    for (Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-      Status status = InitializeMember(*node, &members_[node->id()]);
-      if (!status.ok()) {
-        return AttachDef(status, *node);
-      }
-    }
-    return Status::OK();
-  }
-
-  // Represents a node in the disjoint node set forest, and the
-  // accumulated constraints on the device used by that node.
-  struct Member {
-    Member() = default;
-    // The id of the node that is the parent of this one, or its own
-    // id if it is a root. parent <= 0 indicates that this member is invalid.
-    int parent = -1;
-
-    // A proxy for the depth of the tree that is used to prefer
-    // connecting smaller trees to larger trees when merging disjoint
-    // sets.
-    int rank = 0;
-
-    // The intersection of all device types supported by this node,
-    // and those of all of its children, in priority order
-    // of the preferred device.
-    PrioritizedDeviceTypeVector supported_device_types;
-
-    // The merged form of the device requested for this node, with
-    // those of all of its children.
-    DeviceNameUtils::ParsedName device_name;
-
-    // Once colocation groups have been formed and we assigned at least
-    // one node in this group to a device, assigned_device_name_index will
-    // contain this device name's index in the graph. The `device_name` will
-    // contain the parsed name of this device and `possible_devices`, if
-    // computed, will contain just this device.
-    // `assigned_device_name_index` is an optimization to avoid parsing and
-    // comparing device names. The value of -1 signals that a single device
-    // has not been chosen yet.
-    int assigned_device_name_index = -1;
-
-    // If this node is a root, stores a list of Devices to which this node
-    // and all of its children have been assigned, or nullptr if this
-    // has not yet been computed.
-    std::vector<Device*> possible_devices;
-  };
-
-  // Returns debugging info for the node referred to by 'node_root'.
-  string DebugInfo(const int node_root) {
-    string text(
-        "\nColocation Debug Info:\n"
-        "Colocation group had the following types and devices: ");
-
-    // If this node is part of a colocation group, then we want to
-    // collect the mapping of ops to supported devices, so that
-    // the user can see why an unsatisfiable placement occurred.
-
-    std::unordered_map<string, string> type_to_devices;
-    std::vector<const Node*> colocation_nodes;
-    int num_nodes_found = 0;
-
-    for (const Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-      int id = node->id();
-      if (FindRoot(id) != node_root) {
-        continue;
-      }
-      ++num_nodes_found;
-      colocation_nodes.push_back(node);
-      const string& op_type = node->type_string();
-      string devices_registered;
-      for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered,
-                           DeviceTypeString(device_type.first), " ");
-      }
-
-      type_to_devices[op_type] = std::move(devices_registered);
-    }
-
-    for (const auto& td : type_to_devices) {
-      strings::StrAppend(&text, "\n", td.first, ": ", td.second);
-    }
-    strings::StrAppend(&text,
-                       "\n\nColocation members and user-requested devices:");
-    for (const Node* node : colocation_nodes) {
-      strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
-                         ") ", node->requested_device());
-    }
-    strings::StrAppend(&text, "\n");
-
-    if (num_nodes_found <= 1) {
-      text.clear();
-    }
-    return text;
-  }
-
-  Status InitializeMember(const Node& node, Member* member) {
-    const int id = node.id();
-    DCHECK_GE(id, 0);
-    member->parent = id;
-    TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
-        device_types_, node.def(), &member->supported_device_types));
-
-    if (node.has_assigned_device_name()) {
-      // This node has already been assigned to a device, so we
-      // respect this placement, after sanity-checking it.  The
-      // device_name and supported_device_types for this node reflect
-      // the assigned device, so any nodes colocated with this node
-      // will be assigned to the same device (assuming this is
-      // possible).
-      // NOTE: Since any assignment must have been performed by
-      // the TensorFlow runtime, we consider errors in this branch to
-      // be INTERNAL.
-      const string& assigned_device_name = node.assigned_device_name();
-      if (!DeviceNameUtils::ParseFullName(assigned_device_name,
-                                          &member->device_name)) {
-        return errors::Internal("Malformed assigned device '",
-                                assigned_device_name, "'");
-      }
-      const Device* assigned_device =
-          device_set_->FindDeviceByName(assigned_device_name);
-      if (assigned_device == nullptr) {
-        return errors::Internal("Assigned device '", assigned_device_name,
-                                "' does not match any device");
-      }
-
-      for (const auto& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) ==
-            d.first) {
-          return Status::OK();
-        }
-      }
-
-      return errors::Internal("Assigned device '", assigned_device_name,
-                              "' does not have registered OpKernel support "
-                              "for ",
-                              node.type_string());
-    } else {
-      // This node has not yet been assigned to a device, so we
-      // calculate any constraints due to the set of registered
-      // kernels and any (partial) user-provided device specification
-      // in the NodeDef.
-
-      // If no kernels are registered for this op type, fail with an error.
-      if (member->supported_device_types.empty()) {
-        std::set<string> registered_device_types;
-        for (Device* d : device_set_->devices()) {
-          registered_device_types.insert(d->device_type());
-        }
-        std::vector<string> attr_key_vals;
-        for (const auto& it : node.attrs()) {
-          const string& name = it.first;
-          const AttrValue& attr_value = it.second;
-          attr_key_vals.push_back(
-              strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
-        }
-        return errors::InvalidArgument(
-            "No OpKernel was registered to support Op '", node.type_string(),
-            "' used by ", errors::FormatNodeNameForError(node.name()),
-            "with these attrs: [", str_util::Join(attr_key_vals, ", "),
-            "]\n"
-            "Registered devices: [",
-            str_util::Join(registered_device_types, ", "), "]\n",
-            "Registered kernels:\n",
-            KernelsRegisteredForOp(node.type_string()));
-      }
-
-      // If the NodeDef contains a device, then we interpret it as a
-      // (partial) device specification.
-      if (!node.requested_device().empty()) {
-        // The user has specified a device in the NodeDef, try to find a
-        // valid device matching their specification in the set of
-        // devices.
-        // NOTE: The full name may specify a device that is not in
-        // n.supported_device_types(), but we check that in AssignDevice().
-        if (!DeviceNameUtils::ParseFullName(node.requested_device(),
-                                            &member->device_name)) {
-          return errors::InvalidArgument("Malformed device specification '",
-                                         node.requested_device(), "'");
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
-    for (const auto& prioritized_device_type : device_types) {
-      if (prioritized_device_type.second != 0) return true;
-    }
-    return false;
-  }
-
-  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
-                                const PrioritizedDeviceTypeVector& b_types) {
-    if (a_types.size() != b_types.size()) {
-      return false;
-    }
-    for (int i = 0; i < a_types.size(); ++i) {
-      if (a_types[i].first != b_types[i].first) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Updates target to contain the intersection of the device types in
-  // "target" and "other".
-  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
-                                    const PrioritizedDeviceTypeVector& other) {
-    PrioritizedDeviceTypeVector temp = *target;
-    target->clear();
-
-    // Generate intersection with priorities.
-    PrioritizedDeviceTypeVector target_intersection;
-    PrioritizedDeviceTypeVector other_intersection;
-    for (const auto& prioritized_device_type : temp) {
-      bool found = false;
-      for (const auto& other_prioritized_device_type : other) {
-        if (prioritized_device_type.first ==
-            other_prioritized_device_type.first) {
-          found = true;
-          other_intersection.push_back(other_prioritized_device_type);
-          break;
-        }
-      }
-      if (found) {
-        target_intersection.push_back(prioritized_device_type);
-      }
-    }
-
-    // Sort the devices by priority order.
-    auto device_sort = [](const std::pair<DeviceType, int32>& a,
-                          const std::pair<DeviceType, int32>& b) {
-      // First look at set priorities.
-      if (a.second != b.second) {
-        return a.second > b.second;
-      }
-      // Then fallback to default priorities.
-      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
-      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
-      if (a_priority != b_priority) {
-        return a_priority > b_priority;
-      }
-      // Finally just look at the Device type strings.
-      return a.first.type_string() < b.first.type_string();
-    };
-
-    std::sort(target_intersection.begin(), target_intersection.end(),
-              device_sort);
-    std::sort(other_intersection.begin(), other_intersection.end(),
-              device_sort);
-
-    bool is_target_prioritized = HasPriorities(target_intersection);
-    bool is_other_prioritized = HasPriorities(other_intersection);
-    // If neither are prioritized then we just return the original i.e. target
-    // prioritization.
-    if (!is_target_prioritized && !is_other_prioritized) {
-      *target = target_intersection;
-    }
-    // If only one is prioritized, then we respect priorities of that in the
-    // intersection.
-    if (is_target_prioritized && !is_other_prioritized) {
-      *target = target_intersection;
-    }
-    if (!is_target_prioritized && is_other_prioritized) {
-      *target = other_intersection;
-    }
-    // If both have priorities and agree then we go with that. If the
-    // prioritization order is different, then we just fallback to the default
-    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
-    // merged priorities to 0, so that downstream merges work correctly as well.
-    if (is_target_prioritized && is_other_prioritized) {
-      bool priorities_agree =
-          ArePrioritiesSame(target_intersection, other_intersection);
-      if (priorities_agree) {
-        *target = target_intersection;
-      } else {
-        for (const auto& prioritized_device : target_intersection) {
-          target->push_back(std::make_pair(prioritized_device.first, 0));
-        }
-        std::sort(target->begin(), target->end(), device_sort);
-      }
-    }
-  }
-
-  // Returns the root node of the disjoint tree to which the node with the
-  // given id is connected.
-  int FindRoot(int node_id) {
-    Member& member = members_[node_id];
-    DCHECK_GE(member.parent, 0);
-    if (member.parent == node_id) {
-      // member.parent is the root of this disjoint tree.  Do nothing.
-    } else {
-      member.parent = FindRoot(member.parent);
-    }
-    // Now it is guaranteed that member.parent is the root of this disjoint
-    // tree.
-    DCHECK_GE(member.parent, 0);
-    return member.parent;
-  }
-
-  // Ensures that the devices of 'dst's resource and reference match the device
-  // specified for 'src', which is an input of 'dst' with a partially or fully
-  // specified device.
-  Status VerifyResourceAndRefInputsCanBeColocated(
-      const Node* dst, const Node* src,
-      const DeviceNameUtils::ParsedName& src_parsed_name) {
-    std::vector<const Edge*> edges;
-    TF_RETURN_IF_ERROR(dst->input_edges(&edges));
-    for (const Edge* edge : edges) {
-      DataType input_type = dst->input_type(edge->dst_input());
-      if (input_type == DT_RESOURCE || IsRefType(input_type)) {
-        const Node* input_node = edge->src();
-        if (input_node == src) {
-          continue;
-        }
-        const auto& input_root = members_[FindRoot(input_node->id())];
-        const auto& input_parsed_name = input_root.device_name;
-        if (DeviceNameUtils::HasSomeDetails(input_parsed_name) &&
-            !DeviceNameUtils::AreCompatibleDevNames(input_parsed_name,
-                                                    src_parsed_name)) {
-          return AttachDef(
-              errors::InvalidArgument(
-                  "Could not colocate node with its "
-                  "resource and reference inputs; devices ",
-                  DeviceNameUtils::ParsedNameToString(input_parsed_name),
-                  " and ", DeviceNameUtils::ParsedNameToString(src_parsed_name),
-                  " are not compatible."),
-              *dst);
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  const Graph* const graph_;  // Not owned.
-  std::vector<Member> members_;
-  const DeviceSet* device_set_;  // Not owned.
-  const std::vector<DeviceType> device_types_;
-  const Device* default_device_;
-  const bool allow_soft_placement_;
-  const bool log_device_placement_;
-};
-
 void LogDeviceAssignment(const Node* node, bool log_device_placement) {
   // Log placement if log_device_placement is set.
   if (log_device_placement) {
@@ -984,6 +97,15 @@ Status Placer::Run() {
     return errors::FailedPrecondition("No devices are registered");
   }
 
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("placer_input", *graph_, nullptr, "/tmp");
+    for (const Node* node : graph_->op_nodes()) {
+      VLOG(3) << "    " << node->name() << ": requested: '"
+              << node->requested_device() << "' assigned: '"
+              << node->assigned_device_name() << "'";
+    }
+  }
+
   ColocationGraph colocation_graph(
       graph_, devices_, default_device_,
       options_ == nullptr || options_->config.allow_soft_placement(),
@@ -991,14 +113,15 @@ Status Placer::Run() {
 
   TF_RETURN_IF_ERROR(colocation_graph.Initialize());
 
-  // For each node, assign a device based on the constraints in the
-  // disjoint node set.
+  // For each node, assign a device based on the constraints in the disjoint
+  // node set.
   std::vector<Node*> second_pass;
   for (Node* node : graph_->op_nodes()) {
     // The graph may have come pre-populated by the framework with assigned
     // devices (e.g., for stateful placements), so the placer should not try to
     // place nodes that are already placed.
     if (node->has_assigned_device_name()) {
+      TF_RETURN_IF_ERROR(colocation_graph.LimitToAssignedDevice(*node));
       LogDeviceAssignment(node, log_device_placement_);
       continue;
     }
@@ -1014,7 +137,7 @@ Status Placer::Run() {
       continue;
     }
 
-    std::vector<Device*>* devices;
+    const std::vector<Device*>* devices;
     Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
@@ -1062,7 +185,7 @@ Status Placer::Run() {
   // Perform a second pass assignment for those nodes explicitly
   // skipped during the first pass.
   for (Node* node : second_pass) {
-    std::vector<Device*>* devices;
+    const std::vector<Device*>* devices;
     Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
@@ -1099,6 +222,9 @@ Status Placer::Run() {
                                     log_device_placement_));
   }
 
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("placer_output", *graph_, nullptr, "/tmp");
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 04e77e55f62e1bd9345c8e9113407bbf0a375774..ece4fe058b10ca886da8e8afecb748f69f2e22c7 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -24,11 +25,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
@@ -40,6 +45,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+using FDH = ::tensorflow::FunctionDefHelper;
+
+constexpr char kCPU[] = "/device:fakecpu:0";
+constexpr char kGPU[] = "/device:fakegpu:0";
+
+constexpr char kFullCPU[] = "/job:a/replica:0/task:0/device:fakecpu:0";
+constexpr char kFullGPU[] = "/job:a/replica:0/task:0/device:fakegpu:0";
+
 namespace {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -210,6 +225,16 @@ class PlacerTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status BuildGraph(const GraphDef& graph_def, Graph* out_graph) {
+    GraphConstructorOptions opts;
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, out_graph));
+    nodes_by_name_.clear();
+    for (Node* node : out_graph->nodes()) {
+      nodes_by_name_[node->name()] = node->id();
+    }
+    return Status::OK();
+  }
+
   // Invokes the Placer on "graph". If no DeviceSet is specified, the
   // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices).
   //
@@ -248,6 +273,16 @@ class PlacerTest : public ::testing::Test {
                              const DeviceType& expected_device_type);
 };
 
+// Fixture that add a parameter for allow_soft_placement.
+// Test cases that want to test behavior with and without soft placement
+// can use this fixture instead of PlacerTest.
+class SoftPlacementPlacerTest : public PlacerTest,
+                                public ::testing::WithParamInterface<bool> {};
+
+INSTANTIATE_TEST_SUITE_P(, SoftPlacementPlacerTest,
+                         ::testing::Values(false, true),
+                         ::testing::PrintToStringParamName());
+
 #define EXPECT_COLOCATED(g, name_a, name_b)                         \
   do {                                                              \
     Graph& g_ = (g);                                                \
@@ -866,7 +901,7 @@ TEST_F(PlacerTest, TestResourceHandle) {
 }
 
 TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
-  auto handle_test = [this](bool allow_soft_placement) {
+  auto handle_test = [this](bool allow_soft_placement, bool set_assigned) {
     Graph g(OpRegistry::Global());
     {  // Scope for temporary variables used to construct g.
       GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -878,27 +913,41 @@ TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
                     b.opts().WithName("two_handles_in"));
       TF_EXPECT_OK(BuildGraph(b, &g));
 
-      GetNodeByName(g, "var_cpu")
-          ->set_assigned_device_name(
-              "/job:a/replica:0/task:0/device:fakecpu:0");
-      GetNodeByName(g, "var_gpu")
-          ->set_assigned_device_name(
-              "/job:a/replica:0/task:0/device:fakegpu:0");
+      if (set_assigned) {
+        GetNodeByName(g, "var_cpu")
+            ->set_assigned_device_name(
+                "/job:a/replica:0/task:0/device:fakecpu:0");
+        GetNodeByName(g, "var_gpu")
+            ->set_assigned_device_name(
+                "/job:a/replica:0/task:0/device:fakegpu:0");
+      } else {
+        GetNodeByName(g, "var_cpu")
+            ->set_requested_device("/job:a/replica:0/task:0/device:fakecpu:0");
+        GetNodeByName(g, "var_gpu")
+            ->set_requested_device("/job:a/replica:0/task:0/device:fakegpu:0");
+      }
     }
 
     SessionOptions options;
     options.config.set_allow_soft_placement(allow_soft_placement);
     options.config.set_log_device_placement(true);
     Status s = Place(&g, &options);
-    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     EXPECT_TRUE(str_util::StrContains(
         s.error_message(),
-        "Could not colocate node with its resource and reference inputs"));
+        "Cannot place the graph because a reference or resource edge "
+        "connects "
+        "colocation groups with incompatible assigned devices: "
+        "/job:a/replica:0/task:0/device:fakegpu:0 vs "
+        "/job:a/replica:0/task:0/device:fakecpu:0"));
+
     return Status::OK();
   };
 
-  TF_EXPECT_OK(handle_test(false));
-  TF_EXPECT_OK(handle_test(true));
+  TF_EXPECT_OK(handle_test(false, false));
+  TF_EXPECT_OK(handle_test(false, true));
+  TF_EXPECT_OK(handle_test(true, false));
+  TF_EXPECT_OK(handle_test(true, true));
 }
 
 // Test that an assignment of an operator to the wrong device
@@ -1034,7 +1083,7 @@ TEST_F(PlacerTest, TestMultipleColocationGroups) {
   EXPECT_COLOCATED(g, "in", "foo");
 }
 
-TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
+TEST_P(SoftPlacementPlacerTest, TestInvalidMultipleColocationGroups) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1051,12 +1100,24 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  Status s = Place(&g);
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "Cannot colocate nodes {{colocation_node foo}} and "
-      "{{colocation_node in}} because no device type supports both of those "
-      "nodes and the other nodes colocated with them"));
+  bool allow_soft_placement = GetParam();
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  options.config.set_log_device_placement(true);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "colocated_1", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "foo", "FakeGPU");
+  } else {
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node foo}} and "
+        "{{colocation_node in}} because no device type supports both of those "
+        "nodes and the other nodes colocated with them"))
+        << s.ToString();
+  }
 }
 
 TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
@@ -1086,7 +1147,8 @@ TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
   EXPECT_COLOCATED(g, "var2", "assign1");
 }
 
-TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
+TEST_P(SoftPlacementPlacerTest,
+       TestColocationGroupWithUnsatisfiableReferenceConnections) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1116,12 +1178,22 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  Status s = Place(&g);
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "Cannot colocate nodes {{colocation_node var3}} and {{colocation_node "
-      "assign3}} because no device type supports both of those nodes and the "
-      "other nodes colocated with them."));
+  bool allow_soft_placement = GetParam();
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  options.config.set_log_device_placement(true);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node assign3}} and "
+        "{{colocation_node var2}} because no device type supports both of "
+        "those nodes and the other nodes colocated with them."))
+        << s.ToString();
+  }
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -1617,5 +1689,160 @@ TEST_F(PlacerTest, TestGeneratorNodeDoesntFollowNonColocatedConsumers) {
   EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
 }
 
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Mul").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Mul").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeGPU"), DummyOp);
+
+TEST_P(SoftPlacementPlacerTest,
+       RequestedDeviceOnResourceGeneratorIsTreatedAsAssigned) {
+  /*
+   *    a:RES:GPU  b:RES:CPU
+   *       |         |
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("id1", "Identity", {"a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+
+  bool allow_soft_placement = GetParam();
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  options.config.set_log_device_placement(true);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id1", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
+        "id1}}: Cannot merge devices with incompatible types: "
+        "'/device:fakecpu:0' and '/device:fakegpu:0'"))
+        << s.ToString();
+  }
+}
+
+TEST_F(PlacerTest, RequestedDeviceCanBeOverridden) {
+  /*
+   *     a:RES      b:RES
+   *       |         |
+   *     id_a:GPU   id_b:CPU
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("id_a", "Identity", {"a"}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("id1", "Identity", {"id_a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(Place(&g));
+
+  // All should be colocated
+  EXPECT_COLOCATED(g, "a", "b");
+  EXPECT_COLOCATED(g, "id_a", "id_b");
+  EXPECT_COLOCATED(g, "id1", "id2");
+  EXPECT_COLOCATED(g, "a", "id_a");
+  EXPECT_COLOCATED(g, "a", "id1");
+}
+
+TEST_P(SoftPlacementPlacerTest,
+       AssignedDevicesAreNotOverriddenDueToResourcesAndColocation) {
+  /*
+   *     a:RES      b:RES
+   *       |         |
+   *     id_a:GPU   id_b:CPU
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("id_a", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("id1", "Identity", {"id_a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  std::unordered_map<string, Node*> nodes = g.BuildNodeNameIndex();
+  GetNodeByName(g, "id_a")->set_assigned_device_name(kFullGPU);
+  GetNodeByName(g, "id_b")->set_assigned_device_name(kFullCPU);
+
+  bool allow_soft_placement = GetParam();
+
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id_a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id1", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id_b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
+        "id1}}: Cannot merge devices with incompatible types: "
+        "'/job:a/replica:0/task:0/device:fakecpu:0' and "
+        "'/job:a/replica:0/task:0/device:fakegpu:0'"))
+        << s.ToString();
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 8be9c7b678e2bbe7659c9e22e31cb595ce704307..603e28b39e171e2de911f88a12ace9f93c421add 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -99,8 +99,6 @@ class PoolAllocator : public Allocator {
     return pool_size_limit_;
   }
 
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
-
  private:
   struct PtrRecord {
     void* ptr;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 950a93671c7773c83de8c22add3ff23f884a4b1a..6db1fae77ce5550fb5980246ffeea34d6020f877 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -464,7 +464,8 @@ Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
                        const FunctionDef* fdef,
                        const FunctionLibraryDefinition* lib_def,
                        std::unique_ptr<Graph>* graph,
-                       std::vector<string>* ret_node_names) {
+                       std::vector<string>* ret_node_names,
+                       std::vector<string>* control_ret_node_names) {
   auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
     return lib_def->LookUpOpDef(op, sig);
   };
@@ -484,6 +485,10 @@ Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
   for (const Node* node : fbody->ret_nodes) {
     ret_node_names->push_back(node->name());
   }
+  control_ret_node_names->reserve(fbody->control_ret_nodes.size());
+  for (const Node* node : fbody->control_ret_nodes) {
+    control_ret_node_names->push_back(node->name());
+  }
   return Status::OK();
 }
 
@@ -522,9 +527,18 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   std::unique_ptr<Graph> graph;
   std::vector<string> ret_node_names;
+  std::vector<string> control_ret_node_names;
 
   TF_RETURN_IF_ERROR(GetGraphAndRets(function_name, attrs, fdef, lib_def,
-                                     &graph, &ret_node_names));
+                                     &graph, &ret_node_names,
+                                     &control_ret_node_names));
+
+  if (options.graph_collector != nullptr) {
+    GraphDef def;
+    graph->ToGraphDef(&def);
+    *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+    options.graph_collector->CollectRawGraph(def);
+  }
 
   DeviceSet device_set;
   for (auto d : device_mgr_->ListDevices()) {
@@ -577,9 +591,9 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   if (options.optimize_graph_fn) {
     DumpGraph("Before running graph optimization fn", graph.get());
-    Status status = options.optimize_graph_fn(std::move(ret_node_names),
-                                              &data->overlay_lib_, device_set,
-                                              cpu_device, &graph);
+    Status status = options.optimize_graph_fn(
+        std::move(ret_node_names), std::move(control_ret_node_names),
+        &data->overlay_lib_, device_set, cpu_device, &graph);
     if (!status.ok()) {
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
                    << status.ToString();
@@ -592,6 +606,13 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
   DumpGraph("After all optimization passes", graph.get());
 
+  if (options.graph_collector != nullptr) {
+    GraphDef def;
+    graph->ToGraphDef(&def);
+    *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+    options.graph_collector->CollectOptimizedGraph(def);
+  }
+
   std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
   TF_RETURN_IF_ERROR(
       PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
@@ -600,7 +621,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     for (const auto& pair : subgraphs) {
       GraphDef def;
       pair.second->ToGraphDef(&def);
-      options.graph_collector->CollectGraph(def);
+      *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+      options.graph_collector->CollectPartitionedGraph(def);
     }
   }
 
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index f30e440c29d7c0a9fb8848ae6c54327b4ea83c33..bc877db99e451577fe5104fd24200da80d0a11a6 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -130,7 +130,7 @@ class RecordingAllocator : public Allocator {
   bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); }
   size_t RequestedSize(const void* p) override { return a_->RequestedSize(p); }
   size_t AllocatedSize(const void* p) override { return a_->AllocatedSize(p); }
-  void GetStats(AllocatorStats* stats) override { a_->GetStats(stats); }
+  absl::optional<AllocatorStats> GetStats() override { return a_->GetStats(); }
   void ClearStats() override { a_->ClearStats(); }
   ProcessState::MDMap* mm_;  // not owned
   Allocator* a_;             // not owned
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index e1dc08d64545ece29a8aa2ab2612dd3cd994559e..d42b8d55e4f50606578cf249e1f245b72cd7bd24 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -34,14 +34,23 @@ namespace tensorflow {
 
 namespace {
 
+int32 DefaultNumInterOpThreads() {
+  // Use environment setting if specified (init once)
+  static int env_num_threads = NumInterOpThreadsFromEnvironment();
+  if (env_num_threads > 0) {
+    return env_num_threads;
+  }
+
+  // Default to using the number of cores available in the process.
+  return port::NumSchedulableCPUs();
+}
+
 static thread::ThreadPool* InitComputePool(const SessionOptions& options) {
   int32 inter_op_parallelism_threads =
       options.config.inter_op_parallelism_threads();
   if (inter_op_parallelism_threads == 0) {
-    // Default to using the number of cores available in the process.
-    inter_op_parallelism_threads = port::NumSchedulableCPUs();
+    inter_op_parallelism_threads = DefaultNumInterOpThreads();
   }
-
   return new thread::ThreadPool(Env::Default(), "Compute",
                                 inter_op_parallelism_threads);
 }
@@ -53,6 +62,18 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
+int32 NumInterOpThreadsFromEnvironment() {
+  int32 num;
+  const char* val = std::getenv("TF_NUM_INTEROP_THREADS");
+  return (val && strings::safe_strto32(val, &num)) ? num : 0;
+}
+
+int32 NumIntraOpThreadsFromEnvironment() {
+  int32 num;
+  const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
+  return (val && strings::safe_strto32(val, &num)) ? num : 0;
+}
+
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op != 0) return inter_op;
@@ -67,7 +88,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #endif  // _OPENMP
     DCHECK_GE(mkl_intra_op, 1);
     const int32 mkl_inter_op = std::max(
-        (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+        (DefaultNumInterOpThreads() + mkl_intra_op - 1) / mkl_intra_op, 2);
     VLOG(0)
         << "Creating new thread pool with default inter op setting: "
         << mkl_inter_op
@@ -75,8 +96,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
     return mkl_inter_op;
   }
 #endif  // INTEL_MKL
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
+  return DefaultNumInterOpThreads();
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index 5d9266671617320eea4cea60de1ebd7210f3b674..7ad658be9f785032c85f20224a4d592ded7e283c 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -30,7 +30,18 @@ namespace tensorflow {
 // using 'options'.  Caller does not take ownership over threadpool.
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
-// Returns number of inter op threads.
+// Returns the TF_NUM_INTEROP_THREADS environment value, or 0 if not specified.
+int32 NumInterOpThreadsFromEnvironment();
+
+// Returns the TF_NUM_INTRAOP_THREADS environment value, or 0 if not specified.
+int32 NumIntraOpThreadsFromEnvironment();
+
+// Returns the number of inter op threads specified in `options` or a default.
+// If no value is specified in the provided options, then the function returns
+// the value defined in the TF_NUM_INTEROP_THREADS environment variable.
+// If neither a value is specified in the options or in the environment,
+// this function will return a reasonable default value based on the number
+// of schedulable CPUs, and any MKL and OpenMP configurations.
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c20cc74bf70e3340848666a179c1bb3617a4ede6
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -0,0 +1,430 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_alg.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+// A ring algorithm exchanges chunks of tensor between devices.  The chunk size
+// depends on the number of subdivisions specified in the algorithm.  If the
+// user does not specify the number of subdivisions we may infer the number
+// dynamically so that the resulting chunk size does not exceed
+// kMaxChunkSizeBytes, empirically set at 4 MiB.
+constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
+// kMaxSubdivsPerDev is used to give an upper bound on the number of
+// subdivisions dynamically generated.  A reasonable value would be a small
+// multiple of the number of NICs adjacent to each device.
+constexpr int kMaxSubdivsPerDevice = 2;
+
+namespace tensorflow {
+namespace {
+// Each CollectiveOp implementation is free to define its own
+// BufRendezvous key format.  This function produces the key used by
+// RingAlg instances.  Note that the exec_key will differentiate between
+// different instances consequently we don't need to further differentiate
+// between subclasses of RingAlg.
+string RingAlgBufKey(const string& name, const string& exec_key, int pass,
+                     int section, int source_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat(name, "(", exec_key, "):pass(", pass, "):section(",
+                           section, "):srcrank(", source_rank, ")");
+  } else {
+    // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit
+    // hash.
+    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
+  }
+}
+
+}  // namespace
+
+void RingAlg::PCQueue::Enqueue(RingField* rf) {
+  mutex_lock l(pcq_mu_);
+  deque_.push_back(rf);
+  if (waiter_count_ > 0) {
+    cv_.notify_one();
+  }
+}
+
+RingAlg::RingField* RingAlg::PCQueue::Dequeue() {
+  mutex_lock l(pcq_mu_);
+  if (deque_.empty()) {
+    ++waiter_count_;
+    while (deque_.empty()) {
+      cv_.wait(l);
+    }
+    --waiter_count_;
+  }
+  RingField* rf = deque_.front();
+  deque_.pop_front();
+  return rf;
+}
+
+RingAlg::RingAlg(CollectiveType type, const string& name)
+    : type_(type),
+      name_(name),
+      col_ctx_(nullptr),
+      col_params_(nullptr),
+      done_(nullptr),
+      group_size_(-1),
+      num_subdivs_(-1) {}
+
+namespace {
+Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.shape.num_elements() == 0) {
+    return errors::Internal("shape in CollectiveParams should be non-empty");
+  }
+  const int kAvgDevPerTask =
+      col_params->group.group_size / col_params->group.num_tasks;
+  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
+  if (kMaxNumSubdivs <= 0) {
+    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
+                            " in ",
+                            col_params->instance.impl_details.collective_name);
+  }
+  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
+  // as many offsets as needed so that the size of tensor chunks <=
+  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
+  // lead to worse performance.
+  int num_subdivs = 0;
+  const size_t tensor_size = col_params->instance.shape.num_elements() *
+                             DataTypeSize(col_params->instance.data_type);
+  size_t chunk_size;
+  do {
+    ++num_subdivs;
+    int num_chunks = col_params->group.group_size * num_subdivs;
+    chunk_size = tensor_size / num_chunks;
+    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
+            << " chunk_size " << chunk_size;
+  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
+  if (num_subdivs <= 0) {
+    return errors::Internal("Unexpected num_subdivs ", num_subdivs, " in ",
+                            col_params->instance.impl_details.collective_name);
+  }
+
+  int subdiv_stride = kAvgDevPerTask / num_subdivs;
+  if (subdiv_stride == 0) subdiv_stride = 1;
+  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
+  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+    int subdiv_offset = subdiv_stride * sdi;
+    if (sdi % 2 == 1) subdiv_offset *= -1;
+    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
+  }
+
+  if (VLOG_IS_ON(2)) {
+    string subdiv_buf;
+    for (const int subdiv_offset :
+         col_params->instance.impl_details.subdiv_offsets) {
+      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
+    }
+    VLOG(2) << "Dynamically generated " << num_subdivs
+            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
+            << tensor_size << " chunk_size " << chunk_size;
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
+  const string& device_name =
+      col_params->instance.device_names[col_params->default_rank];
+  // Each subdiv permutation is a ring formed by rotating each
+  // single-task subsequence of devices by an offset.  This makes most
+  // sense when each task has the same number of devices but we can't
+  // depend on that being the case so we'll compute something that
+  // works in any case.
+
+  // Start by counting the devices in each task.
+  // Precondition: device_names must be sorted so that all devices in
+  // the same task are adjacent.
+  VLOG(2) << "Sorted task names: "
+          << str_util::Join(col_params->instance.task_names, ", ");
+  std::vector<int> dev_per_task;
+  const string* prior_task_name = &col_params->instance.task_names[0];
+  int dev_count = 1;
+  for (int di = 1; di < col_params->group.group_size; ++di) {
+    if (col_params->instance.task_names[di] != *prior_task_name) {
+      dev_per_task.push_back(dev_count);
+      dev_count = 1;
+      prior_task_name = &col_params->instance.task_names[di];
+    } else {
+      ++dev_count;
+    }
+  }
+  dev_per_task.push_back(dev_count);
+  DCHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
+
+  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
+    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
+  }
+
+  // Generate a ring permutation for requested offset.
+  VLOG(2) << "Setting up perms for col_params " << col_params
+          << " subdiv_permutations "
+          << &col_params->instance.impl_details.subdiv_permutations;
+  col_params->instance.impl_details.subdiv_permutations.resize(
+      col_params->instance.impl_details.subdiv_offsets.size());
+  col_params->subdiv_rank.resize(
+      col_params->instance.impl_details.subdiv_offsets.size(), -1);
+  for (int sdi = 0;
+       sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) {
+    std::vector<int>& perm =
+        col_params->instance.impl_details.subdiv_permutations[sdi];
+    DCHECK_EQ(perm.size(), 0);
+    int offset = col_params->instance.impl_details.subdiv_offsets[sdi];
+    // A negative subdivision offset is interpreted as follows:
+    //  1. Reverse the local device ordering.
+    //  2. Begin the subdivision at abs(offset) in the reversed ordering.
+    bool reverse = false;
+    if (offset < 0) {
+      offset = abs(offset);
+      reverse = true;
+    }
+    int prior_dev_count = 0;  // sum over prior worker device counts
+    for (int ti = 0; ti < col_params->group.num_tasks; ++ti) {
+      for (int di = 0; di < dev_per_task[ti]; ++di) {
+        int di_offset = (di + offset) % dev_per_task[ti];
+        int offset_di =
+            reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset;
+        // Device index in global subdivision permutation.
+        int permuted_di = prior_dev_count + offset_di;
+        int rank = static_cast<int>(perm.size());
+        perm.push_back(permuted_di);
+        if (col_params->instance.device_names[permuted_di] == device_name) {
+          DCHECK_EQ(permuted_di, col_params->default_rank);
+          col_params->subdiv_rank[sdi] = rank;
+        }
+      }
+      prior_dev_count += dev_per_task[ti];
+    }
+    DCHECK_EQ(col_params->group.group_size, perm.size());
+  }
+
+  VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
+  return Status::OK();
+}
+
+Status RingAlg::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  DCHECK(col_ctx->dev_mgr);
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+string RingAlg::TensorDebugString(const Tensor& tensor) {
+  const DeviceBase::GpuDeviceInfo* gpu_device_info =
+      col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  if (gpu_device_info) {
+    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
+    Notification note;
+    gpu_device_info->default_context->CopyDeviceTensorToCPU(
+        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
+        [&note](const Status& s) {
+          DCHECK(s.ok());
+          note.Notify();
+        });
+    note.WaitForNotification();
+    return cpu_tensor.SummarizeValue(64);
+  } else {
+    return tensor.SummarizeValue(64);
+  }
+}
+
+void RingAlg::StartAbort(const Status& s) {
+  // In abort mode we stop issuing additional ProvideBuf
+  // and ConsumeBuf calls, but we need to wait for all of the
+  // outstanding callbacks to be invoked before quitting.
+  bool abort_started = false;
+  {
+    mutex_lock l(status_mu_);
+    if (status_.ok()) {
+      LOG(ERROR) << "Aborting Ring" << name_ << " with " << s;
+      abort_started = true;
+      status_.Update(s);
+    }
+  }
+  // If this is the initial entry to abort mode then invoke StartAbort
+  // on the CollectiveExecutor that invoked us.  That should start
+  // cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions.
+  if (abort_started) {
+    col_ctx_->col_exec->StartAbort(s);
+  }
+}
+
+void RingAlg::Finish(bool ok) {
+  if (ok) {
+    // Recover the output from the adaptor.
+    ca_->ConsumeFinalValue(col_ctx_->output);
+  }
+  Status s;
+  {
+    mutex_lock l(status_mu_);
+    s = status_;
+  }
+  rfv_.clear();  // Give up Refs on output tensor.
+  done_(s);
+}
+
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
+void RingAlg::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                            int field_idx) {
+  // Note on field indexing: There are group_size_ devices in the
+  // instance, implying the same number of chunks per tensor, where a
+  // chunk is the unit of data transferred in a time step.  However, if
+  // a device can simultaneously send data by 2 or more independent
+  // channels we can speed up the transfer by subdividing chunks and
+  // processing multiple subdivisions at once.  So the actual number
+  // of RingFields is group_size_ * num_subdivs_.
+  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
+  rf->chunk_idx = chunk_idx;
+  rf->subdiv_idx = subdiv_idx;
+  rf->sc_idx = field_idx;
+  rf->rank = col_params_->subdiv_rank[subdiv_idx];
+  rf->second_pass = false;
+  rf->action = RF_INIT;
+  // Recv from the device with preceding rank within the subdivision.
+  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  rf->recv_dev_idx = col_params_->instance.impl_details
+                         .subdiv_permutations[subdiv_idx][recv_from_rank];
+  int send_dev_idx = col_params_->instance.impl_details
+                         .subdiv_permutations[subdiv_idx][send_to_rank];
+  rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx];
+  rf->send_is_remote = !col_params_->task.is_local[send_dev_idx];
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 0 we skip Recv when rank = chunk_idx
+    rf->do_recv = (rf->chunk_idx != rf->rank);
+    // In pass 0 we skip Send when rank = chunk_idx-1
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  if (rf->do_send || rf->do_recv) {
+    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
+  }
+  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
+          << ca_->TBounds(rf->chunk);
+}
+
+// When a RingField transitions from first to second recompute the
+// do_send and do_recv values.
+void RingAlg::AdvanceToSecondPass(RingField* rf) {
+  VLOG(3) << "IncrRingField old value " << rf->DebugString();
+  DCHECK(!rf->second_pass);
+  rf->second_pass = true;
+  rf->action = RF_INIT;
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 1 the send/no-send boundary moves down 1 place.
+    rf->do_recv =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  VLOG(3) << "IncrRingField new value " << rf->DebugString();
+}
+
+string RingAlg::RingField::DebugString() const {
+  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
+                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
+                              " action=", action);
+  strings::StrAppend(&rv, " pass=", second_pass);
+  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
+                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
+                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
+  return rv;
+}
+
+void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
+  DCHECK(rf->do_send);
+  string send_buf_key = RingAlgBufKey(name_, col_ctx_->exec_key,
+                                      rf->second_pass, rf->sc_idx, rf->rank);
+  VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
+          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
+          << rf->sc_idx;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  int send_to_dev_idx = col_params_->instance.impl_details
+                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
+  col_ctx_->col_exec->PostToPeer(
+      col_params_->instance.device_names[send_to_dev_idx],
+      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
+      col_ctx_->device_locality, done);
+}
+
+void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
+  DCHECK(rf->do_recv);
+  string recv_buf_key =
+      RingAlgBufKey(name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
+                    (rf->rank + (group_size_ - 1)) % group_size_);
+  VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
+          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
+          << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk");
+  Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
+                           ? &rf->tmp_chunk
+                           : &rf->chunk;
+  col_ctx_->col_exec->RecvFromPeer(
+      col_params_->instance.device_names[rf->recv_dev_idx],
+      col_params_->instance.task_names[rf->recv_dev_idx],
+      col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
+      col_ctx_->device_locality, rf->subdiv_idx, done);
+}
+
+string RingAlg::FieldState() {
+  string s = strings::StrCat(
+      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64>(this)),
+      " exec ", col_ctx_->exec_key, " step_id=", col_ctx_->step_id,
+      " state of all ", rfv_.size(), " fields:");
+  for (int i = 0; i < rfv_.size(); ++i) {
+    s.append("\n");
+    s.append(rfv_[i].DebugString());
+  }
+  return s;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc07618f8805e3a9abcaf575c3d2984aa27948b7
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Basic ring-algorithm implementation to be further specialized
+// for specific collective functions.
+class RingAlg : public CollectiveImplementationInterface {
+ public:
+  explicit RingAlg(CollectiveType type, const string& name);
+  ~RingAlg() override {}
+
+  // Establishes the requested number of subdivision permutations based on the
+  // ring order implicit in the device order.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // No-op for ring alg.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
+ protected:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const Status& s);
+  void Finish(bool ok);
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    Status status;
+    string DebugString() const;
+  };
+  virtual void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                             int field_idx);
+  void AdvanceToSecondPass(RingField* rf);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(const Tensor& tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+  };
+
+  const CollectiveType type_;
+  const string name_;
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  int group_size_;
+  int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  mutex status_mu_;
+  Status status_ GUARDED_BY(status_mu_);
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58251fc171459ee35820c3157c48e4222e9f1ec2
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -0,0 +1,266 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_gatherer.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+Status RingGatherer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  DCHECK_EQ(col_params->instance.type, GATHER_COLLECTIVE);
+  DCHECK_EQ(col_params->instance.impl_details.collective_name, "RingGather");
+  // TODO(tucker): Maybe add subdiv support.  It's only useful with
+  // multiple NICS, and maybe gather performance isn't important enough.
+  // For now, there must always be only a single subdiv at offset 0.
+  if (!col_params->instance.impl_details.subdiv_offsets.empty() &&
+      (col_params->instance.impl_details.subdiv_offsets.size() > 1 ||
+       col_params->instance.impl_details.subdiv_offsets[0] != 0)) {
+    return errors::InvalidArgument(
+        "RingGather cannot take any subdiv offset other than 0.");
+  }
+  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
+    col_params->instance.impl_details.subdiv_offsets.push_back(0);
+  }
+  return RingAlg::InitializeCollectiveParams(col_params);
+}
+
+void RingGatherer::Run(StatusCallback done) {
+  DCHECK(col_ctx_);
+  DCHECK(col_params_);
+  done_ = std::move(done);
+  group_size_ = col_params_->group.group_size;
+  num_subdivs_ = static_cast<int>(
+      col_params_->instance.impl_details.subdiv_permutations.size());
+  DCHECK_GT(num_subdivs_, 0);
+
+  if (VLOG_IS_ON(1)) {
+    string buf;
+    for (int r = 0; r < col_params_->instance.device_names.size(); ++r) {
+      strings::StrAppend(&buf, "dev ", r, " : ",
+                         col_params_->instance.device_names[r], "\n");
+    }
+    for (int sd = 0;
+         sd < col_params_->instance.impl_details.subdiv_permutations.size();
+         ++sd) {
+      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      for (auto x :
+           col_params_->instance.impl_details.subdiv_permutations[sd]) {
+        strings::StrAppend(&buf, x, ", ");
+      }
+    }
+    VLOG(1) << "RingGatherer::Run for device " << col_ctx_->device_name
+            << " default_rank " << col_params_->default_rank << "\n"
+            << buf;
+  }
+
+  // Prepare to alias fields within the output.
+  AllocatorAttributes attr = col_ctx_->op_ctx->output_alloc_attr(0);
+  ca_.reset(MakeCollectiveAdapter(col_ctx_->output, group_size_ * num_subdivs_,
+                                  col_ctx_->device->GetAllocator(attr),
+                                  false /*align_chunks*/));
+
+  // Start by copying input to the rank-specific offset of output.
+  // We are running in a blockable thread and the callback can't block so
+  // just wait here on the copy.
+  Notification note;
+  Status status;
+  Tensor alias_chunk(ca_->ChunkAlias(col_params_->subdiv_rank[0]));
+  CollectiveRemoteAccessLocal::MemCpyAsync(
+      col_ctx_->op_ctx->input_device_context(0),
+      col_ctx_->op_ctx->op_device_context(), col_ctx_->device, col_ctx_->device,
+      col_ctx_->op_ctx->input_alloc_attr(0),
+      col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input, &alias_chunk,
+      0 /*dev_to_dev_stream_index*/, [&note, &status](const Status& s) {
+        status.Update(s);
+        note.Notify();
+      });
+  note.WaitForNotification();
+  if (!status.ok()) {
+    done_(status);
+    return;
+  }
+  Finish(RunAsyncParts());
+}
+
+bool RingGatherer::RunAsyncParts() {
+  // This function orchestrates RingGatherer actions on behalf of a
+  // single device. It is entered by a blockable thread that
+  // loops within it until all actions assigned to that device
+  // complete. Hence function local variables are accessible only by that
+  // one thread and do not require an explicit mutex.
+  rfv_.clear();
+  rfv_.resize(group_size_ * num_subdivs_);
+  PCQueue ready_queue;
+  for (int chunk_idx = 0; chunk_idx < group_size_; ++chunk_idx) {
+    for (int subdiv_idx = 0; subdiv_idx < num_subdivs_; ++subdiv_idx) {
+      int rf_index = (chunk_idx * num_subdivs_) + subdiv_idx;
+      InitRingField(&rfv_[rf_index], chunk_idx, subdiv_idx, rf_index);
+      ready_queue.Enqueue(&rfv_[rf_index]);
+    }
+  }
+  const DeviceBase::GpuDeviceInfo* gpu_info =
+      col_ctx_->device->tensorflow_gpu_device_info();
+  if (gpu_info) {
+    // Wait for all currently queued events on the CPU compute stream to
+    // complete before proceeding.  The previous InitRingField calls allocated
+    // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
+    // write) unless we do.
+    Notification note;
+    Status s = gpu_info->default_context->ThenExecute(
+        col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
+    if (s.ok()) {
+      note.WaitForNotification();
+    } else {
+      mutex_lock l(status_mu_);
+      status_ =
+          errors::Internal("Failed to dispatch ThenExecute in RingGatherer");
+      return false;
+    }
+  }
+
+  int field_done_count = 0;
+  int send_pending_count = 0;
+  int recv_pending_count = 0;
+  std::atomic<bool> aborted(false);
+
+  // Loop until all RingFields have advanced to completion.
+  while (field_done_count < rfv_.size()) {
+    VLOG(4) << FieldState();
+    // Wait for a RingField to appear in the ready_queue.
+    RingField* rf = ready_queue.Dequeue();
+    // Advance the RingField to its next action and execute, repeating
+    // until either an async action has been started or the RingField
+    // is done.
+    bool dispatched = false;  // true if async action was initiated
+    do {
+      if (aborted) {
+        // Requeue this RingField to be counted off below.
+        ready_queue.Enqueue(rf);
+        break;
+      }
+      switch (rf->action) {
+        case RF_INIT:
+          if (rf->do_recv) {
+            rf->action = RF_RECV;
+            auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchRecv(rf, requeue);
+            dispatched = true;
+            ++recv_pending_count;
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_RECV:
+          DCHECK_GT(recv_pending_count, 0);
+          --recv_pending_count;
+          rf->action = RF_SEND_READY;
+          break;
+        case RF_REDUCE:
+          // Never used for Gather, so just fall through.
+          TF_FALLTHROUGH_INTENDED;
+        case RF_FINALIZE:
+          // Never used for Gather, so just fall through.
+          TF_FALLTHROUGH_INTENDED;
+        case RF_SEND_READY:
+          if (rf->do_send) {
+            rf->action = RF_SEND;
+            auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchSend(rf, send_complete);
+            dispatched = true;
+            ++send_pending_count;
+          } else {
+            rf->action = RF_DONE;
+          }
+          break;
+        case RF_SEND:
+          DCHECK_GT(send_pending_count, 0);
+          --send_pending_count;
+          rf->action = RF_DONE;
+          break;
+        case RF_DONE:
+          break;
+      }
+      if (rf->action == RF_DONE) {
+        // There's only one pass.
+        ++field_done_count;
+        break;  // from do while(!dispatched)
+      }
+    } while (!dispatched);
+    if (aborted) break;
+  }  // while (field_done_count < number of fields)
+
+  if (aborted) {
+    // All of the pending data actions should be aborted; field the
+    // callbacks and clear the queue before quitting.
+    while ((send_pending_count > 0) || (recv_pending_count > 0)) {
+      RingField* rf = ready_queue.Dequeue();
+      switch (rf->action) {
+        case RF_RECV:
+          --recv_pending_count;
+          break;
+        case RF_SEND:
+          --send_pending_count;
+          break;
+        default: {
+        }  // Ignore any other actions
+      }
+    }
+  }
+
+  DCHECK_EQ(send_pending_count, 0);
+  DCHECK_EQ(recv_pending_count, 0);
+
+  VLOG(2) << this << " device=" << col_ctx_->device_name << " finish;"
+          << " final value " << TensorDebugString(ca_->Value());
+  return !aborted;
+}
+
+REGISTER_COLLECTIVE(RingGather, RingGatherer);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_gatherer.h b/tensorflow/core/common_runtime/ring_gatherer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee9634834d2b6c9d986cfb1841ae03c51e22564b
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Ring-algorithm implementation of collective all-gather.
+class RingGatherer : public RingAlg {
+ public:
+  RingGatherer() : RingAlg(GATHER_COLLECTIVE, "Gather") {}
+  ~RingGatherer() override {}
+
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Begins async execution of the ring gather algorithm.
+  // Must be called in a blockable thread.
+  // TODO(b/80529858): remove the previous warning when we have a dedicated
+  // collective threadpool.
+  void Run(StatusCallback done) override;
+
+ private:
+  bool RunAsyncParts();
+
+  friend class RingGathererTest;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97ff7b58fa700d72bde145c0cb789228cf163cc6
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -0,0 +1,651 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_gatherer.h"
+
+#include <algorithm>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      done(errors::Internal("Deliberate failure"));
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
+        done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                    const DeviceType& device_type,
+                                    DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) {
+    LOG(FATAL) << status;
+  }
+  return k;
+}
+
+static int64 kStepId = 123;
+
+class RingGathererTest : public ::testing::Test {
+ protected:
+  RingGathererTest() : device_type_(DEVICE_CPU) {}
+
+#ifdef GOOGLE_CUDA
+  void InitGPUDevices() {
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+  }
+#endif
+
+  ~RingGathererTest() override {
+    stop_ = true;
+    for (auto i : instances_) delete i;
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int num_subdivs, int fail_after) {
+#ifdef GOOGLE_CUDA
+    InitGPUDevices();
+#endif
+    device_type_ = device_type;
+    std::vector<std::unique_ptr<Device>> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name =
+              strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
+                 << " devices: ";
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    }
+    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    col_params_.name = "test_collective";
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = GATHER_COLLECTIVE;
+    col_params_.instance.impl_details.collective_name = "RingGather";
+    col_params_.instance.data_type = dtype;
+    col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs);
+    col_params_.subdiv_rank.resize(num_subdivs);
+    int subdiv_stride = num_devices / num_subdivs;
+    for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/cpu:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name =
+              strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  void Gather(int fail_after) {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoGather();
+        ++done;
+      });
+      if (fail_after > 0) {
+        // Stagger the op execution starts.
+        Env::Default()->SleepForMicroseconds(100);
+      }
+    }
+    while (done < static_cast<int>(instances_.size())) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int num_subdivs, int tensor_len,
+               int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, num_subdivs, fail_after);
+    int32 output_len = tensor_len * num_workers * num_devices;
+    std::vector<T> expected(output_len, 0.0);
+    for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+      DeviceInstance* instance = instances_[di];
+      int32 instance_offset = di * tensor_len;
+      instance->InitTensor(dtype, TensorShape({tensor_len}),
+                           [instance_offset, &expected, dtype, di](Tensor* t) {
+                             for (size_t i = 0; i < t->NumElements(); ++i) {
+                               // The cast is necessary to prevent clang-tidy
+                               // from insisting that a faster non-open source
+                               // function be substituted.
+                               float value =
+                                   pow(10, static_cast<double>(di)) * i;
+                               if (dtype == DT_INT32 || dtype == DT_INT64) {
+                                 value = di * 10 + i;
+                               }
+                               t->flat<T>()(i) = static_cast<T>(value);
+                               expected[instance_offset + i] = value;
+                             }
+                           });
+    }
+    Gather(fail_after);
+    if (fail_after > 0) {
+      // Confirm that every device terminated with the expected error status.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        EXPECT_EQ("Deliberate failure",
+                  instances_[di]->status_.error_message());
+      }
+    } else {
+      // Confirm that every device accumulated the same set of correct
+      // values.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        TF_EXPECT_OK(instances_[di]->status_);
+        Tensor* inst = &instances_[di]->output_tensor_;
+        CHECK(inst);
+        Tensor actual(dtype, TensorShape({output_len}));
+        if (device_type_ == DEVICE_CPU) {
+          CHECK(actual.CopyFrom(*inst, inst->shape()));
+          VLOG(1) << "actual " << actual.SummarizeValue(100);
+        } else if (device_type_ == DEVICE_GPU) {
+          Notification note;
+          Device* dev = instances_[di]->device_;
+          auto* dev_info = dev->tensorflow_gpu_device_info();
+          CHECK(dev_info);
+          dev_info->default_context->CopyDeviceTensorToCPU(
+              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
+                CHECK(s.ok());
+                note.Notify();
+              });
+          note.WaitForNotification();
+        }
+
+        auto alias = actual.template unaligned_flat<T>();
+        for (int i = 0; i < output_len; ++i) {
+          switch (dtype) {
+            case DT_FLOAT:
+              EXPECT_FLOAT_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_DOUBLE:
+              EXPECT_DOUBLE_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_INT32:
+            case DT_INT64:
+              EXPECT_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            default:
+              LOG(FATAL) << "unimplemented";
+          }
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveGather(const CollectiveParams& params,
+                                                Tensor* input,
+                                                const DeviceType& device_type,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_gather_", gather_counter_++),
+        "CollectiveGather");
+    TF_CHECK_OK(builder.Attr("T", params.instance.data_type)
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", params.instance.shape)
+                    .Input(FakeInput(params.instance.data_type))
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  void RunSubdivPermsTest(
+      CollectiveParams* cp,
+      const std::vector<std::vector<int>>& expected_subdiv_perms,
+      const std::vector<int>& expected_subdiv_rank) {
+    col_exec_ = nullptr;
+    cp->instance.impl_details.subdiv_permutations.clear();
+    cp->subdiv_rank.clear();
+    // Create a stub ring gatherer only for testing param initialization.
+    RingGatherer gatherer;
+    TF_CHECK_OK(gatherer.InitializeCollectiveParams(cp));
+    EXPECT_EQ(expected_subdiv_perms,
+              cp->instance.impl_details.subdiv_permutations);
+    EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, RingGathererTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_))
+          << "Couldn't find device " << dev_name
+          << " existing devices: " << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size,
+               static_cast<int>(col_params_.instance.device_names.size()));
+      // Id of this device is at rank position in first subdiv perm.
+      int my_device_id =
+          col_params_.instance.impl_details.subdiv_permutations[0][rank];
+      col_params_.default_rank = my_device_id;
+      // Set rank for all other subdivs by finding that device_id.
+      for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+        for (int r = 0; r < static_cast<int>(col_params_.instance.impl_details
+                                                 .subdiv_permutations[sdi]
+                                                 .size());
+             ++r) {
+          if (my_device_id ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            break;
+          }
+        }
+      }
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      input_tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        init_f(&input_tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        init_f(&cpu_tensor);
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        Notification note;
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &input_tensor_, [&note](const Status& s) {
+              CHECK(s.ok());
+              note.Notify();
+            });
+        note.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoGather() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&input_tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op = parent_->GetCollectiveGather(
+          col_params_, &input_tensor_, DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TensorShape output_shape({static_cast<int64>(
+          parent_->instances_.size() * input_tensor_.shape().num_elements())});
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, output_shape,
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+      // Prepare a RingGatherer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      RingGatherer gatherer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                &ctx, &op_params, col_params_, exec_key,
+                                kStepId, &input_tensor_, output_tensor_ptr);
+      TF_CHECK_OK(gatherer.InitializeCollectiveContext(&col_ctx));
+
+      // Run the all-gather.
+      gatherer.Run([this](Status s) { status_ = s; });
+      if (status_.ok()) {
+        CHECK(output_tensor_.CopyFrom(*ctx.mutable_output(0),
+                                      ctx.mutable_output(0)->shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    const Tensor& input_tensor() { return input_tensor_; }
+    const Tensor& output_tensor() { return output_tensor_; }
+
+    RingGathererTest* parent_;
+    string dev_name_;
+    DeviceType device_type_;
+    int rank_;
+    Tensor input_tensor_;
+    Tensor output_tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };
+
+  bool stop_ = false;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
+  mutex mu_;
+  int32 gather_counter_ GUARDED_BY(mu_) = 0;
+};
+
+CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
+                                       const int num_tasks) {
+  CollectiveParams cp;
+  const int kNumDevs = num_devs_per_task * num_tasks;
+  cp.group.group_key = 1;
+  cp.group.group_size = kNumDevs;
+  cp.group.device_type = DeviceType("GPU");
+  cp.group.num_tasks = num_tasks;
+  cp.instance.instance_key = 3;
+  cp.instance.type = GATHER_COLLECTIVE;
+  cp.instance.data_type = DataType(DT_FLOAT);
+  cp.instance.shape = TensorShape({kNumDevs * kNumDevs});
+  cp.instance.impl_details.collective_name = "RingGather";
+  cp.instance.impl_details.subdiv_offsets.push_back(0);
+  cp.is_source = false;
+  for (int i = 0; i < kNumDevs; ++i) {
+    int task_id = i / num_devs_per_task;
+    int dev_id = i % num_devs_per_task;
+    string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id);
+    string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id);
+    cp.instance.task_names.push_back(task_name);
+    cp.instance.device_names.push_back(device_name);
+  }
+  return cp;
+}
+
+TEST_F(RingGathererTest, InitializeParams) {
+  const int kNumDevsPerTask = 8;
+  const int kNumTasks = 3;
+  CollectiveParams cp = SetUpCollectiveParams(kNumDevsPerTask, kNumTasks);
+
+  cp.default_rank = 0;
+  cp.instance.impl_details.subdiv_offsets = {};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  cp.instance.impl_details.subdiv_offsets = {0};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  cp.default_rank = 3;
+  cp.instance.impl_details.subdiv_offsets = {};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {3});
+}
+
+// TODO(b/113171733): change to use TEST_P.
+#define DEF_TEST(B, T, W, D, S, L, A)                                         \
+  TEST_F(RingGathererTest,                                                    \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, S, L, A);                    \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#ifndef GOOGLE_CUDA
+// Success tests
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1, 32768, 0)
+DEF_TEST(DOUBLE, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(DOUBLE, CPU, 2, 8, 1, 4095, 0)
+DEF_TEST(INT32, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT32, CPU, 2, 8, 1, 4095, 0)
+DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, CPU, 2, 8, 1, 4095, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 1)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 11)
+#endif
+
+#ifdef GOOGLE_CUDA
+// GPU tests.  So long as the device names are all in a single tasks we
+// bypass inter-worker routing code and can fake multiple GPUs with a single
+// GPU, from the perspective of the RingGatherer logic.  So these tests
+// are all single-worker.
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 32768, 0)
+DEF_TEST(FLOAT, GPU, 1, 4, 1, 32768, 0)
+DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
+// INT32 values are never on the GPU.
+// DEF_TEST(INT32, GPU, 1, 1, 1, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 5)
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 8ed2fc2f1c973467cfe881b4e5a0a0d870fc58fd..3328804cdfb00ecbbc473add3984b414add06b1e 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -39,212 +39,15 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
-// Set true for greater intelligibility of debug mode log messages.
-#define READABLE_KEYS false
-// RingReduce algorithm exchanges chunks of tensor between devices.  The chunk
-// size depends on the number of subdivisions specified in the algorithm.  If
-// the user does not specify the number of subdivisions, we infer the number
-// dynamically so that the resulting chunk size does not exceed
-// kMaxChunkSizeBytes, empirically set at 4 MiB.
-constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
-// kMaxSubdivsPerDev is used to give an upper bound on the number of
-// subdivisions dynamically generated.  A reasonable value would be a small
-// multiple of the number of NICs adjacent to each device.
-constexpr int kMaxSubdivsPerDevice = 2;
-
 namespace tensorflow {
-namespace {
-// Each CollectiveOp implementation is free to define its own
-// BufRendezvous key format.  This function produces the key used by
-// RingReducer.
-string RingReduceBufKey(const string& exec_key, int pass, int section,
-                        int source_rank) {
-  if (READABLE_KEYS) {
-    return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(",
-                           section, "):srcrank(", source_rank, ")");
-  } else {
-    // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit
-    // hash.
-    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
-  }
-}
-
-}  // namespace
-
-void RingReducer::PCQueue::Enqueue(RingField* rf) {
-  mutex_lock l(pcq_mu_);
-  deque_.push_back(rf);
-  if (waiter_count_ > 0) {
-    cv_.notify_one();
-  }
-}
-
-RingReducer::RingField* RingReducer::PCQueue::Dequeue() {
-  mutex_lock l(pcq_mu_);
-  if (deque_.empty()) {
-    ++waiter_count_;
-    while (deque_.empty()) {
-      cv_.wait(l);
-    }
-    --waiter_count_;
-  }
-  RingField* rf = deque_.front();
-  deque_.pop_front();
-  return rf;
-}
-
-RingReducer::RingReducer()
-    : col_ctx_(nullptr),
-      col_params_(nullptr),
-      done_(nullptr),
-      group_size_(-1),
-      num_subdivs_(-1) {}
 
 RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); }
 
-Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
-  if (col_params->instance.shape.num_elements() == 0) {
-    return errors::Internal("shape in CollectiveParams should be non-empty");
-  }
-  const int kAvgDevPerTask =
-      col_params->group.group_size / col_params->group.num_tasks;
-  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
-  if (kMaxNumSubdivs <= 0) {
-    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
-                            " in RingReducer");
-  }
-  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
-  // as many offsets as needed so that the size of tensor chunks <=
-  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
-  // lead to worse performance.
-  int num_subdivs = 0;
-  const size_t tensor_size = col_params->instance.shape.num_elements() *
-                             DataTypeSize(col_params->instance.data_type);
-  size_t chunk_size;
-  do {
-    ++num_subdivs;
-    int num_chunks = col_params->group.group_size * num_subdivs;
-    chunk_size = tensor_size / num_chunks;
-    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
-            << " chunk_size " << chunk_size;
-  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
-  if (num_subdivs <= 0) {
-    return errors::Internal("Unexpected num_subdivs ", num_subdivs,
-                            " in RingReducer");
-  }
-
-  int subdiv_stride = kAvgDevPerTask / num_subdivs;
-  if (subdiv_stride == 0) subdiv_stride = 1;
-  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
-  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
-    int subdiv_offset = subdiv_stride * sdi;
-    if (sdi % 2 == 1) subdiv_offset *= -1;
-    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
-  }
-
-  if (VLOG_IS_ON(2)) {
-    string subdiv_buf;
-    for (const int subdiv_offset :
-         col_params->instance.impl_details.subdiv_offsets) {
-      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
-    }
-    VLOG(2) << "Dynamically generated " << num_subdivs
-            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
-            << tensor_size << " chunk_size " << chunk_size;
-  }
-
-  return Status::OK();
-}
-
 Status RingReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
   // TODO(b/113171733): change CHECKs to return errors.
   CHECK_EQ(col_params->instance.type, REDUCTION_COLLECTIVE);
   CHECK_EQ(col_params->instance.impl_details.collective_name, "RingReduce");
-  const string& device_name =
-      col_params->instance.device_names[col_params->default_rank];
-  // Each subdiv permutation is a ring formed by rotating each
-  // single-task subsequence of devices by an offset.  This makes most
-  // sense when each task has the same number of devices but we can't
-  // depend on that being the case so we'll compute something that
-  // works in any case.
-
-  // Start by counting the devices in each task.
-  // Precondition: device_names must be sorted so that all devices in
-  // the same task are adjacent.
-  VLOG(2) << "Sorted task names: "
-          << str_util::Join(col_params->instance.task_names, ", ");
-  std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->instance.task_names[0];
-  int dev_count = 1;
-  for (int di = 1; di < col_params->group.group_size; ++di) {
-    if (col_params->instance.task_names[di] != *prior_task_name) {
-      dev_per_task.push_back(dev_count);
-      dev_count = 1;
-      prior_task_name = &col_params->instance.task_names[di];
-    } else {
-      ++dev_count;
-    }
-  }
-  dev_per_task.push_back(dev_count);
-  CHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
-
-  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
-    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
-  }
-
-  // Generate a ring permutation for requested offset.
-  VLOG(2) << "Setting up perms for col_params " << col_params
-          << " subdiv_permutations "
-          << &col_params->instance.impl_details.subdiv_permutations;
-  col_params->instance.impl_details.subdiv_permutations.resize(
-      col_params->instance.impl_details.subdiv_offsets.size());
-  col_params->subdiv_rank.resize(
-      col_params->instance.impl_details.subdiv_offsets.size(), -1);
-  for (int sdi = 0;
-       sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) {
-    std::vector<int>& perm =
-        col_params->instance.impl_details.subdiv_permutations[sdi];
-    CHECK_EQ(perm.size(), 0);
-    int offset = col_params->instance.impl_details.subdiv_offsets[sdi];
-    // A negative subdivision offset is interpreted as follows:
-    //  1. Reverse the local device ordering.
-    //  2. Begin the subdivision at abs(offset) in the reversed ordering.
-    bool reverse = false;
-    if (offset < 0) {
-      offset = abs(offset);
-      reverse = true;
-    }
-    int prior_dev_count = 0;  // sum over prior worker device counts
-    for (int ti = 0; ti < col_params->group.num_tasks; ++ti) {
-      for (int di = 0; di < dev_per_task[ti]; ++di) {
-        int di_offset = (di + offset) % dev_per_task[ti];
-        int offset_di =
-            reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset;
-        // Device index in global subdivision permutation.
-        int permuted_di = prior_dev_count + offset_di;
-        int rank = static_cast<int>(perm.size());
-        perm.push_back(permuted_di);
-        if (col_params->instance.device_names[permuted_di] == device_name) {
-          CHECK_EQ(permuted_di, col_params->default_rank);
-          col_params->subdiv_rank[sdi] = rank;
-        }
-      }
-      prior_dev_count += dev_per_task[ti];
-    }
-    CHECK_EQ(col_params->group.group_size, perm.size());
-  }
-
-  VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
-  return Status::OK();
-}
-
-Status RingReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
-  CHECK(col_ctx->dev_mgr);
-  col_ctx_ = col_ctx;
-  col_params_ = &col_ctx->col_params;
-  return collective_util::InitializeDeviceAndLocality(
-      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
-      &col_ctx->device_locality);
+  return RingAlg::InitializeCollectiveParams(col_params);
 }
 
 void RingReducer::Run(StatusCallback done) {
@@ -303,25 +106,6 @@ void RingReducer::Run(StatusCallback done) {
   ContinueAfterInputCopy();
 }
 
-string RingReducer::TensorDebugString(const Tensor& tensor) {
-  const DeviceBase::GpuDeviceInfo* gpu_device_info =
-      col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  if (gpu_device_info) {
-    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
-    Notification note;
-    gpu_device_info->default_context->CopyDeviceTensorToCPU(
-        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
-        [&note](const Status& s) {
-          CHECK(s.ok());
-          note.Notify();
-        });
-    note.WaitForNotification();
-    return cpu_tensor.SummarizeValue(64);
-  } else {
-    return tensor.SummarizeValue(64);
-  }
-}
-
 // Note that this function is blocking and must not run in any thread
 // which cannot be blocked.
 void RingReducer::ContinueAfterInputCopy() {
@@ -358,170 +142,16 @@ void RingReducer::ContinueAfterInputCopy() {
   Finish(RunAsyncParts());
 }
 
-void RingReducer::StartAbort(const Status& s) {
-  // In abort mode we stop issuing additional ProvideBuf
-  // and ConsumeBuf calls, but we need to wait for all of the
-  // outstanding callbacks to be invoked before quitting.
-  bool abort_started = false;
-  {
-    mutex_lock l(status_mu_);
-    if (status_.ok()) {
-      LOG(ERROR) << "Aborting RingReduce with " << s;
-      abort_started = true;
-      status_.Update(s);
-    }
-  }
-  // If this is the initial entry to abort mode then invoke StartAbort
-  // on the CollectiveExecutor that invoked us.  That should start
-  // cancellation on all of the outstanding CollectiveRemoteAccess
-  // actions.
-  if (abort_started) {
-    col_ctx_->col_exec->StartAbort(s);
-  }
-}
-
-void RingReducer::Finish(bool ok) {
-  if (ok) {
-    // Recover the output from the adaptor.
-    ca_->ConsumeFinalValue(col_ctx_->output);
-  }
-  Status s;
-  {
-    mutex_lock l(status_mu_);
-    s = status_;
-  }
-  rfv_.clear();  // Give up Refs on output tensor.
-  done_(s);
-}
-
-// At the beginning of the algorithm initialize a RingField struct for
-// every independent field of the tensor.
 void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
                                 int field_idx) {
-  // Note on field indexing: There are group_size_ devices in the
-  // instance, implying the same number of chunks per tensor, where a
-  // chunk is the unit of data transferred in a time step.  However, if
-  // a device can simultaneously send data by 2 or more independent
-  // channels we can speed up the transfer by subdividing chunks and
-  // processing multiple subdivisions at once.  So the actual number
-  // of RingFields is group_size_ * num_subdivs_.
-  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
-  rf->chunk_idx = chunk_idx;
-  rf->subdiv_idx = subdiv_idx;
-  rf->sc_idx = field_idx;
-  rf->rank = col_params_->subdiv_rank[subdiv_idx];
-  rf->second_pass = false;
-  rf->action = RF_INIT;
-  // Recv from the device with preceding rank within the subdivision.
-  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
-  int send_to_rank = (rf->rank + 1) % group_size_;
-  rf->recv_dev_idx = col_params_->instance.impl_details
-                         .subdiv_permutations[subdiv_idx][recv_from_rank];
-  int send_dev_idx = col_params_->instance.impl_details
-                         .subdiv_permutations[subdiv_idx][send_to_rank];
-  rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx];
-  rf->send_is_remote = !col_params_->task.is_local[send_dev_idx];
-  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
-    // In pass 0 we skip Recv when rank = chunk_idx
-    rf->do_recv = (rf->chunk_idx != rf->rank);
-    // In pass 0 we skip Send when rank = chunk_idx-1
-    rf->do_send =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-  }
-  rf->is_final =
-      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-  if (rf->do_send || rf->do_recv) {
-    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
-    CHECK(rf->chunk.IsAligned()) << rf->DebugString();
-  }
+  RingAlg::InitRingField(rf, chunk_idx, subdiv_idx, field_idx);
   if (rf->do_recv) {
     rf->tmp_chunk = ca_->TempChunk(rf->sc_idx);
-    CHECK(rf->tmp_chunk.IsAligned()) << rf->DebugString();
-  }
-  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
-          << ca_->TBounds(rf->chunk);
-}
-
-// When a RingField transitions from first to second recompute the
-// do_send and do_recv values.
-void RingReducer::AdvanceToSecondPass(RingField* rf) {
-  VLOG(3) << "IncrRingField old value " << rf->DebugString();
-  CHECK(!rf->second_pass);
-  rf->second_pass = true;
-  rf->action = RF_INIT;
-  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
-    // In pass 1 the send/no-send boundary moves down 1 place.
-    rf->do_recv =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-    rf->do_send =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
-  }
-  rf->is_final =
-      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
-  VLOG(3) << "IncrRingField new value " << rf->DebugString();
-}
-
-string RingReducer::RingField::DebugString() const {
-  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
-                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
-                              " action=", action);
-  strings::StrAppend(&rv, " pass=", second_pass);
-  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
-                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
-                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
-  return rv;
-}
-
-void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) {
-  CHECK(rf->do_send);
-  string send_buf_key = RingReduceBufKey(col_ctx_->exec_key, rf->second_pass,
-                                         rf->sc_idx, rf->rank);
-  VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
-          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
-          << rf->sc_idx;
-  int send_to_rank = (rf->rank + 1) % group_size_;
-  int send_to_dev_idx = col_params_->instance.impl_details
-                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
-  col_ctx_->col_exec->PostToPeer(
-      col_params_->instance.device_names[send_to_dev_idx],
-      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
-      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
-      col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
-      col_ctx_->device_locality, done);
-}
-
-void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) {
-  CHECK(rf->do_recv);
-  string recv_buf_key =
-      RingReduceBufKey(col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
-                       (rf->rank + (group_size_ - 1)) % group_size_);
-  VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
-          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
-          << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk");
-  Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
-                           ? &rf->tmp_chunk
-                           : &rf->chunk;
-  col_ctx_->col_exec->RecvFromPeer(
-      col_params_->instance.device_names[rf->recv_dev_idx],
-      col_params_->instance.task_names[rf->recv_dev_idx],
-      col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
-      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
-      col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
-      col_ctx_->device_locality, rf->subdiv_idx, done);
-}
-
-string RingReducer::FieldState() {
-  string s = strings::StrCat(
-      "RingReducer ", strings::Hex(reinterpret_cast<uint64>(this)), " exec ",
-      col_ctx_->exec_key, " step_id=", col_ctx_->step_id, " state of all ",
-      rfv_.size(), " fields:");
-  for (int i = 0; i < rfv_.size(); ++i) {
-    s.append("\n");
-    s.append(rfv_[i].DebugString());
   }
-  return s;
 }
 
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
 bool RingReducer::RunAsyncParts() {
   // This function orchestrates RingReduce actions on behalf of a
   // single device. It is entered by a blockable thread that
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
index a5aa8fad70caa27e2c3f1f2d6b50a9ec843b8f07..a681fabd2bdf1c7e3765ede3098ebb5bf596a881 100644
--- a/tensorflow/core/common_runtime/ring_reducer.h
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -21,108 +21,36 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
 #include "tensorflow/core/framework/collective.h"
 
 namespace tensorflow {
 class Device;
 
 // Ring-algorithm implementation of collective all-reduce.
-class RingReducer : public CollectiveImplementationInterface {
+class RingReducer : public RingAlg {
  public:
-  RingReducer();
+  RingReducer() : RingAlg(REDUCTION_COLLECTIVE, "Reduce") {}
   ~RingReducer() override;
 
-  // Establishes the requested number of subdivision permutations based on the
-  // ring order implicit in the device order.
-  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
-
-  // Initializes members of CollectiveContext not yet initialized, i.e. device
-  // and device_locality.  Also saves the CollectiveContext in this object.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
-
-  // No-op for ring reducer.
-  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
-    return Status::OK();
-  }
-
   // Begins async execution of the ring reduce algorithm.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
   // collective threadpool.
   void Run(StatusCallback done) override;
 
- private:
-  // Called when a bad status is received that implies we should terminate
-  // execution and return a bad status.
-  void StartAbort(const Status& s);
-  void ContinueAfterInputCopy();
-  void Finish(bool ok);
-  bool RunAsyncParts();
-
-  // Current status of a RingField
-  enum RingFieldAction {
-    RF_INIT = 0,    // Just initialized for a pass
-    RF_RECV,        // Recv pending
-    RF_REDUCE,      // Reduce pending
-    RF_FINALIZE,    // FinalOp pending
-    RF_SEND_READY,  // Ready to send
-    RF_SEND,        // Send pending
-    RF_DONE,        // No more work
-  };
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
 
-  // Tracks progress of actions on a single subfield of the entire tensor.
-  struct RingField {
-    int16 chunk_idx;     // major division index
-    int16 subdiv_idx;    // minor division index
-    int16 sc_idx;        // subchunk index
-    int16 rank;          // rank within subdiv permutation
-    int16 recv_dev_idx;  // dev from which value should be recv'd
-    RingFieldAction action;
-    bool second_pass;
-    bool recv_is_remote = false;
-    bool send_is_remote = false;
-    bool do_send = false;   // is the value sent in this pass?
-    bool do_recv = false;   // is the value recv'd in this pass?
-    bool is_final = false;  // is the last field in the pass for this rank
-    Tensor chunk;           // alias to field values
-    Tensor tmp_chunk;
-    Status status;
-    string DebugString() const;
-  };
-  void AdvanceToSecondPass(RingField* rf);
+ protected:
   void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
-                     int field_idx);
-  void DispatchSend(RingField* rf, const StatusCallback& done);
-  void DispatchRecv(RingField* rf, const StatusCallback& done);
-
-  // For constructing log messages for debugging.
-  string FieldState();
-  string TensorDebugString(const Tensor& tensor);
-
-  // Producer/Consumer Queue of RingField structs.
-  class PCQueue {
-   public:
-    void Enqueue(RingField* rf);
-    RingField* Dequeue();
+                     int field_idx) override;
 
-   private:
-    mutex pcq_mu_;
-    condition_variable cv_;
-    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
-    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
-  };
+ private:
+  void ContinueAfterInputCopy();
+  bool RunAsyncParts();
 
-  CollectiveContext* col_ctx_;          // Not owned
-  const CollectiveParams* col_params_;  // Not owned
-  StatusCallback done_;
-  int group_size_;
-  int num_subdivs_;
   Tensor group_size_tensor_;
   Notification group_size_tensor_ready_;
-  std::unique_ptr<CollectiveAdapter> ca_;
-  mutex status_mu_;
-  Status status_ GUARDED_BY(status_mu_);
-  std::vector<RingField> rfv_;
 
   friend class RingReducerTest;
 };
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 7feb29a6dbbb17d73967344ad07db9d234411840..7f18cdb5e2caec7690c8f96c6deb32319acb2e10 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -335,19 +335,20 @@ class RingReducerTest : public ::testing::Test {
           note.WaitForNotification();
         }
 
+        auto alias = actual.template unaligned_flat<T>();
         for (int i = 0; i < tensor_len; ++i) {
           switch (dtype) {
             case DT_FLOAT:
-              EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_FLOAT_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             case DT_DOUBLE:
-              EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_DOUBLE_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             case DT_INT32:
             case DT_INT64:
-              EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             default:
diff --git a/tensorflow/core/platform/default/string_coding.cc b/tensorflow/core/common_runtime/shared_counter.h
similarity index 58%
rename from tensorflow/core/platform/default/string_coding.cc
rename to tensorflow/core/common_runtime/shared_counter.h
index 7410ee67820a384e4843a57386b110e40a7e0680..5e378524b203b1b1089fe6836a57d1effb961db5 100644
--- a/tensorflow/core/platform/default/string_coding.cc
+++ b/tensorflow/core/common_runtime/shared_counter.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-#include "tensorflow/core/platform/default/string_coding.h"
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
 
 namespace tensorflow {
-namespace port {
-
-std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
-  return std::unique_ptr<StringListEncoder>(new StringListEncoder(out));
-}
+// A lightweight thread-safe monotone counter for establishing
+// temporal ordering.
+class SharedCounter {
+ public:
+  int64 get() { return value_; }
+  int64 next() { return ++value_; }
 
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
-  return std::unique_ptr<StringListDecoder>(new StringListDecoder(in));
-}
+ private:
+  std::atomic<int64> value_{0};
+};
 
-}  // namespace port
 }  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 49265445659ff1daa30b632f60c03845d4a6a7f7..318cfec21a8be19bdad362c45b11398c33438bbb 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -176,9 +176,10 @@ void NodeExecStatsWrapper::AddAllocation(
   memory->set_peak_bytes(std::get<1>(sizes));
   memory->set_live_bytes(std::get<2>(sizes));
 
-  AllocatorStats stats;
-  allocator->GetStats(&stats);
-  memory->set_allocator_bytes_in_use(stats.bytes_in_use);
+  absl::optional<AllocatorStats> stats = allocator->GetStats();
+  if (stats) {
+    memory->set_allocator_bytes_in_use(stats->bytes_in_use);
+  }
   allocations_.push_back(std::make_pair(memory, tracking_allocator));
 }
 
@@ -409,6 +410,21 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
+void StepStatsCollector::SaveThreadName(const string& device,
+                                        const uint32 thread_id,
+                                        const string& thread_name) {
+  VLOG(1) << "Save dev " << device << " thread id " << thread_id << " name "
+          << thread_name;
+  {
+    mutex_lock l(mu_);
+    if (finalized_) {
+      LOG(WARNING) << "thread_name saved after finalize will not be collected.";
+    }
+    auto& thread_names_map = thread_names_[device];
+    thread_names_map[thread_id] = thread_name;
+  }
+}
+
 NodeExecStatsInterface* StepStatsCollector::CreateNodeExecStats(
     const Node* node) {
   // Only collect statistics for non-transfer nodes.
@@ -531,5 +547,15 @@ void StepStatsCollector::FinalizeInternal() {
       stats->stats()->Swap(dss->add_node_stats());
     }
   }
+  for (const auto& device_thread : thread_names_) {
+    if (dev_stats_pb.find(device_thread.first) == dev_stats_pb.end()) {
+      // skip device without DeviceStepStats.
+      continue;
+    }
+    DeviceStepStats* dss = dev_stats_pb.at(device_thread.first);
+    for (const auto& thread_name : device_thread.second) {
+      (*dss->mutable_thread_names())[thread_name.first] = thread_name.second;
+    }
+  }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 7d34383ce8209c9f4b889410a96bce02f6702a64..dfcc51ff4c79b386c327dcf9503c7ee35b20d2c4 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -175,6 +175,10 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   void Save(const string& device, NodeExecStats* node_stats_pb);
   void Save(const string& device, NodeExecStatsWrapper* node_stats);
 
+  // Saves thread name.
+  void SaveThreadName(const string& device, const uint32 thread_id,
+                      const string& thread_name);
+
   NodeExecStatsInterface* CreateNodeExecStats(const Node* node) override;
   string ReportAllocsOnResourceExhausted(const string& err) override;
 
@@ -191,12 +195,14 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   static const uint64 kMaxCollectedNodes = 1 << 20;
 
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
+  typedef std::unordered_map<uint32, string> ThreadNamesMap;
 
   void FinalizeInternal() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   bool finalized_ GUARDED_BY(mu_);
   std::unordered_map<string, NodeStatsVector> dev_stats_ GUARDED_BY(mu_);
+  std::unordered_map<string, ThreadNamesMap> thread_names_ GUARDED_BY(mu_);
   StepStats* step_stats_ GUARDED_BY(mu_);
   uint64 collected_nodes_ GUARDED_BY(mu_) = 0;
 };
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 5fc95a8f20d2b3f1b37a660e17d0efee17aacb94..b69eb1da39e68cc470d0d64c69c28ce1a3f6477c 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -299,7 +299,7 @@ Status DebugNodeInserter::CreateCopyNode(
 
   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
                      .Input(src_node_name, src_output, src_dt)
-                     .Attr("debug_ops_spec", std::move(debug_ops_spec));
+                     .Attr("debug_ops_spec", debug_ops_spec);
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index f70931e926507c72287588da278a3b8d6bb19122..4927caf5a3285a3855d27b614bf597943059e2fb 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 6994dec3b59d997650e07ba9a6fd14233022b201..ebcb046003437eb9fab452c5337204cb249c510c 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -730,7 +731,7 @@ Status DebugGrpcChannel::Connect(const int64 timeout_micros) {
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   channel_ = ::grpc::CreateCustomChannel(
       server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
   if (!channel_->WaitForConnected(
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 82e0ae5edb1eccd35c7c76da0a8a2ee9ea12d9fd..0926a82fade31904376fac277273b20b13367167 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_node_key.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index da88f9cb93e752248d48b381f352987bf7da86da..af744ce790bf5d39895a41fff8f77650f7adc19a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -17,7 +17,6 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 # For platform specific build config
@@ -298,6 +297,7 @@ cc_library(
     deps = [
         ":call_options",
         ":message_wrappers",
+        ":request_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
     ],
@@ -311,6 +311,7 @@ cc_library(
         ":call_options",
         ":master_env",
         ":master_session",
+        ":recent_request_ids",
         ":remote_device",
         ":worker_cache",
         ":worker_interface",
@@ -425,7 +426,6 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
@@ -644,7 +644,7 @@ tf_cuda_cc_test(
         "manual",  # TODO(b/27683709): Re-enable when not flaky.
         "notap",  # TODO(b/27683709): Re-enable when not flaky.
         "noguitar",  # TODO(b/27683709): Re-enable when not flaky.
-        "nooss",  # TODO(b/27683709): Re-enable when not flaky.
+        "no_oss",  # TODO(b/27683709): Re-enable when not flaky.
     ],
     deps = [
         ":master",
@@ -765,6 +765,7 @@ cc_library(
     srcs = ["recent_request_ids.cc"],
     hdrs = ["recent_request_ids.h"],
     deps = [
+        ":message_wrappers",
         "//tensorflow/core:lib",
         "//tensorflow/core:worker_proto_cc",
     ],
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index de6e4b4a7c51379f6492314de3dc8c69f424c769..a642313275d01f2575575fcb17de8a496cf09239 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -293,8 +293,11 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      DoneCallback done) {
   VLOG(1) << "RemoteRendezvous Recv " << this << " " << parsed.FullKey();
-  CHECK(is_initialized()) << "RecvAsync called when uninitialized.";
   Status s = ValidateDevices(parsed, false /*!is_src*/);
+  if (s.ok() && !is_initialized()) {
+    s.Update(errors::Internal(
+        "RecvAsync called when uninitialized (key:", parsed.FullKey(), ")."));
+  }
   if (!s.ok()) {
     done(s, Args(), recv_args, Tensor(), false);
     return;
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 9f94a24fcdda5ee8f3033a05f44b72540baf4cca..443759ab740b99860e9d50a6a112a2a054d39f1a 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -82,15 +82,21 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     DeviceResolverDistributed* dev_resolver, WorkerCacheInterface* worker_cache,
     const string& task_name)
-    : CollectiveParamResolverLocal(dev_mgr, dev_resolver, task_name),
+    : CollectiveParamResolverLocal(config, dev_mgr, dev_resolver, task_name),
       worker_cache_(worker_cache),
       group_leader_(task_name == config.experimental().collective_group_leader()
                         ? ""
-                        : config.experimental().collective_group_leader()) {}
+                        : config.experimental().collective_group_leader()) {
+  VLOG(1) << "CompleteParamResolverDistributed ctor task={" << task_name
+          << "} config.collective_group_leader={"
+          << config.experimental().collective_group_leader() << "}";
+}
 
 void CollectiveParamResolverDistributed::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     const StatusCallback& done) {
+  VLOG(1) << "CompleteParams distributed " << device << " for " << cp << ": "
+          << cp->ToString();
   CompleteGroupDistributed(device, cp, cancel_mgr,
                            [this, device, cp, cancel_mgr, done](
                                const Status& s, const GroupRec* gr) {
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 1065f021a1b0f97dc955e2b00ff333976575b519..144113a04309d8dde8b8eebdce4485f828732595 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -356,6 +356,12 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = RecvOutputsFromRendezvous(rendezvous, out, Rendezvous::Args());
   rendezvous->Unref();
+  if (!s.ok()) {
+    // Failing to fetch the outputs should not be possible, so rewrite the error
+    // status to an INTERNAL error.
+    s = errors::Internal("Failed to fetch outputs for step ", step_id,
+                         ". (Original error message: ", s.ToString(), ")");
+  }
   return s;
 }
 
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 269f620e42e61b67477f9d73336a6e8da63b2eff..fc8d2871ac770bcea9104a206acabcd44ebde77f 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -65,7 +65,8 @@ Master::Master(MasterEnv* env, double session_gc_seconds)
     : env_(env),
       last_1000_steps_(1000),
       step_count_(0),
-      session_gc_seconds_(session_gc_seconds) {
+      session_gc_seconds_(session_gc_seconds),
+      recent_request_ids_(10000) {
   // Right now, a master service must be co-located with a device.
   // Otherwise, fetches do not work.
   CHECK(!env->local_devices.empty());
@@ -510,6 +511,12 @@ void Master::ExtendSession(const ExtendSessionRequest* req,
 
 void Master::PartialRunSetup(const PartialRunSetupRequest* req,
                              PartialRunSetupResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "PartialRunSetup (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
@@ -525,6 +532,12 @@ void Master::PartialRunSetup(const PartialRunSetupRequest* req,
 
 void Master::RunStep(CallOptions* opts, const RunStepRequestWrapper* req,
                      MutableRunStepResponseWrapper* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "RunStep (Master)", req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto start_time = env_->env->NowMicros();
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
@@ -664,6 +677,12 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
 
 void Master::MakeCallable(const MakeCallableRequest* req,
                           MakeCallableResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "MakeCallable (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
@@ -681,6 +700,12 @@ void Master::MakeCallable(const MakeCallableRequest* req,
 
 void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
                          RunCallableResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "RunCallable (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index dbb337fd484960fbd3bfe47d0bfe0497985de66f..0524582ac78846fe192e8de47419280c6dde6177 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
@@ -95,6 +96,9 @@ class Master {
   // closed automatically.
   const double session_gc_seconds_;
 
+  // Used to track ids for incoming requests so we can detect duplicates.
+  RecentRequestIds recent_request_ids_;
+
   // Call CleanupAll on all workers.
   void CleanupWorkers(const ResetRequest& reset);
 
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index a8ae3cba3cdd3f02aae823d893e027b2bccae2c9..cde47fb9caf55f35db481fec8ae69ad6e6fcd8ed 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -66,7 +67,9 @@ class MasterInterface {
   // The message returned from this method must only be used in a
   // `RunStep()` call on the same `MasterInterface` instance.
   virtual MutableRunStepRequestWrapper* CreateRunStepRequest() {
-    return new MutableProtoRunStepRequest;
+    MutableProtoRunStepRequest* ret = new MutableProtoRunStepRequest;
+    ret->request_.set_request_id(GetUniqueRequestId());
+    return ret;
   }
 
   // Returns a response object for use in calls to
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 48b72fb9483f632012a69a1f3f8bf3e099310fbd..2f14967656fc832550c310d4c2b7821061d11e75 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1103,6 +1103,8 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
       req.options().experimental().collective_graph_key();
   if (config.experimental().collective_deterministic_sequential_execution()) {
     opts->collective_order = GraphCollectiveOrder::kEdges;
+  } else if (config.experimental().collective_nccl()) {
+    opts->collective_order = GraphCollectiveOrder::kAttrs;
   }
 }
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 40bf564cab6fe465ed66639f42fe0daeb149f132..c9bc558964c2e7a704c5e2e9f52a1f794065a7d3 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -97,6 +97,10 @@ bool InMemoryRunStepRequest::store_errors_in_response_body() const {
   return store_errors_in_response_body_;
 }
 
+int64 InMemoryRunStepRequest::request_id() const {
+  return 0;  // no need to track request id for local version.
+}
+
 void InMemoryRunStepRequest::set_store_errors_in_response_body(
     bool store_errors) {
   store_errors_in_response_body_ = store_errors;
@@ -210,6 +214,10 @@ void MutableProtoRunStepRequest::set_store_errors_in_response_body(
   request_.set_store_errors_in_response_body(store_errors);
 }
 
+int64 MutableProtoRunStepRequest::request_id() const {
+  return request_.request_id();
+}
+
 string MutableProtoRunStepRequest::DebugString() const {
   return request_.DebugString();
 }
@@ -272,6 +280,8 @@ bool ProtoRunStepRequest::store_errors_in_response_body() const {
   return request_->store_errors_in_response_body();
 }
 
+int64 ProtoRunStepRequest::request_id() const { return request_->request_id(); }
+
 string ProtoRunStepRequest::DebugString() const {
   return request_->DebugString();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 474ac0e186a203464ff64e1cbea2b4faaf87b05b..2cdbd1bfaf1be1fb646926ae82488f88377d491d 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -87,6 +87,8 @@ class RunStepRequestWrapper {
   // truncate long metadata messages.
   virtual bool store_errors_in_response_body() const = 0;
 
+  virtual int64 request_id() const = 0;
+
   // Returns a human-readable representation of this message for debugging.
   virtual string DebugString() const = 0;
 
@@ -127,6 +129,7 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -177,6 +180,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -189,6 +193,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
 
  private:
   RunStepRequest request_;
+  friend class MasterInterface;
 };
 
 // Wrapper for immutable RunStep requests that use a non-owned
@@ -216,6 +221,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
  private:
   const RunStepRequest* const request_;  // Not owned.
@@ -234,7 +240,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-// Abstract interface for an immutable RunStepRequest message.
+// Abstract interface for an immutable RunGraphRequest message.
 //
 // This interface is typically used by server-side components in the
 // TensorFlow worker.
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index 4f6866c5d154ba023b0923af67fe00a7a69b459d..2c953e12c06287e88b1a68bfab48a7234207046a 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -28,12 +28,10 @@ RecentRequestIds::RecentRequestIds(int num_tracked_request_ids)
   set_.reserve(num_tracked_request_ids);
 }
 
-Status RecentRequestIds::TrackUnique(int64 request_id,
-                                     const string& method_name,
-                                     const protobuf::Message& request) {
+bool RecentRequestIds::Insert(int64 request_id) {
   if (request_id == 0) {
     // For backwards compatibility, allow all requests with request_id 0.
-    return Status::OK();
+    return true;
   }
 
   mutex_lock l(mu_);
@@ -43,9 +41,7 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
     // request_id's age in the circular_buffer_ if it's tracked again. Strict
     // LRU is not useful here because returning this error will close the
     // current Session.
-    return errors::Aborted("The same ", method_name,
-                           " request was received twice. ",
-                           request.ShortDebugString());
+    return false;
   }
 
   // Remove the oldest request_id from the set_. circular_buffer_ is
@@ -54,7 +50,30 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
   set_.erase(circular_buffer_[next_index_]);
   circular_buffer_[next_index_] = request_id;
   next_index_ = (next_index_ + 1) % circular_buffer_.size();
-  return Status::OK();
+  return true;
+}
+
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const protobuf::Message& request) {
+  if (Insert(request_id)) {
+    return Status::OK();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           request.ShortDebugString());
+  }
+}
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const RunStepRequestWrapper* wrapper) {
+  if (Insert(request_id)) {
+    return Status::OK();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           wrapper->ToProto().ShortDebugString());
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 11cf937c94659d85e3dc88350f20e107a27fab62..4094fcbde72ae97fcc5655a030fdf69426b093c8 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -58,8 +59,13 @@ class RecentRequestIds {
   // ShortDebugString are added to returned errors.
   Status TrackUnique(int64 request_id, const string& method_name,
                      const protobuf::Message& request);
+  // Overloaded versions of the above function for wrapped protos.
+  Status TrackUnique(int64 request_id, const string& method_name,
+                     const RunStepRequestWrapper* wrapper);
 
  private:
+  bool Insert(int64 request_id);
+
   mutex mu_;
   // next_index_ indexes into circular_buffer_, and points to the next storage
   // space to use. When the buffer is full, next_index_ points at the oldest
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 16078f9688cf3d26d8ef6b6a40cde273f084c955..dd22e74ac5466ea675487ee8fbed92164bf38c78 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/core:lib",
         # Required to be able to overload TensorResponse parsing.
         "//tensorflow/core/distributed_runtime:tensor_coding",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -159,6 +160,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_response_cache",
+    srcs = ["grpc_response_cache.cc"],
+    hdrs = ["grpc_response_cache.h"],
+    deps = [
+        ":grpc_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cuda_library(
     name = "grpc_worker_service",
     srcs = ["grpc_worker_service.cc"],
@@ -166,6 +177,7 @@ tf_cuda_library(
     deps = [
         ":async_service_interface",
         ":grpc_call",
+        ":grpc_response_cache",
         ":grpc_tensor_coding",
         ":grpc_util",
         ":grpc_worker_service_impl",
@@ -183,6 +195,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_session",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -394,6 +407,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:local_master",
         "//tensorflow/core/distributed_runtime:master_interface",
         "//tensorflow/core/distributed_runtime:message_wrappers",
+        "//tensorflow/core/distributed_runtime:request_id",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 781b7d65cdd184363d7c7650305bd62f3129c271..64c221805b072313fc9fba20fa9cdefe8cea9bfc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -62,7 +62,7 @@ Status ValidateHostPortPair(const string& host_port) {
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   if (rpc_options != nullptr) {
     if (rpc_options->compression_algorithm() == "deflate") {
       args.SetCompressionAlgorithm(GRPC_COMPRESS_DEFLATE);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index b832a2115cb809b5561fc55ab8d9057f2274dcd8..a84559098a3ad59cd1cabe91bb8546194da105e5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -43,104 +44,139 @@ class GrpcRemoteMaster : public MasterInterface {
   Status CreateSession(CallOptions* call_options,
                        const CreateSessionRequest* request,
                        CreateSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::CreateSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::CreateSession);
   }
 
   Status ExtendSession(CallOptions* call_options,
                        const ExtendSessionRequest* request,
                        ExtendSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ExtendSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ExtendSession);
   }
 
   Status PartialRunSetup(CallOptions* call_options,
                          const PartialRunSetupRequest* request,
                          PartialRunSetupResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::PartialRunSetup);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::PartialRunSetup);
   }
 
   Status RunStep(CallOptions* call_options, RunStepRequestWrapper* request,
                  MutableRunStepResponseWrapper* response) override {
-    ::grpc::ClientContext ctx;
-    auto trace = TraceRpc("RunStep/Client", &ctx);
-    return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
+    return CallWithRetry(call_options, &request->ToProto(),
+                         get_proto_from_wrapper(response),
+                         &MasterServiceStub::RunStep, "RunStep/Client");
   }
 
   Status CloseSession(CallOptions* call_options,
                       const CloseSessionRequest* request,
                       CloseSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::CloseSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::CloseSession);
   }
 
   Status ListDevices(CallOptions* call_options,
                      const ListDevicesRequest* request,
                      ListDevicesResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ListDevices);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ListDevices);
   }
 
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::Reset);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::Reset);
   }
 
   Status MakeCallable(CallOptions* call_options,
                       const MakeCallableRequest* request,
                       MakeCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::MakeCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::MakeCallable);
   }
   Status RunCallable(CallOptions* call_options,
                      const RunCallableRequest* request,
                      RunCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::RunCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::RunCallable);
   }
   Status ReleaseCallable(CallOptions* call_options,
                          const ReleaseCallableRequest* request,
                          ReleaseCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ReleaseCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ReleaseCallable);
   }
 
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  tracing::ScopedActivity TraceRpc(StringPiece name,
-                                   ::grpc::ClientContext* ctx) {
+  tracing::ScopedActivity* NewTraceRpc(StringPiece name,
+                                       ::grpc::ClientContext* ctx) {
     string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return tracing::ScopedActivity(name, trace_id);
-  }
-
-  void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) {
-    if (time_in_ms > 0) {
-      ctx->set_deadline(gpr_time_from_millis(time_in_ms, GPR_TIMESPAN));
-    }
+    return new tracing::ScopedActivity(name, trace_id);
   }
 
   template <typename Request, typename Response>
-  Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
-              const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
-                                                         const Request&,
-                                                         Response*)) {
-    ctx->set_fail_fast(false);
-    SetDeadline(ctx, call_options->GetTimeout());
-    return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
+  Status CallWithRetry(CallOptions* call_options, const Request* request,
+                       Response* response,
+                       ::grpc::Status (MasterServiceStub::*pfunc)(
+                           ::grpc::ClientContext*, const Request&, Response*),
+                       string trace_string = {}) {
+    int64 timeout_in_ms = call_options->GetTimeout();
+    int64 expired_time_micros = Env::Default()->NowMicros();
+    if (timeout_in_ms > 0) {
+      expired_time_micros += (timeout_in_ms / 1000.);
+    }
+    Status s;
+    for (int num_retries = 0;; ++num_retries) {
+      ::grpc::ClientContext ctx;
+      std::unique_ptr<tracing::ScopedActivity> trace;
+      if (!trace_string.empty()) {
+        trace.reset(NewTraceRpc(trace_string, &ctx));
+      }
+      ctx.set_fail_fast(false);
+      if (timeout_in_ms > 0) {
+        // We do not modify the timeout here to match legacy behavior. However,
+        // this could violate the contract of tensorflow::Session. If we retry
+        // an RPC just before the deadline is exceeded, we will still set the
+        // timeout to the original value. This leads to the overall timeout
+        // being double what was expected.
+        // TODO(b/117162170): investigate fixing this behavior for legacy and
+        // gRPC RPC layers.
+        ctx.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
+      }
+      s = FromGrpcStatus((stub_.get()->*pfunc)(&ctx, *request, response));
+      if (!errors::IsUnavailable(s)) {
+        return s;
+      }
+      // TODO(b/117162170): we may want to make this configurable.
+      constexpr int kMaxRetries = 10;
+      LOG(WARNING) << "RPC failed with status = \"" << s
+                   << "\" and grpc_error_string = \""
+                   << ctx.debug_error_string() << "\", maybe retrying the RPC";
+      if (num_retries >= kMaxRetries) {
+        LOG(WARNING) << "Too many retries, returning last status: " << s;
+        return s;
+      }
+      const int64 now_micros = Env::Default()->NowMicros();
+      const int64 deadline_with_backoff_micros =
+          now_micros + ComputeBackoffMicroseconds(num_retries);
+      // Wait for a short period of time before retrying the RPC.  If our
+      // backoff would put us past the RPC deadline, we truncate it to ensure
+      // our RPC starts before the deadline.
+      const auto backoff_until =
+          (timeout_in_ms <= 0 ||
+           expired_time_micros > deadline_with_backoff_micros)
+              ? deadline_with_backoff_micros
+              : expired_time_micros;
+      Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
+      if (Env::Default()->NowMicros() > expired_time_micros &&
+          timeout_in_ms > 0) {
+        // If timeout_in_ms is set, exit the retry loop on timeout.
+        return errors::DeadlineExceeded(ctx.debug_error_string());
+      }
+    }
   }
 
   std::unique_ptr<MasterServiceStub> stub_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 2daefcb399c79324f80278340967b679be5c6574..2479e7368be8de810db36cb18c887977aeae2472 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -39,6 +39,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+const int kMaxWorkerRpcRetries = 10;
+
 class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
@@ -259,17 +261,19 @@ class GrpcRemoteWorker : public WorkerInterface {
   // given callback, `done`, will be called when the RPC completes.
   void IssueRequest(const protobuf::Message* request,
                     protobuf::Message* response, const ::grpc::string& method,
-                    StatusCallback done, CallOptions* call_opts = nullptr) {
+                    StatusCallback done, CallOptions* call_opts = nullptr,
+                    int max_retries = kMaxWorkerRpcRetries) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
                                     std::move(done), call_opts,
-                                    callback_threadpool_);
+                                    callback_threadpool_, max_retries);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
-                    CallOptions* call_opts = nullptr) {
+                    CallOptions* call_opts = nullptr,
+                    int max_retries = kMaxWorkerRpcRetries) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, max_retries);
   }
 
   // Helper function for initializing the RpcMethod objects below.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..613c290905d4e8914761b130d9353536023f5856
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
@@ -0,0 +1,183 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+struct WorkerCacheEntry {
+  enum class State {
+    PENDING = 0,
+    ACTIVE = 1,
+    FINISHED = 2,
+  };
+
+  State state = State::PENDING;
+  int64 expires_seconds;
+
+  ::grpc::ByteBuffer response_buf;
+  Status response_status;
+
+  // Additional retries may arrive while a request is still executing.  The
+  // callbacks for these calls are queued in `callbacks` and evaluated after
+  // the original request is completed.
+  std::vector<std::pair<RPCResponse, StatusCallback>> callbacks;
+};
+
+void RPCResponse::Encode(::grpc::ByteBuffer* tgt) const {
+  if (buf_ != nullptr) {
+    *tgt = *buf_;
+  } else {
+    CHECK(msg_ != nullptr);
+    ::grpc::Slice slice(msg_->ByteSizeLong());
+    msg_->SerializeWithCachedSizesToArray(
+        const_cast<uint8*>(reinterpret_cast<const uint8*>(slice.begin())));
+    ::grpc::ByteBuffer tmp(&slice, 1);
+    tgt->Swap(&tmp);
+  }
+}
+
+void RPCResponse::CopyFrom(const ::grpc::ByteBuffer& src) {
+  if (buf_ != nullptr) {
+    *buf_ = src;
+    return;
+  }
+
+  CHECK(msg_ != nullptr);
+  // We create a single slice when encoding protocol messages.
+  std::vector<::grpc::Slice> slices;
+  if (src.Dump(&slices).ok()) {
+    msg_->ParseFromArray(slices[0].begin(), slices[0].size());
+  } else {
+    LOG(ERROR) << "Failed to decode cached buffer.";
+  }
+}
+
+void GrpcResponseCache::LookupOrCompute(const string& key, RPCResponse response,
+                                        ComputeFunc compute_func,
+                                        StatusCallback done_cb) {
+  VLOG(1) << "Lookup " << key;
+  std::shared_ptr<WorkerCacheEntry> req;
+  MaybeCleanup();
+  {
+    mutex_lock m(mu_);
+
+    if (requests_.find(key) != requests_.end()) {
+      req = requests_[key];
+    } else {
+      req.reset(new WorkerCacheEntry);
+      requests_[key] = req;
+    }
+
+    if (req->state == WorkerCacheEntry::State::FINISHED) {
+      if (req->expires_seconds > Env::Default()->NowSeconds()) {
+        VLOG(1) << "Reuse cached response for " << key;
+        response.CopyFrom(req->response_buf);
+        done_cb(req->response_status);
+        return;
+      }
+      VLOG(1) << "Found expired cache entry for " << key;
+      req->state = WorkerCacheEntry::State::PENDING;
+      req->response_buf.Clear();
+    }
+
+    req->callbacks.push_back(std::make_pair(response, done_cb));
+
+    if (req->state == WorkerCacheEntry::State::ACTIVE) {
+      VLOG(1) << "Found active request for " << key
+              << ".  Adding entry to response queue.";
+      return;
+    }
+
+    VLOG(2) << "No cache entry for " << key << ", running user computation.";
+    req->state = WorkerCacheEntry::State::ACTIVE;
+    req->expires_seconds = Env::Default()->NowSeconds() + expire_time_seconds_;
+  }
+
+  compute_func([this, key, req, response](Status status) {
+    mutex_lock m(mu_);
+    response.Encode(&req->response_buf);
+    current_bytes_ += req->response_buf.Length();
+
+    req->response_status = status;
+    req->state = WorkerCacheEntry::State::FINISHED;
+
+    VLOG(1) << "Operation for " << key << " finished. "
+            << "Status: " << status << ", " << req->response_buf.Length()
+            << " response bytes, " << req->callbacks.size()
+            << " pending callbacks.";
+    for (auto& cb : req->callbacks) {
+      cb.first.CopyFrom(req->response_buf);
+      cb.second(req->response_status);
+    }
+    req->callbacks.clear();
+  });
+}
+
+// Remove all stale or expired cache entries if the cache is full.
+void GrpcResponseCache::MaybeCleanup() {
+  mutex_lock m(mu_);
+  if (current_bytes_ < max_bytes_) {
+    return;
+  }
+
+  VLOG(1) << "Cleanup: " << current_bytes_ << " -> " << max_bytes_;
+  std::vector<std::pair<string, std::shared_ptr<WorkerCacheEntry>>>
+      ordered_entries;
+  ordered_entries.reserve(requests_.size());
+  for (const auto& p : requests_) {
+    ordered_entries.push_back(std::make_pair(p.first, p.second));
+  }
+
+  std::sort(ordered_entries.begin(), ordered_entries.end(),
+            [](const std::pair<string, std::shared_ptr<WorkerCacheEntry>>& a,
+               const std::pair<string, std::shared_ptr<WorkerCacheEntry>>& b) {
+              return a.second->expires_seconds > b.second->expires_seconds;
+            });
+
+  std::unordered_map<string, std::shared_ptr<WorkerCacheEntry>> kept;
+  int64 now = Env::Default()->NowSeconds();
+  int64 bytes_used = 0;
+
+  // Always keep active requests.
+  for (auto& pair : ordered_entries) {
+    if (pair.second->state != WorkerCacheEntry::State::FINISHED) {
+      kept.insert(pair);
+    }
+  }
+
+  // Keep unexpired, finished requests up to half of max_bytes_.  This reduces
+  // chances of overfilling the cache when active requests complete and
+  // amortizes cache cleanup cost.
+  for (auto& pair : ordered_entries) {
+    if (pair.second->expires_seconds < now || bytes_used >= max_bytes_ / 2) {
+      break;
+    }
+
+    if (pair.second->state == WorkerCacheEntry::State::FINISHED) {
+      kept.insert(pair);
+      bytes_used += pair.second->response_buf.Length();
+    }
+  }
+
+  VLOG(1) << "Cleaned cache.  Bytes used: " << current_bytes_ << " -> "
+          << bytes_used << ". Cache size: " << requests_.size() << " -> "
+          << kept.size();
+  current_bytes_ = bytes_used;
+  std::swap(requests_, kept);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..0892d9f788d165f11803c676717e63585ca808a2
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+// gRPC response caching.  Most WorkerService methods cannot be retried directly
+// as they will fail or deadlock.  To enable retrying, we can instead cache
+// responses for a short period of time and reply to duplicate requests from the
+// cache.
+namespace tensorflow {
+
+// Union type to aid caching of either raw buffers (for RecvTensor RPCs) and
+// protocol buffer messages (for all other RPCs).
+class RPCResponse {
+ public:
+  explicit RPCResponse() : buf_(nullptr), msg_(nullptr) {}
+  explicit RPCResponse(::grpc::ByteBuffer* b) : buf_(b), msg_(nullptr) {}
+  explicit RPCResponse(protobuf::Message* m) : buf_(nullptr), msg_(m) {}
+
+  // Encode this response into the target buffer.
+  void Encode(::grpc::ByteBuffer* tgt) const;
+
+  // Copy from `src`: if this is a buffer, make a shallow copy.
+  // For protocol messages, parse the response from `src`.
+  void CopyFrom(const ::grpc::ByteBuffer& src);
+
+ private:
+  ::grpc::ByteBuffer* buf_;
+  protobuf::Message* msg_;
+};
+
+typedef std::function<void(StatusCallback)> ComputeFunc;
+struct WorkerCacheEntry;
+
+// Track and cache the state of worker service RPCs.  An RPC can be in 3 states:
+//
+// * PENDING: this is the first call of the RPC, and it will transition to
+// * ACTIVE: another thread is active processing this RPC
+// * FINISHED: the worker has finished processing the method
+//
+// The response from completed RPCs are LRU cached until either `max_bytes`
+// bytes are in use by the cache or they expire (according to `expire_time`).
+class GrpcResponseCache {
+ public:
+  GrpcResponseCache(int64 max_bytes, int64 expire_time_seconds)
+      : max_bytes_(max_bytes), expire_time_seconds_(expire_time_seconds) {}
+
+  // Lookup the result for key.
+  // If it is finished, invoke `done_cb` immediately after filling `response`.
+  // If active, done_db will be invoked when the current call completes.
+  // Otherwise, invoke `compute_func` to fill the cache and invoke done_cb.
+  void LookupOrCompute(const string& key, RPCResponse response,
+                       ComputeFunc compute_func, StatusCallback done_cb);
+
+  // Remove all stale or expired cache entries if the cache is full.
+  void MaybeCleanup();
+
+ private:
+  int64 current_bytes_ GUARDED_BY(mu_) = 0;
+  const int64 max_bytes_;
+  const int64 expire_time_seconds_;
+
+  std::unordered_map<string, std::shared_ptr<WorkerCacheEntry>> requests_
+      GUARDED_BY(mu_);
+  mutex mu_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 7f63cc9344f87010ceb1225dbe4b031bd5272f2c..3635caf3d104760d9200497f6f25d3f0fdfde48c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -210,7 +210,7 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 32063fecbbef4347bcdbfbdfda32f008015b5975..c14bfd2155fb4b2276642e220176a3658448f350 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/local_master.h"
 #include "tensorflow/core/distributed_runtime/master_interface.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -312,6 +313,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   for (const string& target : target_nodes) {
     req.add_target(target);
   }
+  req.set_request_id(GetUniqueRequestId());
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
   TF_RETURN_IF_ERROR(master_->PartialRunSetup(&call_options, &req, &resp));
   *handle = resp.partial_run_handle();
@@ -408,6 +410,7 @@ Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
   MakeCallableRequest req;
   TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   *req.mutable_options() = callable_options;
+  req.set_request_id(GetUniqueRequestId());
   MakeCallableResponse resp;
   CallOptions call_options;
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
@@ -423,6 +426,7 @@ Status GrpcSession::RunCallable(CallableHandle handle,
   RunCallableRequest req;
   TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   req.set_handle(handle);
+  req.set_request_id(GetUniqueRequestId());
   for (const Tensor& feed : feed_tensors) {
     feed.AsProtoTensorContent(req.mutable_feed()->Add());
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index d73638651f2b78fb935ab8865a776a708826c930..0ca64dc159b6680342a9937480a1d67135ad6197 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -32,6 +32,9 @@ limitations under the License.
 namespace tensorflow {
 
 // Object allocated per active RPC.
+// Manage the state of a single asynchronous RPC request.  If `max_retries`
+// is greater than 0, the request will be retried for any transient failures
+// as long as the overall deadline has not elapsed.
 template <class Response>
 class RPCState : public GrpcClientCQTag {
  public:
@@ -39,34 +42,55 @@ class RPCState : public GrpcClientCQTag {
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool)
+           thread::ThreadPool* threadpool, int32 max_retries = 0)
       : RPCState(stub, cq, method, request, response, std::move(done),
                  call_opts, threadpool, /*fail_fast=*/false,
-                 /*timeout_in_ms=*/0) {}
+                 /*timeout_in_ms=*/0, max_retries) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
-    context_.set_fail_fast(fail_fast);
-    if (timeout_in_ms > 0) {
-      context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
-    }
-
-    if (call_opts) {
-      call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
-    }
-
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
+           int32 max_retries)
+      : call_opts_(call_opts),
+        threadpool_(threadpool),
+        done_(std::move(done)),
+        cq_(cq),
+        stub_(stub),
+        method_(method),
+        max_retries_(max_retries),
+        timeout_in_ms_(timeout_in_ms),
+        fail_fast_(fail_fast) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
       LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
                  << s.error_message();
+      // Skip retry logic if we fail to parse our request.
+      done_(FromGrpcStatus(s));
+      delete this;
+      return;
+    }
+    StartCall();
+  }
+
+  void StartCall() {
+    context_.reset(new ::grpc::ClientContext());
+    context_->set_fail_fast(fail_fast_);
+
+    if (timeout_in_ms_ > 0) {
+      context_->set_deadline(
+          gpr_time_from_millis(timeout_in_ms_, GPR_TIMESPAN));
     }
-    call_ =
-        std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
+    if (call_opts_) {
+      call_opts_->SetCancelCallback([this]() { context_->TryCancel(); });
+    }
+
+    VLOG(2) << "Starting call: " << method_;
+
+    call_ = std::move(
+        stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_));
     call_->StartCall();
     call_->Finish(&response_buf_, &status_, this);
   }
@@ -89,16 +113,26 @@ class RPCState : public GrpcClientCQTag {
         threadpool_->Schedule([this]() { ParseAndCallDone(); });
       } else {
         ParseAndCallDone();
-        return;
       }
-    } else {
-      VLOG(2) << "Call returned with non-ok status: " << s;
+      return;
+    }
 
-      // Attach additional GRPC error information if any
+    VLOG(1) << method_ << " returned with non-ok status: " << s
+            << " Retries: " << num_retries_ << " Max: " << max_retries_ << "\n"
+            << context_->debug_error_string();
+    // Retry if we have any attempts left
+    if (++num_retries_ <= max_retries_ &&
+        (errors::IsUnavailable(s) || errors::IsUnknown(s))) {
+      response_buf_.Clear();
+      VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
+              << " of " << max_retries_;
+      StartCall();
+    } else {
+      // Attach additional GRPC error information if any to the final status
       s = Status(s.code(),
                  strings::StrCat(s.error_message(),
                                  "\nAdditional GRPC error information:\n",
-                                 context_.debug_error_string()));
+                                 context_->debug_error_string()));
       done_(s);
       delete this;
     }
@@ -115,7 +149,7 @@ class RPCState : public GrpcClientCQTag {
 
  private:
   CallOptions* call_opts_;
-  ::grpc::ClientContext context_;
+  std::unique_ptr<::grpc::ClientContext> context_;
   thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
@@ -123,6 +157,15 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::ByteBuffer response_buf_;
   ::grpc::Status status_;
   StatusCallback done_;
+  int64 timeout_in_ms_;
+
+  size_t num_retries_ = 0;
+  size_t max_retries_;
+
+  ::grpc::CompletionQueue* cq_;
+  ::grpc::GenericStub* stub_;
+  ::grpc::string method_;
+  bool fail_fast_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index e211c33732b26777697f11178909edaf6c9b65ed..471e2c16b348e12eca094247c729008a936174f7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -15,9 +15,61 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
+namespace {
+
+double GenerateUniformRandomNumber() {
+  return random::New64() * (1.0 / std::numeric_limits<uint64>::max());
+}
+
+double GenerateUniformRandomNumberBetween(double a, double b) {
+  if (a == b) return a;
+  DCHECK_LT(a, b);
+  return a + GenerateUniformRandomNumber() * (b - a);
+}
+
+}  // namespace
+
+int64 ComputeBackoffMicroseconds(int current_retry_attempt, int64 min_delay,
+                                 int64 max_delay) {
+  DCHECK_GE(current_retry_attempt, 0);
+
+  // This function with the constants below is calculating:
+  //
+  // (0.4 * min_delay) + (random[0.6,1.0] * min_delay * 1.3^retries)
+  //
+  // Note that there is an extra truncation that occurs and is documented in
+  // comments below.
+  constexpr double kBackoffBase = 1.3;
+  constexpr double kBackoffRandMult = 0.4;
+
+  // This first term does not vary with current_retry_attempt or a random
+  // number. It exists to ensure the final term is >= min_delay
+  const double first_term = kBackoffRandMult * min_delay;
+
+  // This is calculating min_delay * 1.3^retries
+  double uncapped_second_term = min_delay;
+  while (current_retry_attempt > 0 &&
+         uncapped_second_term < max_delay - first_term) {
+    current_retry_attempt--;
+    uncapped_second_term *= kBackoffBase;
+  }
+  // Note that first_term + uncapped_second_term can exceed max_delay here
+  // because of the final multiply by kBackoffBase.  We fix that problem with
+  // the min() below.
+  double second_term = std::min(uncapped_second_term, max_delay - first_term);
+
+  // This supplies the random jitter to ensure that retried don't cause a
+  // thundering herd problem.
+  second_term *=
+      GenerateUniformRandomNumberBetween(1.0 - kBackoffRandMult, 1.0);
+
+  return std::max(static_cast<int64>(first_term + second_term), min_delay);
+}
+
 ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
                                      grpc::ByteBuffer* dst) {
   bool own_buffer;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 45259aa2ece9698d7ffb5a850b716de442f7497f..976f3e6452a7673455d8c2d0946257ee54d762fe 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -29,6 +29,15 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Given the total number of RPC retries attempted, return a randomized
+// amount of time to delay before retrying the request.
+//
+// The average computed backoff increases with the number of RPCs attempted.
+// See implementation for details on the calculations.
+int64 ComputeBackoffMicroseconds(int current_retry_attempt,
+                                 int64 min_delay = 1000,
+                                 int64 max_delay = 10000000);
+
 // Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse an
 // efficient byte reader from which to decode a RecvTensorResponse.
 class GrpcByteSource : public TensorResponse::Source {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index f45b9bf97aaded94536074a67c977e0e5db4a585..904862100e460d811dc03648ff2b8aa4f26f672c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 
 #include <deque>
+#include <memory>
 #include <unordered_map>
+#include <vector>
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -32,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
@@ -42,8 +46,12 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -91,10 +99,11 @@ class GrpcWorkerServiceThread {
  public:
   explicit GrpcWorkerServiceThread(
       GrpcWorker* worker, ::grpc::ServerBuilder* builder,
-      std::unordered_map<int, int> queue_depth,
+      std::unordered_map<int, int> queue_depth, GrpcResponseCache* cache,
       grpc::WorkerService::AsyncService* worker_service)
       : worker_(worker),
         queue_depth_(queue_depth),
+        cache_(cache),
         worker_service_(worker_service),
         is_shutdown_(false) {
     cq_ = builder->AddCompletionQueue();
@@ -220,18 +229,32 @@ class GrpcWorkerServiceThread {
       NonOwnedProtoRunGraphResponse* wrapped_response =
           new NonOwnedProtoRunGraphResponse(&call->response);
       call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-      worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
-                             [call, call_opts, wrapped_request,
-                              wrapped_response](const Status& s) {
-                               if (!s.ok()) {
-                                 VLOG(1) << "Bad response from RunGraph:" << s;
-                               }
-                               call->ClearCancelCallback();
-                               delete call_opts;
-                               delete wrapped_request;
-                               delete wrapped_response;
-                               call->SendResponse(ToGrpcStatus(s));
-                             });
+      auto done_cb = [call, call_opts, wrapped_request,
+                      wrapped_response](const Status& s) {
+        VLOG(1) << "RunGraph::Done";
+        if (!s.ok()) {
+          VLOG(1) << "Bad response from RunGraph:" << s;
+        }
+        call->ClearCancelCallback();
+        delete call_opts;
+        delete wrapped_request;
+        delete wrapped_response;
+        call->SendResponse(ToGrpcStatus(s));
+      };
+
+      auto compute_fn = [this, call_opts, wrapped_request,
+                         wrapped_response](StatusCallback done) {
+        worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
+                               done);
+      };
+
+      if (cache_) {
+        string request_key = call->request.ShortDebugString();
+        cache_->LookupOrCompute(request_key, RPCResponse(&call->response),
+                                compute_fn, done_cb);
+      } else {
+        compute_fn(done_cb);
+      }
     });
     ENQUEUE_REQUEST(RunGraph, true);
   }
@@ -241,16 +264,28 @@ class GrpcWorkerServiceThread {
     Schedule([this, call]() {
       CallOptions* call_opts = new CallOptions;
       call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-      worker_->GrpcRecvTensorAsync(
-          call_opts, &call->request, &call->response,
-          [call, call_opts](const Status& s) {
-            call->ClearCancelCallback();
-            delete call_opts;
-            if (!s.ok()) {
-              VLOG(1) << "Bad response from RecvTensor:" << s;
-            }
-            call->SendResponse(ToGrpcStatus(s));
-          });
+
+      auto done_cb = [call, call_opts](const Status& s) {
+        call->ClearCancelCallback();
+        delete call_opts;
+        if (!s.ok()) {
+          VLOG(1) << "Bad response from RecvTensor:" << s;
+        }
+        call->SendResponse(ToGrpcStatus(s));
+      };
+
+      auto compute_fn = [this, &call_opts, &call](StatusCallback done) {
+        worker_->GrpcRecvTensorAsync(call_opts, &call->request, &call->response,
+                                     done);
+      };
+
+      if (cache_) {
+        string request_key = call->request.ShortDebugString();
+        cache_->LookupOrCompute(request_key, RPCResponse(&call->response),
+                                compute_fn, done_cb);
+      } else {
+        compute_fn(done_cb);
+      }
     });
     EnqueueRecvTensorRequestRaw();
   }
@@ -328,6 +363,7 @@ class GrpcWorkerServiceThread {
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   std::unique_ptr<Thread> thread_;
   std::unordered_map<int, int> queue_depth_;
+  GrpcResponseCache* cache_;
   grpc::WorkerService::AsyncService* const worker_service_;
 
   mutex shutdown_mu_;
@@ -341,9 +377,16 @@ class GrpcWorkerService : public AsyncServiceInterface {
                     GrpcWorkerServiceOptions options)
       : is_shutdown_(false) {
     builder->RegisterService(&worker_service_);
-    for (int i = 0; i < options.num_worker_threads; i++) {
-      threads_.emplace_back(new GrpcWorkerServiceThread(
-          worker, builder, options.queue_depth, &worker_service_));
+    if (options.response_cache_bytes > 0) {
+      cache_.reset(
+          new GrpcResponseCache(options.response_cache_bytes,
+                                options.response_cache_expires_seconds));
+    }
+
+    for (int i = 0; i < options.num_serving_threads; i++) {
+      threads_.emplace_back(
+          new GrpcWorkerServiceThread(worker, builder, options.queue_depth,
+                                      cache_.get(), &worker_service_));
     }
   }
 
@@ -378,6 +421,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
   grpc::WorkerService::AsyncService worker_service_;
   std::vector<std::unique_ptr<GrpcWorkerServiceThread>> threads_;
 
+  std::unique_ptr<GrpcResponseCache> cache_;
   mutex service_shutdown_mu_;
   bool is_shutdown_ GUARDED_BY(service_shutdown_mu_);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 88beb6c21652d22774f1b0d0b59f18af3f76be2e..8f2830c899b9b9854e0b6f02e23651ebd1b06491 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 
+#include <memory>
 #include <unordered_map>
 #include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 
@@ -63,7 +65,9 @@ struct GrpcWorkerServiceOptions {
   // Map from GrpcWorkerMethod id to queue depth.  If set this overrides the
   // default queue depth for a method.
   std::unordered_map<int, int> queue_depth;
-  int num_worker_threads = 8;
+  int num_serving_threads = 8;
+  int64 response_cache_bytes = 0;
+  int64 response_cache_expires_seconds = 0;
 };
 
 // Returns an implementation of WorkerService rpc service.
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 9fb920404f987d6b5b324cce4155da40c7e753b4..ee561e1a8a02a78256b97f5ce015f99ef148a591 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -71,9 +71,12 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     req_.set_request_id(GetUniqueRequestId());
   }
 
-  void Reset(WorkerCacheInterface* wc) {
-    wc->ReleaseWorker(src_worker_, wi_);
-    wi_ = nullptr;
+  void Reset() {
+    // The RpcRemoteRendezvous using this object is responsible for calling
+    // ReleaseWorker() before Reset().
+    DCHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "Leaking WorkerInterface in RpcRecvTensorCall::Reset().";
+
     alloc_attrs_ = AllocatorAttributes();
     dst_device_ = nullptr;
     // We don't clear opts_ and assume that Init will set up the state for
@@ -89,9 +92,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
 
   ~RpcRecvTensorCall() override {
     // Since only the RpcRecvTensorFreeList will delete an
-    // RpcRecvTensorCall, and it always sets this->wi_ to null when
-    // a call object is released to it, we can assert that this->wi_ is
-    // always null at the point of deletion.
+    // RpcRecvTensorCall, we require that ReleaseWorker() has been called before
+    // the user releases a Call object to the free list.
     CHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
         << "Leaking WorkerInterface in RpcRecvTensorCall destructor.";
   }
@@ -113,6 +115,13 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     return status_;
   }
 
+  void ReleaseWorker(WorkerCacheInterface* worker_cache) {
+    DCHECK_NE(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "RpcRecvTensorCall::ReleaseWorker() called twice.";
+    worker_cache->ReleaseWorker(src_worker_, wi_);
+    wi_ = nullptr;
+  }
+
   const Tensor& tensor() const { return resp_.tensor(); }
 
   bool is_dead() const { return resp_.metadata().is_dead(); }
@@ -144,7 +153,7 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
 
   string src_worker_;
   string src_rel_device_;
-  WorkerInterface* wi_;
+  WorkerInterface* wi_;  // Not owned.
   AllocatorAttributes alloc_attrs_;
   Device* dst_device_;
   CallOptions opts_;
@@ -180,8 +189,8 @@ class RpcRecvTensorFreeList {
     return new RpcRecvTensorCall;
   }
 
-  void Release(RpcRecvTensorCall* obj, WorkerCacheInterface* wc) {
-    obj->Reset(wc);
+  void Release(RpcRecvTensorCall* obj) {
+    obj->Reset();
     {
       mutex_lock l(mu_);
       if (objects_.size() < kMaxObjects) {
@@ -220,6 +229,9 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
                          " is invalid remote source device.");
   }
   WorkerSession* sess = session();
+  // The worker will be released in a subsequent call to
+  // `sess->worker_cache->ReleaseWorker()` (if the call has not yet been
+  // initialized) or `call->ReleaseWorker()` (if it has been initialized).
   WorkerInterface* rwi = sess->worker_cache->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
     s = errors::Internal("No worker known as ", call->src_worker_);
@@ -233,7 +245,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     if (rwi != nullptr) {
       sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
     }
-    get_call_freelist()->Release(call, sess->worker_cache.get());
+    get_call_freelist()->Release(call);
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -246,10 +258,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   // RendezvousMgr already aborted, shouldn't send RPC call any more
   if (!call->status().ok()) {
+    // NOTE: `*sess` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(sess->worker_cache.get());
     call->done()(call->status(), Args(), Args(), Tensor(), false);
-    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
-    call->wi_ = nullptr;
-    get_call_freelist()->Release(call, session()->worker_cache.get());
+    get_call_freelist()->Release(call);
     return;
   }
 
@@ -261,10 +275,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // If StartAbort was called prior to DeregisterCall, then the
     // current status should be bad.
     Status s = call->status();
+    // NOTE: `*session()` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(session()->worker_cache.get());
     call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
-    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
-    call->wi_ = nullptr;
-    get_call_freelist()->Release(call, session()->worker_cache.get());
+    get_call_freelist()->Release(call);
     Unref();
   });
 }
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a12934dde814344b70f52fbc972f74347e0..6d20e7cfcada3e3396611143bfcb148ee2a8f0c2 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -68,13 +68,14 @@ Status TensorResponse::InitFrom(RecvTensorResponse* response) {
   return s;
 }
 
-void TensorResponse::InitPartial(const RecvTensorResponse& response) {
+void TensorResponse::InitPartial(const RecvTensorResponse& response,
+                                 const AllocationAttributes& allocation_attr) {
   // Everything except content is present in *response.  Content will
   // arrive later; allocate a Tensor with appropriate storage for that
   // content.
   meta_ = response;
   TensorShape shape(meta_.tensor().tensor_shape());
-  Tensor t(allocator_, meta_.tensor().dtype(), shape);
+  Tensor t(allocator_, meta_.tensor().dtype(), shape, allocation_attr);
   tensor_ = std::move(t);
 }
 
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.h b/tensorflow/core/distributed_runtime/tensor_coding.h
index 4c34297990d399e4e42f5776cd23fb660c9090c5..86d95a30631493c713f24cbc2e04a09da80e00b8 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.h
+++ b/tensorflow/core/distributed_runtime/tensor_coding.h
@@ -76,7 +76,8 @@ class TensorResponse {
 
   // Initialize tensor metadata from response and allocate
   // uninitialized backing storage for actual contents.
-  void InitPartial(const RecvTensorResponse& response);
+  void InitPartial(const RecvTensorResponse& response,
+                   const AllocationAttributes& allocation_attr);
 
   // Return a reference to the parsed tensor.  The tensor will remain
   // live only until *this is destroyed or modified.
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index e942191efe96cde305acf9eb4335cfd8038cb9b1..f21f76fec53d2deac4a0f6467c8744e086c637b7 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -26,14 +26,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-void AllocatorStats::Clear() {
-  this->num_allocs = 0;
-  this->bytes_in_use = 0;
-  this->max_bytes_in_use = 0;
-  this->max_alloc_size = 0;
-  this->bytes_limit = 0;
-}
-
 string AllocatorStats::DebugString() const {
   return strings::Printf(
       "Limit:        %20lld\n"
@@ -41,8 +33,8 @@ string AllocatorStats::DebugString() const {
       "MaxInUse:     %20lld\n"
       "NumAllocs:    %20lld\n"
       "MaxAllocSize: %20lld\n",
-      this->bytes_limit, this->bytes_in_use, this->max_bytes_in_use,
-      this->num_allocs, this->max_alloc_size);
+      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
 }
 
 constexpr size_t Allocator::kAllocatorAlignment;
@@ -132,10 +124,10 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       ++stats_.num_allocs;
       stats_.bytes_in_use += alloc_size;
-      stats_.max_bytes_in_use =
-          std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
-      stats_.max_alloc_size =
-          std::max<int64>(stats_.max_alloc_size, alloc_size);
+      stats_.peak_bytes_in_use =
+          std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+      stats_.largest_alloc_size =
+          std::max<int64>(stats_.largest_alloc_size, alloc_size);
 
       if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
           total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
@@ -158,16 +150,16 @@ class CPUAllocator : public Allocator {
     port::AlignedFree(ptr);
   }
 
-  void GetStats(AllocatorStats* stats) override {
+  absl::optional<AllocatorStats> GetStats() override {
     mutex_lock l(mu_);
-    *stats = stats_;
+    return stats_;
   }
 
   void ClearStats() override {
     mutex_lock l(mu_);
     stats_.num_allocs = 0;
-    stats_.max_bytes_in_use = stats_.bytes_in_use;
-    stats_.max_alloc_size = 0;
+    stats_.peak_bytes_in_use = stats_.bytes_in_use;
+    stats_.largest_alloc_size = 0;
   }
 
   size_t AllocatedSizeSlow(const void* ptr) override {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 3ded86e8e93ea484c9b742c0ac7837e35b388bdb..4dc5eaf16d7f3eb034e44898f61dab33ba4c8d82 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include <limits>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
@@ -46,23 +48,31 @@ struct AllocationAttributes {
   // which Op is performing the allocation, and sets this flag to
   // true.
   bool allocation_will_be_logged = false;
+  // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
+  // a memory chunk whose last-freed count is at this value or earlier may be
+  // returned.
+  std::function<uint64()> freed_by_func = nullptr;
 };
 
-// Runtime statistics collected by an allocator.
+// Runtime statistics collected by an allocator. Exactly the same as
+// stream_executor::AllocatorStats, but independently defined to preserve the
+// mutual independence of StreamExecutor and TensorFlow.
 struct AllocatorStats {
-  int64 num_allocs;        // Number of allocations.
-  int64 bytes_in_use;      // Number of bytes in use.
-  int64 max_bytes_in_use;  // The maximum bytes in use.
-  int64 max_alloc_size;    // The max single allocation seen.
+  int64 num_allocs;          // Number of allocations.
+  int64 bytes_in_use;        // Number of bytes in use.
+  int64 peak_bytes_in_use;   // The peak bytes in use.
+  int64 largest_alloc_size;  // The largest single allocation seen.
 
-  // The upper limit what the allocator can allocate, if such a limit
-  // is known. Certain allocator may return 0 to indicate the limit is
-  // unknown.
-  int64 bytes_limit;
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  absl::optional<int64> bytes_limit;
 
-  AllocatorStats() { Clear(); }
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0) {}
 
-  void Clear();
   string DebugString() const;
 };
 
@@ -194,7 +204,7 @@ class Allocator {
   }
 
   // Fills in 'stats' with statistics collected by this allocator.
-  virtual void GetStats(AllocatorStats* stats) { stats->Clear(); }
+  virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
 
   // Clears the internal stats except for the `in_use` field.
   virtual void ClearStats() {}
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index 9dc74345dab8d075809d586ee1c5e86fe9acb515..d9f3280c62d7c1a4a2bb7a3de117768f836653af 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
 
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index a409cb2de7fbae20f435f464ca07155a36fede4a..85e8ba6a71b7760b004b9d2ebbc425ddff5fbf17 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -25,20 +25,23 @@ limitations under the License.
 namespace tensorflow {
 
 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 max_bytes_in_use, int64 max_alloc_size) {
-  AllocatorStats stats;
-  a->GetStats(&stats);
-  LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
+                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+  absl::optional<AllocatorStats> stats = a->GetStats();
+  EXPECT_TRUE(stats);
+  if (!stats) {
+    return;
+  }
+  LOG(INFO) << "Alloc stats: \n" << stats->DebugString();
 #if defined(PLATFORM_GOOGLE) && defined(NDEBUG)
   // NOTE: allocator stats expectation depends on the system malloc,
   // and can vary as that changes.
   static const int64 kSlop = 5 * 1024;
-  EXPECT_GT(stats.bytes_in_use, bytes_in_use - kSlop);
-  EXPECT_LT(stats.bytes_in_use, bytes_in_use + kSlop);
-  EXPECT_GT(stats.max_bytes_in_use, max_bytes_in_use - kSlop);
-  EXPECT_LT(stats.max_bytes_in_use, max_bytes_in_use + kSlop);
-  EXPECT_EQ(stats.num_allocs, num_allocs);
-  EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
+  EXPECT_GT(stats->bytes_in_use, bytes_in_use - kSlop);
+  EXPECT_LT(stats->bytes_in_use, bytes_in_use + kSlop);
+  EXPECT_GT(stats->peak_bytes_in_use, peak_bytes_in_use - kSlop);
+  EXPECT_LT(stats->peak_bytes_in_use, peak_bytes_in_use + kSlop);
+  EXPECT_EQ(stats->num_allocs, num_allocs);
+  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 #endif
 }
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 79966f06922a62c7d04648f4a2829d05861cd76b..43b435270c49087b43ce101991686e8c9c069de2 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -54,9 +54,7 @@ uint64 TensorProtoHash(const TensorProto& tp) {
   DCHECK(success);
   TensorProto p;
   tensor.AsProtoTensorContent(&p);
-  string s;
-  SerializeToStringDeterministic(p, &s);
-  return Hash64(s);
+  return DeterministicProtoHash64(p);
 }
 
 // Do not create large tensors in memory, compute hash based on TensorProto
@@ -64,12 +62,8 @@ uint64 TensorProtoHash(const TensorProto& tp) {
 // different hash code if they are defined with different TensorProto
 // representations.
 uint64 FastTensorProtoHash(const TensorProto& tp) {
-  string s;
   if (TensorByteSize(tp) > kMaxAttrValueTensorByteSize) {
-    string s;
-    bool success = SerializeToStringDeterministic(tp, &s);
-    DCHECK(success);
-    return Hash64(s);
+    return DeterministicProtoHash64(tp);
   } else {
     return TensorProtoHash(tp);
   }
@@ -95,11 +89,7 @@ bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
   TensorProto rhs_tp;
   rhs_t.AsProtoTensorContent(&rhs_tp);
 
-  string lhs_str, rhs_str;
-  SerializeToStringDeterministic(lhs_tp, &lhs_str);
-  SerializeToStringDeterministic(rhs_tp, &rhs_str);
-
-  return lhs_str == rhs_str;
+  return AreSerializedProtosEqual(lhs_tp, rhs_tp);
 }
 
 // Do not construct large tensors in memory, compare equality using TensorProto
@@ -139,9 +129,7 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   }
 
   // If `a` is not a tensor or func, get a hash of serialized string.
-  string s;
-  SerializeToStringDeterministic(a, &s);
-  return Hash64(s);
+  return DeterministicProtoHash64(a);
 }
 
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
@@ -175,10 +163,7 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
 
   // All other fields in AttrValue have deterministic representations.
   // It is safe to compare their serialized strings.
-  string a_str, b_str;
-  SerializeToStringDeterministic(a, &a_str);
-  SerializeToStringDeterministic(b, &b_str);
-  return a_str == b_str;
+  return AreSerializedProtosEqual(a, b);
 }
 
 string SummarizeString(const string& str) {
diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc
index af59500aee32d83dadb7cf94f6d277819f6c65c4..7f639b5ca9a5fd6219b900f08965acaf2c6ee923 100644
--- a/tensorflow/core/framework/cancellation.cc
+++ b/tensorflow/core/framework/cancellation.cc
@@ -27,6 +27,12 @@ CancellationManager::CancellationManager()
       is_cancelled_(false),
       next_cancellation_token_(0) {}
 
+void CancellationManager::Reset() {
+  mutex_lock l(mu_);
+  is_cancelling_ = false;
+  is_cancelled_.store(false);
+}
+
 void CancellationManager::StartCancel() {
   gtl::FlatMap<CancellationToken, CancelCallback> callbacks_to_run;
   {
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 7a5d9424867d35a4ca07e690230c73afff0b2940..51b200423ec11fba771d233e6985c62708f901ac 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -56,6 +56,9 @@ class CancellationManager {
   // Returns true iff StartCancel() has been called.
   bool IsCancelled() { return is_cancelled_.load(std::memory_order_acquire); }
 
+  // Resets the cancellation manager to its original pre-cancelled state.
+  void Reset();
+
   // Returns a token that must be used in calls to RegisterCallback
   // and DeregisterCallback.
   CancellationToken get_cancellation_token();
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 546e3938a828a1007de43c2bdc188eee174c911e..e00cc17961cb89cfdad8d33cbca758d80a5ca274 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -42,6 +42,7 @@ class Tensor;
 enum CollectiveType {
   REDUCTION_COLLECTIVE = 0,
   BROADCAST_COLLECTIVE,
+  GATHER_COLLECTIVE,
   UNDEFINED_COLLECTIVE,
 };
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 83bc95065e00dfb37a6a732f668270743ac3ab51..5c974a76aca76f14ef166d285733c5e2f9ad723b 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1303,6 +1303,12 @@ Status ConcatV2Shape(InferenceContext* c) {
                            c->num_inputs() - 1 /* dim_index */);
 }
 
+Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat) {
+  return ConcatShapeHelper(c, 0 /* start_value_index */,
+                           num_inputs_to_concat /* end_value_index */,
+                           num_inputs_to_concat /* dim_index */);
+}
+
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                             ShapeHandle shape_x,
                                             ShapeHandle shape_y,
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 14b9688bdc5d41e8cb2e92b1f1a8640fb9687d8c..d421844ee607b18132f4657e7562dec04253c2fa 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -279,6 +279,8 @@ Status ConcatShape(shape_inference::InferenceContext* c,
 // Shape function for concat operations.
 Status ConcatV2Shape(shape_inference::InferenceContext* c);
 
+Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat);
+
 // Shape function for binary operators that broadcast their inputs
 // and with output to output_index.
 // Note: out cannot be NULL.
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 321947aca8e06008c3291fa43befa389b53f998c..89ba662b69b060b1b76a0a22630acd4ecb80bed6 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -246,6 +246,15 @@ class DeviceBase {
     return errors::Internal("Device does not implement MakeTensorFromProto()");
   }
 
+  // Some devices (i.e. GPUs) may free device memory prior to its actual use
+  // being completed on the assumption that subsequent allocations can only be
+  // used serially with respect to pending uses.  If this function returns a
+  // non-zero value it is the value of a device-specific counter such that any
+  // device memory tagged with an earlier freed-at count is really unencumbered
+  // by pending uses.  For this to be useful the device memory allocator must
+  // be tagging deallocated memory chunks using the same counter.
+  virtual uint64 SafeAllocFrontier() { return 0; }
+
  protected:
   // Does not take ownership.
   void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index b7adfd0c947b60ff9295c867f4afdf756208b126..35d04eb7278f35ff4118c082adf5f7a320cf923c 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -569,6 +569,9 @@ string Print(const FunctionDef& fdef) {
   for (const auto& n : fdef.node_def()) {
     strings::StrAppend(&out, "  ", Print(n), "\n");
   }
+  for (const auto& cr : fdef.control_ret()) {
+    strings::StrAppend(&out, "  @return ", cr.first, " = ", cr.second, "\n");
+  }
   for (const auto& r : fdef.ret()) {
     strings::StrAppend(&out, "  return ", r.first, " = ", r.second, "\n");
   }
@@ -825,6 +828,12 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
   if (ret1 != ret2) return false;
 
+  std::map<string, string> control_ret1(f1.control_ret().begin(),
+                                        f1.control_ret().end());
+  std::map<string, string> control_ret2(f2.control_ret().begin(),
+                                        f2.control_ret().end());
+  if (control_ret1 != control_ret2) return false;
+
   return true;
 }
 
@@ -849,6 +858,14 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
     h = Hash64(p.second.data(), p.second.size(), h);
   }
 
+  // control output names
+  std::map<string, string> control_ret(fdef.control_ret().begin(),
+                                       fdef.control_ret().end());
+  for (const auto& p : control_ret) {
+    h = Hash64(p.first.data(), p.first.size(), h);
+    h = Hash64(p.second.data(), p.second.size(), h);
+  }
+
   return h;
 }
 
@@ -1339,7 +1356,7 @@ GET_ATTR(bool)
 
 namespace {
 
-constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+constexpr char kApiImplements[] = "api_implements";
 
 absl::flat_hash_set<string> ReachableFunctions(
     const FunctionLibraryDefinition& flib,
@@ -1347,10 +1364,10 @@ absl::flat_hash_set<string> ReachableFunctions(
   // Functions that are reachable from the graph.
   absl::flat_hash_set<string> reachable_funcs;
 
-  // For any functions, if it has attribute "experimental_api_implements" =
+  // For any functions, if it has attribute "api_implements" =
   // "some_interface" and it is reachable, then it means any other
   // function with same attribute name and value could also be potentially
-  // reachable, eg via experimental_implementation_selector swapping the
+  // reachable, eg via implementation_selector swapping the
   // nodedef.
   absl::flat_hash_set<string> reachable_api_interface;
 
@@ -1400,7 +1417,7 @@ absl::flat_hash_set<string> ReachableFunctions(
     const string& func_name = func->signature().name();
     reachable_funcs.insert(func_name);
 
-    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    const auto attr_it = func->attr().find(kApiImplements);
     if (attr_it != func->attr().end()) {
       reachable_api_interface.insert(attr_it->second.s());
     }
@@ -1416,7 +1433,7 @@ absl::flat_hash_set<string> ReachableFunctions(
 
   for (const auto& func_name : flib.ListFunctionNames()) {
     const auto& func_def = flib.Find(func_name);
-    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    const auto attr_it = func_def->attr().find(kApiImplements);
     if (attr_it != func_def->attr().end()) {
       if (reachable_api_interface.contains(attr_it->second.s())) {
         reachable_funcs.insert(func_name);
@@ -1512,7 +1529,8 @@ FunctionDef FunctionDefHelper::Create(
     const string& function_name, gtl::ArraySlice<string> in_def,
     gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
     gtl::ArraySlice<Node> node_def,
-    gtl::ArraySlice<std::pair<string, string>> ret_def) {
+    gtl::ArraySlice<std::pair<string, string>> ret_def,
+    gtl::ArraySlice<std::pair<string, string>> control_ret_def) {
   FunctionDef fdef;
 
   // Signature
@@ -1520,6 +1538,7 @@ FunctionDef FunctionDefHelper::Create(
   for (const auto& i : in_def) b.Input(i);
   for (const auto& o : out_def) b.Output(o);
   for (const auto& a : attr_def) b.Attr(a);
+  for (const auto& c : control_ret_def) b.ControlOutput(c.first);
 
   OpRegistrationData op_reg_data;
   TF_CHECK_OK(b.Finalize(&op_reg_data));
@@ -1535,6 +1554,11 @@ FunctionDef FunctionDefHelper::Create(
     fdef.mutable_ret()->insert({r.first, r.second});
   }
 
+  // Control returns
+  for (const auto& cr : control_ret_def) {
+    fdef.mutable_control_ret()->insert({cr.first, cr.second});
+  }
+
   auto* op_def_registry = OpRegistry::Global();
   // Check if any op is stateful.
   for (const auto& n : node_def) {
@@ -1550,6 +1574,16 @@ FunctionDef FunctionDefHelper::Create(
   return fdef;
 }
 
+/* static */
+FunctionDef FunctionDefHelper::Create(
+    const string& function_name, gtl::ArraySlice<string> in_def,
+    gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
+    gtl::ArraySlice<Node> node_def,
+    gtl::ArraySlice<std::pair<string, string>> ret_def) {
+  return Create(function_name, in_def, out_def, attr_def, node_def, ret_def,
+                /*control_ret_def=*/{});
+}
+
 /* static */
 FunctionDef FunctionDefHelper::Define(const string& name,
                                       gtl::ArraySlice<string> arg_def,
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 79755f599cfc80fa3ccdbadc83cef65667d07250..4f0d595ed3bca0c9c4430512e8b42203f2b56de9 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -122,9 +122,23 @@ class FunctionDefHelper {
     NodeDef ToNodeDef() const;
   };
 
-  // The Create() function uses the new NodeDef field.  `ret_def`
-  // holds a mapping from the function output names from `out_def` to
-  // the node outputs from `node_def`.
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
+  // - `control_ret_def` holds a mapping from the function control
+  //   output names to the nodes from `node_def`.
+  static FunctionDef Create(
+      const string& function_name, gtl::ArraySlice<string> in_def,
+      gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
+      gtl::ArraySlice<Node> node_def,
+      gtl::ArraySlice<std::pair<string, string>> ret_def,
+      gtl::ArraySlice<std::pair<string, string>> control_ret_def);
+
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
   static FunctionDef Create(const string& function_name,
                             gtl::ArraySlice<string> in_def,
                             gtl::ArraySlice<string> out_def,
@@ -132,7 +146,6 @@ class FunctionDefHelper {
                             gtl::ArraySlice<Node> node_def,
                             gtl::ArraySlice<std::pair<string, string>> ret_def);
 
-  // The two Define() functions use the old FunctionDef::Node field.
   // TODO(josh11b): Get rid of these and transition to the one above.
   static FunctionDef Define(const string& function_name,
                             gtl::ArraySlice<string> arg_def,
@@ -554,6 +567,7 @@ class FunctionLibraryRuntime {
     // If provided, this optimization function will be invoked before
     // the placer for multi-device functions.
     std::function<Status(std::vector<string> /*ret_node_names*/,
+                         std::vector<string> /*keep_node_names*/,
                          FunctionLibraryDefinition*, const DeviceSet&,
                          Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
         optimize_graph_fn;
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index e69d3938d93d109a7cb0c940e8f981d30f464599..64f406bfd73c847e64d58553143aa91b2dc5f424 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -76,6 +76,10 @@ message FunctionDef {
   // A mapping from the output arg names from `signature` to the
   // outputs from `node_def` that should be returned by the function.
   map<string, string> ret = 4;
+
+  // A mapping from control output names from `signature` to node names in
+  // `node_def` which should be control outputs of this function.
+  map<string, string> control_ret = 6;
 }
 
 // GradientDef defines the gradient function of a function defined in
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 6a828e9afaaec536d4d5ef51d50dec88fdd6d391..6fbbabfc95d13d7574f578ddd05f9887435aa0d1 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -156,6 +156,48 @@ ControlDep(x:int32) -> (y:int32) {
   EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
+TEST(TFunc, ControlRet) {
+  auto fdef = FDH::Create(
+      // Name
+      "ControlRet",
+      // Inputs
+      {"x: int32"},
+      // Outputs
+      {"y: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {
+          {{"a"}, "Identity", {"x"}, {{"T", DT_INT32}}},
+      },
+      // Returns
+      {{"y", "a:output:0"}},
+      // Control returns
+      {{"must_execute", "a"}});
+
+  const char* e = R"P(
+ControlRet(x:int32) -> (y:int32) {
+  a = Identity[T=int32](x)
+  @return must_execute = a
+  return y = a:output:0
+}
+)P";
+  EXPECT_EQ(DebugString(fdef), e);
+
+  // Instantiate one with T=float
+  InstantiationResult result;
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
+  const char* e2 = R"P(
+(x:int32) -> (a:int32) {
+  a = Identity[T=int32](x)
+}
+)P";
+  EXPECT_EQ(result.arg_types, DataTypeVector({DT_INT32}));
+  EXPECT_EQ(result.ret_types, DataTypeVector({DT_INT32}));
+  EXPECT_EQ(DebugString(result.nodes), e2);
+}
+
 REGISTER_OP("HasDefaultType")
     .Output("out: T")
     .Attr("T: {float, double, int32, int64} = DT_FLOAT");
@@ -1320,7 +1362,7 @@ TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
 
     if (!interface_name.empty()) {
       auto* attr = func_def.mutable_attr();
-      (*attr)["experimental_api_implements"].set_s(interface_name);
+      (*attr)["api_implements"].set_s(interface_name);
     }
     return func_def;
   };
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index e369e882a0961e60d20f52e0155e9738bf16415e..fee52375c139ada0e457efe1247a18d471e8aa46 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -515,10 +515,13 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
           ". (Check whether your GraphDef-interpreting binary is up to date "
           "with your GraphDef-generating binary.).");
     }
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        ValidateAttrValue(attr.second, *iter->second),
-        "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
-        SummarizeOpDef(op_def));
+    // If attr value is placeholder, do not check it.
+    if (attr.second.placeholder().empty()) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          ValidateAttrValue(attr.second, *iter->second),
+          "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
+          SummarizeOpDef(op_def));
+    }
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
   }
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index b8309eafb05251235bfaaa7b5489cac06f0024dc..b29d7ae77f031a9fff0dfa6280a43dba75f4ab71 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -60,6 +60,21 @@ void OpRegistry::Register(const OpRegistrationDataFactory& op_data_factory) {
 
 Status OpRegistry::LookUp(const string& op_type_name,
                           const OpRegistrationData** op_reg_data) const {
+  {
+    tf_shared_lock l(mu_);
+    if (initialized_) {
+      if (const OpRegistrationData* res =
+              gtl::FindWithDefault(registry_, op_type_name, nullptr)) {
+        *op_reg_data = res;
+        return Status::OK();
+      }
+    }
+  }
+  return LookUpSlow(op_type_name, op_reg_data);
+}
+
+Status OpRegistry::LookUpSlow(const string& op_type_name,
+                              const OpRegistrationData** op_reg_data) const {
   *op_reg_data = nullptr;
   const OpRegistrationData* res = nullptr;
 
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 81ed5f95f0bf020780f1d71692388885ce702b70..538ce04ef44f591c7090489f7723121ee362e54f 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -144,6 +144,9 @@ class OpRegistry : public OpRegistryInterface {
   Status RegisterAlreadyLocked(const OpRegistrationDataFactory& op_data_factory)
       const EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  Status LookUpSlow(const string& op_type_name,
+                    const OpRegistrationData** op_reg_data) const;
+
   mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index aea2d2bb09a2c2c80ae02b10b1222d6882606c3c..e44ecc9f6236210b3bcb21a4914243741c632d2c 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -54,6 +54,10 @@ message OpDef {
   // Description of the output(s).
   repeated ArgDef output_arg = 3;
 
+  // Named control outputs for this operation. Useful only for composite
+  // operations (i.e. functions) which want to name different control outputs.
+  repeated string control_output = 20;
+
   // Description of the graph-construction-time configuration of this
   // Op.  That is to say, this describes the attr fields that will
   // be specified in the NodeDef.
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 8a9bb6318211ad1537727d7e60945897c4a9a63d..0a62a2e871ab1bfeea4c7cbc14e93173bbc1a3c1 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -316,6 +316,14 @@ bool ConsumeInOutTimesType(StringPiece* sp, StringPiece* out) {
       .GetResult(sp, out);
 }
 
+bool ConsumeControlOutName(StringPiece* sp, StringPiece* out) {
+  return Scanner(*sp)
+      .One(Scanner::LETTER)
+      .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+      .StopCapture()
+      .GetResult(sp, out);
+}
+
 #define VERIFY(expr, ...)                                             \
   do {                                                                \
     if (!(expr)) {                                                    \
@@ -409,6 +417,25 @@ void FinalizeInputOrOutput(StringPiece spec, bool is_output, OpDef* op_def,
 
 #undef VERIFY
 
+string ControlOutError(StringPiece orig, const string& op_name) {
+  return strings::StrCat(" from ControlOutput(\"", orig, "\") for Op ",
+                         op_name);
+}
+
+void FinalizeControlOutput(StringPiece name, OpDef* op_def,
+                           std::vector<string>* errors) {
+  StringPiece orig(name);
+
+  // Parse control output name.
+  StringPiece tmp_name;
+  if (!ConsumeControlOutName(&orig, &tmp_name)) {
+    errors->push_back(strings::StrCat("Trouble parsing 'name:'",
+                                      ControlOutError(orig, op_def->name())));
+  }
+
+  *op_def->add_control_output() = string(tmp_name.data(), tmp_name.size());
+}
+
 int num_leading_spaces(StringPiece s) {
   size_t i = 0;
   while (i < s.size() && s[i] == ' ') {
@@ -545,6 +572,11 @@ OpDefBuilder& OpDefBuilder::Output(string spec) {
   return *this;
 }
 
+OpDefBuilder& OpDefBuilder::ControlOutput(string name) {
+  control_outputs_.push_back(std::move(name));
+  return *this;
+}
+
 #ifndef TF_LEAN_BINARY
 OpDefBuilder& OpDefBuilder::Doc(string text) {
   if (!doc_.empty()) {
@@ -614,6 +646,9 @@ Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
   for (StringPiece output : outputs_) {
     FinalizeInputOrOutput(output, true, op_def, &errors);
   }
+  for (StringPiece control_output : control_outputs_) {
+    FinalizeControlOutput(control_output, op_def, &errors);
+  }
   FinalizeDoc(doc_, op_def, &errors);
 
   if (errors.empty()) return Status::OK();
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index 8077b20598c210d9266c168569f3d9a3a190c097..38d3f5cfc608d19b90b56b648b2ffb6bccbdd8f3 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -28,6 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class FunctionDefHelper;
+
 namespace shape_inference {
 class InferenceContext;
 }
@@ -150,12 +152,20 @@ class OpDefBuilder {
   Status Finalize(OpRegistrationData* op_reg_data) const;
 
  private:
+  friend class FunctionDefHelper;
+
+  // Adds control output to this OpDefBuilder (and returns *this).
+  // The <name> must be a valid node name (matches regexp
+  // [a-zA-Z][a-zA-Z0-9_]*). Named control output can only exist for functions.
+  OpDefBuilder& ControlOutput(string name);
+
   OpDef* op_def() { return &op_reg_data_.op_def; }
 
   OpRegistrationData op_reg_data_;
   std::vector<string> attrs_;
   std::vector<string> inputs_;
   std::vector<string> outputs_;
+  std::vector<string> control_outputs_;
   string doc_;
   std::vector<string> errors_;
 };
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 3597f43d51987b0d46df90ad0db964927f16adf0..9c47ac0f017779fccfa40ab521a161e83fd1e7df 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -114,6 +114,8 @@ Status ValidateAttrValue(const AttrValue& attr_value,
         length = attr_value.list().shape_size();
       } else if (attr.type() == "list(tensor)") {
         length = attr_value.list().tensor_size();
+      } else if (attr.type() == "list(func)") {
+        length = attr_value.list().func_size();
       }
       if (length < attr.minimum()) {
         return errors::InvalidArgument(
@@ -833,25 +835,37 @@ bool OpDefEqual(const OpDef& o1, const OpDef& o2) {
   // Compare it separately here instead of serializing below.
   if (!RepeatedAttrDefEqual(o1.attr(), o2.attr())) return false;
 
-  // Clear attr field, serialize, and compare serialized strings
+  // `control_output` order doesn't matter.
+  std::set<string> control_output1(o1.control_output().begin(),
+                                   o1.control_output().end());
+  std::set<string> control_output2(o2.control_output().begin(),
+                                   o2.control_output().end());
+  if (control_output1 != control_output2) return false;
+
+  // Clear `attr` and `control_output` fields, serialize, and compare serialized
+  // strings.
   OpDef o1_copy = o1;
   OpDef o2_copy = o2;
   o1_copy.clear_attr();
+  o1_copy.clear_control_output();
   o2_copy.clear_attr();
-  string s1, s2;
-  SerializeToStringDeterministic(o1_copy, &s1);
-  SerializeToStringDeterministic(o2_copy, &s2);
-  if (s1 != s2) return false;
-  return true;
+  o2_copy.clear_control_output();
+
+  return AreSerializedProtosEqual(o1_copy, o2_copy);
 }
 
 uint64 OpDefHash(const OpDef& o) {
   uint64 h = RepeatedAttrDefHash(o.attr());
+
+  // Compute deterministic order-independent control outputs hash.
+  std::set<string> control_output(o.control_output().begin(),
+                                  o.control_output().end());
+  for (const auto& co : control_output) h = Hash64Combine(h, Hash64(co));
+
   OpDef o_copy = o;
   o_copy.clear_attr();
-  string s;
-  SerializeToStringDeterministic(o_copy, &s);
-  return Hash64(s.data(), s.size(), h);
+  o_copy.clear_control_output();
+  return DeterministicProtoHash64(o_copy, h);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 789f0fda7526fadc667e51046a344062a9532670..16ca40c31c73e0cab9cab408d59ac230b95e6cde 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <cstdlib>
+#include <cstring>
+
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -998,6 +1001,12 @@ static Status IsProbablySafeToLoad(const string& path) {
 
 void LoadDynamicKernelsInternal() {
   Env* env = Env::Default();
+
+  // Override to allow loading unsafe packages for development.
+  // DO NOT USE UNLESS YOU KNOW WHAT ABI ISSUES YOU CAN ENCOUNTER.
+  bool override_abi_check =
+      strcmp(getenv("TF_REALLY_LOAD_UNSAFE_PACKAGES"), "1") == 0;
+
   string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
                                          "tensorflow",
                                          "core",
@@ -1010,7 +1019,12 @@ void LoadDynamicKernelsInternal() {
       string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
         Status s = IsProbablySafeToLoad(fullpath);
-        if (s.ok()) {
+        if (!s.ok() && override_abi_check) {
+          LOG(WARNING) << "Loading UNSAFE library " << fullpath
+                       << " because ABI check override is set: "
+                       << s.error_message();
+        }
+        if (s.ok() || override_abi_check) {
           // TODO(gunan): Store the handles to the opened files.
           void* unused_filehandle;
           TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
@@ -1074,6 +1088,11 @@ void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
   delete kernel_def;
 }
 
+OpKernel* OpKernelRegistrar::PtrOpKernelFactory::Create(
+    OpKernelConstruction* context) {
+  return (*create_func_)(context);
+}
+
 }  // namespace kernel_factory
 
 namespace {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 06b90964ad1f7e4c8047f79ec37bee097327be9a..ff0b44650ce5e306c73fc7c1a37d4d6ec2b3e6c2 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -525,11 +525,42 @@ struct TensorValue {
 // Used to store partitioned graphs from function-calling ops.
 struct GraphCollector {
   mutex mu;
-  std::vector<GraphDef> graphs GUARDED_BY(mu);
+  std::vector<GraphDef> partitioned_graphs GUARDED_BY(mu);
+  GraphDef raw_graph GUARDED_BY(mu);
+  GraphDef optimized_graph GUARDED_BY(mu);
 
-  void CollectGraph(const GraphDef& graph) {
+  bool dirty GUARDED_BY(mu);
+
+  GraphCollector() : dirty(false) {}
+
+  void CollectRawGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    raw_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectOptimizedGraph(const GraphDef& graph) {
     mutex_lock ml(mu);
-    graphs.push_back(graph);
+    optimized_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectPartitionedGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    partitioned_graphs.push_back(graph);
+    dirty = true;
+  }
+
+  void ClearGraphs() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    raw_graph.Clear();
+    optimized_graph.Clear();
+    partitioned_graphs.clear();
+    dirty = false;
+  }
+
+  bool HasUpdatedGraphs() {
+    mutex_lock ml(mu);
+    return dirty;
   }
 };
 
@@ -1436,23 +1467,21 @@ class OpKernelRegistrar {
     // Perform the check in the header to allow compile-time optimization
     // to a no-op, allowing the linker to remove the kernel symbols.
     if (kernel_def != nullptr) {
-      struct PtrOpKernelFactory : public OpKernelFactory {
-        explicit PtrOpKernelFactory(
-            OpKernel* (*create_func)(OpKernelConstruction*))
-            : create_func_(create_func) {}
-
-        OpKernel* Create(OpKernelConstruction* context) override {
-          return (*create_func_)(context);
-        }
-
-        OpKernel* (*create_func_)(OpKernelConstruction*);
-      };
       InitInternal(kernel_def, kernel_class_name,
                    absl::make_unique<PtrOpKernelFactory>(create_fn));
     }
   }
 
  private:
+  struct PtrOpKernelFactory : public OpKernelFactory {
+    explicit PtrOpKernelFactory(OpKernel* (*create_func)(OpKernelConstruction*))
+        : create_func_(create_func) {}
+
+    OpKernel* Create(OpKernelConstruction* context) override;
+
+    OpKernel* (*create_func_)(OpKernelConstruction*);
+  };
+
   void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     std::unique_ptr<OpKernelFactory> factory);
 };
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index f84ef0f953cf23e3fb2af210706586f95cfbb8ad..ed4ff240393eab495e04b85d80b25377c578ac1e 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -241,7 +241,7 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
   num_records_produced_ = state.num_records_produced();
   work_ = state.current_work();
   if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
     const string debug_string = state.DebugString();
@@ -251,7 +251,7 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
         debug_string);
   }
   if (work_started_ > work_finished_) {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || (__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
     const string debug_string = state.DebugString();
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 9c381e7d6b4e909689591d3a75bfabbecd886a0d..da547d5829f846ae87857c410d731bcc9457cd3b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -89,9 +89,17 @@ class ScopedStepContainer {
   // step_id: the unique ID of this step. Doesn't have to be sequential, just
   // has to be unique.
   // cleanup: callback to delete a container of this name.
+  // prefix: optional string prefix to disambiguate step containers.
   ScopedStepContainer(const int64 step_id,
                       std::function<void(const string&)> cleanup)
       : name_(strings::StrCat("__per_step_", step_id)), cleanup_(cleanup) {}
+
+  ScopedStepContainer(const int64 step_id,
+                      std::function<void(const string&)> cleanup,
+                      const string& prefix)
+      : name_(strings::StrCat("__", prefix, "_per_step_", step_id)),
+        cleanup_(cleanup) {}
+
   ~ScopedStepContainer() { cleanup_(name_); }
 
   const string& name() const { return name_; }
@@ -124,14 +132,14 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status Lookup(const string& container, const string& name,
                 T** resource) const TF_MUST_USE_RESULT;
 
   // Similar to Lookup, but looks up multiple resources at once, with only a
   // single lock acquisition.  If containers_and_names[i] is uninitialized
   // then this function does not modify resources[i].
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupMany(absl::Span<std::pair<const string*, const string*> const>
                         containers_and_names,
                     std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
@@ -147,7 +155,7 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupOrCreate(const string& container, const string& name,
                         T** resource,
                         std::function<Status(T**)> creator) TF_MUST_USE_RESULT;
@@ -188,7 +196,7 @@ class ResourceMgr {
   mutable mutex mu_;
   std::unordered_map<string, Container*> containers_ GUARDED_BY(mu_);
 
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupInternal(const string& container, const string& name,
                         T** resource) const
       SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
@@ -259,7 +267,7 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value);
 //
 // If the lookup is successful, the caller takes the ownership of one ref on
 // `*value`, and must call its `Unref()` method when it has finished using it.
-template <typename T>
+template <typename T, bool use_dynamic_cast = false>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p, T** value);
 
 // Looks up multiple resources pointed by a sequence of resource handles.  If
@@ -429,15 +437,15 @@ Status ResourceMgr::Create(const string& container, const string& name,
   return DoCreate(container, MakeTypeIndex<T>(), name, resource);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::Lookup(const string& container, const string& name,
                            T** resource) const {
   CheckDeriveFromResourceBase<T>();
   tf_shared_lock l(mu_);
-  return LookupInternal(container, name, resource);
+  return LookupInternal<T, use_dynamic_cast>(container, name, resource);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupMany(
     absl::Span<std::pair<const string*, const string*> const>
         containers_and_names,
@@ -447,8 +455,9 @@ Status ResourceMgr::LookupMany(
   resources->resize(containers_and_names.size());
   for (size_t i = 0; i < containers_and_names.size(); ++i) {
     T* resource;
-    Status s = LookupInternal(*containers_and_names[i].first,
-                              *containers_and_names[i].second, &resource);
+    Status s = LookupInternal<T, use_dynamic_cast>(
+        *containers_and_names[i].first, *containers_and_names[i].second,
+        &resource);
     if (s.ok()) {
       (*resources)[i].reset(resource);
     }
@@ -456,7 +465,18 @@ Status ResourceMgr::LookupMany(
   return Status::OK();
 }
 
+// Simple wrapper to allow conditional dynamic / static casts.
+template <typename T, bool use_dynamic_cast>
+struct TypeCastFunctor {
+  static T* Cast(ResourceBase* r) { return static_cast<T*>(r); }
+};
+
 template <typename T>
+struct TypeCastFunctor<T, true> {
+  static T* Cast(ResourceBase* r) { return dynamic_cast<T*>(r); }
+};
+
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupInternal(const string& container, const string& name,
                                    T** resource) const {
   ResourceBase* found = nullptr;
@@ -464,12 +484,12 @@ Status ResourceMgr::LookupInternal(const string& container, const string& name,
   if (s.ok()) {
     // It's safe to down cast 'found' to T* since
     // typeid(T).hash_code() is part of the map key.
-    *resource = static_cast<T*>(found);
+    *resource = TypeCastFunctor<T, use_dynamic_cast>::Cast(found);
   }
   return s;
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
                                    T** resource,
                                    std::function<Status(T**)> creator) {
@@ -478,11 +498,11 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
   Status s;
   {
     tf_shared_lock l(mu_);
-    s = LookupInternal(container, name, resource);
+    s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
     if (s.ok()) return s;
   }
   mutex_lock l(mu_);
-  s = LookupInternal(container, name, resource);
+  s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
   if (s.ok()) return s;
   TF_RETURN_IF_ERROR(creator(resource));
   s = DoCreate(container, MakeTypeIndex<T>(), name, *resource);
@@ -558,11 +578,12 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value) {
   return ctx->resource_manager()->Create(p.container(), p.name(), value);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
                       T** value) {
   TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
-  return ctx->resource_manager()->Lookup(p.container(), p.name(), value);
+  return ctx->resource_manager()->Lookup<T, use_dynamic_cast>(p.container(),
+                                                              p.name(), value);
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 67cc9e38459a00394c45bc74b5a966e6128b204a..f8cab135aba799d67183f0978ee1166aba533b99 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -77,6 +77,8 @@ message NodeExecStats {
 message DeviceStepStats {
   string device = 1;
   repeated NodeExecStats node_stats = 2;
+  // Its key is thread id.
+  map<uint32, string> thread_names = 3;
 }
 
 message StepStats {
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 65f6dc1c00b5123287212eae39dc607ad8f68e29..65fb72145f20012ed5217bddcafdf74bfd268cd0 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_util.h"
 
+#include <cmath>
 #include <vector>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensor {
@@ -37,10 +42,10 @@ Tensor DeepCopy(const Tensor& other) {
              other_data.size());
     }
   } else if (other.dtype() == DT_STRING) {
-    tmp.flat<string>() = other.flat<string>();
+    tmp.unaligned_flat<string>() = other.unaligned_flat<string>();
   } else {
     CHECK_EQ(DT_VARIANT, other.dtype());
-    tmp.flat<Variant>() = other.flat<Variant>();
+    tmp.unaligned_flat<Variant>() = other.unaligned_flat<Variant>();
   }
   return tmp;
 }
@@ -175,7 +180,175 @@ void SetTensorProtoShape(std::vector<size_t> shape,
     shape_proto->mutable_dim()->Add()->set_size(dim);
   }
 }
+
+template <typename T>
+bool CompressTensorContent(float min_compression_ratio,
+                           const TensorShape& shape, TensorProto* tensor) {
+  using TypeHelper = internal::TensorProtoHelper<T>;
+  using FieldType = typename internal::TensorProtoHelper<T>::FieldType;
+  const int64 num_tensor_values = shape.num_elements();
+  const int64 num_bytes = tensor->tensor_content().size();
+  const int64 num_raw_values = num_bytes / sizeof(T);
+  if (num_raw_values != num_tensor_values) {
+    // Invalid or too small.
+    return false;
+  }
+  int64 last_offset = num_bytes - 1;
+  int64 prev_offset = last_offset - sizeof(T);
+  // Inspect individual raw bytes sizeof(T) bytes apart in adjacent elements,
+  // starting from the end, to find the last pair of elements that are not
+  // identical.
+  while (prev_offset >= 0) {
+    if (tensor->tensor_content()[prev_offset] !=
+        tensor->tensor_content()[last_offset]) {
+      break;
+    }
+    --last_offset;
+    --prev_offset;
+  }
+  // Round up to the next whole number of element of type T.
+  const int64 new_num_values = last_offset / sizeof(T) + 1;
+  if (new_num_values * sizeof(FieldType) >
+      static_cast<int64>(num_bytes / min_compression_ratio)) {
+    return false;
+  }
+  // Copy values to truncated repeated field.
+  if (sizeof(FieldType) == sizeof(T)) {
+    FieldType* dst_ptr =
+        TypeHelper::AppendUninitialized(new_num_values, tensor);
+    port::CopySubrangeToArray(tensor->tensor_content(), 0,
+                              new_num_values * sizeof(T),
+                              reinterpret_cast<char*>(dst_ptr));
+    tensor->clear_tensor_content();
+  } else if (sizeof(T) > 1) {
+    // Copy raw bytes to temp array first, then cast.
+    gtl::InlinedVector<T, 64> tmp(new_num_values);
+    port::CopySubrangeToArray(tensor->tensor_content(), 0,
+                              new_num_values * sizeof(T),
+                              reinterpret_cast<char*>(tmp.data()));
+    tensor->clear_tensor_content();
+    TypeHelper::AddValues(tmp.data(), tmp.data() + tmp.size(), tensor);
+  } else {
+    // Copy and cast, one byte at a time.
+    for (int64 i = 0; i < new_num_values; ++i) {
+      char c = tensor->tensor_content()[i];
+      TypeHelper::AddValue(static_cast<FieldType>(c), tensor);
+    }
+    tensor->clear_tensor_content();
+  }
+  return true;
+}
+
+template <typename T>
+inline bool PackedValuesNotEqual(T a, T b) {
+  return a != b;
+}
+template <>
+inline bool PackedValuesNotEqual(float a, float b) {
+  return reinterpret_cast<int32_t&>(a) != reinterpret_cast<int32_t&>(b);
+}
+template <>
+inline bool PackedValuesNotEqual(double a, double b) {
+  return reinterpret_cast<int64_t&>(a) != reinterpret_cast<int64_t&>(b);
+}
+
+template <typename T>
+bool CompressRepeatedField(float min_compression_ratio,
+                           const TensorShape& shape, TensorProto* tensor) {
+  using TypeHelper = internal::TensorProtoHelper<T>;
+  using FieldType = typename internal::TensorProtoHelper<T>::FieldType;
+  const int64 num_tensor_values = shape.num_elements();
+  const int64 num_proto_values = TypeHelper::NumValues(*tensor);
+  if (num_proto_values != num_tensor_values) {
+    // Already compressed or invalid.
+    return false;
+  }
+  T prev_value = TypeHelper::GetValue(num_proto_values - 1, *tensor);
+  int64 last_index = 0;
+  for (int64 i = num_proto_values - 2; i >= 0 && last_index == 0; --i) {
+    const T cur_value = TypeHelper::GetValue(i, *tensor);
+    if (PackedValuesNotEqual(cur_value, prev_value)) {
+      last_index = i + 1;
+    }
+    prev_value = cur_value;
+  }
+  const int64 num_truncated_proto_values = last_index + 1;
+  const int64 num_bytes_as_field =
+      num_truncated_proto_values * sizeof(FieldType);
+  const int64 num_bytes_as_tensor_content = num_tensor_values * sizeof(T);
+  const int64 num_bytes_before = num_proto_values * sizeof(FieldType);
+  if (std::min(num_bytes_as_field, num_bytes_as_tensor_content) >
+      static_cast<int64>(num_bytes_before / min_compression_ratio)) {
+    return false;
+  }
+  if (num_bytes_as_field <= num_bytes_as_tensor_content) {
+    TypeHelper::Truncate(num_truncated_proto_values, tensor);
+  } else {
+    gtl::InlinedVector<T, 64> tmp(num_tensor_values);
+    TypeHelper::CopyValues(tmp.begin(), *tensor);
+    TypeHelper::Truncate(0, tensor);
+    port::CopyFromArray(tensor->mutable_tensor_content(),
+                        reinterpret_cast<const char*>(tmp.data()),
+                        num_bytes_as_tensor_content);
+  }
+  return true;
+}
+
+template <typename T>
+bool CompressTensorProtoInPlaceImpl(int64 min_num_elements,
+                                    float min_compression_ratio,
+                                    TensorProto* tensor) {
+  const TensorShape shape(tensor->tensor_shape());
+  const int64 num_tensor_values = shape.num_elements();
+  if (num_tensor_values < min_num_elements) {
+    return false;
+  }
+  if (tensor->tensor_content().empty()) {
+    return CompressRepeatedField<T>(min_compression_ratio, shape, tensor);
+  } else {
+    return CompressTensorContent<T>(min_compression_ratio, shape, tensor);
+  }
+  return true;
+}
+
 }  // namespace internal
 
+#define HANDLE_COMPRESS_CASE(TF_TYPE)                                  \
+  case TF_TYPE:                                                        \
+    return internal::CompressTensorProtoInPlaceImpl<                   \
+        EnumToDataType<TF_TYPE>::Type>(min_num_elements,               \
+                                       min_compression_ratio, tensor); \
+    break
+
+bool CompressTensorProtoInPlace(int64 min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor) {
+  switch (tensor->dtype()) {
+    HANDLE_COMPRESS_CASE(DT_FLOAT);
+    HANDLE_COMPRESS_CASE(DT_DOUBLE);
+    HANDLE_COMPRESS_CASE(DT_UINT8);
+    HANDLE_COMPRESS_CASE(DT_INT8);
+    HANDLE_COMPRESS_CASE(DT_UINT16);
+    HANDLE_COMPRESS_CASE(DT_INT16);
+    HANDLE_COMPRESS_CASE(DT_UINT32);
+    HANDLE_COMPRESS_CASE(DT_INT32);
+    HANDLE_COMPRESS_CASE(DT_UINT64);
+    HANDLE_COMPRESS_CASE(DT_INT64);
+    HANDLE_COMPRESS_CASE(DT_BOOL);
+    HANDLE_COMPRESS_CASE(DT_QUINT8);
+    HANDLE_COMPRESS_CASE(DT_QINT8);
+    HANDLE_COMPRESS_CASE(DT_QUINT16);
+    HANDLE_COMPRESS_CASE(DT_QINT16);
+    HANDLE_COMPRESS_CASE(DT_QINT32);
+    // TODO(rmlarsen): Add support for complex and half float types.
+    //    HANDLE_COMPRESS_CASE(DT_HALF);
+    //    HANDLE_COMPRESS_CASE(DT_BFLOAT16);
+    default:
+      return false;
+  }
+}
+
+#undef HANDLE_COMPRESS_CASE
+
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index a7cf600bab9b2d260277b682946467e9c43f745c..6b7a559a8e43db7083a5b44070c3841db863fec4 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 
+#include <algorithm>
+#include <vector>
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
-#include <vector>
 namespace tensorflow {
 namespace tensor {
 
@@ -60,84 +63,129 @@ namespace internal {
 void SetTensorProtoShape(std::vector<size_t> shape,
                          TensorShapeProto* shape_proto);
 
-// Defines value type dependent methods to manipulate `TensorProto`.
-// Class specializations has to define following methods:
-//   static DataType GetDataType()
-//   static void AddValue(Type value, TensorProto* proto)
 template <typename Type>
-class TensorProtoHelper : public std::false_type {};
+class TensorProtoFieldHelper : public std::false_type {};
+
+#define DEFINE_PROTO_FIELD_HELPER(TYPE, FIELDNAME)                            \
+  template <>                                                                 \
+  class TensorProtoFieldHelper<TYPE> : public std::true_type {                \
+   public:                                                                    \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val(0)) FieldType;            \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val()) RepeatedFieldType;     \
+    typedef decltype(std::declval<TensorProto>().mutable_##FIELDNAME##_val()) \
+        MutableRepeatedFieldType;                                             \
+    static MutableRepeatedFieldType GetMutableField(TensorProto* proto) {     \
+      return proto->mutable_##FIELDNAME##_val();                              \
+    }                                                                         \
+    static RepeatedFieldType& GetField(const TensorProto& proto) {            \
+      return proto.FIELDNAME##_val();                                         \
+    }                                                                         \
+  }
 
-template <>
-class TensorProtoHelper<string> : public std::true_type {
+DEFINE_PROTO_FIELD_HELPER(float, float);
+DEFINE_PROTO_FIELD_HELPER(double, double);
+DEFINE_PROTO_FIELD_HELPER(int8, int);
+DEFINE_PROTO_FIELD_HELPER(uint8, int);
+DEFINE_PROTO_FIELD_HELPER(int16, int);
+DEFINE_PROTO_FIELD_HELPER(uint16, int);
+DEFINE_PROTO_FIELD_HELPER(int32, int);
+DEFINE_PROTO_FIELD_HELPER(uint32, uint32);
+DEFINE_PROTO_FIELD_HELPER(int64, int64);
+DEFINE_PROTO_FIELD_HELPER(uint64, uint64);
+DEFINE_PROTO_FIELD_HELPER(bool, bool);
+DEFINE_PROTO_FIELD_HELPER(qint8, int);
+DEFINE_PROTO_FIELD_HELPER(quint8, int);
+DEFINE_PROTO_FIELD_HELPER(qint16, int);
+DEFINE_PROTO_FIELD_HELPER(quint16, int);
+DEFINE_PROTO_FIELD_HELPER(qint32, int);
+// TODO(rmlarsen): Add support for complex and half float types.
+// DEFINE_PROTO_FIELD_HELPER(Eigen::hals, half);
+// DEFINE_PROTO_FIELD_HELPER(qint32, half);
+
+#undef DEFINE_PROTO_HELPER
+
+template <typename T>
+class TensorProtoHelper : public std::true_type {
  public:
-  static DataType GetDataType() { return DataType::DT_STRING; }
-  static void AddValue(const string& value, TensorProto* proto) {
-    *proto->mutable_string_val()->Add() = value;
+  using FieldHelper = TensorProtoFieldHelper<T>;
+  using FieldType = typename TensorProtoFieldHelper<T>::FieldType;
+
+  static DataType GetDataType() { return DataTypeToEnum<T>::value; }
+
+  static size_t NumValues(const TensorProto& proto) {
+    return FieldHelper::GetField(proto).size();
   }
-};
 
-template <>
-class TensorProtoHelper<int32> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_INT32; }
-  static void AddValue(int32 value, TensorProto* proto) {
-    proto->mutable_int_val()->Add(value);
+  static void AddValue(const T& value, TensorProto* proto) {
+    FieldHelper::GetMutableField(proto)->Add(static_cast<FieldType>(value));
   }
-};
 
-template <>
-class TensorProtoHelper<int64> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_INT64; }
-  static void AddValue(int64 value, TensorProto* proto) {
-    proto->mutable_int64_val()->Add(value);
+  static T GetValue(size_t index, const TensorProto& proto) {
+    return static_cast<T>(FieldHelper::GetField(proto).Get(index));
   }
-};
 
-template <>
-class TensorProtoHelper<uint32> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_UINT32; }
-  static void AddValue(uint32 value, TensorProto* proto) {
-    proto->mutable_uint32_val()->Add(value);
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    using SrcType = typename std::iterator_traits<IterType>::value_type;
+    size_t n = std::distance(begin, end);
+    FieldType* dst_ptr = AppendUninitialized(n, proto);
+    if (std::is_same<SrcType, FieldType>::value) {
+      std::copy(begin, end, dst_ptr);
+    } else {
+      std::transform(begin, end, dst_ptr, [](const SrcType& x) -> FieldType {
+        return static_cast<FieldType>(x);
+      });
+    }
   }
-};
 
-template <>
-class TensorProtoHelper<uint64> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_UINT64; }
-  static void AddValue(uint64 value, TensorProto* proto) {
-    proto->mutable_uint64_val()->Add(value);
+  template <typename IterType>
+  static void CopyValues(IterType dst, const TensorProto& proto) {
+    using DstType = typename std::iterator_traits<IterType>::value_type;
+    auto begin = FieldHelper::GetField(proto).begin();
+    auto end = FieldHelper::GetField(proto).end();
+    if (std::is_same<DstType, FieldType>::value) {
+      std::copy(begin, end, dst);
+    } else {
+      std::transform(begin, end, dst, [](const FieldType& x) -> DstType {
+        return static_cast<DstType>(x);
+      });
+    }
   }
-};
 
-template <>
-class TensorProtoHelper<float> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_FLOAT; }
-  static void AddValue(float value, TensorProto* proto) {
-    proto->mutable_float_val()->Add(value);
+  static void Truncate(size_t new_size, TensorProto* proto) {
+    FieldHelper::GetMutableField(proto)->Truncate(new_size);
   }
-};
 
-template <>
-class TensorProtoHelper<double> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_DOUBLE; }
-  static void AddValue(double value, TensorProto* proto) {
-    proto->mutable_double_val()->Add(value);
+  static FieldType* AppendUninitialized(size_t n, TensorProto* proto) {
+    auto* field = FieldHelper::GetMutableField(proto);
+    field->Reserve(field->size() + n);
+    return reinterpret_cast<FieldType*>(field->AddNAlreadyReserved(n));
   }
 };
 
+// Specialization for string.
 template <>
-class TensorProtoHelper<bool> : public std::true_type {
+class TensorProtoHelper<string> : public std::true_type {
  public:
-  static DataType GetDataType() { return DataType::DT_BOOL; }
-  static void AddValue(bool value, TensorProto* proto) {
-    proto->mutable_bool_val()->Add(value);
+  static DataType GetDataType() { return DataType::DT_STRING; }
+  static void AddValue(const string& value, TensorProto* proto) {
+    *proto->mutable_string_val()->Add() = value;
+  }
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    for (IterType it = begin; it != end; ++it) {
+      AddValue(*it, proto);
+    }
+  }
+  template <typename IterType>
+  static void CopyToTensorContent(IterType begin, IterType end,
+                                  TensorProto* proto) {
+    AddValues(begin, end, proto);
   }
 };
+
 }  // namespace internal
 
 // Creates a 'TensorProto' with specified shape and values.
@@ -149,15 +197,52 @@ typename std::enable_if<internal::TensorProtoHelper<Type>::value,
 CreateTensorProto(const std::vector<Type>& values,
                   const std::vector<size_t>& shape) {
   TensorProto tensor;
+  TensorShapeProto tensor_shape_proto;
+  internal::SetTensorProtoShape(shape, &tensor_shape_proto);
+  if (TensorShape(tensor_shape_proto).num_elements() != values.size()) {
+    LOG(ERROR) << "Shape and number of values (" << values.size()
+               << ") are incompatible.";
+    return tensor;
+  }
   using TypeHelper = internal::TensorProtoHelper<Type>;
   tensor.set_dtype(TypeHelper::GetDataType());
-  internal::SetTensorProtoShape(shape, tensor.mutable_tensor_shape());
-  for (const auto& value : values) {
-    TypeHelper::AddValue(value, &tensor);
-  }
+  tensor.mutable_tensor_shape()->Swap(&tensor_shape_proto);
+  TypeHelper::AddValues(values.begin(), values.end(), &tensor);
   return tensor;
 }
 
+// Converts values in tensor to run-length encoded compressed form.
+//
+// The elements of a tensor can be stored in a TensorProto in one of the
+// following two forms:
+// 1. As a raw byte string in the field `tensor_content` containing the
+//    serialized in-memory representation of the tensor.
+// 2. As values of a repeated field depending on the datatype, e.g. that
+//    values of a DT_FLOAT tensor would be stored in the repeated field
+//    `float_val`.
+// Storage scheme 2 may use a simple form of run-length encoding to compress
+// data: If the values contains a tail of identical values, the repeated field
+// will be truncated such that the number of values in the repeated field is
+// less than the number of elements implied by the field`tensor_shape`. The
+// original tensor can be recovered by repeating the final value in the repeated
+// field.
+//
+// The TensorProto will be compressed if a) the tensor contains at least
+// min_num_elements elements and b) the compressed tensor proto is would be at
+// most the size of the original tensor proto divided by min_compression_ratio.
+//
+// Returns true if the tensor was compressed.
+bool CompressTensorProtoInPlace(int64 min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor);
+
+inline bool CompressTensorProtoInPlace(TensorProto* tensor) {
+  static const int64 kDefaultMinNumElements = 64;
+  static const float kDefaultMinCompressionRatio = 2.0f;
+  return CompressTensorProtoInPlace(kDefaultMinNumElements,
+                                    kDefaultMinCompressionRatio, tensor);
+}
+
 }  // namespace tensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 2b4e1cad2fa24c00f1efc703cd040a105fa68bfe..dbc0ac59f61d4a1b1c4a715f22082d3782f76aed 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -145,6 +149,68 @@ TEST(TensorUtil, DeepCopySlice) {
   }
 }
 
+TEST(TensorUtil, DeepCopySliceString) {
+  Tensor x(DT_STRING, TensorShape({10}));
+  x.flat<string>().setConstant("hello");
+
+  // Slice 'x' -- y still refers to the same buffer.
+  Tensor y = x.Slice(3, 7);
+
+  // Do a deep copy of y, which is a slice.
+  Tensor z = tensor::DeepCopy(y);
+
+  // Set x to be different.
+  x.flat<string>().setConstant("goodbye");
+
+  EXPECT_EQ(TensorShape({10}), x.shape());
+  EXPECT_EQ(TensorShape({4}), y.shape());
+  EXPECT_EQ(TensorShape({4}), z.shape());
+  EXPECT_EQ(DT_STRING, x.dtype());
+  EXPECT_EQ(DT_STRING, y.dtype());
+  EXPECT_EQ(DT_STRING, z.dtype());
+
+  // x and y should now all be 'goodbye', but z should be 'hello'.
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ("goodbye", x.flat<string>()(i));
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ("goodbye", y.unaligned_flat<string>()(i));
+    EXPECT_EQ("hello", z.flat<string>()(i));
+  }
+}
+
+TEST(TensorUtil, DeepCopySliceVariant) {
+  Tensor x(DT_VARIANT, TensorShape({10}));
+  x.flat<Variant>().setConstant(Tensor(42.0f));
+
+  // Slice 'x' -- y still refers to the same buffer.
+  Tensor y = x.Slice(3, 7);
+
+  // Do a deep copy of y, which is a slice.
+  Tensor z = tensor::DeepCopy(y);
+
+  // Set x to be different.
+  x.flat<Variant>().setConstant(Tensor("foo"));
+
+  EXPECT_EQ(TensorShape({10}), x.shape());
+  EXPECT_EQ(TensorShape({4}), y.shape());
+  EXPECT_EQ(TensorShape({4}), z.shape());
+  EXPECT_EQ(DT_VARIANT, x.dtype());
+  EXPECT_EQ(DT_VARIANT, y.dtype());
+  EXPECT_EQ(DT_VARIANT, z.dtype());
+
+  // Each element of x and y should now be a DT_STRING Tensor containing "foo",
+  // but each element of z should be a DT_FLOAT tensor containing 42.0.
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ("foo", x.flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ("foo",
+              y.unaligned_flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+    EXPECT_EQ(42.0, z.flat<Variant>()(i).get<Tensor>()->scalar<float>()());
+  }
+}
+
 TEST(TensorUtil, Concat) {
   std::vector<int64> sizes = {1, 4, 5};
   std::vector<Tensor> to_concat;
@@ -366,5 +432,135 @@ TEST(TensorProtoUtil, CreatesBoolTensorProto) {
             "bool_val: false\n");
 }
 
+TEST(TensorProtoUtil, CompressTensorProtoInPlaceTooSmall) {
+  const int kLength = 63;
+  TensorProto tensor_proto =
+      tensor::CreateTensorProto(std::vector<float>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+}
+
+TEST(TensorProtoUtil, CompressTensorProtoInPlaceAllEqual) {
+  const int kLength = 64;
+  TensorProto tensor_proto =
+      tensor::CreateTensorProto(std::vector<float>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<float>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<int>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<uint8>::NumValues(tensor_proto),
+            1);
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<bool>::NumValues(tensor_proto),
+            1);
+}
+
+template <typename T>
+std::vector<T> VectorWithConstantTail(int size, int tail_length) {
+  CHECK_LE(tail_length, size);
+  std::vector<T> v(size, T(0));
+  std::iota(v.begin(), v.end() - tail_length, T(1));
+  return v;
+}
+
+template <typename T>
+TensorProto CreateAsProtoTensorContent(int size, int tail_length) {
+  auto values = VectorWithConstantTail<T>(size, tail_length);
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape({size}));
+  std::copy(values.begin(), values.end(), tensor.flat<T>().data());
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  return tensor_proto;
+}
+
+template <typename T>
+TensorProto CreateAsProtoField(int size, int tail_length) {
+  auto values = VectorWithConstantTail<T>(size, tail_length);
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape({size}));
+  std::copy(values.begin(), values.end(), tensor.flat<T>().data());
+  TensorProto tensor_proto;
+  tensor.AsProtoField(&tensor_proto);
+  return tensor_proto;
+}
+
+template <typename T>
+void CompareTensorValues(const TensorProto& x, const TensorProto& y) {
+  Tensor x_t;
+  EXPECT_TRUE(x_t.FromProto(x));
+  Tensor y_t;
+  EXPECT_TRUE(y_t.FromProto(y));
+  test::ExpectTensorEqual<T>(x_t, y_t);
+}
+
+template <typename T>
+void ConstantTailTest(int64 length, int64 tail_length, bool as_field) {
+  using TensorProtoHelper = tensor::internal::TensorProtoHelper<T>;
+  using FieldType = typename TensorProtoHelper::FieldType;
+  const float kMinCompressionRatio = 2.0;
+  const int64 kMinSize = 64;
+  TensorProto tensor_proto =
+      as_field ? CreateAsProtoField<T>(length, tail_length)
+               : CreateAsProtoTensorContent<T>(length, tail_length);
+  TensorProto original_tensor_proto = tensor_proto;
+  int64 original_size = length * (as_field ? sizeof(FieldType) : sizeof(T));
+  int64 size_as_tensor_content = length * sizeof(T);
+  int64 size_as_field =
+      std::min(length, (length - tail_length + 1)) * sizeof(FieldType);
+  bool will_compress = std::min(size_as_tensor_content, size_as_field) <=
+                       static_cast<int64>(original_size / kMinCompressionRatio);
+
+  EXPECT_EQ(tensor::CompressTensorProtoInPlace(kMinSize, kMinCompressionRatio,
+                                               &tensor_proto),
+            will_compress);
+  if (will_compress) {
+    if (size_as_tensor_content < size_as_field) {
+      EXPECT_EQ(TensorProtoHelper::NumValues(tensor_proto), 0);
+      EXPECT_FALSE(tensor_proto.tensor_content().empty());
+    } else {
+      EXPECT_LE(TensorProtoHelper::NumValues(tensor_proto),
+                (length - tail_length + 1));
+      EXPECT_TRUE(tensor_proto.tensor_content().empty());
+    }
+  }
+  CompareTensorValues<T>(tensor_proto, original_tensor_proto);
+}
+
+TEST(TensorProtoUtil, CompressTensorProtoConstantTail) {
+  const int kLength = 64;
+  for (bool as_field : {true, false}) {
+    for (int tail_length : {0, 1, 2, 32, 33, 63, 64}) {
+      ConstantTailTest<float>(kLength, tail_length, as_field);
+      ConstantTailTest<double>(kLength, tail_length, as_field);
+      ConstantTailTest<int32>(kLength, tail_length, as_field);
+      ConstantTailTest<uint32>(kLength, tail_length, as_field);
+      ConstantTailTest<int64>(kLength, tail_length, as_field);
+      ConstantTailTest<uint64>(kLength, tail_length, as_field);
+      ConstantTailTest<int8>(kLength, tail_length, as_field);
+      ConstantTailTest<uint8>(kLength, tail_length, as_field);
+      ConstantTailTest<int16>(kLength, tail_length, as_field);
+      ConstantTailTest<uint16>(kLength, tail_length, as_field);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index 2df402573a58ad3728e03a22d391b32766c49b00..ff454f5847563bb696afecb79eae1743241628a5 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -152,8 +152,8 @@ int64 TrackingAllocator::AllocationId(const void* ptr) {
   }
 }
 
-void TrackingAllocator::GetStats(AllocatorStats* stats) {
-  allocator_->GetStats(stats);
+absl::optional<AllocatorStats> TrackingAllocator::GetStats() {
+  return allocator_->GetStats();
 }
 
 void TrackingAllocator::ClearStats() { allocator_->ClearStats(); }
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 5eafce662ec491de2410e5bfdd6e5a69ecaea199..3b45d1cab80f3a82329d19bd9408a2909673de0b 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -66,7 +66,7 @@ class TrackingAllocator : public Allocator {
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   int64 AllocationId(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
   // If the underlying allocator tracks allocation sizes, this returns
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 2cdc7edd2d1e9f2634a96e85879dc45a53f633cc..554af609866e059bc3002a2c5097664d6b173c92 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -44,7 +44,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     EXPECT_NE(size_map_.end(), iter);
     return iter->second;
   }
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
+  absl::optional<AllocatorStats> GetStats() override { return absl::nullopt; }
 
  private:
   std::unordered_map<const void*, size_t> size_map_;
@@ -58,7 +58,7 @@ class NoMemoryAllocator : public Allocator {
   }
   void DeallocateRaw(void* ptr) override {}
   bool TracksAllocationSizes() override { return true; }
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
+  absl::optional<AllocatorStats> GetStats() override { return absl::nullopt; }
 };
 
 TEST(TrackingAllocatorTest, SimpleNoTracking) {
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index c169e867ec44947d8a761aa1f3be0e06188f974b..993a8989b708c448653bab374dd25bc907b7bf0c 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -20,14 +20,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-VariantTensorData::VariantTensorData() {}
-
 VariantTensorData::VariantTensorData(VariantTensorDataProto proto) {
   FromProto(std::move(proto));
 }
 
-VariantTensorData::~VariantTensorData() {}
-
 int VariantTensorData::tensors_size() const { return tensors_.size(); }
 
 const Tensor& VariantTensorData::tensors(int index) const {
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index ca99e83f9c7e575de622349749f8dfb98ab37747..d98cf6b5e1fb8c6d541aad2c2127c2ca9033792c 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -37,11 +37,11 @@ class VariantTensorDataProto;
 // separate so that kernels do not need to depend on protos.
 class VariantTensorData {
  public:
-  VariantTensorData();
+  VariantTensorData() = default;
+
   // TODO(b/118823936): This silently returns if the proto is invalid.
   // Consider calling FromProto explicitly instead.
   VariantTensorData(VariantTensorDataProto proto);
-  ~VariantTensorData();
 
   // Name of the type of objects being serialized.
   const string& type_name() const { return type_name_; }
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 08d09de7b845101cd2c9604b2ea44bbe25a94171..8947f93887a78659e2e0a0bcd06cedc1ab733d99 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -186,7 +186,7 @@ TEST(VariantTest, TensorListTest) {
   x.Encode(&serialized);
 
   Variant y = TensorList();
-  y.Decode(std::move(serialized));
+  y.Decode(serialized);
 
   const TensorList& decoded_vec = *y.get<TensorList>();
   for (int i = 0; i < 4; ++i) {
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 00d3549312aee9669eb588ace593f347263c1a11..3ea222c13c5aa06f708bce61454cef9c24e56c8b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -85,10 +85,6 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"CollectiveBcastSend", NC_COLLECTIVE},
         {"CollectiveBcastRecv", NC_COLLECTIVE},
         {"FakeParam", NC_FAKE_PARAM},
-        {"IteratorGetNext", NC_DATASET},
-        {"IteratorGetNextSync", NC_DATASET},
-        {"DatasetToSingleElement", NC_DATASET},
-        {"ReduceDataset", NC_DATASET},
     });
 
 #undef REF_CLASS
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index f65e4b921efb3298bad090198a0e1d32c31b8fd3..289a3d2a2307280830e23b8b12513e20feccb153 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -174,8 +174,6 @@ class Node {
   bool IsMetadata() const { return class_ == NC_METADATA; }
   bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
 
-  bool IsDataset() const { return class_ == NC_DATASET; }
-
   template <typename T>
   void AddAttr(const string& name, const T& val) {
     SetAttrValue(val, AddAttrHelper(name));
@@ -256,7 +254,6 @@ class Node {
     NC_SCOPED_ALLOCATOR,
     NC_COLLECTIVE,
     NC_FAKE_PARAM,
-    NC_DATASET,
     NC_OTHER  // Not a special kind of node
   };
 
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 990b2fe9b04770dc875b949ec3e17c321fe018be..f36ca8c5a843c8f2e5e2860e8416d0533dc940ed 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -96,7 +96,7 @@ static inline bool IsMklOp(const string& op_name, DataType T) {
 
   // Restrict quantized ops to QUINT8 and QINT8 for now
   if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
-    return (T == DT_QUINT8 || T == DT_QINT8);
+    return (T == DT_QUINT8 || T == DT_QINT8 || T == DT_QINT32);
   }
   // Restrict regular ops to FLOAT
   if (kernel.find(kMklOpLabelPattern) != string::npos) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index e934978e76a0c53f755c3ae18644dbe7278112d0..d5dcd16be7f8d459189774b9bd672b7c5dcdb7b9 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -291,10 +291,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.pad = "Pad";
     csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
     csinfo_.pad_with_fused_conv2d = "__MklDummyPadWithFusedConv2D";
-// Temporarily don't convert quantized operators into MKL versions for now.
-// TODO(Intel-tf) Once all the relevant PRs have been merged then remove
-// the ifdef.
-#ifdef INTEL_MKL_QUANTIZED
     csinfo_.quantized_avg_pool = "QuantizedAvgPool";
     csinfo_.quantized_concatv2 = "QuantizedConcatV2";
     csinfo_.quantized_conv2d = "QuantizedConv2D";
@@ -316,14 +312,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "QuantizedConv2DWithBiasSumAndReluAndRequantize";
     csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize =
         "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize";
-#endif
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.relu6 = "Relu6";
     csinfo_.relu6_grad = "Relu6Grad";
-#ifdef INTEL_MKL_QUANTIZED
     csinfo_.requantize = "Requantize";
-#endif
     csinfo_.tanh = "Tanh";
     csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
@@ -443,7 +436,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.pad_with_fused_conv2d,
                       csinfo_.mkl_pad_with_fused_conv2d,
                       CopyAttrsPadWithFusedConv2D, AlwaysRewrite});
-#ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
                       CopyAttrsQuantizedPooling, AlwaysRewrite});
@@ -499,7 +491,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          mkl_op_registry::GetMklOpName(
              csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize),
          CopyAttrsQuantizedConv2D, AlwaysRewrite});
-#endif
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
@@ -511,11 +502,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.relu6_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
                       CopyAttrsDataType, AlwaysRewrite});
-#ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.requantize,
                       mkl_op_registry::GetMklOpName(csinfo_.requantize),
                       CopyAttrsRequantize, AlwaysRewrite});
-#endif
     /*
     rinfo_.push_back({csinfo_.tanh,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh),
@@ -3187,9 +3176,7 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   // Set the Mkl layer label for this op.
   if (DataTypeIsQuantized(orig_node->input_type(0)) ||
       DataTypeIsQuantized(orig_node->output_type(0))) {
-#ifdef INTEL_MKL_QUANTIZED
     nb.Attr("_kernel", mkl_op_registry::kMklQuantizedOpLabel);
-#endif
   } else {
     nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
   }
@@ -3243,7 +3230,6 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
 // Current implementation reflects only QuantizedConv2D and its fused Ops.
 const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
-#ifdef INTEL_MKL_QUANTIZED
   DataType Tinput, Tfilter;
   if (!(GetNodeAttr(n->def(), "Tinput", &Tinput).ok() &&
         GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok())) {
@@ -3257,7 +3243,6 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
       }
     }
   }
-#endif
   return nullptr;
 }
 
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 9fe699360feec6686312a4d11b67b4f411832126..77307708fab5c99cb52ad652c72220efc89f8337 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,7 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 cc_library(
     name = "op_types",
@@ -45,6 +44,7 @@ tf_cc_test(
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -71,7 +71,6 @@ cc_library(
     deps = [
         ":graph_view",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index e4f6bf7c862302a217c122cff726b7ab925cc482..e7c72b80886f53ad4068f048dda99ee7dca92293 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -227,14 +227,14 @@ Status SingleMachine::GetPeakMemoryUsage(
 
   device_peak_memory->clear();
   for (Device* device : devices) {
-    AllocatorStats stats;
     auto* allocator = device->GetAllocator(AllocatorAttributes());
     if (!allocator->TracksAllocationSizes()) {
       return Status(error::INVALID_ARGUMENT,
                     "Tracking allocation is not enabled.");
     }
-    allocator->GetStats(&stats);
-    (*device_peak_memory)[device->name()] = stats.max_bytes_in_use;
+    absl::optional<AllocatorStats> stats = allocator->GetStats();
+    (*device_peak_memory)[device->name()] =
+        (stats ? stats->peak_bytes_in_use : 0);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index eb7ee8dc0a10147d6bfe201f21d437579850b6d9..fdc6b79c8298b7051d1c9247431c151827288720 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -102,7 +101,7 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &run_metadata, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9157), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
   // grappler will not process "label", therefore we have 15 here instead
   EXPECT_EQ(15, summary.num_ops_total);
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index d0ac87c962b48d0fa39126a1b6c7c4e7c89f8bb8..6907988d08f507b8bc4c4e4c3560f06973a0aed3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -481,6 +481,7 @@ bool IsNumericType(const DataType dtype) {
           DT_QINT8,
           DT_QUINT8,
           DT_QINT16,
+          DT_QUINT16,
           DT_QINT32,
           // Bool.
           DT_BOOL,
@@ -1181,7 +1182,7 @@ class SymbolicShapeRefiner {
     // elements is larger than the given max size.
     for (int i = 0; i < ic->num_outputs(); i++) {
       const ShapeHandle& shape_handle = ic->output(i);
-      if (!ic->FullyDefined(shape_handle) &&
+      if (!ic->FullyDefined(shape_handle) ||
           ic->Value(ic->NumElements(shape_handle)) > max_size) {
         return false;
       }
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 0a7697a21324d3c27b80e04d6c79f5eeb539bc20..fa6b05bd154b824893e3baf1ec3f64192ab00e4f 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -975,6 +975,52 @@ TEST_F(GraphPropertiesTest, IdentityPassingShape) {
   EXPECT_EQ("float: [5,5]", PropToString(out_prop0));
 }
 
+TEST_F(GraphPropertiesTest, SkippingValueInferenceForLargeTensors) {
+  // When using aggressive_shape_inference, we run EvaluateNode() for
+  // whitelisted ops and small input / output tensors. For instance, Fill op is
+  // evaluated and produces output tensor value if output tensor size is smal
+  // (currently, fewer than 17 elements); otherwise we don't run EvalauteNode().
+  // This is to avoid wasting time and memory for producing huge tensors (e.g.,
+  // initializing a large table using Fill.
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), 4, {2});  // 4x4
+    Output b = ops::Const(s.WithOpName("const"), 0.1f, {});
+    // Shape described by a is small; expect output values of Fill op.
+    Output c = ops::Fill(s.WithOpName("fill"), a, b);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("fill");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [4,4]", PropToString(out_prop0));
+    EXPECT_TRUE(out_prop0.has_value());
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), 1000, {4});  // 1000x1000x1000x1000
+    Output b = ops::Const(s.WithOpName("const"), 0.1f, {});
+    // Shape described by a is huge; in that case we skip value inference.
+    // Otherwise, it'd be too much overhead.
+    Output c = ops::Fill(s.WithOpName("fill"), a, b);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("fill");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [1000,1000,1000,1000]", PropToString(out_prop0));
+    EXPECT_FALSE(out_prop0.has_value());
+  }
+}
+
 TEST_F(GraphPropertiesTest, PackWithConstInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 1, {});
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 59d20f1fb9a730f45cfe1135526321a0e132fda1..11877d87513ec8f2965a0e3a7fc7a11e0a551728 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -27,7 +27,6 @@ namespace tensorflow {
 namespace grappler {
 
 constexpr int kOpsPerMac = 2;
-constexpr char kConst[] = "Const";
 constexpr char kGuaranteeConst[] = "GuaranteeConst";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
@@ -50,8 +49,6 @@ constexpr char kSqueeze[] = "Squeeze";
 constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
-constexpr char kVariable[] = "Variable";
-constexpr char kVariableV2[] = "VariableV2";
 constexpr char kRank[] = "Rank";
 constexpr char kShape[] = "Shape";
 constexpr char kShapeN[] = "ShapeN";
@@ -67,7 +64,15 @@ constexpr char kAvgPool[] = "AvgPool";
 constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
 constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
+constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
+// Persistent ops.
+constexpr char kConst[] = "Const";
+constexpr char kVariable[] = "Variable";
+constexpr char kVariableV2[] = "VariableV2";
+constexpr char kAutoReloadVariable[] = "AutoReloadVariable";
+constexpr char kVarHandleOp[] = "VarHandleOp";
+constexpr char kReadVariableOp[] = "ReadVariableOp";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -239,6 +244,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+      {kQuantizedMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kQuantizedMatMulV2, wrap(&OpLevelCostEstimator::PredictMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -259,10 +265,6 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
 
-      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
-
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShapeN, wrap(&OpLevelCostEstimator::PredictMetadata)},
@@ -276,6 +278,11 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad)},
   };
 
+  persistent_ops_ = {
+      kConst,       kVariable,       kVariableV2, kAutoReloadVariable,
+      kVarHandleOp, kReadVariableOp,
+  };
+
 #define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
 
   // Quantize = apply min and max bounds, multiply by scale factor and round.
@@ -363,21 +370,25 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   auto it = device_cost_impl_.find(op_info.op());
-  if (it == device_cost_impl_.end()) {
-    if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
-      return PredictCwiseOp(op_context);
-    }
+  if (it != device_cost_impl_.end()) {
+    std::function<Costs(const OpContext&)> estimator = it->second;
+    Costs costs = estimator(op_context);
+    VLOG(1) << "Operation " << op_info.op() << " takes "
+            << costs.execution_time.count() << " ns.";
+    return costs;
+  }
 
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+  if (persistent_ops_.find(op_info.op()) != persistent_ops_.end()) {
+    return PredictVariable(op_context);
+  }
 
-    return PredictCostOfAnUnknownOp(op_context);
+  if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
+    return PredictCwiseOp(op_context);
   }
 
-  std::function<Costs(const OpContext&)> estimator = it->second;
-  Costs costs = estimator(op_context);
-  VLOG(1) << "Operation " << op_info.op() << " takes "
-          << costs.execution_time.count() << " ns.";
-  return costs;
+  VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+
+  return PredictCostOfAnUnknownOp(op_context);
 }
 
 DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
@@ -1240,7 +1251,7 @@ Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
   result.num_ops_with_unknown_shapes = result.inaccurate;
 
   result.compute_time = kMinComputeTime;
-  result.execution_time = result.execution_time;
+  result.execution_time = result.compute_time;
   return result;
 }
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index f8ba8c6637d9aade6610a6af8dd6c9f3e0be01af..ace8fb218c75886b8bd215a0b04c98ef89f1fed6 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -193,6 +193,7 @@ class OpLevelCostEstimator {
   // If true, assume compute and memory overlap; hence, the op cost is max of
   // compute_time and memory_time, insteaf of sum of those two.
   bool compute_memory_overlap_;
+  std::set<string> persistent_ops_;
 
  private:
   friend class OpLevelCostEstimatorTest;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index aa0fc9d6c2af621fd0beef75d95c7154e9e88233..04c6ada2bf690a8d528eac60e0984873aeba2b5c 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -499,6 +499,26 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
   OpLevelCostEstimator estimator_;
 };
 
+TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  std::unordered_set<string> persisent_ops = {
+      "Const",       "Variable",       "VariableV2", "AutoReloadVariable",
+      "VarHandleOp", "ReadVariableOp",
+  };
+  // Minmum cost for all persistent ops.
+  for (const auto& op : persisent_ops) {
+    op_context.op_info.set_op(op);
+    auto cost = estimator_.PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(1), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(1), cost.execution_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
+
 TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 64098e7e28a9e9deaec1bc7923910edf9363714d..6a5e60edccee09f861a66d3ec94e7f7a5297031d 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -106,6 +106,146 @@ bool HasRegularFaninNode(const MutableGraphView& graph, const NodeDef& node,
   return false;
 }
 
+using FanoutsMap =
+    absl::flat_hash_map<MutableGraphView::OutputPort,
+                        absl::flat_hash_set<MutableGraphView::InputPort>>;
+
+void SwapControlledFanoutInputs(const MutableGraphView& graph,
+                                const FanoutsMap::iterator& control_fanouts,
+                                absl::string_view to_node_name) {
+  absl::string_view from_node_name(control_fanouts->first.node->name());
+  string control = TensorIdToString({to_node_name, Graph::kControlSlot});
+  for (const auto& control_fanout : control_fanouts->second) {
+    const int start = graph.NumFanins(*control_fanout.node,
+                                      /*include_controlling_nodes=*/false);
+    for (int i = start; i < control_fanout.node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(control_fanout.node->input(i));
+      if (tensor_id.node() == from_node_name) {
+        control_fanout.node->set_input(i, control);
+        break;
+      }
+    }
+  }
+}
+
+void SwapRegularFanoutInputs(FanoutsMap* fanouts, NodeDef* from_node,
+                             absl::string_view to_node_name, int max_port) {
+  MutableGraphView::OutputPort port;
+  port.node = from_node;
+  for (int i = 0; i <= max_port; ++i) {
+    port.port_id = i;
+    auto it = fanouts->find(port);
+    if (it == fanouts->end()) {
+      continue;
+    }
+    string input = TensorIdToString({to_node_name, i});
+    for (const auto& fanout : it->second) {
+      fanout.node->set_input(fanout.port_id, input);
+    }
+  }
+}
+
+using MaxOutputPortsMap = absl::flat_hash_map<const NodeDef*, int>;
+
+void SwapFanoutInputs(const MutableGraphView& graph, FanoutsMap* fanouts,
+                      MaxOutputPortsMap* max_output_ports, NodeDef* from_node,
+                      NodeDef* to_node) {
+  auto from_control_fanouts = fanouts->find({from_node, Graph::kControlSlot});
+  if (from_control_fanouts != fanouts->end()) {
+    SwapControlledFanoutInputs(graph, from_control_fanouts, to_node->name());
+  }
+  auto to_control_fanouts = fanouts->find({to_node, Graph::kControlSlot});
+  if (to_control_fanouts != fanouts->end()) {
+    SwapControlledFanoutInputs(graph, to_control_fanouts, from_node->name());
+  }
+  auto from_max_port = max_output_ports->find(from_node);
+  if (from_max_port != max_output_ports->end()) {
+    SwapRegularFanoutInputs(fanouts, from_node, to_node->name(),
+                            from_max_port->second);
+  }
+  auto to_max_port = max_output_ports->find(to_node);
+  if (to_max_port != max_output_ports->end()) {
+    SwapRegularFanoutInputs(fanouts, to_node, from_node->name(),
+                            to_max_port->second);
+  }
+}
+
+void SwapFanoutsMapValues(FanoutsMap* fanouts,
+                          const MutableGraphView::OutputPort& from_port,
+                          const FanoutsMap::iterator& from_fanouts,
+                          const MutableGraphView::OutputPort& to_port,
+                          const FanoutsMap::iterator& to_fanouts) {
+  const bool from_exists = from_fanouts != fanouts->end();
+  const bool to_exists = to_fanouts != fanouts->end();
+
+  if (from_exists && to_exists) {
+    std::swap(from_fanouts->second, to_fanouts->second);
+  } else if (from_exists) {
+    fanouts->emplace(to_port, std::move(from_fanouts->second));
+    fanouts->erase(from_port);
+  } else if (to_exists) {
+    fanouts->emplace(from_port, std::move(to_fanouts->second));
+    fanouts->erase(to_port);
+  }
+}
+
+void SwapRegularFanoutsAndMaxPortValues(FanoutsMap* fanouts,
+                                        MaxOutputPortsMap* max_output_ports,
+                                        NodeDef* from_node, NodeDef* to_node) {
+  auto from_max_port = max_output_ports->find(from_node);
+  auto to_max_port = max_output_ports->find(to_node);
+  bool from_exists = from_max_port != max_output_ports->end();
+  bool to_exists = to_max_port != max_output_ports->end();
+
+  auto forward_fanouts = [fanouts](NodeDef* from, NodeDef* to, int start,
+                                   int end) {
+    for (int i = start; i <= end; ++i) {
+      MutableGraphView::OutputPort from_port(from, i);
+      auto from_fanouts = fanouts->find(from_port);
+      if (from_fanouts != fanouts->end()) {
+        MutableGraphView::OutputPort to_port(to, i);
+        fanouts->emplace(to_port, std::move(from_fanouts->second));
+        fanouts->erase(from_port);
+      }
+    }
+  };
+
+  if (from_exists && to_exists) {
+    const int from = from_max_port->second;
+    const int to = to_max_port->second;
+    const int shared = std::min(from, to);
+    for (int i = 0; i <= shared; ++i) {
+      MutableGraphView::OutputPort from_port(from_node, i);
+      auto from_fanouts = fanouts->find(from_port);
+      MutableGraphView::OutputPort to_port(to_node, i);
+      auto to_fanouts = fanouts->find(to_port);
+      SwapFanoutsMapValues(fanouts, from_port, from_fanouts, to_port,
+                           to_fanouts);
+    }
+    if (to > from) {
+      forward_fanouts(to_node, from_node, shared + 1, to);
+    } else if (from > to) {
+      forward_fanouts(from_node, to_node, shared + 1, from);
+    }
+
+    std::swap(from_max_port->second, to_max_port->second);
+  } else if (from_exists) {
+    forward_fanouts(from_node, to_node, 0, from_max_port->second);
+
+    max_output_ports->emplace(to_node, from_max_port->second);
+    max_output_ports->erase(from_node);
+  } else if (to_exists) {
+    forward_fanouts(to_node, from_node, 0, to_max_port->second);
+
+    max_output_ports->emplace(from_node, to_max_port->second);
+    max_output_ports->erase(to_node);
+  }
+}
+
+bool HasFanoutValue(const FanoutsMap& fanouts, const FanoutsMap::iterator& it) {
+  return it != fanouts.end() && !it->second.empty();
+}
+
 Status MutationError(absl::string_view function_name, absl::string_view params,
                      absl::string_view msg) {
   return errors::InvalidArgument(absl::Substitute(
@@ -180,6 +320,12 @@ Status CheckPortRange(int port, int min, int max, ErrorHandler handler) {
   return Status::OK();
 }
 
+string SwapNodeNamesSwitchControlErrorMsg(absl::string_view node_name) {
+  return absl::Substitute(
+      "can't swap node name '$0' as it will become a Switch control dependency",
+      node_name);
+}
+
 string GeneratedNameForIdentityConsumingSwitch(
     const MutableGraphView::OutputPort& fanin) {
   return AddPrefixToNodeName(
@@ -398,6 +544,182 @@ Status MutableGraphView::UpdateNode(
   return Status::OK();
 }
 
+Status MutableGraphView::UpdateNodeName(absl::string_view from_node_name,
+                                        absl::string_view to_node_name,
+                                        bool update_fanouts) {
+  auto error_status = [from_node_name, to_node_name,
+                       update_fanouts](absl::string_view msg) {
+    string params = absl::Substitute(
+        "from_node_name='$0', to_node_name='$1', update_fanouts=$2",
+        from_node_name, to_node_name, update_fanouts);
+    return MutationError("UpdateNodeName", params, msg);
+  };
+
+  NodeDef* node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, node, error_status));
+
+  if (node->name() == to_node_name) {
+    return Status::OK();
+  }
+  if (HasNode(to_node_name)) {
+    return error_status(
+        "can't update node name because new node name is in use");
+  }
+  auto max_output_port = max_regular_output_port().find(node);
+  const bool has_max_output_port =
+      max_output_port != max_regular_output_port().end();
+  auto control_fanouts = fanouts().find({node, Graph::kControlSlot});
+
+  if (update_fanouts) {
+    SwapControlledFanoutInputs(*this, control_fanouts, to_node_name);
+    if (has_max_output_port) {
+      SwapRegularFanoutInputs(&fanouts(), node, to_node_name,
+                              max_output_port->second);
+    }
+  } else if (has_max_output_port ||
+             HasFanoutValue(fanouts(), control_fanouts)) {
+    return error_status("can't update node name because node has fanouts");
+  }
+
+  nodes().erase(node->name());
+  node->set_name(string(to_node_name));
+  nodes().emplace(node->name(), node);
+  return Status::OK();
+}
+
+Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
+                                       absl::string_view to_node_name,
+                                       bool update_fanouts) {
+  auto error_status = [from_node_name, to_node_name,
+                       update_fanouts](absl::string_view msg) {
+    string params = absl::Substitute(
+        "from_node_name='$0', to_node_name='$1', update_fanouts=$2",
+        from_node_name, to_node_name, update_fanouts);
+    return MutationError("SwapNodeNames", params, msg);
+  };
+
+  NodeDef* from_node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, from_node, error_status));
+  if (from_node_name == to_node_name) {
+    return Status::OK();
+  }
+  NodeDef* to_node = GetNode(to_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(to_node_name, to_node, error_status));
+
+  auto swap_names = [this, from_node, to_node]() {
+    nodes().erase(from_node->name());
+    nodes().erase(to_node->name());
+    std::swap(*from_node->mutable_name(), *to_node->mutable_name());
+    nodes().emplace(from_node->name(), from_node);
+    nodes().emplace(to_node->name(), to_node);
+  };
+
+  if (update_fanouts) {
+    SwapFanoutInputs(*this, &fanouts(), &max_regular_output_port(), from_node,
+                     to_node);
+    swap_names();
+    return Status::OK();
+  }
+
+  bool from_is_switch = IsSwitch(*from_node);
+  MutableGraphView::OutputPort to_control(to_node, Graph::kControlSlot);
+  auto to_control_fanouts = fanouts().find(to_control);
+  if (from_is_switch && HasFanoutValue(fanouts(), to_control_fanouts)) {
+    return error_status(SwapNodeNamesSwitchControlErrorMsg(from_node_name));
+  }
+
+  bool to_is_switch = IsSwitch(*to_node);
+  MutableGraphView::OutputPort from_control(from_node, Graph::kControlSlot);
+  auto from_control_fanouts = fanouts().find(from_control);
+  if (to_is_switch && HasFanoutValue(fanouts(), from_control_fanouts)) {
+    return error_status(SwapNodeNamesSwitchControlErrorMsg(to_node_name));
+  }
+
+  // Swap node names.
+  swap_names();
+
+  // Swap controlling fanouts.
+  SwapFanoutsMapValues(&fanouts(), from_control, from_control_fanouts,
+                       to_control, to_control_fanouts);
+
+  // Swap regular fanouts.
+  SwapRegularFanoutsAndMaxPortValues(&fanouts(), &max_regular_output_port(),
+                                     from_node, to_node);
+
+  // Update fanins to remove self loops.
+  auto update_fanins = [this](NodeDef* node, absl::string_view old_node_name) {
+    for (int i = 0; i < node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node->input(i));
+      if (tensor_id.node() == node->name()) {
+        const int idx = tensor_id.index();
+        const int node_idx =
+            IsTensorIdControlling(tensor_id) ? Graph::kControlSlot : i;
+
+        MutableGraphView::OutputPort from_fanin(node, idx);
+        absl::flat_hash_set<InputPort>* from_fanouts = &fanouts()[from_fanin];
+        from_fanouts->erase({node, node_idx});
+        UpdateMaxRegularOutputPortForRemovedFanin(from_fanin, *from_fanouts);
+
+        MutableGraphView::OutputPort to_fanin(nodes().at(old_node_name), idx);
+        fanouts()[to_fanin].insert({node, node_idx});
+        UpdateMaxRegularOutputPortForAddedFanin(to_fanin);
+        node->set_input(i, TensorIdToString({old_node_name, idx}));
+      }
+    }
+  };
+  update_fanins(from_node, to_node->name());
+  update_fanins(to_node, from_node->name());
+
+  // Dedup control dependencies.
+  auto dedup_control_fanouts =
+      [this](NodeDef* node, const FanoutsMap::iterator& control_fanouts) {
+        if (CanDedupControlWithRegularInput(*this, *node) &&
+            control_fanouts != fanouts().end()) {
+          for (const auto& control_fanout : control_fanouts->second) {
+            if (HasRegularFaninNode(*this, *control_fanout.node,
+                                    node->name())) {
+              RemoveControllingFaninInternal(control_fanout.node, node);
+            }
+          }
+        }
+      };
+  auto dedup_switch_control = [this, dedup_control_fanouts](NodeDef* node) {
+    OutputPort port;
+    port.node = node;
+    const int max_port =
+        gtl::FindWithDefault(max_regular_output_port(), node, -1);
+    for (int i = 0; i <= max_port; ++i) {
+      port.port_id = i;
+      auto it = fanouts().find(port);
+      if (it == fanouts().end()) {
+        continue;
+      }
+      for (const auto& fanout : it->second) {
+        auto fanout_controls =
+            fanouts().find({fanout.node, Graph::kControlSlot});
+        dedup_control_fanouts(fanout.node, fanout_controls);
+      }
+    }
+  };
+
+  if (!from_is_switch) {
+    if (to_is_switch) {
+      dedup_switch_control(from_node);
+    } else {
+      dedup_control_fanouts(from_node, from_control_fanouts);
+    }
+  }
+  if (!to_is_switch) {
+    if (from_is_switch) {
+      dedup_switch_control(to_node);
+    } else {
+      dedup_control_fanouts(to_node, to_control_fanouts);
+    }
+  }
+
+  return Status::OK();
+}
+
 Status MutableGraphView::UpdateFanouts(absl::string_view from_node_name,
                                        absl::string_view to_node_name) {
   NodeDef* from_node = GetNode(from_node_name);
@@ -625,9 +947,9 @@ NodeDef* MutableGraphView::GetControllingFaninToAdd(absl::string_view node_name,
   if (!IsSwitch(*fanin.node)) {
     return fanin.node;
   } else {
-    TensorId tensor_id(fanin.node->name(), fanin.port_id);
     if (IsOutputPortControlling(fanin)) {
       // Can't add a Switch node control dependency.
+      TensorId tensor_id(fanin.node->name(), fanin.port_id);
       *error_msg = absl::Substitute(
           "can't add fanin '$0' as it will become a Switch control dependency",
           tensor_id.ToString());
@@ -639,18 +961,15 @@ NodeDef* MutableGraphView::GetControllingFaninToAdd(absl::string_view node_name,
     // dependency is only triggered when the corresponding output is triggered.
     // We start by looking for an identity node connected to the output of the
     // switch node, and use it to anchor the control dependency.
-    auto fanouts = GetFanouts(*fanin.node, /*include_controlled_nodes=*/false);
-    for (auto fanout : fanouts) {
+    for (const auto& fanout : GetFanout(fanin)) {
       if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
-        if (ParseTensorName(fanout.node->input(0)) == tensor_id) {
-          if (fanout.node->name() == node_name) {
-            *error_msg =
-                absl::Substitute("can't add found fanin '$0' to self",
-                                 AsControlDependency(fanout.node->name()));
-            return nullptr;
-          }
-          return fanout.node;
+        if (fanout.node->name() == node_name) {
+          *error_msg =
+              absl::Substitute("can't add found fanin '$0' to self",
+                               AsControlDependency(fanout.node->name()));
+          return nullptr;
         }
+        return fanout.node;
       }
     }
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index 08a1cd18a1e138a5c766751f4843faa13770e324..a09c147be6c1a044d558c202e2047ae6a5d12916 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -84,6 +84,40 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
                     absl::string_view device,
                     absl::Span<const std::pair<string, AttrValue>> attrs);
 
+  // Updates node `from_node_name` name to `to_node_name`. If `to_node_name` is
+  // in use, node `from_node_name` does not exist, or node `from_node_name` has
+  // fanouts and `update_fanouts` is set to false, an error will be returned and
+  // nothing will be modified in the graph.
+  Status UpdateNodeName(absl::string_view from_node_name,
+                        absl::string_view to_node_name, bool update_fanouts);
+
+  // Swap node names `from_node_name` and `to_node_name`. Self loops of one node
+  // are removed by updating the inputs introducing self loops to use the other
+  // node's name. Setting `update_fanouts` to false will exclude other fanouts
+  // from having their inputs updated, but inputs introducing self loops will
+  // always be updated regardless of `update_fanouts.
+  //
+  // Example:
+  //   1. foo(other:3, bar:2, ^bar)
+  //   2. bar(foo:3, other:1, foo:1, ^foo)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", false):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", true):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(bar:5, foo:6)
+  //
+  // If it is not possible to swap node names (i.e. nodes do not exist or Switch
+  // control dependency may be introduced), an error will be returned and
+  // nothing will be modified in the graph.
+  Status SwapNodeNames(absl::string_view from_node_name,
+                       absl::string_view to_node_name, bool update_fanouts);
+
   // Updates all fanouts (input ports fetching output tensors) from
   // `from_node_name` to the `to_node_name`, including control dependencies.
   //
@@ -92,7 +126,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   //   2. foo2(bar:1, other:1)
   //   3. foo3(other:2, ^bar)
   //
-  // After calling ForwardOutputs(bar, new_bar):
+  // After calling UpdateFanouts(bar, new_bar):
   //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
   //   3. foo3(other:2, ^new_bar)
@@ -240,7 +274,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   //   2. foo2(bar:1, other:1)
   //   3. foo3(other:2, ^bar)
   //
-  // After calling ForwardOutputs(bar, new_bar):
+  // After calling UpdateFanouts(bar, new_bar):
   //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
   //   3. foo3(other:2, ^new_bar)
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index 06333d3a34491b9071620167ef0380c044f5a6bb..07818d1f526b5b7d7897fd5db2c561b5d90965c7 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -305,6 +305,450 @@ TEST(MutableGraphViewTest, UpdateNodeSwitchControlDependency) {
   CheckGraph(graph);
 }
 
+absl::flat_hash_map<string, std::vector<string>> GetNodeInputsFromGraph(
+    const GraphDef& graph, absl::string_view node_to_exclude) {
+  absl::flat_hash_map<string, std::vector<string>> node_inputs;
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    node_inputs[node.name()] =
+        std::vector<string>(node.input().begin(), node.input().end());
+  }
+  return node_inputs;
+}
+
+void CheckUnmodifiedNodeFanins(
+    const GraphDef& graph, absl::string_view node_to_exclude,
+    const absl::flat_hash_map<string, std::vector<string>>&
+        unmodified_node_inputs) {
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    auto it = unmodified_node_inputs.find(node.name());
+    ASSERT_NE(it, unmodified_node_inputs.end());
+    ASSERT_EQ(it->second.size(), node.input_size());
+    for (int i = 0; i < node.input_size(); ++i) {
+      EXPECT_EQ(node.input(i), it->second[i]);
+    }
+  }
+}
+
+void TestUpdateNodeName(absl::string_view from_node_name, bool node_exists,
+                        absl::string_view to_node_name, bool update_fanouts,
+                        bool success, const string& error_msg,
+                        absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(from_node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, from_node_name);
+
+  Status s = graph.UpdateNodeName(from_node_name, to_node_name, update_fanouts);
+  EXPECT_EQ(s.ok(), success);
+  string updated_node_name;
+  if (success) {
+    updated_node_name = string(to_node_name);
+  } else {
+    updated_node_name = string(from_node_name);
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    EXPECT_EQ(node->name(), updated_node_name);
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, updated_node_name,
+                            unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeName) {
+  string error_msg;
+  // Node has no fanouts.
+  TestUpdateNodeName("b", /*node_exists=*/true, "d", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {"a"});
+  // Node has fanouts and rename to self.
+  TestUpdateNodeName("b", /*node_exists=*/true, "b", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {"a"});
+  // Node has no fanouts and rename to self.
+  TestUpdateNodeName("a", /*node_exists=*/true, "a", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {});
+
+  // New node name is in use.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='c', to_node_name='b', "
+      "update_fanouts=false) error: can't update node name because new node "
+      "name is in use.";
+  TestUpdateNodeName("c", /*node_exists=*/true, "b", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='a', to_node_name='b', "
+      "update_fanouts=true) error: can't update node name because new node "
+      "name is in use.";
+  TestUpdateNodeName("a", /*node_exists=*/true, "b", /*update_fanouts=*/true,
+                     /*success=*/false, error_msg, {});
+  // Node has fanouts.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='a', to_node_name='d', "
+      "update_fanouts=false) error: can't update node name because node has "
+      "fanouts.";
+  TestUpdateNodeName("a", /*node_exists=*/true, "d", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  // Node does not exist.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='d', to_node_name='e', "
+      "update_fanouts=false) error: node 'd' was not found.";
+  TestUpdateNodeName("d", /*node_exists=*/false, "e", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='d', to_node_name='e', "
+      "update_fanouts=true) error: node 'd' was not found.";
+  TestUpdateNodeName("d", /*node_exists=*/false, "e", /*update_fanouts=*/true,
+                     /*success=*/false, error_msg, {});
+}
+
+TEST(MutableGraphViewTest, UpdateNodeNameWithFanouts) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"b", "^a"}),
+       NDef("d", "NotImportant", {"^b", "^a"}),
+       NDef("e", "NotImportant", {"b:2", "c:4", "b:1", "^a"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNodeName("b", "f", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"f", "^c", "^d", "^e"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"a:2"},
+            {"c", "^d", "e", "e:2"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"f", "^a"}, {"e:1"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^f", "^a"}, {});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"f:2", "c:4", "f:1", "^a"},
+            {});
+
+  CheckGraph(graph);
+}
+
+GraphDef SimpleSwapNodeNamesMutationGraph() {
+  return test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("switch_1", "Switch", {"a"}),
+       NDef("identity_1", "Identity", {"switch_1:1"}),
+       NDef("b", "NotImportant", {}, {}), NDef("switch_2", "Switch", {"b"}),
+       NDef("identity_2", "Identity", {"switch_2:0"}),
+       NDef("foo_1", "NotImportant", {"identity_1", "^identity_1"}),
+       NDef("foo_2", "NotImportant", {"identity_2", "^identity_2"})},
+      /*funcs=*/{});
+}
+
+void TestSwapNodeNames(bool update_fanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("foo_1", "foo_2", update_fanouts));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNames) {
+  TestSwapNodeNames(/*update_fanouts=*/false);
+  TestSwapNodeNames(/*update_fanouts=*/true);
+}
+
+void TestSwapNodeNamesWithSameNames(bool update_fanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("identity_1", "identity_1", update_fanouts));
+
+  // No changes to graph.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSameName) {
+  TestSwapNodeNamesWithSameNames(/*update_fanouts=*/false);
+  TestSwapNodeNamesWithSameNames(/*update_fanouts=*/true);
+}
+
+TEST(MutableGraphView, SwapNodeNamesBetweenSwitches) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(
+      graph.SwapNodeNames("switch_1", "switch_2", /*update_fanouts=*/false));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"a"}, {"identity_2"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"b"}, {"identity_1"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesBetweenSwitchesAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(
+      graph.SwapNodeNames("switch_1", "switch_2", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_2:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_1:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSwitchAndNonSwitch) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "switch_1", /*update_fanouts=*/false));
+
+  // Dedup controls and fix self loop.
+  CheckNode(graph, "switch_1", "NotImportant", "", {}, {}, {"a", "identity_1"});
+  CheckNode(graph, "a", "Switch", "", {}, {"switch_1"}, {});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"}, {"foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {}, {"identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSwitchAndNonSwitchAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "switch_1", /*update_fanouts=*/true));
+
+  CheckNode(graph, "switch_1", "NotImportant", "", {}, {}, {"a"});
+  CheckNode(graph, "a", "Switch", "", {}, {"switch_1"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"a:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesNonSwitchAndSwitch) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("switch_2", "b", /*update_fanouts=*/false));
+
+  // Dedup controls and fix self loop.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "switch_2", "NotImportant", "", {}, {}, {"b", "identity_2"});
+  CheckNode(graph, "b", "Switch", "", {}, {"switch_2"}, {});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"}, {"foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesNonSwitchAndSwitchAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("switch_2", "b", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "switch_2", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "Switch", "", {}, {"switch_2"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"b:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+void TestSwapNodeNamesSimpleSelfLoop(bool update_fanouts) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {"b:7"}), NDef("b", "NotImportant", {"a:10"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "b", update_fanouts));
+
+  // No self loops.
+  CheckNode(graph, "a", "NotImportant", "", {}, {"b:10"}, {"b:0"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:7"}, {"a:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSelfLoops) {
+  TestSwapNodeNamesSimpleSelfLoop(/*update_fanouts=*/false);
+  TestSwapNodeNamesSimpleSelfLoop(/*update_fanouts=*/true);
+}
+
+void TestSwapNodeNamesError(absl::string_view from_node_name,
+                            absl::string_view to_node_name, bool update_fanouts,
+                            const string& error_msg) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.SwapNodeNames(from_node_name, to_node_name, update_fanouts);
+  EXPECT_EQ(s.ok(), false);
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  // No changes to graph.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+// TODO(lyandy): add tests with update_fanouts == true.
+TEST(MutableGraphView, SwapNodeNamesError) {
+  string error_msg;
+  // Missing nodes.
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_3', "
+      "to_node_name='foo_2', update_fanouts=false) error: node 'foo_3' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_3", "foo_2", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_3', "
+      "to_node_name='foo_2', update_fanouts=true) error: node 'foo_3' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_3", "foo_2", /*update_fanouts=*/true, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_1', "
+      "to_node_name='foo_4', update_fanouts=false) error: node 'foo_4' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_1", "foo_4", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_1', "
+      "to_node_name='foo_4', update_fanouts=true) error: node 'foo_4' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_1", "foo_4", /*update_fanouts=*/true, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_5', "
+      "to_node_name='foo_6', update_fanouts=false) error: node 'foo_5' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_5", "foo_6", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_5', "
+      "to_node_name='foo_6', update_fanouts=true) error: node 'foo_5' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_5", "foo_6", /*update_fanouts=*/true, error_msg);
+
+  // Switch control dependencies.
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='switch_2', "
+      "to_node_name='identity_1', update_fanouts=false) error: can't swap node "
+      "name 'switch_2' as it will become a Switch control dependency.";
+  TestSwapNodeNamesError("switch_2", "identity_1", /*update_fanouts=*/false,
+                         error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='identity_2', "
+      "to_node_name='switch_1', update_fanouts=false) error: can't swap node "
+      "name 'switch_1' as it will become a Switch control dependency.";
+  TestSwapNodeNamesError("identity_2", "switch_1", /*update_fanouts=*/false,
+                         error_msg);
+}
+
 TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
@@ -461,36 +905,6 @@ GraphDef SimpleMutateFaninGraph() {
   return graph_def;
 }
 
-absl::flat_hash_map<string, std::vector<string>> GetNodeInputsFromGraph(
-    const GraphDef& graph, absl::string_view node_to_exclude) {
-  absl::flat_hash_map<string, std::vector<string>> node_inputs;
-  for (const auto& node : graph.node()) {
-    if (node.name() == node_to_exclude) {
-      continue;
-    }
-    node_inputs[node.name()] =
-        std::vector<string>(node.input().begin(), node.input().end());
-  }
-  return node_inputs;
-}
-
-void CheckUnmodifiedNodeFanins(
-    const GraphDef& graph, absl::string_view node_to_exclude,
-    const absl::flat_hash_map<string, std::vector<string>>&
-        unmodified_node_inputs) {
-  for (const auto& node : graph.node()) {
-    if (node.name() == node_to_exclude) {
-      continue;
-    }
-    auto it = unmodified_node_inputs.find(node.name());
-    ASSERT_NE(it, unmodified_node_inputs.end());
-    ASSERT_EQ(it->second.size(), node.input_size());
-    for (int i = 0; i < node.input_size(); ++i) {
-      EXPECT_EQ(node.input(i), it->second[i]);
-    }
-  }
-}
-
 void TestAddRegularFanin(absl::string_view node_name, bool node_exists,
                          const TensorId& fanin_to_add, bool success,
                          const string& error_msg,
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index e8179b22a8b9e0bf18001f8c5df24f0831e41f90..59400dc479b70a14f9af2443bcd9bfd7fe2f14c9 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -47,24 +47,16 @@ bool IsAnyDiv(const NodeDef& node) {
          node.op() == "FloorDiv" || node.op() == "TruncateDiv";
 }
 
-bool IsAnyMax(const NodeDef& node) {
+bool IsAnyMaxPool(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "Max" || op == "SegmentMax" || op == "UnsortedSegmentMax";
-}
-
-bool IsAnyMin(const NodeDef& node) {
-  const auto& op = node.op();
-  return op == "Min" || op == "SegmentMin" || op == "UnsortedSegmentMin";
+  return op == "MaxPool" || op == "MaxPoolV2" || op == "MaxPool3D" ||
+         op == "MaxPoolWithArgmax" || op == "FractionalMaxPool";
 }
 
 bool IsApproximateEqual(const NodeDef& node) {
   return node.op() == "ApproximateEqual";
 }
 
-bool IsArgMax(const NodeDef& node) { return node.op() == "ArgMax"; }
-
-bool IsArgMin(const NodeDef& node) { return node.op() == "ArgMin"; }
-
 bool IsAvgPoolGrad(const NodeDef& node) { return node.op() == "AvgPoolGrad"; }
 
 bool IsAssign(const NodeDef& node) {
@@ -184,8 +176,7 @@ bool IsElementWiseMonotonic(const NodeDef& node, bool* is_non_decreasing) {
           "Sign",  "Sinh", "Softsign", "Softplus", "Sqrt",  "Tanh",
       }));
   static const gtl::FlatSet<string>* const kMonotonicNonIncreasingOps =
-      CHECK_NOTNULL((new gtl::FlatSet<string>{"Acos", "Erfc", "Inv", "Neg",
-                                              "Reciprocal", "Rsqrt"}));
+      CHECK_NOTNULL((new gtl::FlatSet<string>{"Acos", "Erfc", "Neg", "Rsqrt"}));
   if (kMonotonicNonDecreasingOps->count(node.op()) > 0) {
     if (is_non_decreasing) {
       *is_non_decreasing = true;
@@ -288,8 +279,8 @@ bool IsLogicalOr(const NodeDef& node) { return node.op() == "LogicalOr"; }
 
 bool IsMatMul(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "MatMul" || op == "BatchMatMul" || op == "QuantizedMatMul" ||
-         op == "SparseMatMul";
+  return op == "MatMul" || op == "BatchMatMul" || op == "SparseMatMul" ||
+         IsQuantizedMatMul(node);
 }
 
 bool IsMax(const NodeDef& node) { return node.op() == "Max"; }
@@ -330,6 +321,8 @@ bool IsNextIteration(const NodeDef& node) {
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
+bool IsOnesLike(const NodeDef& node) { return node.op() == "OnesLike"; }
+
 bool IsPack(const NodeDef& node) { return node.op() == "Pack"; }
 
 bool IsPad(const NodeDef& node) {
@@ -357,6 +350,10 @@ bool IsPrint(const NodeDef& node) {
 
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
+bool IsQuantizedMatMul(const NodeDef& node) {
+  return node.op() == "QuantizedMatMul" || node.op() == "QuantizedMatMulV2";
+}
+
 bool IsQueue(const NodeDef& node) {
   return str_util::EndsWith(node.op(), "QueueV2");
 }
@@ -550,6 +547,8 @@ bool IsWhile(const NodeDef& node) {
   return op == "While" || op == "StatelessWhile";
 }
 
+bool IsZerosLike(const NodeDef& node) { return node.op() == "ZerosLike"; }
+
 bool IsZeta(const NodeDef& node) { return node.op() == "Zeta"; }
 
 namespace {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index fc32749d9fa8a1a32d4190ed4ffb0eb0863b3000..bc1bb33772d8c8b664aeef280c779206f6b4e5fa 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -28,11 +28,8 @@ bool IsAll(const NodeDef& node);
 bool IsAngle(const NodeDef& node);
 bool IsAny(const NodeDef& node);
 bool IsAnyDiv(const NodeDef& node);
-bool IsAnyMax(const NodeDef& node);
-bool IsAnyMin(const NodeDef& node);
+bool IsAnyMaxPool(const NodeDef& node);
 bool IsApproximateEqual(const NodeDef& node);
-bool IsArgMax(const NodeDef& node);
-bool IsArgMin(const NodeDef& node);
 bool IsAvgPoolGrad(const NodeDef& node);
 bool IsAssert(const NodeDef& node);
 bool IsAssign(const NodeDef& node);
@@ -104,10 +101,12 @@ bool IsMod(const NodeDef& node);
 bool IsMul(const NodeDef& node);
 bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
+bool IsOnesLike(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPad(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPartitionedCall(const NodeDef& node);
+bool IsQuantizedMatMul(const NodeDef& node);
 bool IsNeg(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
@@ -174,6 +173,7 @@ bool IsTruncateMod(const NodeDef& node);
 bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsWhile(const NodeDef& node);
+bool IsZerosLike(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
 // Return true if the op is an aggregation (e.g. Add, AddN).
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 4c8b4f69d6bf11a7d523052fcc036cbc203222ec..cdf6180ff5fd3a7da3f1b3cfeb0905ee06e8b54b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -101,6 +101,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -108,6 +109,9 @@ tf_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
     shard_count = 5,
+    # Running cuda on cpu will trigger tests guarded by GOOGLE_CUDA but NCHW
+    # won't be available, which result in test failures. So disable that.
+    tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":constant_folding",
         ":dependency_optimizer",
@@ -268,13 +272,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arithmetic_optimizer_test_utils",
+    testonly = 1,
+    hdrs = [
+        "arithmetic_optimizer_test_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":arithmetic_optimizer",
+        ":constant_folding",
+        ":model_pruner",
+        "//tensorflow/core:test",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "arithmetic_optimizer_test",
     size = "small",
     srcs = ["arithmetic_optimizer_test.cc"],
     deps = [
         ":arithmetic_optimizer",
-        ":constant_folding",
+        ":arithmetic_optimizer_test_utils",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
@@ -289,7 +309,6 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
-        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -520,9 +539,9 @@ cc_library(
         ":custom_graph_optimizer_registry",
         ":debug_stripper",
         ":dependency_optimizer",
-        ":experimental_implementation_selector",
         ":function_optimizer",
         ":graph_optimizer",
+        ":implementation_selector",
         ":layout_optimizer",
         ":loop_optimizer",
         ":memory_optimizer",
@@ -868,9 +887,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "experimental_implementation_selector",
-    srcs = ["experimental_implementation_selector.cc"],
-    hdrs = ["experimental_implementation_selector.h"],
+    name = "implementation_selector",
+    srcs = ["implementation_selector.cc"],
+    hdrs = ["implementation_selector.h"],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
@@ -886,14 +905,14 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "experimental_implementation_selector_test",
+    name = "implementation_selector_test",
     size = "small",
-    srcs = ["experimental_implementation_selector_test.cc"],
+    srcs = ["implementation_selector_test.cc"],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
-        ":experimental_implementation_selector",
         ":function_api_info",
+        ":implementation_selector",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index e2c8b7cdbc6165397ede305abe24e18477dd445a..902cb3fdd84465d79d52e496315597c533e69252 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -52,7 +52,6 @@ limitations under the License.
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-using tensorflow::str_util::StringReplace;
 using tensorflow::strings::StrCat;
 
 namespace tensorflow {
@@ -2722,8 +2721,7 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
   ~OptimizeMaxOrMinOfMonotonicStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAnyMax(*node) || IsAnyMin(*node) || IsArgMax(*node) ||
-           IsArgMin(*node);
+    return IsMax(*node) || IsMin(*node) || IsAnyMaxPool(*node);
   }
 
   Status TrySimplify(NodeDef* reduction_node,
@@ -2737,10 +2735,13 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
     // 0. inner_function is not in the preserve set,
     // 1. inner_function's Op is element-wise monotonic
     // 2. inner_function's output is not being consumed elsewhere.
+    // 3. is monotonic increasing if reduction_node is a pooling operation
+    //    since we don't have MinPool operations.
     bool is_non_decreasing = false;
     if (!IsInPreserveSet(*inner_function) &&
         IsElementWiseMonotonic(*inner_function, &is_non_decreasing) &&
-        ctx().node_map->GetOutputs(inner_function->name()).size() == 1) {
+        ctx().node_map->GetOutputs(inner_function->name()).size() == 1 &&
+        (is_non_decreasing || !IsAnyMaxPool(*reduction_node))) {
       // Swap the first inputs of the inner function Op & the reduction Op.
       NodeDef* inner_input;
       TF_RETURN_IF_ERROR(GetInputNode(inner_function->input(0), &inner_input));
@@ -2754,15 +2755,9 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
       if (!is_non_decreasing) {
         // Flip Min<->Max if the function is non-increasing, e.g.
         // Max(Neg(x)) = Neg(Min(x)).
-        const string opposite = FlipMinMax(*reduction_node);
+        const string opposite = IsMax(*reduction_node) ? "Min" : "Max";
         reduction_node->set_op(opposite);
       }
-
-      if (IsArgMax(*reduction_node) || IsArgMin(*reduction_node)) {
-        // ArgMax(Sqrt(x)) = ArgMax(x)
-        inner_function->set_op("Identity");
-      }
-
       AddToOptimizationQueue(reduction_node);
       AddToOptimizationQueue(inner_function);
       AddToOptimizationQueue(inner_input);
@@ -2783,16 +2778,6 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(consumer);
     }
   }
-
- private:
-  string FlipMinMax(const NodeDef& node) {
-    const string& op = node.op();
-    if (IsAnyMax(node) || IsArgMax(node)) {
-      return str_util::StringReplace(op, "Max", "Min", false);
-    } else {
-      return str_util::StringReplace(op, "Min", "Max", false);
-    }
-  }
 };
 
 // Replace a chain of type&shape preserving unary ops with a
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index dd5ee909b6530a215f6209281730c22a243a091b..277833462298b8286fe30fb12454c64058b9a8a0 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -20,10 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -92,211 +91,6 @@ void VerifyGraphsMatch(const GraphDef& original_graph,
 }
 }  // namespace
 
-class ArithmeticOptimizerTest : public GrapplerTest {
- protected:
-  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
-  // longer have any output consumers.
-  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                        GraphDef* output) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
-  }
-
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                     GraphDef* output) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-  }
-
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  // Optionally run a constant folding pass before pruning.
-  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                             GraphDef* output, bool const_folding = false) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-
-    if (const_folding) {
-      item->graph.Swap(output);
-      output->Clear();
-      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
-                       .Optimize(nullptr, *item, output));
-    }
-
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
-  }
-
-  // TODO(ezhulenev): Make private. After migration to stages each test
-  // should explicitly enable required optimization for tests isolation
-  void DisableAllStages(ArithmeticOptimizer* optimizer) {
-    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
-    options.dedup_computations = false;
-    options.combine_add_to_addn = false;
-    options.convert_sqrt_div_to_rsqrt_mul = false;
-    options.convert_pow = false;
-    options.convert_log1p = false;
-    options.optimize_max_or_min_of_monotonic = false;
-    options.fold_conjugate_into_transpose = false;
-    options.fold_multiply_into_conv = false;
-    options.fold_transpose_into_matmul = false;
-    options.hoist_common_factor_out_of_aggregation = false;
-    options.hoist_cwise_unary_chains = false;
-    options.minimize_broadcasts = false;
-    options.remove_identity_transpose = false;
-    options.remove_involution = false;
-    options.remove_idempotent = false;
-    options.remove_redundant_bitcast = false;
-    options.remove_redundant_cast = false;
-    options.remove_redundant_reshape = false;
-    options.remove_negation = false;
-    options.remove_logical_not = false;
-    options.reorder_cast_like_and_value_preserving = false;
-    options.replace_mul_with_square = false;
-    options.simplify_aggregation = false;
-    options.unary_ops_composition = false;
-    optimizer->options_ = options;
-  }
-
-  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
-    optimizer->options_.combine_add_to_addn = false;
-  }
-
-  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.combine_add_to_addn = true;
-  }
-
-  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_conjugate_into_transpose = true;
-  }
-
-  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_multiply_into_conv = true;
-  }
-
-  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_transpose_into_matmul = true;
-  }
-
-  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
-  }
-
-  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.minimize_broadcasts = true;
-  }
-
-  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_identity_transpose = true;
-  }
-
-  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_involution = true;
-  }
-
-  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_bitcast = true;
-  }
-
-  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_cast = true;
-  }
-
-  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_reshape = true;
-  }
-
-  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_negation = true;
-  }
-
-  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.reorder_cast_like_and_value_preserving = true;
-  }
-
-  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.replace_mul_with_square = true;
-  }
-
-  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.hoist_cwise_unary_chains = true;
-  }
-
-  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
-  }
-
-  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_pow = true;
-  }
-
-  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_idempotent = true;
-  }
-
-  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_logical_not = true;
-  }
-
-  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.simplify_aggregation = true;
-  }
-
-  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_log1p = true;
-  }
-
-  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.optimize_max_or_min_of_monotonic = true;
-  }
-
-  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_expm1 = true;
-  }
-
-  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.unary_ops_composition = true;
-  }
-
-  void EnableOnlyRemoveStackStridedSliceSameAxis(
-      ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_stack_strided_slice_same_axis = true;
-  }
-};
-
 TEST_F(ArithmeticOptimizerTest, NoOp) {
   // This trivial graph is so basic there's nothing to optimize.
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
@@ -3466,47 +3260,6 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMaxOrMinOfMonotonicElementWise) {
   EXPECT_EQ(2, required_node_count);
 }
 
-TEST_F(ArithmeticOptimizerTest, OptimizeArgMaxOrArgMinOfMonotonicElementWise) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  const auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output sqrt = ops::Sqrt(s.WithOpName("sqrt"), x);
-  Output arg_max = ops::ArgMax(s.WithOpName("arg_max"), sqrt, 1);
-  Output final_out = ops::Identity(s.WithOpName("final_out"), arg_max);
-
-  GrapplerItem item;
-  item.fetch = {"final_out"};
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  EXPECT_EQ(1, tensors_expected.size());
-
-  GraphDef output;
-  ArithmeticOptimizer optimizer;
-  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
-  OptimizeAndPrune(&optimizer, &item, &output);
-  const auto tensors = EvaluateNodes(output, item.fetch);
-  EXPECT_EQ(1, tensors.size());
-
-  test::ExpectTensorEqual<int64>(tensors_expected[0], tensors[0]);
-  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
-  // Check if the inputs are switched
-  int required_node_count = 0;
-  for (int i = 0; i < output.node_size(); ++i) {
-    const NodeDef& node = output.node(i);
-    if (node.name() == "final_out") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("arg_max", node.input(0));
-      ++required_node_count;
-    } else if (node.name() == "arg_max") {
-      EXPECT_EQ("ArgMax", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      ++required_node_count;
-    }
-  }
-  EXPECT_EQ(2, required_node_count);
-}
-
 TEST_F(ArithmeticOptimizerTest,
        OptimizeMaxOrMinOfMonotonicElementWise_DoNotChangeFetchNode) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -3602,6 +3355,75 @@ TEST_F(ArithmeticOptimizerTest,
   EXPECT_EQ(2, required_node_count);
 }
 
+TEST_F(ArithmeticOptimizerTest,
+       OptimizeMaxOrMinOfMonotonicElementWiseNonIncreasingDoNotChangeMaxPool) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 1.5f, {3, 3, 3, 1});
+  Output neg = ops::Neg(s.WithOpName("neg"), x);
+  Output max_pool = ops::MaxPool(s.WithOpName("max_pool"), neg, {1, 2, 2, 1},
+                                 {1, 2, 2, 1}, "VALID");
+
+  GrapplerItem item;
+  item.fetch = {"max_pool"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  // Should be a NoOp
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, OptimizeMaxOrMinOfMonotonicElementWiseMaxPool) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 1.5f, {3, 3, 3, 1});
+  Output sqrt = ops::Sqrt(s.WithOpName("sqrt"), x);
+  Output max_pool = ops::MaxPool(s.WithOpName("max_pool"), sqrt, {1, 2, 2, 1},
+                                 {1, 2, 2, 1}, "VALID");
+  Output final_out = ops::Identity(s.WithOpName("final_out"), max_pool);
+
+  GrapplerItem item;
+  item.fetch = {"final_out"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  // Check if the inputs are switched
+  int required_node_count = 0;
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "sqrt") {
+      EXPECT_EQ("Sqrt", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("max_pool", node.input(0));
+      ++required_node_count;
+    } else if (node.name() == "max_pool") {
+      EXPECT_EQ("MaxPool", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      ++required_node_count;
+    }
+  }
+  EXPECT_EQ(2, required_node_count);
+}
+
 TEST_F(ArithmeticOptimizerTest, UnaryOpsComposition) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..94d0adc60923b0b41147891f843b98af76477653
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
@@ -0,0 +1,236 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ArithmeticOptimizerTest : public GrapplerTest {
+ protected:
+  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
+  // longer have any output consumers.
+  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                        GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                     GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  // Optionally run a constant folding pass before pruning.
+  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                             GraphDef* output, bool const_folding = false) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    if (const_folding) {
+      item->graph.Swap(output);
+      output->Clear();
+      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
+                       .Optimize(nullptr, *item, output));
+    }
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // TODO(ezhulenev): Make private. After migration to stages each test
+  // should explicitly enable required optimization for tests isolation
+  void DisableAllStages(ArithmeticOptimizer* optimizer) {
+    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.dedup_computations = false;
+    options.combine_add_to_addn = false;
+    options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.convert_pow = false;
+    options.convert_log1p = false;
+    options.optimize_max_or_min_of_monotonic = false;
+    options.fold_conjugate_into_transpose = false;
+    options.fold_multiply_into_conv = false;
+    options.fold_transpose_into_matmul = false;
+    options.hoist_common_factor_out_of_aggregation = false;
+    options.hoist_cwise_unary_chains = false;
+    options.minimize_broadcasts = false;
+    options.remove_identity_transpose = false;
+    options.remove_involution = false;
+    options.remove_idempotent = false;
+    options.remove_redundant_bitcast = false;
+    options.remove_redundant_cast = false;
+    options.remove_redundant_reshape = false;
+    options.remove_negation = false;
+    options.remove_logical_not = false;
+    options.reorder_cast_like_and_value_preserving = false;
+    options.replace_mul_with_square = false;
+    options.simplify_aggregation = false;
+    options.unary_ops_composition = false;
+    optimizer->options_ = options;
+  }
+
+  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    optimizer->options_.combine_add_to_addn = false;
+  }
+
+  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.combine_add_to_addn = true;
+  }
+
+  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_conjugate_into_transpose = true;
+  }
+
+  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_multiply_into_conv = true;
+  }
+
+  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_transpose_into_matmul = true;
+  }
+
+  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
+  }
+
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
+  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_identity_transpose = true;
+  }
+
+  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_involution = true;
+  }
+
+  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_bitcast = true;
+  }
+
+  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_cast = true;
+  }
+
+  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_reshape = true;
+  }
+
+  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_negation = true;
+  }
+
+  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reorder_cast_like_and_value_preserving = true;
+  }
+
+  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_square = true;
+  }
+
+  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_cwise_unary_chains = true;
+  }
+
+  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
+  }
+
+  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_pow = true;
+  }
+
+  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_idempotent = true;
+  }
+
+  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_logical_not = true;
+  }
+
+  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_aggregation = true;
+  }
+
+  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_log1p = true;
+  }
+
+  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.optimize_max_or_min_of_monotonic = true;
+  }
+
+  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_expm1 = true;
+  }
+
+  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.unary_ops_composition = true;
+  }
+
+  void EnableOnlyRemoveStackStridedSliceSameAxis(
+      ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_stack_strided_slice_same_axis = true;
+  }
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index b0c3c5b5181be4b744128fb18ac288c122c59f2a..37fa7d9a3e3cb35032ed0003fb45bfaa70fc3309 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 
+#include <cmath>
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -34,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -167,6 +172,55 @@ bool HasTPUAttributes(const NodeDef& node) {
   return false;
 }
 
+template <typename T>
+bool PackedValuesNotEqual(T a, T b) {
+  return a != b;
+}
+
+template <>
+bool PackedValuesNotEqual(float a, float b) {
+  return reinterpret_cast<int32_t&>(a) != reinterpret_cast<int32_t&>(b);
+}
+
+template <>
+bool PackedValuesNotEqual(double a, double b) {
+  return reinterpret_cast<int64_t&>(a) != reinterpret_cast<int64_t&>(b);
+}
+
+float QuantizedTypeMinAsFloat(DataType data_type) {
+  switch (data_type) {
+    case DT_QINT8:
+      return Eigen::NumTraits<qint8>::lowest();
+    case DT_QUINT8:
+      return Eigen::NumTraits<quint8>::lowest();
+    case DT_QINT16:
+      return Eigen::NumTraits<qint16>::lowest();
+    case DT_QUINT16:
+      return Eigen::NumTraits<quint16>::lowest();
+    case DT_QINT32:
+      return Eigen::NumTraits<qint32>::lowest();
+    default:
+      return 0.0f;
+  }
+}
+
+float QuantizedTypeMaxAsFloat(DataType data_type) {
+  switch (data_type) {
+    case DT_QINT8:
+      return Eigen::NumTraits<qint8>::highest();
+    case DT_QUINT8:
+      return Eigen::NumTraits<quint8>::highest();
+    case DT_QINT16:
+      return Eigen::NumTraits<qint16>::highest();
+    case DT_QUINT16:
+      return Eigen::NumTraits<quint16>::highest();
+    case DT_QINT32:
+      return Eigen::NumTraits<qint32>::highest();
+    default:
+      return 0.0f;
+  }
+}
+
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
@@ -716,6 +770,61 @@ Status ConstantFolding::MaterializeReductionIndices(
   return Status::OK();
 }
 
+Status ConstantFolding::MaterializeConstantValuedNode(
+    NodeDef* node, const GraphProperties& properties) {
+  // Nodes that generate constant-valued outputs can be represented compactly in
+  // compressed format, regardless of their shape.
+  const std::vector<OpInfo::TensorProperties>& output_props =
+      properties.GetOutputProperties(node->name());
+  if (output_props.size() != 1) return Status::OK();
+  const auto& output_shape = output_props[0].shape();
+  if (!PartialTensorShape(output_shape).IsFullyDefined()) {
+    return Status::OK();
+  }
+  if (IsFill(*node)) {
+    const auto output_dtype = output_props[0].dtype();
+    NodeDef* input_node = nullptr;
+    for (int i = 0; i < 2; ++i) {
+      input_node = node_map_->GetNode(NodeName(node->input(i)));
+      if (input_node == nullptr || !IsReallyConstant(*input_node)) {
+        return Status::OK();
+      }
+    }
+    TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
+    const TensorProto& input_tensor = input_node->attr().at("value").tensor();
+    // TODO(rmlarsen): Handle the case where the value is stored in
+    // tensor_content.
+    if (!input_tensor.tensor_content().empty()) {
+      return Status::OK();
+    }
+    TensorProto* tensor = (*node->mutable_attr())["value"].mutable_tensor();
+    // Copy the input tensor to the fill node, set the output shape, and
+    // change the nodd type to Const.
+    *tensor = input_tensor;
+    *(tensor->mutable_tensor_shape()) = output_shape;
+    (*node->mutable_attr())["dtype"].set_type(output_dtype);
+    node->mutable_attr()->erase("T");
+    node->mutable_attr()->erase("index_type");
+    node->set_op("Const");
+    for (int i = 0; i < 2; i++) {
+      // Change inputs to a control inputs.
+      const string ctrl_dep = AsControlDependency(node->input(i));
+      node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+      node->set_input(i, ctrl_dep);
+    }
+    graph_modified_ = true;
+  } else {
+    double value =
+        (IsZerosLike(*node) ? 0.0 : (IsOnesLike(*node) ? 1.0 : -1.0));
+    bool success = false;
+    if (value >= 0) {
+      TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
+          value, properties, output_shape, node, graph_, &success));
+    }
+  }
+  return Status::OK();
+}
+
 Status ConstantFolding::MaterializeConstants(
     const GraphProperties& properties) {
   const int node_count = graph_->node_size();
@@ -726,6 +835,8 @@ Status ConstantFolding::MaterializeConstants(
       TF_RETURN_IF_ERROR(MaterializeBroadcastGradientArgs(node, properties));
     } else if (IsReduction(node)) {
       TF_RETURN_IF_ERROR(MaterializeReductionIndices(&node, properties));
+    } else if (IsFill(node) || IsZerosLike(node) || IsOnesLike(node)) {
+      TF_RETURN_IF_ERROR(MaterializeConstantValuedNode(&node, properties));
     }
   }
   return Status::OK();
@@ -870,6 +981,11 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
       SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
       SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QUINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QUINT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
       return errors::InvalidArgument("Unsupported type: ", type);
@@ -897,6 +1013,42 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
   return dtype;
 }
 
+// Checks whether the shape of the const input of the Mul op is valid to perform
+// the MulConvPushDown optimization.
+bool IsValidConstShapeForMulConvPushDown(
+    const string& data_format, const TensorShapeProto& filter_shape,
+    const TensorShapeProto& mul_const_input_shape) {
+  // If the const is a scalar, or it has fewer or same number of dimensions
+  // than the filter and it only has single element, the optimization should
+  // work.
+  if (mul_const_input_shape.dim_size() <= data_format.size() &&
+      TensorShape(mul_const_input_shape).num_elements() == 1) {
+    return true;
+  }
+
+  // Otherwise, check the eligibility according to data format.
+  if (data_format == "NHWC" || data_format == "NDHWC") {
+    TensorShapeProto new_filter_shape;
+    if (!ShapeAfterBroadcast(filter_shape, mul_const_input_shape,
+                             &new_filter_shape)) {
+      return false;
+    }
+    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
+      return false;
+    }
+    // Only the last dimension could be larger than one, since broadcasting over
+    // the last dimension (the output channel) will result in invalid filter.
+    for (int i = 0; i < mul_const_input_shape.dim_size() - 1; ++i) {
+      if (mul_const_input_shape.dim(i).size() > 1) return false;
+    }
+    return true;
+  } else if (data_format == "NCHW" || data_format == "NCDHW") {
+    // TODO(laigd): support NCHW and NCDHW (b/111214513).
+    return false;
+  }
+  return false;
+}
+
 }  // namespace
 
 // static
@@ -917,29 +1069,28 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                  \
-  {                                                                   \
-    const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
-    TYPE last = *val_ptr;                                             \
-    int64 last_index = 0;                                             \
-    for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
-      TYPE cur = *val_ptr++;                                          \
-      if (cur != last) {                                              \
-        last = cur;                                                   \
-        last_index = i;                                               \
-      }                                                               \
-    }                                                                 \
-    if (last_index < kint32max) {                                     \
-      optimized = true;                                               \
-      encoded_size = (last_index + 1) * sizeof(NAME);                 \
-      t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
-      t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
-      val_ptr = tensor->flat<TYPE>().data();                          \
-      for (int64 i = 0; i <= last_index; ++i) {                       \
-        t->set_##NAME##_val(i, *val_ptr++);                           \
-      }                                                               \
-    }                                                                 \
-  }                                                                   \
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                      \
+  {                                                                       \
+    const auto* val_ptr = tensor->flat<TYPE>().data();                    \
+    auto last = *val_ptr;                                                 \
+    int64 last_index = 0;                                                 \
+    for (int64 i = 0; i < tensor->NumElements(); ++i) {                   \
+      TYPE cur = *val_ptr++;                                              \
+      if (PackedValuesNotEqual(cur, last)) {                              \
+        last = cur;                                                       \
+        last_index = i;                                                   \
+      }                                                                   \
+    }                                                                     \
+    if (last_index < kint32max) {                                         \
+      optimized = true;                                                   \
+      encoded_size = (last_index + 1) * sizeof(NAME);                     \
+      t->mutable_##NAME##_val()->Reserve(last_index + 1);                 \
+      const auto* src_ptr = tensor->flat<TYPE>().data();                  \
+      auto* dst_ptr =                                                     \
+          t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);              \
+    }                                                                     \
+  }                                                                       \
   break
 
     switch (tensor->dtype()) {
@@ -975,6 +1126,8 @@ Status ConstantFolding::CreateNodeDef(const string& name,
     t->set_dtype(tensor->dtype());
     tensor->shape().AsProto(t->mutable_tensor_shape());
   } else {
+    // DT_HALF, DT_BFLOAT16, DT_QINT32, DT_QINT16, DT_QUINT16, DT_QINT8,
+    // DT_QUINT8
     tensor->AsProtoTensorContent(t);
     encoded_size = t->tensor_content().size();
   }
@@ -1059,98 +1212,103 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   return Status::OK();
 }
 
-Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
-                                 bool* result_too_large) {
-  if (IsMerge(*node)) {
-    // Merge nodes are special, in the sense that they execute as soon as one of
-    // their input is ready. We can therefore fold a merge node iff it has at
-    // least one constant input without control dependency.
-    // We still need to ensure that the nodes in the fanin of the merge node are
-    // scheduled. We'll therefore add a control dependency from the merge node
-    // to the folded constant. We end up with:
-    //  * the merge node and its inputs are preserved as is
-    //  * a new constant node C1, driven by the merge node through a control
-    //  dependency, initialized to the value of the folded input
-    //  * a new constant node C2, driven by the merge node through a control
-    //  dependency, initialized to the index of the folded input
-    //  * the fanout of the merge nodes is rewired to be driven by either C1 or
-    //  C2.
-    for (int input_index = 0; input_index < node->input_size(); ++input_index) {
-      const auto& input = node->input(input_index);
-      if (IsControlInput(input)) {
-        // Try the next input.
-        continue;
-      }
-      NodeDef* input_node = node_map_->GetNode(input);
-      if (!IsReallyConstant(*input_node)) {
-        continue;
-      }
-      bool valid_input = true;
-      for (const string& fanin_of_input : input_node->input()) {
-        if (IsControlInput(fanin_of_input)) {
-          valid_input = false;
-          break;
-        }
-      }
-      if (!valid_input) {
-        // Try the next input
-        continue;
+Status ConstantFolding::FoldMergeNode(NodeDef* node, GraphDef* output_graph) {
+  // Merge nodes are special, in the sense that they execute as soon as one of
+  // their input is ready. We can therefore fold a merge node iff it has at
+  // least one constant input without control dependency.
+  // We still need to ensure that the nodes in the fanin of the merge node are
+  // scheduled. We'll therefore add a control dependency from the merge node
+  // to the folded constant. We end up with:
+  //  * the merge node and its inputs are preserved as is
+  //  * a new constant node C1, driven by the merge node through a control
+  //  dependency, initialized to the value of the folded input
+  //  * a new constant node C2, driven by the merge node through a control
+  //  dependency, initialized to the index of the folded input
+  //  * the fanout of the merge nodes is rewired to be driven by either C1 or
+  //  C2.
+  for (int input_index = 0; input_index < node->input_size(); ++input_index) {
+    const auto& input = node->input(input_index);
+    if (IsControlInput(input)) {
+      // Try the next input.
+      continue;
+    }
+    NodeDef* input_node = node_map_->GetNode(input);
+    if (!IsReallyConstant(*input_node)) {
+      continue;
+    }
+    bool valid_input = true;
+    for (const string& fanin_of_input : input_node->input()) {
+      if (IsControlInput(fanin_of_input)) {
+        valid_input = false;
+        break;
       }
+    }
+    if (!valid_input) {
+      // Try the next input
+      continue;
+    }
 
-      string const_out_name = OptimizedNodeName(*node, "_const");
-      string const_index_name = OptimizedNodeName(*node, "_index");
-      if (node_map_->GetNode(const_out_name) ||
-          node_map_->GetNode(const_index_name)) {
-        // Intended name already exists.
-        return errors::AlreadyExists(
-            strings::StrCat(const_out_name, " or ", const_index_name,
-                            " already present in the graph"));
-      }
-
-      NodeDef* const_out = output_graph->add_node();
-      *const_out = *input_node;
-      const_out->set_name(const_out_name);
-      const_out->set_device(node->device());
-      *const_out->add_input() = AsControlDependency(*node);
-      node_map_->AddNode(const_out->name(), const_out);
-      node_map_->AddOutput(node->name(), const_out->name());
-
-      NodeDef* const_index = output_graph->add_node();
-      const_index->set_op("Const");
-      Tensor index(DT_INT32, TensorShape({}));
-      index.flat<int32>()(0) = input_index;
-      (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
-      index.AsProtoTensorContent(
-          (*const_index->mutable_attr())["value"].mutable_tensor());
-      const_index->set_name(const_index_name);
-      const_index->set_device(node->device());
-      *const_index->add_input() = AsControlDependency(*node);
-      node_map_->AddNode(const_index->name(), const_index);
-      node_map_->AddOutput(node->name(), const_index->name());
-
-      auto outputs = node_map_->GetOutputs(node->name());
-      for (NodeDef* output : outputs) {
-        for (int i = 0; i < output->input_size(); i++) {
-          int port;
-          string node_name = ParseNodeName(output->input(i), &port);
-          if (node_name == node->name()) {
-            if (port == 0) {
-              *output->mutable_input(i) = const_out->name();
-              node_map_->AddOutput(const_out->name(), output->name());
-            } else if (port == 1) {
-              *output->mutable_input(i) = const_index->name();
-              node_map_->AddOutput(const_index->name(), output->name());
-            } else {
-              // This is a control dependency (or an invalid edge since the
-              // merge node has only 2 inputs): preserve them.
-            }
+    string const_out_name = OptimizedNodeName(*node, "_const");
+    string const_index_name = OptimizedNodeName(*node, "_index");
+    if (node_map_->GetNode(const_out_name) ||
+        node_map_->GetNode(const_index_name)) {
+      // Intended name already exists.
+      return errors::AlreadyExists(
+          strings::StrCat(const_out_name, " or ", const_index_name,
+                          " already present in the graph"));
+    }
+
+    NodeDef* const_out = output_graph->add_node();
+    *const_out = *input_node;
+    const_out->set_name(const_out_name);
+    const_out->set_device(node->device());
+    *const_out->add_input() = AsControlDependency(*node);
+    node_map_->AddNode(const_out->name(), const_out);
+    node_map_->AddOutput(node->name(), const_out->name());
+
+    NodeDef* const_index = output_graph->add_node();
+    const_index->set_op("Const");
+    Tensor index(DT_INT32, TensorShape({}));
+    index.flat<int32>()(0) = input_index;
+    (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
+    index.AsProtoTensorContent(
+        (*const_index->mutable_attr())["value"].mutable_tensor());
+    const_index->set_name(const_index_name);
+    const_index->set_device(node->device());
+    *const_index->add_input() = AsControlDependency(*node);
+    node_map_->AddNode(const_index->name(), const_index);
+    node_map_->AddOutput(node->name(), const_index->name());
+
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (NodeDef* output : outputs) {
+      for (int i = 0; i < output->input_size(); i++) {
+        int port;
+        string node_name = ParseNodeName(output->input(i), &port);
+        if (node_name == node->name()) {
+          if (port == 0) {
+            *output->mutable_input(i) = const_out->name();
+            node_map_->AddOutput(const_out->name(), output->name());
+          } else if (port == 1) {
+            *output->mutable_input(i) = const_index->name();
+            node_map_->AddOutput(const_index->name(), output->name());
+          } else {
+            // This is a control dependency (or an invalid edge since the
+            // merge node has only 2 inputs): preserve them.
           }
         }
       }
-      return Status::OK();
     }
     return Status::OK();
   }
+  return Status::OK();
+}
+
+Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
+                                 bool* result_too_large) {
+  *result_too_large = false;
+  if (IsMerge(*node)) {
+    return FoldMergeNode(node, output_graph);
+  }
 
   std::vector<NodeDef> const_nodes;
   TF_RETURN_IF_ERROR(
@@ -1395,7 +1553,8 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "OnesLike") return true;
+  if (IsOnesLike(node)) return true;
+  if (IsZerosLike(node)) return false;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsOnes(*values);
@@ -1417,6 +1576,11 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
     IS_ONES_CASE(DT_INT16);
     IS_ONES_CASE(DT_INT32);
     IS_ONES_CASE(DT_INT64);
+    IS_ONES_CASE(DT_QINT32);
+    IS_ONES_CASE(DT_QINT16);
+    IS_ONES_CASE(DT_QUINT16);
+    IS_ONES_CASE(DT_QINT8);
+    IS_ONES_CASE(DT_QUINT8);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1428,7 +1592,8 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "ZerosLike") return true;
+  if (IsOnesLike(node)) return false;
+  if (IsZerosLike(node)) return true;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsZeros(*values);
@@ -1450,6 +1615,11 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
     IS_ZEROS_CASE(DT_INT16);
     IS_ZEROS_CASE(DT_INT32);
     IS_ZEROS_CASE(DT_INT64);
+    IS_ZEROS_CASE(DT_QINT32);
+    IS_ZEROS_CASE(DT_QINT16);
+    IS_ZEROS_CASE(DT_QUINT16);
+    IS_ZEROS_CASE(DT_QINT8);
+    IS_ZEROS_CASE(DT_QUINT8);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1562,6 +1732,7 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     node->set_input(i, ctrl_dep);
   }
   *success = true;
+  graph_modified_ = true;
   return Status::OK();
 }
 
@@ -2458,6 +2629,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
   *success = false;
   const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
   const bool is_matmul = IsMatMul(*node);
+  const bool is_quantized_matmul = IsQuantizedMatMul(*node);
   const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
   const bool is_sub = IsSub(*node);
   const bool is_any_div = IsAnyDiv(*node);
@@ -2552,6 +2724,10 @@ Status ConstantFolding::SimplifyArithmeticOperations(
         if (!replace_op_status.ok()) {
           return replace_op_status;
         } else if (replace_succeed) {
+          if (is_quantized_matmul) {
+            TF_RETURN_IF_ERROR(
+                AddQuantizedMatMulMinMaxOutConstNodes(node, optimized_graph));
+          }
           *success = true;
           return Status::OK();
         }
@@ -2715,118 +2891,110 @@ bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
   //                 X  C1                       C1  C2
   //
   // where C1 and C2 are constants and X is non-constant.
-  if (IsMul(*node) && NumNonControlInputs(*node) == 2) {
-    NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
-    NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
-    // One child must be constant, and the second must be Conv op.
-    const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
-    const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
-    if (!left_child_is_constant && !right_child_is_constant) {
-      return false;
-    }
-    NodeDef* conv_node =
-        left_child_is_constant ? mul_right_child : mul_left_child;
-    if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
-      return false;
-    }
-    if (node->device() != mul_left_child->device() ||
-        node->device() != mul_right_child->device()) {
-      return false;
-    }
-
-    // Make sure that it is safe to change the value of the convolution
-    // output.
-    if (conv_node->input_size() < 2 ||
-        NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
-        nodes_to_preserve_.find(conv_node->name()) !=
-            nodes_to_preserve_.end()) {
-      return false;
-    }
-
-    // Identify the nodes to swap.
-    NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
-    NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
-    const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
-    const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
-    if (!conv_left_is_constant && !conv_right_is_constant) {
-      // At least one of the convolution inputs should be constant.
-      return false;
-    }
-    if (conv_left_is_constant && conv_right_is_constant) {
-      // Leverage regular constant folding to handle this.
-      return false;
-    }
-    const auto& mul_props = properties.GetOutputProperties(node->name());
-    const auto& conv_props = properties.GetOutputProperties(conv_node->name());
-    if (mul_props.empty() || conv_props.empty()) {
-      return false;
-    }
-    const auto& mul_shape = mul_props[0].shape();
-    const auto& conv_shape = conv_props[0].shape();
-    if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
-      return false;
-    }
-
-    const auto& input_props = properties.GetInputProperties(conv_node->name());
-    if (input_props.size() < 2) {
-      return false;
-    }
-    const auto& filter_shape = input_props[1].shape();
+  if (!IsMul(*node) || NumNonControlInputs(*node) != 2) return false;
+
+  NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
+  NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
+  // One child must be constant, and the second must be Conv op.
+  const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
+  const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
+  if (!left_child_is_constant && !right_child_is_constant) {
+    return false;
+  }
+  NodeDef* conv_node =
+      left_child_is_constant ? mul_right_child : mul_left_child;
+  if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
+    return false;
+  }
+  if (node->device() != mul_left_child->device() ||
+      node->device() != mul_right_child->device()) {
+    return false;
+  }
 
-    NodeDef* const_node =
-        left_child_is_constant ? mul_left_child : mul_right_child;
-    const auto& const_props =
-        properties.GetOutputProperties(const_node->name());
-    if (const_props.empty()) {
-      return false;
-    }
-    const auto& const_shape = const_props[0].shape();
+  // Make sure that it is safe to change the value of the convolution
+  // output.
+  if (conv_node->input_size() < 2 ||
+      NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
+      nodes_to_preserve_.find(conv_node->name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
 
-    TensorShapeProto new_filter_shape;
-    if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
-      return false;
-    }
-    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
-      return false;
-    }
+  // Identify the nodes to swap.
+  NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
+  NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
+  const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
+  const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
+  if (!conv_left_is_constant && !conv_right_is_constant) {
+    // At least one of the convolution inputs should be constant.
+    return false;
+  }
+  if (conv_left_is_constant && conv_right_is_constant) {
+    // Leverage regular constant folding to handle this.
+    return false;
+  }
+  const auto& mul_props = properties.GetOutputProperties(node->name());
+  const auto& conv_props = properties.GetOutputProperties(conv_node->name());
+  if (mul_props.empty() || conv_props.empty()) {
+    return false;
+  }
+  const auto& mul_shape = mul_props[0].shape();
+  const auto& conv_shape = conv_props[0].shape();
+  if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
+    return false;
+  }
 
-    string mul_new_name =
-        AddPrefixToNodeName("merged_input", conv_node->name());
-    if (node_map_->NodeExists(mul_new_name)) {
-      return false;
-    }
-    // Make sure we don't introduce loops in the graph by removing control
-    // dependencies from the conv2d node to c2.
-    string conv_const_input =
-        conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
-    if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
-                                node_map_.get())) {
-      // Add a control dep from c1 to c2 to ensure c2 is in the right frame
-      MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
-                           node_map_.get());
-    }
+  const auto& input_props = properties.GetInputProperties(conv_node->name());
+  if (input_props.size() < 2) {
+    return false;
+  }
+  const auto& filter_shape = input_props[1].shape();
 
-    conv_node->set_name(node->name());
-    node->set_name(mul_new_name);
-    if (conv_left_is_constant) {
-      node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
-      conv_node->set_input(0, mul_new_name);
-    } else {
-      node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
-      conv_node->set_input(1, mul_new_name);
-    }
-    NodeDef* conv_const_node =
-        conv_left_is_constant ? conv_left_child : conv_right_child;
-    if (left_child_is_constant) {
-      node->set_input(1, conv_const_node->name());
-    } else {
-      node->set_input(0, conv_const_node->name());
-    }
-    node_map_->AddNode(mul_new_name, node);
+  NodeDef* const_node =
+      left_child_is_constant ? mul_left_child : mul_right_child;
+  const auto& const_props = properties.GetOutputProperties(const_node->name());
+  if (const_props.empty()) {
+    return false;
+  }
+  const auto& const_shape = const_props[0].shape();
+  if (!IsValidConstShapeForMulConvPushDown(
+          conv_node->attr().at("data_format").s(), filter_shape, const_shape)) {
+    return false;
+  }
 
-    return true;
+  string mul_new_name = AddPrefixToNodeName("merged_input", conv_node->name());
+  if (node_map_->NodeExists(mul_new_name)) {
+    return false;
   }
-  return false;
+  // Make sure we don't introduce loops in the graph by removing control
+  // dependencies from the conv2d node to c2.
+  string conv_const_input =
+      conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
+  if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
+                              node_map_.get())) {
+    // Add a control dep from c1 to c2 to ensure c2 is in the right frame
+    MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
+                         node_map_.get());
+  }
+
+  conv_node->set_name(node->name());
+  node->set_name(mul_new_name);
+  if (conv_left_is_constant) {
+    node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
+    conv_node->set_input(0, mul_new_name);
+  } else {
+    node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
+    conv_node->set_input(1, mul_new_name);
+  }
+  NodeDef* conv_const_node =
+      conv_left_is_constant ? conv_left_child : conv_right_child;
+  if (left_child_is_constant) {
+    node->set_input(1, conv_const_node->name());
+  } else {
+    node->set_input(0, conv_const_node->name());
+  }
+  node_map_->AddNode(mul_new_name, node);
+
+  return true;
 }
 
 bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
@@ -3127,6 +3295,65 @@ bool ConstantFolding::MergeConcat(const GraphProperties& properties,
   return true;
 }
 
+Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
+    NodeDef* node, GraphDef* optimized_graph) {
+  auto add_quantized_out = [this, node, optimized_graph](
+                               const string& out_const_name, int index) {
+    NodeDef* out_node = optimized_graph->add_node();
+    Tensor value(DT_FLOAT, TensorShape({}));
+    const bool is_min = index == 1;
+    const DataType type_attr = node->attr().at("dtype").type();
+
+    value.flat<float>()(0) = is_min ? QuantizedTypeMinAsFloat(type_attr)
+                                    : QuantizedTypeMaxAsFloat(type_attr);
+    TF_RETURN_IF_ERROR(
+        CreateNodeDef(out_const_name, TensorValue(&value), out_node));
+    node_map_->AddNode(out_const_name, out_node);
+    out_node->set_device(node->device());
+
+    // Copy all inputs from node.
+    out_node->mutable_input()->CopyFrom(node->input());
+    for (const string& input : out_node->input()) {
+      node_map_->AddOutput(NodeName(input), out_const_name);
+    }
+
+    // Update output nodes consuming node:index to new const node.
+    string old_input = absl::StrCat(node->name(), ":", index);
+    int old_node_count = 0;
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (const auto& output : outputs) {
+      for (int i = 0; i < output->input_size(); ++i) {
+        if (output->input(i) == old_input) {
+          output->set_input(i, out_const_name);
+          node_map_->AddOutput(out_const_name, output->name());
+        } else if (NodeName(output->input(i)) == node->name()) {
+          ++old_node_count;
+        }
+      }
+      if (old_node_count == 0) {
+        node_map_->RemoveOutput(node->name(), output->name());
+      }
+    }
+
+    return Status::OK();
+  };
+  const string min_out_const_name =
+      OptimizedNodeName(*node, "-quantized_matmul_min_out");
+  const string max_out_const_name =
+      OptimizedNodeName(*node, "-quantized_matmul_max_out");
+  if (node_map_->GetNode(min_out_const_name) == nullptr &&
+      node_map_->GetNode(max_out_const_name) == nullptr) {
+    TF_RETURN_IF_ERROR(add_quantized_out(min_out_const_name, 1));
+    TF_RETURN_IF_ERROR(add_quantized_out(max_out_const_name, 2));
+  } else {
+    return errors::Internal(absl::Substitute(
+        "Can't create Const for QuantizedMatMul min_out/max_out of "
+        "node '$0' because of node name conflict",
+        node->name()));
+  }
+  return Status::OK();
+}
+
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
                                             GraphDef* optimized_graph) {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 99200925cb351478bd188361c33b88634caffa26..418176c8932639f4f8bbef8f636c33b56d36f1c2 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -67,8 +67,10 @@ class ConstantFolding : public GraphOptimizer {
                                           const GraphProperties& properties);
   Status MaterializeReductionIndices(NodeDef* node,
                                      const GraphProperties& properties);
-
+  Status MaterializeConstantValuedNode(NodeDef* node,
+                                       const GraphProperties& properties);
   Status MaterializeConstants(const GraphProperties& properties);
+
   bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
@@ -78,6 +80,7 @@ class ConstantFolding : public GraphOptimizer {
   Status EvaluateOneFoldable(const NodeDef& node, std::vector<NodeDef>* outputs,
                              bool* result_too_large);
 
+  Status FoldMergeNode(NodeDef* node, GraphDef* output_graph);
   Status FoldNode(NodeDef* node, GraphDef* output_graph,
                   bool* result_too_large);
 
@@ -233,6 +236,9 @@ class ConstantFolding : public GraphOptimizer {
   bool MergeConcat(const GraphProperties& properties, bool use_shape_info,
                    GraphDef* optimized_graph, NodeDef* node);
 
+  Status AddQuantizedMatMulMinMaxOutConstNodes(NodeDef* node,
+                                               GraphDef* optimized_graph);
+
   // Points to an externally provided device or to owned_device_;
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index d7cabf5a8b8ad6659937e868df7635292936d48c..76e149d0ae3c5766205e42f6e2486a825d382b66 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -119,6 +119,100 @@ class ConstantFoldingTest : public GrapplerTest {
       }
     }
   }
+
+  void MulConvPushDownTest(const TensorShape& input_shape,
+                           const TensorShape& filter_shape,
+                           const TensorShape& mul_const_input_shape,
+                           const bool use_3d_conv, const char* padding,
+                           const char* data_format, const bool expect_folded) {
+    // Tests if the following rewrite is performed:
+    //
+    //         *                       Conv2D
+    //        / \                       / \
+    //       c  Conv2D        -->      x  (c * filter)
+    //           / \
+    //          x  filter
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    Tensor filter_values(DT_FLOAT, filter_shape);
+    for (int i = 0; i < filter_values.NumElements(); ++i) {
+      filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
+    }
+    Output filter =
+        ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
+
+    Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                    ops::Placeholder::Shape(input_shape));
+
+    Output conv;
+    if (use_3d_conv) {
+      conv = ops::Conv3D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1, 1},
+                         padding, ops::Conv3D::DataFormat(data_format));
+    } else {
+      conv = ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1},
+                         padding, ops::Conv2D::DataFormat(data_format));
+    }
+    Tensor mul_const_input(DT_FLOAT, mul_const_input_shape);
+    for (int i = 0; i < mul_const_input.NumElements(); ++i) {
+      mul_const_input.flat<float>()(i) = static_cast<float>(i + 3);
+    }
+    Output c =
+        ops::Const(s.WithOpName("c"), Input::Initializer(mul_const_input));
+    Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(5, output.node_size());
+    int found = 0;
+    if (expect_folded) {
+      for (const auto& node : output.node()) {
+        if (node.name() == "mul") {
+          found++;
+          EXPECT_EQ(use_3d_conv ? "Conv3D" : "Conv2D", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("conv/merged_input", node.input(1));
+        } else if (node.name() == "conv/merged_input") {
+          found++;
+          EXPECT_EQ("Const", node.op());
+          EXPECT_EQ(0, node.input_size());
+        }
+      }
+    } else {
+      for (const auto& node : output.node()) {
+        if (node.name() == "mul") {
+          found++;
+          EXPECT_EQ("Mul", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("c", node.input(0));
+          EXPECT_EQ("conv", node.input(1));
+        } else if (node.name() == "conv") {
+          found++;
+          EXPECT_EQ(use_3d_conv ? "Conv3D" : "Conv2D", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("filter", node.input(1));
+        }
+      }
+    }
+    EXPECT_EQ(2, found);
+
+    // Check that const folded multiplication node has the expected value.
+    std::vector<string> fetch = {"mul"};
+    Tensor value(DT_FLOAT, input_shape);
+    for (int i = 0; i < value.NumElements(); ++i) {
+      value.flat<float>()(i) = i;
+    }
+    auto actual = EvaluateNodes(output, fetch, {{"x", value}});
+    auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
+    test::ExpectTensorEqual<float>(expected[0], actual[0]);
+  }
 };
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
@@ -242,73 +336,147 @@ TEST_F(ConstantFoldingTest, AddTree) {
   }
 }
 
-TEST_F(ConstantFoldingTest, ConvPushDownTest) {
-  // Tests if the following rewrite is performed:
-  //
-  //         *                       Conv2D
-  //        / \                       / \
-  //       c  Conv2D        -->      x  (c * filter)
-  //           / \
-  //          x  filter
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                              : TensorShape{4, 3, 10, 10},
+        /*filter_shape=*/{2, 2, 3, 5},
+        /*mul_const_input_shape=*/{},
+        /*use_3d_conv=*/false,
+        /*padding=*/"VALID", data_format.c_str(),
+        /*expect_folded=*/true);
+  }
+}
 
-  int input_depth = 3;
-  int filter_count = 5;
-  int filter_size = 2;
-  TensorShape filter_shape(
-      {filter_size, filter_size, input_depth, filter_count});
-  Tensor filter_values(DT_FLOAT, filter_shape);
-  for (int i = 0; i < filter_values.NumElements(); ++i) {
-    filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
-  }
-  Output filter =
-      ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
-
-  int batch_size = 4;
-  int input_dim = 10;
-  TensorShape input_shape({batch_size, input_dim, input_dim, input_depth});
-  Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
-                                  ops::Placeholder::Shape(input_shape));
-
-  Output conv =
-      ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1}, "VALID");
-  Output c = ops::Const(s.WithOpName("c"), 3.0f, {1});
-  Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    for (auto mul_const_input_shape :
+         {TensorShape{1}, TensorShape{1, 1, 1, 1}}) {
+      MulConvPushDownTest(
+          /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                                : TensorShape{4, 3, 10, 10},
+          /*filter_shape=*/{2, 2, 3, 5}, mul_const_input_shape,
+          /*use_3d_conv=*/false,
+          /*padding=*/"VALID", data_format.c_str(),
+          /*expect_folded=*/true);
+    }
+  }
+}
 
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+TEST_F(ConstantFoldingTest,
+       MulConvPushDownTest_Conv2D_SingletonConst_ShapeMismatch) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                              : TensorShape{4, 3, 10, 10},
+        /*filter_shape=*/{2, 2, 3, 5},
+        /*mul_const_input_shape=*/{1, 1, 1, 1, 1},
+        /*use_3d_conv=*/false,
+        /*padding=*/"VALID", data_format.c_str(),
+        /*expect_folded=*/false);
+  }
+}
 
-  ConstantFolding optimizer(/*cpu_device=*/nullptr);
-  GraphDef output;
-  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
-  TF_EXPECT_OK(status);
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1x3Const) {
+  for (auto data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3},
+        /*mul_const_input_shape=*/{3, 1, 3},
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME", data_format,
+        /*expect_folded=*/false);
+  }
+}
 
-  EXPECT_EQ(5, output.node_size());
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "mul") {
-      found++;
-      EXPECT_EQ("Conv2D", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("conv/merged_input", node.input(1));
-    } else if (node.name() == "conv/merged_input") {
-      found++;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(0, node.input_size());
-    }
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_NHWC_VectorLikeConst) {
+  for (auto mul_const_input_shape :
+       {TensorShape{3}, TensorShape{1, 3}, TensorShape{1, 1, 1, 3}}) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3}, mul_const_input_shape,
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME",
+        /*data_format=*/"NHWC",
+        /*expect_folded=*/true);
   }
-  EXPECT_EQ(2, found);
+}
 
-  // Check that const folded multiplication node has the expected value.
-  std::vector<string> fetch = {"mul"};
-  Tensor value(DT_FLOAT, input_shape);
-  for (int i = 0; i < value.NumElements(); ++i) {
-    value.flat<float>()(i) = i;
+#if GOOGLE_CUDA
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_NCHW_VectorLikeConst) {
+  for (auto mul_const_input_shape :
+       {TensorShape{3}, TensorShape{3, 1, 1}, TensorShape{1, 3, 1, 1}}) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3}, mul_const_input_shape,
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME",
+        /*data_format=*/"NCHW",
+        // TODO(laigd): optimization should happen in this case.
+        /*expect_folded=*/false);
   }
-  auto actual = EvaluateNodes(output, fetch, {{"x", value}});
-  auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
-  test::ExpectTensorEqual<float>(expected[0], actual[0]);
+}
+#endif  // GOOGLE_CUDA
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1Const) {
+  for (auto data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3},
+        /*mul_const_input_shape=*/{3, 1},
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME", data_format,
+        /*expect_folded=*/false);
+  }
+}
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NDHWC_1x1x3Const) {
+  MulConvPushDownTest(
+      /*input_shape=*/{3, 3, 3, 3, 3},
+      /*filter_shape=*/{3, 3, 3, 3, 3},
+      /*mul_const_input_shape=*/{1, 1, 3},
+      /*use_3d_conv=*/true,
+      /*padding=*/"SAME",
+      /*data_format=*/"NDHWC",
+      /*expect_folded=*/true);
+}
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NCDHW_3x1x1x1Const) {
+  MulConvPushDownTest(
+      /*input_shape=*/{3, 3, 3, 3, 3},
+      /*filter_shape=*/{3, 3, 3, 3, 3},
+      /*mul_const_input_shape=*/{3, 1, 1, 1},
+      /*use_3d_conv=*/true,
+      /*padding=*/"SAME",
+      /*data_format=*/"NDHWC",
+      // TODO(laigd): optimization should happen in this case.
+      /*expect_folded=*/false);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement) {
@@ -378,7 +546,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     const string ones_name = strings::StrCat("ones", suffix);
     const string ctrl_zeros_name = strings::StrCat("^zeros", suffix);
     const string ctrl_ones_name = strings::StrCat("^ones", suffix);
-    EXPECT_EQ(27, output.node_size());
+    EXPECT_EQ(const_type == kFill ? 31 : 27, output.node_size());
     for (int i = 0; i < output.node_size(); ++i) {
       const NodeDef& node = output.node(i);
       const string& name = node.name();
@@ -3466,6 +3634,88 @@ TEST_F(ConstantFoldingCastConstTest, CastConstFolding) {
   }
 }
 
+TEST_F(ConstantFoldingTest, MaterializeConstantValuedNode) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output x =
+      ops::Placeholder(scope.WithOpName("x"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({1, 2, 3, 4})));
+  Output ones_like = ops::OnesLike(scope.WithOpName("ones_like"), x);
+  Output zeros_like = ops::ZerosLike(scope.WithOpName("zeros_like"), x);
+  Output fill = ops::Fill(scope.WithOpName("fill"), {4, 3, 2, 1}, 42);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"ones_like", "zeros_like", "fill"};
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 2, 3, 4}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+
+  ConstantFolding optimizer(/*opt_level=*/RewriterConfig::AGGRESSIVE,
+                            /*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.node_size(), 6);
+  for (const auto& node : output.node()) {
+    if (node.name() != "x") {
+      EXPECT_EQ(node.op(), "Const");
+    }
+    if (node.name() == "ones_like" || node.name() == "zeros_like") {
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "^x");
+    }
+    if (node.name() == "fill") {
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0)[0], '^');
+      EXPECT_EQ(node.input(1)[0], '^');
+    }
+  }
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  ASSERT_EQ(item.fetch.size(), tensors.size());
+  ASSERT_EQ(tensors_expected.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    if (item.fetch[i] == "fill") {
+      test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+    } else {
+      test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, BitcastDenormalFloats) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Tensor x_t(DT_INT64, TensorShape({2, 2}));
+  x_t.flat<int64>()(0) = 9223372036854775807L;
+  x_t.flat<int64>()(1) = 1L;
+  x_t.flat<int64>()(2) = 9223372036854775807L;
+  x_t.flat<int64>()(3) = 1L;
+  Output x = ops::Const(scope.WithOpName("x"), x_t);
+  Output y = ops::Bitcast(scope.WithOpName("y"), x, DT_FLOAT);
+  Output z = ops::Bitcast(scope.WithOpName("z"), y, DT_INT64);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"z"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  ASSERT_EQ(output.node_size(), 1);
+  const NodeDef& node = output.node(0);
+  EXPECT_EQ(node.name(), "z");
+  EXPECT_EQ(node.op(), "Const");
+
+  auto tensors = EvaluateNodes(output, item.fetch, {});
+  ASSERT_EQ(tensors.size(), 1);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  test::ExpectTensorEqual<int64>(tensors[0], tensors_expected[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index ef02962975efa2c0865fd8c7c5b2c38187c83e16..5f060789889b230070f0ead6df16c24d43755cd4 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -547,11 +547,15 @@ cc_library(
     hdrs = ["rebatch.h"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core:framework",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-    ],
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core:lib",
+    ] + tf_protos_all(),
     alwayslink = 1,
 )
 
@@ -596,7 +600,7 @@ cc_library(
         "optimizer_base.h",
     ],
     deps = [
-        "//tensorflow/core:metrics",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
index 187e1a62afa99f4fcdb81f34b8be504ad04b877f..14c3931267598e2a8c9914f3f20cdec54fc5eb5e 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.cc
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -15,10 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/rebatch.h"
 
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -35,7 +42,58 @@ namespace {
 
 constexpr char kCastOp[] = "Cast";
 constexpr char kRealDivOp[] = "RealDiv";
-constexpr char kBatchDatasetOp[] = "BatchDatasetV2";
+constexpr char kConstOp[] = "Const";
+
+constexpr std::array<const char*, 5> kBatchDatasetOps = {
+    "BatchDataset",
+    "BatchDatasetV2",
+    "ExperimentalMapAndBatchDataset",
+    "PaddedBatchDataset",
+    "PaddedBatchDatasetV2"
+};
+
+constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
+    "ConcatenateDataset",
+    "ZipDataset"
+};
+
+constexpr std::array<const char*, 17> kPassThroughOps = {
+    "CacheDataset",
+    "FilterDataset",
+    "FilterByLastComponentDataset",
+    "Identity",
+    "MapDataset",
+    "ModelDataset",
+    "OptimizeDataset",
+    "ParallelMapDataset",
+    "PrefetchDataset",
+    "ReduceDataset",
+    "RepeatDataset",
+    "ShardDataset",
+    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",
+    "SkipDataset",
+    "TakeDataset",
+    "WindowDataset"
+};
+
+constexpr std::array<const char*, 3> kFuncDatasetOps = {
+    "FlatMapDataset",
+    "InterleaveDataset",
+    "ParallelInterleaveDatasetV2"
+};
+
+constexpr std::array<const char*, 9> kSourceDatasetOps = {
+    "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2",
+    "GeneratorDataset",
+    "RangeDataset",
+    "SparseTensorsSliceDataset",
+    "TensorDataset",
+    "TensorSliceDataset",
+    "TextLineDataset",
+    "TFRecordDataset"
+};
 
 NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
                      MutableGraphView* graph) {
@@ -68,40 +126,180 @@ NodeDef* AddFloatDivNode(const string& input_x, const string& input_y,
   return AddBinaryNode(input_x, input_y, kRealDivOp, DT_FLOAT, graph);
 }
 
+template <std::size_t SIZE>
+bool IsDatasetNodeOfType(const NodeDef& node,
+                         const std::array<const char*, SIZE>& arr) {
+  for (const auto& dataset_op_name : arr) {
+    if (node.op() == dataset_op_name) return true;
+  }
+  return false;
+}
+
+// Given a "batch" dataset node, modifies the batch_size input to divide the
+// current batch size by num_workers.
+Status MutateBatchSize(const NodeDef& node, int64 num_workers,
+                       MutableGraphView* graph) {
+  // TODO(rohanj): Fix up the output_shapes attribute as well. For this Dataset
+  // as well as all the downstream datasets.
+  // For all the batching datasets the batch_size is input number 1.
+  NodeDef* batch_size_node = graph_utils::GetInputNode(node, *graph, 1);
+  // By the time this optimization is run, the batch_size is computed and
+  // is a constant.
+  if (batch_size_node->op() != kConstOp) {
+    return errors::Internal("Batch size node should be a Const. Obtained: ",
+                            batch_size_node->op(), " instead.");
+  }
+  Tensor batch_size_tensor;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(*batch_size_node, "value", &batch_size_tensor));
+  if (!TensorShapeUtils::IsScalar(batch_size_tensor.shape())) {
+    return errors::Internal("Batch size node shape should be scalar");
+  }
+  int64 batch_size = batch_size_tensor.scalar<int64>()();
+  if (batch_size % num_workers != 0) {
+    return errors::InvalidArgument(
+        "Batch size: ", batch_size,
+        " is not divisible by num_workers: ", num_workers);
+  }
+  batch_size /= num_workers;
+  NodeDef* new_batch_size_node =
+      graph_utils::AddScalarConstNode<int64>(batch_size, graph);
+  // We don't call UpdateFanouts here because CSE elimination might lead to
+  // multiple nodes sharing the same batch size constant node. This is also
+  // why we don't delete batch_size_node as well.
+  TF_RETURN_IF_ERROR(graph->UpdateRegularFaninByPort(
+      node.name(), 1, {new_batch_size_node->name(), 0}));
+  return Status::OK();
+}
+
+// There is one Sink node at least that is added to the end of the graph. We
+// find that node and return it. It is possible that there are multiple
+// Identity ops from the final Dataset op to that Sink node, but the recursive
+// graph traversal handles that.
+Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node) {
+  absl::flat_hash_map<string, int> all_node_names;
+  absl::flat_hash_map<string, int> node_input_map;
+  for (int i = 0; i < graph_def.node_size(); ++i) {
+    all_node_names.insert_or_assign(graph_def.node(i).name(), i);
+    node_input_map.insert_or_assign(graph_def.node(i).name(), 0);
+  }
+  // Counts how many graph nodes is this node the input to. Candidate sink
+  // nodes are ones which are inputs into zero nodes.
+  for (const NodeDef& node : graph_def.node()) {
+    for (const string& input_name : node.input()) {
+      node_input_map[input_name]++;
+    }
+  }
+  for (const auto& it : node_input_map) {
+    if (it.second == 0) {
+      const NodeDef& sink_graph_node = graph_def.node(all_node_names[it.first]);
+      // Sometimes the searching surfaces Arg nodes in function cases that
+      // have no input. This check rejects those.
+      if (sink_graph_node.input_size() == 0) {
+        continue;
+      }
+      *sink_node = sink_graph_node;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Failed to find a sink node");
+}
+
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+                     GraphDef* output);
+
+// Helper function that starts from a node in the graph and recurses into its
+// inputs trying to find a BatchDataset type operation to modify. During the
+// recursion it handles four kinds of cases.
+// 1. BatchDataset type ops: Mutates the batch_size input node and stops.
+// 2. Zip / Concatenate dataset ops: Recurses into all inputs to these ops
+//      as they are datasets themselves.
+// 3. Core dataset ops + Identity op: Recurses into first input parameter.
+// 4. FlatMap type mapping dataset ops: Recurses into the function definition.
+Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
+                           FunctionLibraryDefinition* flib,
+                           MutableGraphView* graph) {
+  if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
+    return MutateBatchSize(node, num_workers, graph);
+  } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
+    // For all multiple input datasets, all inputs are datasets themselves.
+    for (int i = 0; i < node.input_size(); ++i) {
+      NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i);
+      TF_RETURN_IF_ERROR(
+          RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+    }
+  } else if (IsDatasetNodeOfType(node, kPassThroughOps)) {
+    // For all the dataset ops that are pass through, the input dataset is
+    // input 0.
+    NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
+    TF_RETURN_IF_ERROR(
+        RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+  } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
+    const string func_name = node.attr().at("f").func().name();
+    const FunctionDef* fdef = flib->Find(func_name);
+    GrapplerFunctionItem f_item;
+    TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+        *fdef, *flib, graph->graph()->versions().producer(), &f_item));
+    GraphDef optimized_func_graph;
+    Status s = OptimizeGraph(f_item, num_workers, &optimized_func_graph);
+    if (s.ok()) {
+      // Function body optimization might have created new specialized
+      // functions for each instantiation context. Add them to the library.
+      for (const FunctionDef& func_def :
+           optimized_func_graph.library().function()) {
+        if (flib->Find(func_def.signature().name()) == nullptr) {
+          TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def));
+        }
+      }
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      f_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func));
+    }
+  } else if (IsDatasetNodeOfType(node, kSourceDatasetOps)) {
+    return errors::InvalidArgument(
+        "Reached a source dataset: ", node.op(),
+        " without encountering a batch transformation.");
+  } else {
+    return errors::InvalidArgument("Encountered an unsupported op: ",
+                                   node.op());
+  }
+  return Status::OK();
+}
+
+// Helper function that given a GrapplerItem generates a mutated graph def
+// with the batch size changed. The GrapplerItem could be generated from the
+// main graph or could be a function graph.
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+                     GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
+
+  NodeDef sink_node;
+  TF_RETURN_IF_ERROR(FindSinkNode(item.graph, &sink_node));
+  TF_RETURN_IF_ERROR(
+      RecursivelyHandleOp(sink_node, num_workers, &flib, &graph));
+  *output->mutable_library() = flib.ToProto();
+  return Status::OK();
+}
+
 }  // anonymous namespace
 
-Status RebatchOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                  GraphDef* output) {
+Status RebatchOptimizer::OptimizeAndCollectStats(Cluster* cluster,
+                                                 const GrapplerItem& item,
+                                                 GraphDef* output,
+                                                 OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
 
-  absl::flat_hash_set<string> nodes_to_delete;
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() == kBatchDatasetOp) {
-      NodeDef* batch_size_node = graph_utils::GetInputNode(node, graph, 1);
-      NodeDef tmp_node;
-      tmp_node = *batch_size_node;
-      graph_utils::SetUniqueGraphNodeName(tmp_node.op(), graph.graph(),
-                                          &tmp_node);
-      NodeDef* copy_batch_size_node = graph.AddNode(std::move(tmp_node));
-      NodeDef* float_copy_batch_size_node =
-          AddCastNode(copy_batch_size_node->name(), DT_INT64, DT_FLOAT, &graph);
-      NodeDef* num_worker_node =
-          graph_utils::AddScalarConstNode<int64>(num_workers_, &graph);
-      NodeDef* float_num_worker_node =
-          AddCastNode(num_worker_node->name(), DT_INT64, DT_FLOAT, &graph);
-      NodeDef* divided_batch_size_node =
-          AddFloatDivNode(float_copy_batch_size_node->name(),
-                          float_num_worker_node->name(), &graph);
-      NodeDef* cast_new_batch_size_node = AddCastNode(
-          divided_batch_size_node->name(), DT_FLOAT, DT_INT64, &graph);
-      TF_RETURN_IF_ERROR(graph.UpdateFanouts(batch_size_node->name(),
-                                             cast_new_batch_size_node->name()));
-      nodes_to_delete.insert(batch_size_node->name());
-      break;
-    }
-  }
-  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, output));
+  stats->num_changes++;
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.h b/tensorflow/core/grappler/optimizers/data/rebatch.h
index f7aa69f3ff995a44ccfb89bab83b237624275f42..29a610002645b9dd88d8a278f68094b2121697ac 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.h
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -23,10 +25,7 @@ namespace grappler {
 // This optimizer changes the batch size of the output dataset by dividing the
 // current batch size by parameter `num_workers`. Currently, this works only
 // for very simple pipelines with a single BatchDatasetV2 transformation.
-//
-// TODO(rohanj): Extend this logic to correctly handle any input pipeline that
-// uses core tf.data APIs + MapAndBatch.
-class RebatchOptimizer : public CustomGraphOptimizer {
+class RebatchOptimizer : public TFDataOptimizerBase {
  public:
   RebatchOptimizer() = default;
   ~RebatchOptimizer() override = default;
@@ -36,8 +35,9 @@ class RebatchOptimizer : public CustomGraphOptimizer {
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index ff64ff1adbcd71d916a1bd6f842b9decc4a68d96..0563460b29505f1b054a57624d470c4e642bea2f 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h"
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 0eee91f241a8e3c09b93a159c93addb43e749b02..0f34d2b7ebe59244a9b02b5209732fc830cd6729 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
-#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index c57a7b125693af2b53d52e772bb4264bfbe00b23..1969ff00e4ae5147f183e1230986b4d2b4620fc7 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -415,6 +415,10 @@ Status Vectorization::Initialize(const FunctionDef& outer_scope,
 // NodeBuilder
 Status Vectorization::StackTensor(WrappedTensor* unstacked,
                                   TensorDesc* result) {
+  if (unstacked->node->output_type(unstacked->output_index) == DT_VARIANT) {
+    // TODO(b/124069171): "ExpandDims" doesn't work with Variant tensors.
+    return errors::Unimplemented("Cannot stack tensor with Variant type.");
+  }
   // Note that all these nodes are necessary as the size of the batch may not be
   // constant.
   if (unstacked->stacked) {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 9712ad63b08e7e3139a293251775bcd0cb49bf27..2dfa5e99d6af91cd6b7786fda111be2f6259db35 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -74,23 +74,26 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
     return false;
   } else if (IsSwitch(*input)) {
     // Don't turn Identity nodes following Switch into NoOp or remove them
-    // if it requires anchoring a control dependencies the Switch node, which
+    // if it requires anchoring a control dependencies to the Switch node, which
     // is not valid.
     MutableGraphView::OutputPort control_port(const_cast<NodeDef*>(&node),
                                               Graph::kControlSlot);
-    if (!graph_view_->GetFanout(control_port).empty()) {
+    auto control_fanouts = graph_view_->GetFanout(control_port);
+    if (!control_fanouts.empty()) {
       return false;
     }
   }
   bool node_has_multiple_inputs =
       graph_view_->NumFanins(node, /*include_controlling_nodes=*/true) > 1;
-  for (auto consumer :
-       graph_view_->GetFanouts(node, /*include_controlled_nodes=*/true)) {
-    if (node_has_multiple_inputs && IsMerge(*consumer.node)) {
+
+  auto fanouts =
+      graph_view_->GetFanouts(node, /*include_controlled_nodes=*/true);
+  for (auto fanout : fanouts) {
+    if (node_has_multiple_inputs && IsMerge(*fanout.node)) {
       return false;
     }
     if (IsSwitch(*input)) {
-      if (graph_view_->HasFanin(*consumer.node,
+      if (graph_view_->HasFanin(*fanout.node,
                                 {node.name(), Graph::kControlSlot})) {
         return false;
       }
@@ -134,22 +137,21 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
 }
 
 int DependencyOptimizer::NumEdgesIfBypassed(
-    const NodeDef& node,
-    const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges) const {
+    const NodeDef& node, int num_controlling_fanins,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges,
+    int num_unique_fanout_nodes) const {
   const bool is_multi_input_identity_n =
       IsIdentityN(node) && !IsIdentityNSingleInput(node);
-  const int num_fanins = node.input_size();
-  const int num_fanouts = fanout_edges.size();
+  const int num_fanins = fanin_edges.size();
 
   if (is_multi_input_identity_n) {
     // multi-input identity_n with input/output control dependencies will likely
     // increase number of edges after optimization.
-    int num_edges_if_bypassed(0);
-    int num_non_controlling_fanins =
-        graph_view_->NumFanins(node, /*include_controlling_nodes=*/false);
+    int num_edges_if_bypassed = 0;
+    int num_non_controlling_fanins = num_fanins - num_controlling_fanins;
     num_edges_if_bypassed += num_non_controlling_fanins;
-    num_edges_if_bypassed +=
-        (num_fanins - num_non_controlling_fanins) * num_fanouts;
+    num_edges_if_bypassed += num_controlling_fanins * num_unique_fanout_nodes;
 
     for (const auto& fanout : fanout_edges) {
       if (fanout.dst.port_id == Graph::kControlSlot) {
@@ -160,31 +162,35 @@ int DependencyOptimizer::NumEdgesIfBypassed(
     }
     return num_edges_if_bypassed;
   } else {
-    return num_fanins * num_fanouts;
+    return num_fanins * num_unique_fanout_nodes;
   }
 }
 
 bool DependencyOptimizer::BypassingNodeIsBeneficial(
-    const NodeDef& node,
+    const NodeDef& node, int num_controlling_fanins,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
     const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges) const {
   const bool is_identity = IsIdentity(node) || IsIdentityNSingleInput(node);
   const bool is_multi_input_identity_n =
       IsIdentityN(node) && !IsIdentityNSingleInput(node);
-  const int num_fanins = node.input_size();
-  const int num_fanouts = fanout_edges.size();
+  const int num_fanins = fanin_edges.size();
+  absl::flat_hash_set<NodeDef*> unique_fanout_nodes;
+  for (const auto& fanout_edge : fanout_edges) {
+    unique_fanout_nodes.insert(fanout_edge.dst.node);
+  }
+  const int num_unique_fanout_nodes = unique_fanout_nodes.size();
 
-  if (NumEdgesIfBypassed(node, fanout_edges) > num_fanins + num_fanouts) {
+  if (NumEdgesIfBypassed(node, num_controlling_fanins, fanin_edges,
+                         fanout_edges, num_unique_fanout_nodes) >
+      num_fanins + num_unique_fanout_nodes) {
     return false;
   }
 
-  auto fanin_edges =
-      graph_view_->GetFaninEdges(node, /*include_controlling_edges=*/true);
-
   // Make sure that we don't increase the number of edges that cross
   // device boundaries.
-  if ((num_fanins == 1 && num_fanouts > 1 &&
+  if ((num_fanins == 1 && num_unique_fanout_nodes > 1 &&
        fanin_edges.begin()->src.node->device() != node.device()) ||
-      (num_fanins > 1 && num_fanouts == 1 &&
+      (num_fanins > 1 && num_unique_fanout_nodes == 1 &&
        fanout_edges.begin()->dst.node->device() != node.device())) {
     return false;
   }
@@ -198,17 +204,17 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
     num_cross_in += static_cast<int>(fanin.src.node->device() != node_dev);
   }
   int num_cross_out = 0;
-  for (const auto& fanout : fanout_edges) {
-    num_cross_out += static_cast<int>(fanout.dst.node->device() != node_dev);
+  for (const auto& fanout : unique_fanout_nodes) {
+    num_cross_out += static_cast<int>(fanout->device() != node_dev);
   }
 
   // Make sure we do not increase the number of device crossings.
   const int num_cross_before = num_cross_in + num_cross_out;
   int num_cross_after = 0;
   for (const auto& fanin : fanin_edges) {
-    for (const auto& fanout : fanout_edges) {
-      num_cross_after += static_cast<int>(fanin.src.node->device() !=
-                                          fanout.dst.node->device());
+    for (const auto& fanout : unique_fanout_nodes) {
+      num_cross_after +=
+          static_cast<int>(fanin.src.node->device() != fanout->device());
     }
   }
   if (num_cross_after > num_cross_before) {
@@ -239,8 +245,9 @@ Status DependencyOptimizer::OptimizeNode(
   // In this case we then can prune all their output control dependencies.
   if (IsConstant(*node) &&
       graph_view_->NumFanins(*node, /*include_controlling_nodes=*/true) == 0) {
-    for (const auto& fanout :
-         graph_view_->GetFanouts(*node, /*include_controlled_nodes=*/true)) {
+    MutableGraphView::OutputPort control_port(node, Graph::kControlSlot);
+    auto control_fanouts = graph_view_->GetFanout(control_port);
+    for (const auto& fanout : control_fanouts) {
       TF_RETURN_IF_ERROR(
           graph_view_->RemoveControllingFanin(fanout.node->name(), node_name));
       nodes_to_simplify->PushBack(fanout.node->name());
@@ -251,14 +258,14 @@ Status DependencyOptimizer::OptimizeNode(
         fetch_nodes_known_ &&
         nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
-      nodes_to_delete->insert(node->name());
+      nodes_to_delete->insert(node_name);
     }
     return Status::OK();
   }
 
   // Change ops that only have control dependencies as outputs to NoOps.
   if (!is_noop && SafeToConvertToNoOp(*node)) {
-    VLOG(1) << "***** Replacing  " << node_name << " (" << node->op()
+    VLOG(1) << "***** Replacing " << node_name << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
@@ -271,11 +278,11 @@ Status DependencyOptimizer::OptimizeNode(
     TF_RETURN_IF_ERROR(
         graph_view_->UpdateAllRegularFaninsToControlling(node_name));
     TF_RETURN_IF_ERROR(
-        graph_view_->UpdateNode(node->name(), "NoOp", node->device(), {}));
-    nodes_to_simplify->PushBack(node->name());
+        graph_view_->UpdateNode(node_name, "NoOp", node->device(), {}));
     for (const string& regular_fanin_name : regular_fanin_names) {
       nodes_to_simplify->PushBack(regular_fanin_name);
     }
+    nodes_to_simplify->PushBack(node_name);
     return Status::OK();
   }
 
@@ -329,76 +336,69 @@ Status DependencyOptimizer::OptimizeNode(
 
   if (is_noop || ((is_identity || is_multi_input_identity) &&
                   SafeToRemoveIdentity(*node))) {
+    auto fanin_edges =
+        graph_view_->GetFaninEdges(*node, /*include_controlling_edges=*/true);
+    std::vector<NodeDef*> controlling_fanins;
+    controlling_fanins.reserve(fanin_edges.size());
+    for (const auto& fanin_edge : fanin_edges) {
+      if (fanin_edge.src.port_id == Graph::kControlSlot) {
+        controlling_fanins.push_back(fanin_edge.src.node);
+      }
+    }
     auto fanout_edges =
         graph_view_->GetFanoutEdges(*node, /*include_controlled_edges=*/true);
-    if (!BypassingNodeIsBeneficial(*node, fanout_edges)) {
+    if (!BypassingNodeIsBeneficial(*node, controlling_fanins.size(),
+                                   fanin_edges, fanout_edges)) {
       return Status::OK();
     }
 
-    if (!fanout_edges.empty()) {
-      const int num_regular_fanins =
-          graph_view_->NumFanins(*node, /*include_controlling_nodes=*/false);
-      std::vector<TensorId> regular_fanin_ids;
-      regular_fanin_ids.reserve(num_regular_fanins);
-      for (int i = 0; i < num_regular_fanins; ++i) {
-        regular_fanin_ids.push_back(ParseTensorName(node->input(i)));
-      }
-
-      absl::flat_hash_set<NodeDef*> processed_consumers;
-      VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
-      // Now remove the node and re-wire its inputs to its outputs.
-      for (auto fanout_edge : fanout_edges) {
-        NodeDef* consumer = fanout_edge.dst.node;
-        VLOG(1) << "consumer before:\n" << consumer->DebugString();
-        if (fanout_edge.src.port_id == Graph::kControlSlot) {
-          TF_RETURN_IF_ERROR(
-              graph_view_->RemoveControllingFanin(consumer->name(), node_name));
-          // Add all non controlling fanins of node as controlling fanins to
-          // consumer.
-          for (const TensorId& regular_fanin_id : regular_fanin_ids) {
-            TF_RETURN_IF_ERROR(graph_view_->AddControllingFanin(
-                consumer->name(), regular_fanin_id));
-            nodes_to_simplify->PushBack(string(regular_fanin_id.node()));
-          }
-        } else {
-          TF_RETURN_IF_ERROR(graph_view_->UpdateRegularFaninByPort(
-              consumer->name(), fanout_edge.dst.port_id,
-              ParseTensorName(node->input(fanout_edge.src.port_id))));
-        }
-        processed_consumers.insert(consumer);
-        VLOG(1) << "consumer after:\n" << consumer->DebugString();
-      }
-
-      std::vector<TensorId> controlling_fanin_ids;
-      controlling_fanin_ids.reserve(node->input_size() - num_regular_fanins);
-      for (int i = num_regular_fanins; i < node->input_size(); ++i) {
-        controlling_fanin_ids.push_back(ParseTensorName(node->input(i)));
-      }
-
-      for (NodeDef* consumer : processed_consumers) {
-        // Forward all controlling fanins of node to consumer.
-        VLOG(1) << "consumer before:\n" << consumer->DebugString();
-        for (const TensorId& controlling_fanin_id : controlling_fanin_ids) {
-          TF_RETURN_IF_ERROR(graph_view_->AddControllingFanin(
-              consumer->name(), controlling_fanin_id));
-        }
-        nodes_to_simplify->PushBack(consumer->name());
-        VLOG(1) << "consumer after:\n" << consumer->DebugString();
+    VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
+
+    absl::flat_hash_set<NodeDef*> processed_nodes;
+    for (const auto& fanout_edge : fanout_edges) {
+      NodeDef* consumer = fanout_edge.dst.node;
+      const int src_port = fanout_edge.src.port_id;
+      if ((is_identity && src_port == 0) ||
+          (is_multi_input_identity && src_port > Graph::kControlSlot)) {
+        // Identity regular fanins.
+        const string& input_to_forwards = node->input(src_port);
+        TF_RETURN_IF_ERROR(graph_view_->UpdateRegularFaninByPort(
+            consumer->name(), fanout_edge.dst.port_id,
+            ParseTensorName(input_to_forwards)));
+      } else if (is_identity || is_multi_input_identity) {
+        // Identity control dependency.
+        // TODO(lyandy): Handle IdentityN properly here by adding all regular
+        // fanins as controlling fanins.
+        const string& node_first_input = node->input(0);
+        TF_RETURN_IF_ERROR(graph_view_->UpdateFanin(
+            consumer->name(), {node_name, Graph::kControlSlot},
+            {ParseTensorName(node_first_input).node(), Graph::kControlSlot}));
+      } else {
+        // NoOp.
+        TF_RETURN_IF_ERROR(
+            graph_view_->RemoveControllingFanin(consumer->name(), node_name));
       }
-      for (const TensorId& controlling_fanin_id : controlling_fanin_ids) {
-        nodes_to_simplify->PushBack(string(controlling_fanin_id.node()));
+      processed_nodes.insert(consumer);
+      nodes_to_simplify->PushBack(consumer->name());
+    }
+    for (const auto& processed_node : processed_nodes) {
+      // Forward dependency from input to consumer if it doesn't already
+      // depend on it.
+      for (const auto& controlling_fanin : controlling_fanins) {
+        TF_RETURN_IF_ERROR(graph_view_->AddControllingFanin(
+            processed_node->name(),
+            {controlling_fanin->name(), Graph::kControlSlot}));
+        nodes_to_simplify->PushBack(controlling_fanin->name());
       }
     }
 
     if (fetch_nodes_known_ &&
         nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
+      // Disconnect the node from its inputs to enable further optimizations.
+      TF_RETURN_IF_ERROR(graph_view_->RemoveAllFanins(
+          node_name, /*keep_controlling_fanins=*/false));
       // Mark the node for deletion.
       nodes_to_delete->insert(node_name);
-
-      // Disconnect the node from its inputs to enable further optimizations.
-      TF_RETURN_IF_ERROR(
-          graph_view_->RemoveAllFanins(node_name,
-                                       /*keep_controlling_fanins=*/false));
     }
   }
   return Status::OK();
@@ -442,16 +442,14 @@ Status DependencyOptimizer::TransitiveReduction() {
   // highest index of a target of any control output from each node.
   int num_controls = 0;
   std::vector<gtl::InlinedVector<int, 4>> inputs(num_nodes);
-  std::vector<gtl::InlinedVector<std::pair<int, int>, 2>> control_outputs(
-      num_nodes);
+  std::vector<gtl::InlinedVector<int, 2>> control_outputs(num_nodes);
   for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
     const NodeDef& node = graph->node(node_idx);
     if (ModifiesFrameInfo(node) || !HasOpDef(node)) {
       // Ignore function nodes and nodes that modify frame info.
       continue;
     }
-    for (int input_slot = 0; input_slot < node.input_size(); ++input_slot) {
-      const string& input = node.input(input_slot);
+    for (const string& input : node.input()) {
       const NodeDef* input_node = graph_view_->GetNode(NodeName(input));
       if (ModifiesFrameInfo(*input_node) || IsMerge(*input_node)) {
         // Ignore edges from nodes that modify frame info and from Merge nodes,
@@ -462,7 +460,7 @@ Status DependencyOptimizer::TransitiveReduction() {
       inputs[node_idx].push_back(input_node_idx);
       if (IsControlInput(input)) {
         ++num_controls;
-        control_outputs[input_node_idx].emplace_back(node_idx, input_slot);
+        control_outputs[input_node_idx].emplace_back(node_idx);
       }
     }
   }
@@ -477,14 +475,12 @@ Status DependencyOptimizer::TransitiveReduction() {
   // such that when we swap them out so we don't clobber the
   // node(target).input() repeated field.
   typedef std::pair<int, int> InputSlotAndSource;
-  absl::flat_hash_map<
-      int, std::set<InputSlotAndSource, std::greater<InputSlotAndSource>>>
-      control_edges_to_remove;
+  absl::flat_hash_map<int, absl::flat_hash_set<int>> control_edges_to_remove;
   for (int source = 0; source < num_nodes; ++source) {
     int highest_control_target = -1;
     for (const auto& control_output : control_outputs[source]) {
-      if (control_output.first > highest_control_target) {
-        highest_control_target = control_output.first;
+      if (control_output > highest_control_target) {
+        highest_control_target = control_output;
       }
     }
     if (highest_control_target <= source) {
@@ -514,10 +510,9 @@ Status DependencyOptimizer::TransitiveReduction() {
     // longer than 1, there exists an alternate path, and we can eliminate the
     // redundant direct control dependency.
     for (const auto& control_output : control_outputs[source]) {
-      const int target = control_output.first;
+      const int target = control_output;
       if (longest_distance[target] > 1) {
-        const int input_slot = control_output.second;
-        control_edges_to_remove[target].emplace(input_slot, source);
+        control_edges_to_remove[target].emplace(source);
       }
     }
   }
@@ -526,11 +521,10 @@ Status DependencyOptimizer::TransitiveReduction() {
     const int target = it.first;
     const NodeDef& target_node = graph->node(target);
     const string target_node_name = target_node.name();
-    for (const InputSlotAndSource& slot_and_source : it.second) {
-      const int input_slot = slot_and_source.first;
-      const TensorId tensor_id = ParseTensorName(target_node.input(input_slot));
-      TF_RETURN_IF_ERROR(graph_view_->RemoveControllingFanin(target_node_name,
-                                                             tensor_id.node()));
+    for (const int& source : it.second) {
+      const NodeDef& source_node = graph->node(source);
+      TF_RETURN_IF_ERROR(graph_view_->RemoveControllingFanin(
+          target_node_name, source_node.name()));
       ++num_controls_removed;
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 98b69b4b7a543733b2ca593578ab7a127e8497c3..a60e7a352272748d3b2bb7163e73e96a4f46338d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -49,11 +49,14 @@ class DependencyOptimizer : public GraphOptimizer {
   // Returns true if bypassing node does not increase the number of edges or
   // number of edges crossing a device boundary.
   bool BypassingNodeIsBeneficial(
-      const NodeDef& node,
+      const NodeDef& node, int num_controlling_fanins,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
       const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges) const;
   int NumEdgesIfBypassed(
-      const NodeDef& node,
-      const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges) const;
+      const NodeDef& node, int num_controlling_fanins,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges,
+      int num_unique_fanout_nodes) const;
   // Returns true if node is not an Identity node or if it is an Identity
   // that is safe to remove.
   bool SafeToRemoveIdentity(const NodeDef& node) const;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 59512d8b5d391e6b62583d8f90a966c623c025b3..80a0189b302366c420fc26bb8dc3ac469839e3b0 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -48,6 +48,25 @@ void VerifyGraphsEqual(const GraphDef& original_graph,
   }
 }
 
+bool NodeHasControllingFanins(const NodeDef& node,
+                              const absl::flat_hash_set<string>& expected) {
+  absl::flat_hash_set<string> actual;
+  for (const string& fanin : node.input()) {
+    if (IsControlInput(fanin)) {
+      actual.insert(fanin);
+    }
+  }
+  if (actual.size() != expected.size()) {
+    return false;
+  }
+  for (const auto& expected_fanin : expected) {
+    if (!actual.contains(expected_fanin)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 TEST_F(DependencyOptimizerTest, NoOp) {
   // This trivial graph is so basic there's nothing to optimize.
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
@@ -484,8 +503,7 @@ TEST_F(DependencyOptimizerTest, RemoveIdentity) {
     if (node.name() == "b_a") {
       ASSERT_EQ(node.input_size(), 3);
       EXPECT_EQ(node.input(0), "x");
-      EXPECT_EQ(node.input(1), "^y");
-      EXPECT_EQ(node.input(2), "^z");
+      EXPECT_TRUE(NodeHasControllingFanins(node, {"^y", "^z"}));
       ++found;
     }
     if (node.name() == "c_a") {
@@ -497,8 +515,7 @@ TEST_F(DependencyOptimizerTest, RemoveIdentity) {
     if (node.name() == "c_b") {
       ASSERT_EQ(node.input_size(), 3);
       EXPECT_EQ(node.input(0), "z");
-      EXPECT_EQ(node.input(1), "^x");
-      EXPECT_EQ(node.input(2), "^y");
+      EXPECT_TRUE(NodeHasControllingFanins(node, {"^x", "^y"}));
       ++found;
     }
   }
@@ -781,7 +798,6 @@ TEST_F(DependencyOptimizerTest, IdentityDeviceCrossingConsumerOnSameDevice) {
   for (const auto& node : output.node()) {
     EXPECT_NE(node.name(), "x_on_2");
     if (node.name() == "result") {
-      LOG(INFO) << node.DebugString();
       ASSERT_EQ(node.input_size(), 2);
       EXPECT_EQ(node.input(0), "x_on_1");
       EXPECT_EQ(node.input(1), "one_on_2");
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 497ad6032ea80b22e5b5e2b23b2860b7c99fc57b..9f6352f1f2efa4b299dff163858ad5b4c88b41b8 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -29,10 +29,10 @@ FunctionApiInfo::~FunctionApiInfo() {}
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
   function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
-    if (attr.first == "experimental_api_preferred_device") {
+    if (attr.first == "api_preferred_device") {
       preferred_device_ = attr.second.s();
     }
-    if (attr.first == "experimental_api_implements") {
+    if (attr.first == "api_implements") {
       interface_name_ = attr.second.s();
     }
     if (attr.first == "forward_function_name") {
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 9a5f548951f0931e98fbe4074f7bbd9aacab0c6e..ffa53a7d8d94e29a1e3b6e214a18903e98f47cda 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -80,6 +80,8 @@ class FunctionLibraryApiInfo {
       const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
+  bool empty() const { return func_info_.empty(); }
+  std::size_t size() const { return func_info_.size(); }
 
  private:
   // Map between function name to function details.
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index b683d26b32f04759b658e9e0704f1b6b661fe178..9bb517faa31f1e347810ed8884b6a2c16b26104b 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -58,9 +58,9 @@ void PopulateFunction(const string& name, const string& api_interface_name,
 
   auto* func_attr = func_def->mutable_attr();
   if (!api_interface_name.empty())
-    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+    (*func_attr)["api_implements"].set_s(api_interface_name);
   if (!preferred_device.empty())
-    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+    (*func_attr)["api_preferred_device"].set_s(preferred_device);
   if (!forward_function_name.empty())
     (*func_attr)["forward_function_name"].set_s(forward_function_name);
   if (!backward_function_name.empty())
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index c7373989dc5cd81dacfa336f4707fc4e6e554e2d..2787d9d5253ebc0e1f8179549d1613fdf9301c96 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -1378,7 +1378,7 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithFunctionalControlFlow) {
 
   // All `PartitionedCall` nodes in the optimized graph must be inlined, and
   // `If` node must be lowered to `Switch` and `Merge` nodes.
-  EXPECT_EQ(count_nodes_with_op("PartitionedCallOp"), 0);
+  EXPECT_EQ(count_nodes_with_op("PartitionedCall"), 0);
   EXPECT_EQ(count_nodes_with_op("If"), 0);
   EXPECT_EQ(count_nodes_with_op("Switch"), 3);
   EXPECT_EQ(count_nodes_with_op("Merge"), 1);
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
similarity index 93%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
rename to tensorflow/core/grappler/optimizers/implementation_selector.cc
index 75ad8bffefd8aa00bb1ba88c10ed9b1170a0d25f..a370bf9934e8b6eb057d9ead6558b5ecf57edaef 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 
 #include <string>
 
@@ -101,14 +101,14 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::LoadFunctions(
+Status ImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
   TF_RETURN_IF_ERROR(lib_info_->Init(graph.library()));
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
+Status ImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
@@ -170,12 +170,16 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::SelectImplementation(
+Status ImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
   if (!graph->has_library()) {
     VLOG(2) << "Skipping graph since it does not have function def";
     return Status::OK();
   }
+  if (lib_info_->empty()) {
+    VLOG(2) << "Skipping optimization since lib_info is empty";
+    return Status::OK();
+  }
 
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
@@ -183,9 +187,9 @@ Status ExperimentalImplementationSelector::SelectImplementation(
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::Optimize(Cluster* cluster,
-                                                    const GrapplerItem& item,
-                                                    GraphDef* optimized_graph) {
+Status ImplementationSelector::Optimize(Cluster* cluster,
+                                        const GrapplerItem& item,
+                                        GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
   TF_RETURN_IF_ERROR(LoadFunctions(*optimized_graph));
   return SelectImplementation(optimized_graph);
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
similarity index 80%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector.h
rename to tensorflow/core/grappler/optimizers/implementation_selector.h
index 82f7473a14ec9b20492ac7acef3b72e919040ece..c206d21640b4816d2af46b0581eb410564aca175 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
 
 #include <string>
 
@@ -33,7 +33,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// -- EXPERIMENTAL --
 // This transformation replaces function calls by the appropriate function
 // definition based on properties of the runtime system. For instance,
 // we may choose one implementation over another if we have a GPU with
@@ -45,12 +44,12 @@ namespace grappler {
 //
 // For instance, the python code might specify:
 // @Defun(tf.float32,
-//        experimental_api_implements='plus_one',
-//        experimental_api_preferred_device='GPU')
+//        api_implements='plus_one',
+//        api_preferred_device='GPU')
 // def plus_one_gpu(x): return x + 1.0
 //
 // @Defun(tf.float32,
-//        experimental_api_implements='plus_one')
+//        api_implements='plus_one')
 // def plus_one_reference_implementation(x): return x + 1.0
 // input = tf.constant(2.0, dtype=tf.float32)
 //
@@ -62,21 +61,21 @@ namespace grappler {
 // `plus_one_reference_implementation` based on the availability of the GPU.
 //
 // Available annotations:
-//  - experimental_api_implements(string): all functions mapping to the same
+//  - api_implements(string): all functions mapping to the same
 //    string can be interchanged. For now, all functions must have the same
 //    signature and overloads are not allowed. Defuns within defuns are
 //    allowed.
-//  - experimental_api_preferred_device(string): sets which device is preferred.
-class ExperimentalImplementationSelector : public CustomGraphOptimizer {
+//  - api_preferred_device(string): sets which device is preferred.
+class ImplementationSelector : public CustomGraphOptimizer {
  public:
-  ExperimentalImplementationSelector() = default;
-  ~ExperimentalImplementationSelector() override = default;
+  ImplementationSelector() = default;
+  ~ImplementationSelector() override = default;
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
   }
   string name() const override {
-    return "experimental_implementation_selector";
+    return "implementation_selector";
   }
 
   // This call is not thread-safe.
@@ -106,10 +105,10 @@ class ExperimentalImplementationSelector : public CustomGraphOptimizer {
 
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExperimentalImplementationSelector);
+  TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
 };
 
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
similarity index 82%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
rename to tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index e330835e9bc4fea33928e376a3fd98ebe34a74ee..e2f58964a2a089a0cfda57449f288925ed71d858 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 
 #include <algorithm>
 #include <memory>
@@ -38,15 +38,14 @@ namespace {
 constexpr char CpuDevice[] = "/device:CPU:0";
 constexpr char GpuDevice[] = "/device:GPU:0";
 
-class ExperimentalImplementationSelectorTest : public GrapplerTest {};
+class ImplementationSelectorTest : public GrapplerTest {};
 
-TEST_F(ExperimentalImplementationSelectorTest, NoUpdate) {
+TEST_F(ImplementationSelectorTest, NoUpdate) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {CpuDevice});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  std::unique_ptr<CustomGraphOptimizer> optimizer(
-      new ExperimentalImplementationSelector);
+  std::unique_ptr<CustomGraphOptimizer> optimizer(new ImplementationSelector);
   ASSERT_NE(nullptr, optimizer);
   TF_ASSERT_OK(optimizer->Init());
 
@@ -58,19 +57,19 @@ TEST_F(ExperimentalImplementationSelectorTest, NoUpdate) {
   EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementation) {
+TEST_F(ImplementationSelectorTest, SwapImplementation) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
   auto* func_attr = cpu_def.mutable_attr();
-  (*func_attr)["experimental_api_implements"].set_s("times_two");
-  (*func_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*func_attr)["api_implements"].set_s("times_two");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
 
   auto gpu_def = test::function::XAddX();
   auto* func2_attr = gpu_def.mutable_attr();
-  (*func2_attr)["experimental_api_implements"].set_s("times_two");
-  (*func2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*func2_attr)["api_implements"].set_s("times_two");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -96,19 +95,19 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementation) {
   }
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
+TEST_F(ImplementationSelectorTest, SwapImplementationEval) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
   auto* func_attr = cpu_def.mutable_attr();
-  (*func_attr)["experimental_api_implements"].set_s("random_boost");
-  (*func_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*func_attr)["api_implements"].set_s("random_boost");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
 
   auto gpu_def = test::function::XTimesFour();
   auto* func2_attr = gpu_def.mutable_attr();
-  (*func2_attr)["experimental_api_implements"].set_s("random_boost");
-  (*func2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*func2_attr)["api_implements"].set_s("random_boost");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -133,7 +132,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(2.0f));
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+TEST_F(ImplementationSelectorTest, SwapImplementationWithGradient) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
   // boost_1 returns the doubled input and a const as the internal state, the
@@ -146,8 +145,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "boost:z:0"}, {"s", "one:output:0"}});
   auto* boost_1_attr = boost_1.mutable_attr();
-  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["api_implements"].set_s("random_boost");
+  (*boost_1_attr)["api_preferred_device"].set_s("CPU");
   (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
 
   FunctionDef boost_1_gradient = FDH::Create(
@@ -157,8 +156,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"dx", "grad:z:0"}});
   auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
-  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["api_preferred_device"].set_s("CPU");
   (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
 
   // boost_2 return the input * 4, and with two extra internal states.
@@ -171,8 +170,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
   auto* boost_2_attr = boost_2_func.mutable_attr();
-  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["api_implements"].set_s("random_boost");
+  (*boost_2_attr)["api_preferred_device"].set_s("GPU");
   (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
 
   FunctionDef boost_2_gradient = FDH::Create(
@@ -182,8 +181,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"dx", "grad:z:0"}});
   auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
-  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["api_preferred_device"].set_s("GPU");
   (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
 
   // Define the forward function with f = boost2 function but with CPU device.
@@ -203,7 +202,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
             {"f", FDH::FunctionRef("Boost2Gradient")}},
            CpuDevice);
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index cf5e4db29f418ac560c6a4c6381d4a7f3d88088e..54776e7f80c32ec1de70c1f132d0ebf31d50a72e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -581,8 +581,19 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
   return Status::OK();
 }
 
+// TODO(lyandy): Consolidate with ConstantFolding implementation.
+bool IsReallyConstant(const NodeDef& node,
+                      const absl::flat_hash_set<string>& feed_nodes) {
+  if (!IsConstant(node)) {
+    return false;
+  }
+  // If the node is fed it's not constant anymore.
+  return feed_nodes.find(node.name()) == feed_nodes.end();
+}
+
 Status CheckForDeadFanout(const MutableGraphView& view,
                           const NodeDef& switch_node, const NodeMap& node_map,
+                          const absl::flat_hash_set<string>& feed_nodes,
                           DeviceBase* cpu_device, ResourceMgr* resource_mgr,
                           bool* has_dead_fanout, int* dead_fanout) {
   *has_dead_fanout = false;
@@ -591,7 +602,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
       view.GetRegularFanin(switch_loopcond_port).node;
 
   // CASE 1: Control is a constant.
-  if (IsConstant(*switch_predicate)) {
+  if (IsReallyConstant(*switch_predicate, feed_nodes)) {
     Tensor selector;
     CHECK(selector.FromProto(switch_predicate->attr().at("value").tensor()));
     *has_dead_fanout = true;
@@ -630,7 +641,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (IsMerge(*node)) {
       merge_node = node;
     }
-    if (IsConstant(*node)) {
+    if (IsReallyConstant(*node, feed_nodes)) {
       constant_ctrl_input = node;
       constant_index = i;
     }
@@ -646,7 +657,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (IsEnter(*node)) {
       enter_node = node;
     }
-    if (IsConstant(*node)) {
+    if (IsReallyConstant(*node, feed_nodes)) {
       constant_init_node = node;
     }
   }
@@ -654,7 +665,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (constant_init_node != nullptr) return Status::OK();
     for (const auto& input : enter_node->input()) {
       NodeDef* node = node_map.GetNode(input);
-      if (IsConstant(*node)) {
+      if (IsReallyConstant(*node, feed_nodes)) {
         constant_init_node = node;
       }
     }
@@ -710,8 +721,12 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // TODO(srjoglekar): Figure out if we can optimize NodeMap creations across
     // optimizer passes.
     NodeMap node_map(optimized_graph);
-    TF_RETURN_IF_ERROR(
-        RemoveDeadBranches(item.NodesToPreserve(), node_map, optimized_graph));
+    absl::flat_hash_set<string> feed_nodes;
+    for (const auto& feed : item.feed) {
+      feed_nodes.insert(NodeName(feed.first));
+    }
+    TF_RETURN_IF_ERROR(RemoveDeadBranches(item.NodesToPreserve(), node_map,
+                                          feed_nodes, optimized_graph));
   }
 
   return Status::OK();
@@ -719,7 +734,8 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
 Status LoopOptimizer::RemoveDeadBranches(
     const std::unordered_set<string>& nodes_to_preserve,
-    const NodeMap& node_map, GraphDef* optimized_graph) {
+    const NodeMap& node_map, const absl::flat_hash_set<string>& feed_nodes,
+    GraphDef* optimized_graph) {
   std::unordered_set<const NodeDef*> dead_nodes;
   std::unordered_map<NodeDef*, std::set<int>> dead_merge_inputs;
   // TODO(bsteiner): also rewrite switches as identity. For now we just record
@@ -737,9 +753,9 @@ Status LoopOptimizer::RemoveDeadBranches(
 
     int dead_fanout;
     bool has_dead_fanout;
-    TF_RETURN_IF_ERROR(CheckForDeadFanout(view, node, node_map, cpu_device_,
-                                          resource_mgr_.get(), &has_dead_fanout,
-                                          &dead_fanout));
+    TF_RETURN_IF_ERROR(CheckForDeadFanout(view, node, node_map, feed_nodes,
+                                          cpu_device_, resource_mgr_.get(),
+                                          &has_dead_fanout, &dead_fanout));
     if (!has_dead_fanout) {
       continue;
     }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index d467237a9a704a81a0ecc1da71531868c7f3a49b..7fa1976f348391438d62ce51fb9b8f06f34e15a2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -60,7 +60,9 @@ class LoopOptimizer : public GraphOptimizer {
   };
 
   Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
-                            const NodeMap& node_map, GraphDef* optimized_graph);
+                            const NodeMap& node_map,
+                            const absl::flat_hash_set<string>& feed_nodes,
+                            GraphDef* optimized_graph);
 
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 587767c23c370ca1f747fc5b4e2bfa4cba3ae10d..db4494d42e774bf2db3791564d062141e00d1609 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -504,11 +504,11 @@ void VerifyGraphsEqual(const GraphDef& original_graph,
   for (int i = 0; i < original_graph.node_size(); ++i) {
     const NodeDef& original = original_graph.node(i);
     const NodeDef& optimized = optimized_graph.node(i);
-    EXPECT_EQ(original.name(), optimized.name()) << func;
-    EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    EXPECT_EQ(optimized.name(), original.name()) << func;
+    EXPECT_EQ(optimized.op(), original.op()) << func;
+    ASSERT_EQ(optimized.input_size(), original.input_size()) << func;
     for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      EXPECT_EQ(optimized.input(j), original.input(j)) << func;
     }
   }
 }
@@ -528,7 +528,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
+TEST_F(LoopOptimizerTest, RemovePushNoOp) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
   AddSimpleNode("c", "Const", {}, &graph);
@@ -557,7 +557,7 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(LoopOptimizerTest, RemovePush_NoPopButStackLives) {
+TEST_F(LoopOptimizerTest, RemovePushNoPopButStackLives) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
   AddSimpleNode("c", "Const", {}, &graph);
@@ -609,32 +609,32 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(13, output.node_size());
+  EXPECT_EQ(output.node_size(), 13);
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
     if (node.name() == "push1") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^stack1", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "c");
+      EXPECT_EQ(node.input(1), "^stack1");
     } else if (node.name() == "push2") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("enter_c", node.input(0));
-      EXPECT_EQ("^enter_stack2", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "enter_c");
+      EXPECT_EQ(node.input(1), "^enter_stack2");
     } else if (node.name() == "push3") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^stack3", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "c");
+      EXPECT_EQ(node.input(1), "^stack3");
     } else {
       const NodeDef& orig_node = item.graph.node(i);
-      EXPECT_EQ(orig_node.ShortDebugString(), node.ShortDebugString());
+      EXPECT_EQ(node.ShortDebugString(), orig_node.ShortDebugString());
     }
   }
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesConstantCondition) {
   Scope scope = Scope::NewRootScope();
   Output v_in = ops::Variable(scope.WithOpName("v_in"), {3}, DT_FLOAT);
 
@@ -691,57 +691,57 @@ TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
 
   for (const NodeDef& node : output.node()) {
     // These nodes should have been pruned
-    EXPECT_NE("Square1", node.name());
-    EXPECT_NE("Sqrt2", node.name());
-    EXPECT_NE("m5", node.name());
-    EXPECT_NE("m7", node.name());
+    EXPECT_NE(node.name(), "Square1");
+    EXPECT_NE(node.name(), "Sqrt2");
+    EXPECT_NE(node.name(), "m5");
+    EXPECT_NE(node.name(), "m7");
 
     if (node.name() == "m1") {
       // sqrt1 is dead
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square1", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "square1");
     } else if (node.name() == "m2") {
       // both inputs are alive
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
-      EXPECT_EQ("square1", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
     } else if (node.name() == "m3") {
       // sqrt1 is dead
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "v_in");
     } else if (node.name() == "m4") {
       // both inputs are alive
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("square1", node.input(0));
-      EXPECT_EQ("sqrt2", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "square1");
+      EXPECT_EQ(node.input(1), "sqrt2");
     } else if (node.name() == "m6") {
       // both inputs are alive and the control dependency can get triggered
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
-      EXPECT_EQ("square1", node.input(1));
-      EXPECT_EQ("^sqrt2", node.input(2));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
+      EXPECT_EQ(node.input(2), "^sqrt2");
     } else if (node.name() == "m8") {
       // The node is to be preserved because of a fetch
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("id1", node.input(0));
-      EXPECT_EQ("id2", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "id1");
+      EXPECT_EQ(node.input(1), "id2");
     } else if (node.name() == "m9") {
       // The node is to be preserved because of a fetch
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("id3", node.input(0));
-      EXPECT_EQ("id4", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ(node.input(0), "id3");
+      EXPECT_EQ(node.input(1), "id4");
     }
   }
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_FullyRemoveDeadBranches) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesFullyRemoveDeadBranches) {
   const string gdef_ascii = R"EOF(
 node {
   name: "episodicreplaybuffer_add_readvariableop_resource"
@@ -1153,7 +1153,7 @@ versions {
       << "Merge node was deleted, but it shouldn't have been.";
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_ZeroIterWhile) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesZeroIterWhile) {
   const string gdef_ascii = R"EOF(
 node {
   name: "Const"
@@ -1358,15 +1358,15 @@ versions {
   CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
   item.fetch = {"while/Exit"};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  EXPECT_EQ(1, tensors_expected.size());
+  ASSERT_EQ(tensors_expected.size(), 1);
 
   LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_CHECK_OK(status);
   auto tensors_got = EvaluateNodes(output, item.fetch);
-  EXPECT_EQ(1, tensors_got.size());
-  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_got[0]);
+  ASSERT_EQ(tensors_got.size(), 1);
+  test::ExpectTensorEqual<int32>(tensors_got[0], tensors_expected[0]);
 
   int nodes_present = 0;
   for (const NodeDef& node : output.node()) {
@@ -1382,7 +1382,200 @@ versions {
     }
     ++nodes_present;
   }
-  EXPECT_EQ(8, nodes_present);
+  EXPECT_EQ(nodes_present, 8);
+}
+
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesConstantFeed) {
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "I\'m a value!"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "Const_1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch"
+  op: "Switch"
+  input: "Const_1"
+  input: "Const_1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_t"
+  op: "Identity"
+  input: "cond/Switch:1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/Const"
+  op: "Const"
+  input: "^cond/switch_t"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "cond/Merge"
+  op: "Merge"
+  input: "cond/Switch_1"
+  input: "cond/Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "cond/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 27
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  item.fetch = {"Identity"};
+  Tensor feed_tensor(DT_BOOL, {});
+  feed_tensor.flat<bool>()(0) = false;
+  item.feed.push_back({"Const_1", feed_tensor});
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_CHECK_OK(status);
+  auto tensors_got = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(tensors_got.size(), 1);
+  test::ExpectTensorEqual<string>(tensors_got[0], tensors_expected[0]);
+
+  EXPECT_EQ(output.node_size(), 8);
+
+  // No rewrite because branch has a constant feed node.
+  bool found = false;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "cond/Merge") {
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "cond/Switch_1");
+      EXPECT_EQ(node.input(1), "cond/Const");
+      found = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 33942a591f810f61462892083e28d29495878739..36d68a7b0fbd256290c030043d2ddb5e471c6147 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
@@ -148,6 +148,9 @@ Status MetaOptimizer::InitializeOptimizers(
   if (!cfg_.disable_model_pruning()) {
     optimizers->push_back(MakeUnique<ModelPruner>());
   }
+  if (cfg_.implementation_selector() != RewriterConfig::OFF) {
+    optimizers->push_back(MakeUnique<ImplementationSelector>());
+  }
   if (cfg_.function_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(
         MakeUnique<FunctionOptimizer>(cfg_.function_optimization()));
@@ -241,18 +244,10 @@ Status MetaOptimizer::InitializeCustomGraphOptimizers(
         pre_initialized_optimizers.end()) {
       continue;
     }
-    // Initialize the ExperimentalImplementationSelector here instead of
-    // CustomizeOptimizer registry, due the static link issue in TensorRT for
-    // double registry.
-    // TODO(laigd): Remove this hack and change it back to use the registry once
-    // the duplicate static import issue is fixed.
-    std::unique_ptr<CustomGraphOptimizer> custom_optimizer;
-    if (optimizer_config.name() == "ExperimentalImplementationSelector") {
-      custom_optimizer.reset(new ExperimentalImplementationSelector());
-    } else {
-      custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
-          optimizer_config.name());
-    }
+
+    auto custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
+        optimizer_config.name());
+
     if (custom_optimizer) {
       VLOG(2) << "Registered custom configurable graph optimizer: "
               << optimizer_config.name();
@@ -701,9 +696,10 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
 }
 
 Status OptimizeGraph(
-    std::vector<string> ret_node_names, FunctionLibraryDefinition* flib,
-    const DeviceSet& device_set, Device* cpu_device,
-    const ConfigProto& config_proto, const string& grappler_item_id,
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* flib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
     const GrapplerItem::OptimizationOptions& optimization_options,
     std::unique_ptr<tensorflow::Graph>* g) {
   if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto)) {
@@ -723,6 +719,9 @@ Status OptimizeGraph(
   // Add fetches so that the graph can be pruned.
   item.fetch.swap(ret_node_names);
 
+  // Add noes that can't be removed from the graph.
+  item.keep_ops = std::move(keep_node_names);
+
   (*g)->ToGraphDef(&item.graph);
 
   if (flib) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 751a9e5d60a4b921be5fe50258c99faa4e261b29..b8f0c8e6ff56e2f497144417082d88916b3362ec 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -129,9 +129,10 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
 // complete copy. Therefore, the caller should not keep any references
 // to nodes *g.
 Status OptimizeGraph(
-    std::vector<string> ret_node_names, FunctionLibraryDefinition* lib,
-    const DeviceSet& device_set, Device* cpu_device,
-    const ConfigProto& config_proto, const string& grappler_item_id,
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* lib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
     const GrapplerItem::OptimizationOptions& optimization_options,
     std::unique_ptr<tensorflow::Graph>* g);
 
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 375c3e56c80aa65cd9e5ab0e2248b81d3e3db776..7d4dfb052071ce374f7361eaed19f2e94daf64e9 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -40,7 +40,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 template <typename T>
-bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
+bool SafeSetDoubleScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
   if (value > static_cast<double>(Eigen::NumTraits<RealType>::highest()) ||
       value < static_cast<double>(Eigen::NumTraits<RealType>::lowest())) {
@@ -50,6 +50,17 @@ bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   return true;
 }
 
+template <typename T>
+bool SafeSetIntScalarTensorValue(int value, Tensor* tensor) {
+  using RealType = typename Eigen::NumTraits<T>::Real;
+  if (value > static_cast<int>(Eigen::NumTraits<RealType>::highest()) ||
+      value < static_cast<int>(Eigen::NumTraits<RealType>::lowest())) {
+    return false;
+  }
+  tensor->flat<T>()(0) = static_cast<T>(value);
+  return true;
+}
+
 // Is 'node' an operator that consumes only the shape of its input, not the
 // data itself?
 // TODO(ezhulenev): move to op_types.h. Requires to break circular dependency.
@@ -410,35 +421,50 @@ void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
   EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
 }
 
-#define HANDLE_CASE(DTYPE)                                          \
-  case DTYPE:                                                       \
-    if (!SafeSetScalarTensorValue<EnumToDataType<DTYPE>::Type>(     \
-            static_cast<double>(value), tensor)) {                  \
-      return errors::InvalidArgument("Cannot store value ", value,  \
-                                     " in tensor of type " #DTYPE); \
-    }                                                               \
+#define HANDLE_DOUBLE_CASE(DTYPE)                                     \
+  case DTYPE:                                                         \
+    if (!SafeSetDoubleScalarTensorValue<EnumToDataType<DTYPE>::Type>( \
+            static_cast<double>(value), tensor)) {                    \
+      return errors::InvalidArgument("Cannot store value ", value,    \
+                                     " in tensor of type " #DTYPE);   \
+    }                                                                 \
+    break
+
+#define HANDLE_INT_CASE(DTYPE)                                               \
+  case DTYPE:                                                                \
+    if (!SafeSetIntScalarTensorValue<EnumToDataType<DTYPE>::Type>(value,     \
+                                                                  tensor)) { \
+      return errors::InvalidArgument("Cannot store value ", value,           \
+                                     " in tensor of type " #DTYPE);          \
+    }                                                                        \
     break
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
   // TODO(rmlarsen): Support more general shapes.
+  // TODO(lyandy): Change `value` to be int64 once int64 -> qint32 is supported.
   if (tensor->NumElements() != 1) {
     return errors::InvalidArgument(
         "Expected scalar tensor, got num_elements = ", tensor->NumElements());
   }
   switch (dtype) {
-    HANDLE_CASE(DT_HALF);
-    HANDLE_CASE(DT_BFLOAT16);
-    HANDLE_CASE(DT_BOOL);
-    HANDLE_CASE(DT_FLOAT);
-    HANDLE_CASE(DT_DOUBLE);
-    HANDLE_CASE(DT_UINT8);
-    HANDLE_CASE(DT_INT8);
-    HANDLE_CASE(DT_UINT16);
-    HANDLE_CASE(DT_INT16);
-    HANDLE_CASE(DT_INT32);
-    HANDLE_CASE(DT_INT64);
-    HANDLE_CASE(DT_COMPLEX64);
-    HANDLE_CASE(DT_COMPLEX128);
+    HANDLE_DOUBLE_CASE(DT_HALF);
+    HANDLE_DOUBLE_CASE(DT_BFLOAT16);
+    HANDLE_DOUBLE_CASE(DT_BOOL);
+    HANDLE_DOUBLE_CASE(DT_FLOAT);
+    HANDLE_DOUBLE_CASE(DT_DOUBLE);
+    HANDLE_DOUBLE_CASE(DT_UINT8);
+    HANDLE_DOUBLE_CASE(DT_INT8);
+    HANDLE_DOUBLE_CASE(DT_UINT16);
+    HANDLE_DOUBLE_CASE(DT_INT16);
+    HANDLE_DOUBLE_CASE(DT_INT32);
+    HANDLE_DOUBLE_CASE(DT_INT64);
+    HANDLE_DOUBLE_CASE(DT_COMPLEX64);
+    HANDLE_DOUBLE_CASE(DT_COMPLEX128);
+    HANDLE_INT_CASE(DT_QINT8);
+    HANDLE_INT_CASE(DT_QUINT8);
+    HANDLE_INT_CASE(DT_QINT16);
+    HANDLE_INT_CASE(DT_QUINT16);
+    HANDLE_INT_CASE(DT_QINT32);
     default:
       return errors::InvalidArgument("Unsupported type ",
                                      DataTypeString(dtype));
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 5f38aeac5472f62ac9b24a2e2238cbfabb093c63..2ec9794b68aad4b322e280eda033b26d7e592913 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -311,15 +311,14 @@ GrapplerFunctionItem::GrapplerFunctionItem(
     string func_name, string description, AttrSlice func_attr,
     std::vector<InputArgExpansion> input_arg_expansions,
     std::vector<OutputArgExpansion> output_arg_expansions,
-    std::vector<string> keep_nodes, const int graph_def_version,
+    std::vector<ControlOutput> control_outputs, const int graph_def_version,
     const bool is_stateful, GraphDef&& function_body)
     : description_(std::move(description)),
-      func_attr_(std::move(func_attr)),
+      func_attr_(func_attr),
       input_arg_expansions_(std::move(input_arg_expansions)),
       output_arg_expansions_(std::move(output_arg_expansions)),
+      control_outputs_(std::move(control_outputs)),
       is_stateful_(is_stateful) {
-  // Move assign GrapplerItem members.
-  keep_ops = std::move(keep_nodes);
   id = std::move(func_name);
   graph = std::move(function_body);
 
@@ -336,6 +335,10 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_node);
     }
   }
+  // We must keep all control output nodes.
+  for (const ControlOutput& control_output : control_outputs_) {
+    keep_ops.push_back(control_output.node_name);
+  }
 
   // Tensorflow functions execution semantics is different from the main graph,
   // and we need to preserve it when we do graph optimizations.
@@ -368,6 +371,15 @@ const std::size_t GrapplerFunctionItem::output_size() const {
   return output_arg_expansions_.size();
 }
 
+const std::vector<ControlOutput>& GrapplerFunctionItem::control_outputs()
+    const {
+  return control_outputs_;
+}
+
+const std::size_t GrapplerFunctionItem::control_output_size() const {
+  return control_outputs_.size();
+}
+
 const AttrSlice& GrapplerFunctionItem::func_attr() const { return func_attr_; }
 
 const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
@@ -624,15 +636,20 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     outputs.push_back(std::move(output));
   }
 
-  std::vector<string> keep_ops;
-  bool is_stateful = signature.is_stateful();
+  // Control outputs ensure that all side-effectful nodes in the function body
+  // will execute, even if they are not required to compute regular output args.
+  std::vector<ControlOutput> control_outputs;
+  control_outputs.reserve(func.control_ret_size());
+  for (const auto& control_ret : func.control_ret()) {
+    control_outputs.push_back({control_ret.first, control_ret.second});
+  }
 
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(),
       /*description=*/signature.description(),
       /*func_attr=*/AttrSlice(&func.attr()), std::move(inputs),
-      std::move(outputs), std::move(keep_ops), graph_def_version, is_stateful,
-      std::move(function_body));
+      std::move(outputs), std::move(control_outputs), graph_def_version,
+      signature.is_stateful(), std::move(function_body));
   return Status::OK();
 }
 
@@ -844,6 +861,13 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
         &(*func->mutable_ret())[output_arg.output_name]));
   }
 
+  // Add function control outputs.
+  for (const ControlOutput& control_out : item.control_outputs()) {
+    func->mutable_control_ret()->insert(
+        {control_out.output_name, control_out.node_name});
+    *func->mutable_signature()->add_control_output() = control_out.output_name;
+  }
+
   // Copy function definition specific attributes.
   for (const auto& attr : item.func_attr()) {
     const auto& attr_name = attr.first;
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index d5a41e74739d67fc2cef0c295efe208edbd6255c..d450f6a41fcf926def615c34b4acc725fae5b3d7 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -74,6 +74,12 @@ struct OutputArgExpansion {
   absl::InlinedVector<string, 1> output_nodes;
 };
 
+// A mapping from control output name to node name in function body graph.
+struct ControlOutput {
+  string output_name;
+  string node_name;
+};
+
 // FunctionDef uses different connectivity encoding for the function body nodes,
 // then a GraphDef (see function.proto for details). Input name in FunctionDef
 // can potentially represent a sequence of tensors (instead just one tensor in
@@ -161,6 +167,9 @@ class GrapplerFunctionItem : public GrapplerItem {
   const OutputArgExpansion& output(int i) const;
   const std::size_t output_size() const;
 
+  const std::vector<ControlOutput>& control_outputs() const;
+  const std::size_t control_output_size() const;
+
   const AttrSlice& func_attr() const;
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
@@ -183,8 +192,9 @@ class GrapplerFunctionItem : public GrapplerItem {
                        AttrSlice func_attr,
                        std::vector<InputArgExpansion> input_arg_expansions,
                        std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
+                       std::vector<ControlOutput> control_outputs,
+                       int graph_def_version, bool is_stateful,
+                       GraphDef&& function_body);
 
   string description_;
   AttrSlice func_attr_;  // Attributes specific to function definition that
@@ -192,6 +202,7 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
+  std::vector<ControlOutput> control_outputs_;
 
   bool is_stateful_ = false;
 };
@@ -241,7 +252,7 @@ Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
                              GrapplerFunctionItem* item,
                              std::vector<std::pair<int, int>>* output_mapping);
 
-// TODO(ezhulennev, b/120103818): Add RemoveFunctionInputs.
+// TODO(ezhulenev, b/120103818): Add RemoveFunctionInputs.
 
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 30b6195a9bf95f0285590dc6dbf0de2d770c4e80..813e6a318cf69db536bb6859f1937a3366d03d70 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -646,6 +646,38 @@ TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
   EXPECT_FALSE(opts.allow_pruning_stateful_and_dataset_ops);
 }
 
+TEST_F(FunctionsTest, FromFunctionDefWithControlOutputs) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Create(
+      "WithControlOutputs", /*in_def=*/{"x: Ref(float)"}, /*out_def=*/{}, {},
+      {
+          {{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+          {{"update"}, "AssignAdd", {"x", "one:output:0"}, {{"T", DT_FLOAT}}},
+      },
+      {}, {{"side_effects", "update"}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("WithControlOutputs", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+
+  ASSERT_EQ(1, item.keep_ops.size());
+  EXPECT_EQ("update", item.keep_ops[0]);
+
+  ASSERT_EQ(1, item.control_output_size());
+  const ControlOutput &ctrl = item.control_outputs()[0];
+  EXPECT_EQ("side_effects", ctrl.output_name);
+  EXPECT_EQ("update", ctrl.node_name);
+}
+
 TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -827,17 +859,14 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
 }
 
 TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
-  FunctionDef func = FunctionDefHelper::Define(
-      // Name
-      "DoNothing",
-      // Args
-      {"i: int32"},
-      // Return values
-      {"o: int32"},
-      // Attr def
-      {},
-      // Nodes
-      {{{"o"}, "Identity", {"i"}, {{"T", DT_INT32}}}});
+  FunctionDef func = FunctionDefHelper::Create(
+      "DoNothing", /*in_def=*/{"i: int32"}, /*out_def*/ {"o: int32"},
+      /*attr_def*/ {},
+      {
+          {{"id"}, "Identity", {"i"}, {{"T", DT_INT32}}},
+      },
+      /*ret_def=*/{{"o", "id:output:0"}},
+      /*control_ret_def=*/{{"must_execute", "id"}});
 
   constexpr char description[] = "This is a helpful description.";
   func.mutable_signature()->set_description(description);
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 1b4b9f9a51af17c4472f0fc34331b75192e3d3ae..3a0eec68d1c6adc4236ab2e0e79c8cb66a19b098 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -81,6 +81,7 @@ GrapplerTest::GrapplerTest() {
   cfg->set_debug_stripper(RewriterConfig::OFF);
   cfg->set_dependency_optimization(RewriterConfig::OFF);
   cfg->set_function_optimization(RewriterConfig::OFF);
+  cfg->set_implementation_selector(RewriterConfig::OFF);
   cfg->set_layout_optimizer(RewriterConfig::OFF);
   cfg->set_loop_optimization(RewriterConfig::OFF);
   cfg->set_pin_to_host_optimization(RewriterConfig::OFF);
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index f5ae39867ac758efa52d9109b5f85b020c1e7ae4..e30b1c5b730a2c67101b9b6364b414ea2f7003d8 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <unistd.h>
 #include <limits>
 #include <memory>
+
+#include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -124,56 +126,56 @@ class UtilsTest : public ::testing::Test {
 };
 
 TEST_F(UtilsTest, NodeName) {
-  EXPECT_EQ("abc", NodeName("abc"));
-  EXPECT_EQ("abc", NodeName("^abc"));
-  EXPECT_EQ("abc", NodeName("abc:0"));
-  EXPECT_EQ("abc", NodeName("^abc:0"));
-
-  EXPECT_EQ("abc/def", NodeName("abc/def"));
-  EXPECT_EQ("abc/def", NodeName("^abc/def"));
-  EXPECT_EQ("abc/def", NodeName("abc/def:1"));
-  EXPECT_EQ("abc/def", NodeName("^abc/def:1"));
-
-  EXPECT_EQ("abc/def0", NodeName("abc/def0"));
-  EXPECT_EQ("abc/def0", NodeName("^abc/def0"));
-  EXPECT_EQ("abc/def0", NodeName("abc/def0:0"));
-  EXPECT_EQ("abc/def0", NodeName("^abc/def0:0"));
-
-  EXPECT_EQ("abc/def_0", NodeName("abc/def_0"));
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0"));
-  EXPECT_EQ("abc/def_0", NodeName("abc/def_0:3"));
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3"));
-
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3214"));
+  EXPECT_EQ(NodeName("abc"), "abc");
+  EXPECT_EQ(NodeName("^abc"), "abc");
+  EXPECT_EQ(NodeName("abc:0"), "abc");
+  EXPECT_EQ(NodeName("^abc:0"), "abc");
+
+  EXPECT_EQ(NodeName("abc/def"), "abc/def");
+  EXPECT_EQ(NodeName("^abc/def"), "abc/def");
+  EXPECT_EQ(NodeName("abc/def:1"), "abc/def");
+  EXPECT_EQ(NodeName("^abc/def:1"), "abc/def");
+
+  EXPECT_EQ(NodeName("abc/def0"), "abc/def0");
+  EXPECT_EQ(NodeName("^abc/def0"), "abc/def0");
+  EXPECT_EQ(NodeName("abc/def0:0"), "abc/def0");
+  EXPECT_EQ(NodeName("^abc/def0:0"), "abc/def0");
+
+  EXPECT_EQ(NodeName("abc/def_0"), "abc/def_0");
+  EXPECT_EQ(NodeName("^abc/def_0"), "abc/def_0");
+  EXPECT_EQ(NodeName("abc/def_0:3"), "abc/def_0");
+  EXPECT_EQ(NodeName("^abc/def_0:3"), "abc/def_0");
+
+  EXPECT_EQ(NodeName("^abc/def_0:3214"), "abc/def_0");
 }
 
 TEST_F(UtilsTest, NodePosition) {
-  EXPECT_EQ(2, NodePosition("abc:2"));
-  EXPECT_EQ(123, NodePosition("abc:123"));
-  EXPECT_EQ(-1, NodePosition("^abc:123"));
-  EXPECT_EQ(-1, NodePosition("^abc"));
-  EXPECT_EQ(0, NodePosition(""));
+  EXPECT_EQ(NodePosition("abc:2"), 2);
+  EXPECT_EQ(NodePosition("abc:123"), 123);
+  EXPECT_EQ(NodePosition("^abc:123"), -1);
+  EXPECT_EQ(NodePosition("^abc"), -1);
+  EXPECT_EQ(NodePosition(""), 0);
 }
 
 TEST_F(UtilsTest, NodePositionIfSameNode) {
-  EXPECT_EQ(-2, NodePositionIfSameNode(":123", ""));
-  EXPECT_EQ(-2, NodePositionIfSameNode(":", ""));
-  EXPECT_EQ(-2, NodePositionIfSameNode("", ""));
-  EXPECT_EQ(123, NodePositionIfSameNode("abc:123", "abc"));
-  EXPECT_EQ(-1, NodePositionIfSameNode("^abc", "abc"));
-  EXPECT_EQ(-1, NodePositionIfSameNode("^abc:123", "abc"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc", "abc/xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc/xyz", "abc"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc:123", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("^abc", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("^abc:123", "xyz"));
+  EXPECT_EQ(NodePositionIfSameNode(":123", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode(":", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode("", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc:123", "abc"), 123);
+  EXPECT_EQ(NodePositionIfSameNode("^abc", "abc"), -1);
+  EXPECT_EQ(NodePositionIfSameNode("^abc:123", "abc"), -1);
+  EXPECT_EQ(NodePositionIfSameNode("abc", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc", "abc/xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc/xyz", "abc"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc:123", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("^abc", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("^abc:123", "xyz"), -2);
 }
 
 TEST_F(UtilsTest, AddNodeNamePrefix) {
-  EXPECT_EQ("OPTIMIZED/abc", AddPrefixToNodeName("abc", "OPTIMIZED"));
-  EXPECT_EQ("^OPTIMIZED/abc", AddPrefixToNodeName("^abc", "OPTIMIZED"));
-  EXPECT_EQ("OPTIMIZED/", AddPrefixToNodeName("", "OPTIMIZED"));
+  EXPECT_EQ(AddPrefixToNodeName("abc", "OPTIMIZED"), "OPTIMIZED/abc");
+  EXPECT_EQ(AddPrefixToNodeName("^abc", "OPTIMIZED"), "^OPTIMIZED/abc");
+  EXPECT_EQ(AddPrefixToNodeName("", "OPTIMIZED"), "OPTIMIZED/");
 }
 
 TEST_F(UtilsTest, ExecuteWithTimeout) {
@@ -204,17 +206,17 @@ TEST_F(UtilsTest, ExecuteWithTimeout) {
 
 TEST_F(UtilsTest, NumOutputs) {
   GraphDef graph;
-  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode(), &graph));
-  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode(), &graph));
-  EXPECT_EQ(1, NumOutputs(CreateDequeueNode(), &graph));
+  EXPECT_EQ(NumOutputs(CreateConcatOffsetNode(), &graph), 2);
+  EXPECT_EQ(NumOutputs(CreateFusedBatchNormNode(), &graph), 5);
+  EXPECT_EQ(NumOutputs(CreateDequeueNode(), &graph), 1);
 }
 
 TEST_F(UtilsTest, AsControlDependency) {
   NodeDef node;
   node.set_name("foo");
-  EXPECT_EQ("^foo", AsControlDependency(node));
-  EXPECT_EQ("^foo", AsControlDependency(node.name()));
-  EXPECT_EQ("^foo", AsControlDependency("^foo"));
+  EXPECT_EQ(AsControlDependency(node), "^foo");
+  EXPECT_EQ(AsControlDependency(node.name()), "^foo");
+  EXPECT_EQ(AsControlDependency("^foo"), "^foo");
 }
 
 TEST_F(UtilsTest, GetTailOfChain) {
@@ -233,22 +235,23 @@ TEST_F(UtilsTest, GetTailOfChain) {
   GraphDef graph;
   TF_CHECK_OK(s.ToGraphDef(&graph));
 
-  ASSERT_EQ("c0", graph.node(0).name());
-  ASSERT_EQ("c1", graph.node(1).name());
-  ASSERT_EQ("neg0", graph.node(2).name());
-  ASSERT_EQ("neg1", graph.node(3).name());
-  ASSERT_EQ("neg2", graph.node(4).name());
-  ASSERT_EQ("id1", graph.node(5).name());
-  ASSERT_EQ("id2", graph.node(6).name());
-  ASSERT_EQ("noop", graph.node(7).name());
+  ASSERT_EQ(graph.node_size(), 8);
+  ASSERT_EQ(graph.node(0).name(), "c0");
+  ASSERT_EQ(graph.node(1).name(), "c1");
+  ASSERT_EQ(graph.node(2).name(), "neg0");
+  ASSERT_EQ(graph.node(3).name(), "neg1");
+  ASSERT_EQ(graph.node(4).name(), "neg2");
+  ASSERT_EQ(graph.node(5).name(), "id1");
+  ASSERT_EQ(graph.node(6).name(), "id2");
+  ASSERT_EQ(graph.node(7).name(), "noop");
 
   NodeMap node_map(&graph);
   auto is_neg = [&](const NodeDef& node) { return node.op() == "Neg"; };
   // We walk backwards, starting as "id1", so tail should be "neg1".
   NodeDef* tail = GetTailOfChain(graph.node(5), node_map,
                                  /*follow_control_input=*/false, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg1", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg1");
 
   // We stop at branching nodes, so tail should be "neg2".
   auto is_neg_and_non_branching = [&](const NodeDef& node) {
@@ -257,22 +260,22 @@ TEST_F(UtilsTest, GetTailOfChain) {
   tail =
       GetTailOfChain(graph.node(5), node_map,
                      /*follow_control_input=*/false, is_neg_and_non_branching);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg2", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg2");
 
   // We walk backwards, starting from "noop", also following control inputs,
   // so tail should be "neg0".
   tail = GetTailOfChain(graph.node(7), node_map,
                         /*follow_control_input=*/true, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg0", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg0");
 
   // We walk backwards, starting from "noop", not following control inputs,
   // so tail should be "noop" itself.
   tail = GetTailOfChain(graph.node(7), node_map,
                         /*follow_control_input=*/false, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("noop", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "noop");
 }
 
 TEST_F(UtilsTest, DedupControlInputs) {
@@ -280,40 +283,40 @@ TEST_F(UtilsTest, DedupControlInputs) {
   foo.set_name("foo");
   foo.add_input("bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "bar");
 
   foo.set_input(0, "^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("^bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "^bar");
 
   foo.set_input(0, "bar");
   foo.add_input("bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(2, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
-  EXPECT_EQ("bar", foo.input(1));
+  ASSERT_EQ(foo.input_size(), 2);
+  EXPECT_EQ(foo.input(0), "bar");
+  EXPECT_EQ(foo.input(1), "bar");
 
   foo.set_input(1, "^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "bar");
 
   foo.set_input(0, "^bar");
   foo.add_input("^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("^bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "^bar");
 
   foo.set_input(0, "bar");
   foo.add_input("gnu");
   foo.add_input("^bar");
   foo.add_input("^gnu");
   DedupControlInputs(&foo);
-  EXPECT_EQ(2, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
-  EXPECT_EQ("gnu", foo.input(1));
+  ASSERT_EQ(foo.input_size(), 2);
+  EXPECT_EQ(foo.input(0), "bar");
+  EXPECT_EQ(foo.input(1), "gnu");
 }
 
 TEST_F(UtilsTest, NumNonControlOutputs) {
@@ -347,14 +350,14 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   NodeMap node_map(&graph);
 
   const NodeDef* add_node = node_map.GetNode("add");
-  ASSERT_TRUE(add_node != nullptr);
+  ASSERT_NE(add_node, nullptr);
 
   // [a, b] are only non-control inputs
-  EXPECT_EQ(2, NumNonControlInputs(*add_node));
+  EXPECT_EQ(NumNonControlInputs(*add_node), 2);
   // [sqrt, shape] are non control outputs
-  EXPECT_EQ(2, NumNonControlOutputs(*add_node, node_map));
+  EXPECT_EQ(NumNonControlOutputs(*add_node, node_map), 2);
   // sqrt is the only data output
-  EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
+  EXPECT_EQ(NumNonControlDataOutputs(*add_node, node_map), 1);
 }
 
 TEST(CheckAttrExists, All) {
@@ -465,10 +468,104 @@ TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
 }
 
 TEST_F(UtilsTest, TensorIdToString) {
-  EXPECT_EQ("^foo", TensorIdToString({"foo", -1}));
-  EXPECT_EQ("foo", TensorIdToString({"foo", 0}));
-  EXPECT_EQ("foo:1", TensorIdToString({"foo", 1}));
-  EXPECT_EQ("foo:2", TensorIdToString({"foo", 2}));
+  EXPECT_EQ(TensorIdToString({"foo", -1}), "^foo");
+  EXPECT_EQ(TensorIdToString({"foo", 0}), "foo");
+  EXPECT_EQ(TensorIdToString({"foo", 1}), "foo:1");
+  EXPECT_EQ(TensorIdToString({"foo", 2}), "foo:2");
+}
+
+template <typename T>
+void TestSetTensorValue(DataType type, int val, bool success,
+                        absl::string_view error_msg) {
+  Tensor t(type, TensorShape({}));
+  Status s = SetTensorValue(t.dtype(), val, &t);
+  EXPECT_EQ(s.ok(), success);
+  if (s.ok()) {
+    test::ExpectTensorEqual<T>(Tensor(static_cast<T>(val)), t);
+  } else {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+}
+
+TEST(SetTensorValueTest, Quantized) {
+  auto int_min_error = [](DataType type) {
+    return absl::Substitute(
+        "Cannot store value -2147483648 in tensor of type $0",
+        DataType_Name(type));
+  };
+  auto int_max_error = [](DataType type) {
+    return absl::Substitute(
+        "Cannot store value 2147483647 in tensor of type $0",
+        DataType_Name(type));
+  };
+  const int kMinInt = std::numeric_limits<int>::min();
+  const int kMaxInt = std::numeric_limits<int>::max();
+
+  TestSetTensorValue<qint8>(DT_QINT8, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, std::numeric_limits<qint8>::min(),
+                            /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, std::numeric_limits<qint8>::max(),
+                            /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, kMinInt, /*success=*/false,
+                            int_min_error(DT_QINT8));
+  TestSetTensorValue<qint8>(DT_QINT8, kMaxInt, /*success=*/false,
+                            int_max_error(DT_QINT8));
+
+  TestSetTensorValue<quint8>(
+      DT_QUINT8, -8, /*success=*/false,
+      /*error_msg=*/"Cannot store value -8 in tensor of type DT_QUINT8");
+  TestSetTensorValue<quint8>(DT_QUINT8, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, std::numeric_limits<quint8>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, std::numeric_limits<quint8>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, kMinInt, /*success=*/false,
+                             int_min_error(DT_QUINT8));
+  TestSetTensorValue<quint8>(DT_QUINT8, kMaxInt, /*success=*/false,
+                             int_max_error(DT_QUINT8));
+
+  TestSetTensorValue<qint16>(DT_QINT16, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, std::numeric_limits<qint16>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, std::numeric_limits<qint16>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, kMinInt, /*success=*/false,
+                             int_min_error(DT_QINT16));
+  TestSetTensorValue<qint16>(DT_QINT16, kMaxInt, /*success=*/false,
+                             int_max_error(DT_QINT16));
+
+  TestSetTensorValue<quint16>(
+      DT_QUINT16, -8, /*success=*/false,
+      /*error_msg=*/"Cannot store value -8 in tensor of type DT_QUINT16");
+  TestSetTensorValue<quint16>(DT_QUINT16, 0, /*success=*/true,
+                              /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, 8, /*success=*/true,
+                              /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, std::numeric_limits<quint16>::min(),
+                              /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, std::numeric_limits<quint16>::max(),
+                              /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, kMinInt, /*success=*/false,
+                              int_min_error(DT_QUINT16));
+  TestSetTensorValue<quint16>(DT_QUINT16, kMaxInt, /*success=*/false,
+                              int_max_error(DT_QUINT16));
+
+  TestSetTensorValue<qint32>(DT_QINT32, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, std::numeric_limits<qint32>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, std::numeric_limits<qint32>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, kMinInt, /*success=*/true,
+                             /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, kMaxInt, /*success=*/true,
+                             /*error_msg=*/"");
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b6a421b795c770eb305e3e370e1e59dc2b492be4..d176d3eca166faaf578e40246e1f98973ba82c30 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -33,6 +33,7 @@ load(
     "if_android",
     "if_not_windows",
     "tf_cc_binary",
+    "tf_cc_shared_object",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -160,7 +161,6 @@ tf_kernel_library(
     name = "clustering_ops",
     prefix = "clustering_ops",
     deps = [
-        "//tensorflow/core:clustering_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
@@ -191,7 +191,6 @@ tf_kernel_library(
     ]),
     prefix = "collective_ops",
     deps = [
-        "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -276,7 +275,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
@@ -367,7 +365,6 @@ tf_kernel_library(
         "//tensorflow/core/nccl:nccl_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:nccl_ops_op_lib",
     ]),
 )
 
@@ -569,7 +566,6 @@ cc_library(
         ":concat_lib_hdrs",
         ":ops_util_hdrs",
         ":split_lib_hdrs",
-        "//tensorflow/core:batch_ops_op_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
@@ -790,7 +786,6 @@ ARRAY_DEPS = [
     ":ops_util",
     ":transpose_functor",
     "//tensorflow/core:array_grad",
-    "//tensorflow/core:array_ops_op_lib",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -819,7 +814,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:set_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -1192,7 +1186,6 @@ tf_kernel_library(
     srcs = ["ragged_gather_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_array_ops_op_lib",
     ],
 )
 
@@ -1204,7 +1197,6 @@ tf_cc_test(
         ":ops_testutil",
         ":ragged_gather_op",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_array_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1216,7 +1208,6 @@ tf_kernel_library(
     srcs = ["ragged_range_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_math_ops_op_lib",
     ],
 )
 
@@ -1227,7 +1218,6 @@ tf_cc_test(
         ":ops_testutil",
         ":ragged_range_op",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_math_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1239,7 +1229,6 @@ tf_kernel_library(
     srcs = ["ragged_tensor_to_sparse_kernel.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_conversion_ops_op_lib",
     ],
 )
 
@@ -1252,7 +1241,6 @@ tf_cc_test(
         ":ragged_tensor_to_sparse_kernel",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ragged_conversion_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1266,7 +1254,6 @@ tf_kernel_library(
     deps = [
         ":bounds_check_lib",
         ":gpu_util_hdrs",
-        "//tensorflow/core:cudnn_rnn_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -1342,7 +1329,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1867,7 +1853,6 @@ tf_kernel_library(
     prefix = "candidate_sampler_ops",
     deps = [
         ":range_sampler",
-        "//tensorflow/core:candidate_sampling_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1901,7 +1886,6 @@ tf_kernel_library(
     name = "control_flow_ops",
     prefix = "control_flow_ops",
     deps = [
-        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1913,7 +1897,6 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         ":ops_util",
-        "//tensorflow/core:ctc_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/ctc:ctc_beam_search_lib",
@@ -1994,7 +1977,6 @@ DATA_FLOW_DEPS = [
     ":typed_queue",
     "//third_party/eigen3",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2059,7 +2041,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
     ],
 )
 
@@ -2078,7 +2059,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -2126,7 +2106,6 @@ tf_kernel_library(
 DYNAMIC_DEPS = [
     ":bounds_check",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2159,7 +2138,6 @@ LOOKUP_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:lookup_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -2198,7 +2176,6 @@ tf_kernel_library(
     deps = [
         ":lookup_table_init_op",
         ":lookup_table_op",
-        "//tensorflow/core:checkpoint_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -2209,7 +2186,6 @@ tf_kernel_library(
     name = "load_and_remap_matrix_op",
     srcs = ["load_and_remap_matrix_op.cc"],
     deps = [
-        "//tensorflow/core:checkpoint_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2354,7 +2330,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:resource_variable_ops_op_lib",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2372,7 +2347,6 @@ tf_kernel_library(
         ":fill_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:list_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -2383,7 +2357,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:user_ops_op_lib",
     ],
 )
 
@@ -2406,7 +2379,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
@@ -2419,7 +2391,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -2464,7 +2435,6 @@ IMAGE_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:gif_internal",
-    "//tensorflow/core:image_ops_op_lib",
     "//tensorflow/core:jpeg_internal",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2805,7 +2775,6 @@ cc_library(
 IO_DEPS = [
     ":ops_util",
     "//tensorflow/core:framework",
-    "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
@@ -2849,7 +2818,6 @@ SAVE_RESTORE_DEPS = [
     ":bounds_check_lib",
     ":save_restore_tensor",
     "//tensorflow/core:framework",
-    "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
@@ -2937,6 +2905,7 @@ cc_library(
         ":self_adjoint_eig_op",
         ":self_adjoint_eig_v2_op",
         ":svd_op",
+        ":tridiagonal_solve_op",
     ],
 )
 
@@ -2968,7 +2937,6 @@ LINALG_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:linalg_ops_op_lib",
 ] + if_cuda([
     ":cuda_solvers",
     ":transpose_functor",
@@ -3055,6 +3023,12 @@ tf_kernel_library(
     ]),
 )
 
+tf_kernel_library(
+    name = "tridiagonal_solve_op",
+    srcs = ["tridiagonal_solve_op.cc"],
+    deps = LINALG_DEPS,
+)
+
 tf_kernel_library(
     name = "qr_op",
     prefix = "qr_op",
@@ -3112,7 +3086,6 @@ LOGGING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:logging_ops_op_lib",
     "//tensorflow/core:protos_all_cc",
 ]
 
@@ -3184,7 +3157,6 @@ tf_kernel_library(
         ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:manip_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -3216,7 +3188,6 @@ MATH_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:math_grad",
-    "//tensorflow/core:math_ops_op_lib",
     "//third_party/eigen3",
 ]
 
@@ -3327,7 +3298,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "cwise_op",
     prefix = "cwise_op",
-    deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"],
+    deps = MATH_DEPS,
 )
 
 tf_kernel_library(
@@ -3346,7 +3317,6 @@ tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
     deps = MATH_DEPS + [
-        "//tensorflow/core:spectral_ops_op_lib",
     ] + if_cuda([
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
     ]),
@@ -3553,10 +3523,7 @@ tf_cuda_cc_test(
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
-        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -3788,12 +3755,12 @@ tf_kernel_library(
         ":image_resizer_state",
         ":fill_functor",
         ":ops_util",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + select({
         ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
@@ -3823,7 +3790,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@cub_archive//:cub",
         "@local_config_cuda//cuda:cudnn_header",
@@ -3843,7 +3809,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -3890,9 +3855,8 @@ NN_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:nn_grad",
-    "//tensorflow/core:nn_ops_op_lib",
     "//third_party/eigen3",
-] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"])
+]
 
 tf_kernel_library(
     name = "batch_norm_op",
@@ -4028,7 +3992,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -4137,7 +4100,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
     ],
@@ -4181,7 +4143,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -4263,7 +4224,6 @@ cc_library(
 PARSING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:parsing_ops_op_lib",
     "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
 ]
@@ -4332,7 +4292,6 @@ RANDOM_OPS_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:random_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4382,7 +4341,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stateful_random_ops_op_lib",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
     ],
@@ -4396,7 +4354,6 @@ tf_kernel_library(
         ":random_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stateless_random_ops_op_lib",
     ],
 )
 
@@ -4411,8 +4368,6 @@ cc_library(
 REQUIRED_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:no_op_op_lib",
-    "//tensorflow/core:sendrecv_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4473,7 +4428,6 @@ cc_library(
 SPARSE_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:sparse_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4721,7 +4675,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:sdca_ops_op_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
     ],
@@ -4761,7 +4714,6 @@ STATE_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:state_ops_op_lib",
 ] + if_sycl(["//tensorflow/core:sycl_runtime"])
 
 tf_kernel_library(
@@ -4899,7 +4851,6 @@ STRING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:string_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -5050,7 +5001,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:string_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/icu/data:conversion_data",
         "@icu//:common",
@@ -5072,7 +5022,6 @@ tf_kernel_library(
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:training_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -5134,7 +5083,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:random_ops_op_lib",
     ],
 )
 
@@ -5162,7 +5110,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:random_ops_op_lib",
     ],
 )
 
@@ -5615,6 +5562,7 @@ filegroup(
         "gemm_functors.h",
         "image_resizer_state.h",
         "initializable_lookup_table.h",
+        "logging_ops.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
@@ -6069,12 +6017,9 @@ tf_kernel_library(
         ":ops_util",
         ":pooling_ops",
         ":quantization_utils",
-        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -6560,6 +6505,30 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_concat_op_test",
+    size = "small",
+    srcs = ["mkl_quantized_concat_op_test.cc"],
+    deps = [
+        ":mkl_concat_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantized_batch_norm_op_test",
     size = "small",
@@ -6644,7 +6613,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:remote_fused_graph_ops_op_lib",
     ],
 )
 
@@ -6799,8 +6767,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6808,6 +6774,7 @@ tf_cc_test_mkl(
     name = "mkl_conv_ops_test",
     size = "small",
     srcs = ["mkl_conv_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -6834,8 +6801,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6850,8 +6815,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6870,8 +6833,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6885,8 +6846,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -6901,8 +6860,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -6960,6 +6917,65 @@ tf_mkl_kernel_library(
     deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_requantize_ops",
+    srcs = [
+        "mkl_requantization_range_per_channel_op.cc",
+        "mkl_requantize_per_channel_op.cc",
+    ],
+    hdrs = [
+        "meta_support.h",
+        "no_op.h",
+        "reference_gemm.h",
+    ],
+    deps = if_mkl(
+        [
+            ":concat_lib_hdrs",
+            ":conv_ops",
+            ":cwise_op",
+            ":eigen_helpers",
+            ":image_resizer_state",
+            ":ops_util",
+            ":pooling_ops",
+            ":quantization_utils",
+            ":transpose_functor",
+            "//third_party/eigen3",
+            "@gemmlowp",
+            "@mkl_dnn",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_requantize_ops_test",
+    size = "small",
+    srcs = ["mkl_requantize_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":mkl_requantize_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_fused_ops_test",
     size = "small",
@@ -7043,7 +7059,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:summary_ops_op_lib",
         "//tensorflow/core/lib/db:sqlite",
         "//tensorflow/core/summary:schema",
         "//tensorflow/core/summary:summary_db_writer",
@@ -7057,7 +7072,6 @@ tf_kernel_library(
         "decode_proto_op.cc",
     ],
     deps = [
-        "//tensorflow/core:decode_proto_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/proto:decode",
@@ -7072,7 +7086,6 @@ tf_kernel_library(
     name = "encode_proto_op",
     srcs = ["encode_proto_op.cc"],
     deps = [
-        "//tensorflow/core:encode_proto_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/proto:descriptors",
@@ -7090,7 +7103,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:rpc_ops_op_lib",
         "//tensorflow/core/util/rpc:call_container",
         "//tensorflow/core/util/rpc:rpc_factory",
         "//tensorflow/core/util/rpc:rpc_factory_registry",
@@ -7103,7 +7115,6 @@ tf_kernel_library(
     srcs = ["unicode_script_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:string_ops_op_lib",
         "@icu//:common",
     ],
 )
@@ -7146,3 +7157,31 @@ cc_header_only_library(
         ":cwise_lib",
     ],
 )
+
+# Library to link with when compiling the quantize and dequantize kernels directly,
+# e.g. for selective registration.
+cc_header_only_library(
+    name = "quantize_and_dequantize_op_hdrs",
+    deps = [
+        ":quantize_and_dequantize_op",
+    ],
+)
+
+cc_library(
+    name = "kernel_platform_strings",
+    srcs = ["kernel_platform_strings.h"],
+    deps = [
+        "//tensorflow/core:platform_strings",
+    ],
+    alwayslink = 1,
+)
+
+# Shared object that links all the kernels TF needs.
+tf_cc_shared_object(
+    name = "libtfkernel_all_kernels.so",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":kernel_platform_strings",
+        "//tensorflow/core:all_kernels_impl",
+    ],
+)
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 87d34fcfcc31c9f806754e9b1bc36430938d64c3..98264c4a1de75f7308c0b55e3d77a2dff88ebb49 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -193,8 +193,8 @@ class AdjustSaturationOp<CPUDevice, float> : public AdjustSaturationOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, scale_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, scale_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d5bd36b4ceaa62f6c2f6928bbea704a0e6d01017..89d742c2dafcfd593f0166816d28ec65cb9ac9f9 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -247,7 +247,6 @@ class Barrier : public ResourceBase {
           keys = t[1];
           values.insert(values.begin(), t.begin() + 2, t.end());
           callback(indices, keys, values);
-          return;
         });
   }
 
@@ -509,7 +508,7 @@ class BarrierOpKernel : public AsyncOpKernel {
     Barrier* barrier = nullptr;
     OP_REQUIRES_OK_ASYNC(ctx, GetResourceFromContext(ctx, "handle", &barrier),
                          callback);
-    ComputeAsync(ctx, barrier, [this, callback, barrier]() {
+    ComputeAsync(ctx, barrier, [callback, barrier]() {
       barrier->Unref();
       callback();
     });
@@ -618,7 +617,6 @@ class TakeManyOp : public BarrierOpKernel {
             values_output.set(i, values[i]);
           }
           callback();
-          return;
         });
   }
 
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5ba461aa9de2a647962c653fb9ca0f199e9110be..338f61ff6642cbc604bb77dfe1908fe28b9fc142 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -720,8 +720,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     BatchResource* br;
-    std::function<Status(BatchResource * *r)> creator = [this,
-                                                         c](BatchResource** r) {
+    std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(
           BatchResource::Create(num_batch_threads_, max_batch_size_,
@@ -801,16 +800,15 @@ class BatchKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     BatchResource* br;
-    std::function<Status(BatchResource * *r)> creator =
-        [this](BatchResource** r) {
-          std::unique_ptr<BatchResource> new_resource;
-          TF_RETURN_IF_ERROR(BatchResource::Create(
-              num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
-              &new_resource));
-          *r = new_resource.release();
-          return Status::OK();
-        };
+    std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
+      std::unique_ptr<BatchResource> new_resource;
+      TF_RETURN_IF_ERROR(BatchResource::Create(
+          num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+          &new_resource));
+      *r = new_resource.release();
+      return Status::OK();
+    };
     OP_REQUIRES_OK_ASYNC(c,
                          c->resource_manager()->LookupOrCreate(
                              container_, shared_name_, &br, creator),
@@ -1066,7 +1064,7 @@ class UnbatchKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     UnbatchResource* ubr;
-    std::function<Status(UnbatchResource * *r)> creator =
+    std::function<Status(UnbatchResource**)> creator =
         [this](UnbatchResource** r) {
           *r = new UnbatchResource(timeout_micros_);
           return Status::OK();
@@ -1252,8 +1250,8 @@ class UnbatchGradKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     UnbatchGradResource* ubr;
-    std::function<Status(UnbatchGradResource * *r)> creator =
-        [this](UnbatchGradResource** r) {
+    std::function<Status(UnbatchGradResource**)> creator =
+        [](UnbatchGradResource** r) {
           *r = new UnbatchGradResource();
           return Status::OK();
         };
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 4e9bab3e21f9f240d32e78a1a489033a693caa73..3aa3bb84b9b973878127fd7db1f2d652f591a34d 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -52,6 +52,16 @@ message BucketizedSplit {
   // the rule feature <= threshold.
   int32 feature_id = 1;
   int32 threshold = 2;
+  // If feature column is multivalent, this holds the index of the dimension
+  // for the split. Defaults to 0.
+  int32 dimension_id = 5;
+  enum DefaultDirection {
+    // Left is the default direction.
+    DEFAULT_LEFT = 0;
+    DEFAULT_RIGHT = 1;
+  }
+  // default direction for missing values.
+  DefaultDirection default_direction = 6;
 
   // Node children indexing into a contiguous
   // vector of nodes starting from the root.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 4ae26fb95b1bb47db6a9462670df08f1bb4e171e..04d3359a90056ad3a4f0cd168f30601c75b59b19 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -113,8 +113,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
       output_tree_ids.setConstant(latest_tree);
       auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
                       &cached_node_ids, &output_partial_logits,
-                      &output_node_ids, batch_size,
-                      latest_tree](int32 start, int32 end) {
+                      &output_node_ids, latest_tree](int32 start, int32 end) {
         for (int32 i = start; i < end; ++i) {
           int32 tree_id = cached_tree_ids(i);
           int32 node_id = cached_node_ids(i);
@@ -129,7 +128,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
             // Logic in the loop adds the cached node value again if it is a
             // leaf. If it is not a leaf anymore we need to subtract the old
             // node's value. The following logic handles both of these cases.
-            partial_tree_logit -= resource->node_value(tree_id, node_id);
+            const auto& node_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(node_logits.size(), 1);
+            partial_tree_logit -= node_logits[0];
           } else {
             // No cache exists, start from the very first node.
             node_id = 0;
@@ -137,7 +138,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           float partial_all_logit = 0.0;
           while (true) {
             if (resource->is_leaf(tree_id, node_id)) {
-              partial_tree_logit += resource->node_value(tree_id, node_id);
+              const auto& leaf_logits = resource->node_value(tree_id, node_id);
+              DCHECK_EQ(leaf_logits.size(), 1);
+              partial_tree_logit += leaf_logits[0];
 
               // Tree is done
               partial_all_logit +=
@@ -187,9 +190,6 @@ class BoostedTreesPredictOp : public OpKernel {
                                              &num_bucketized_features_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("logits_dimension", &logits_dimension_));
-    OP_REQUIRES(context, logits_dimension_ == 1,
-                errors::InvalidArgument(
-                    "Currently only one dimensional outputs are supported."));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -225,18 +225,20 @@ class BoostedTreesPredictOp : public OpKernel {
     }
 
     const int32 last_tree = resource->num_trees() - 1;
-
     auto do_work = [&resource, &batch_bucketized_features, &output_logits,
-                    batch_size, last_tree](int32 start, int32 end) {
+                    last_tree, this](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
-        float tree_logit = 0.0;
+        std::vector<float> tree_logits(logits_dimension_, 0.0);
         int32 tree_id = 0;
         int32 node_id = 0;
         while (true) {
           if (resource->is_leaf(tree_id, node_id)) {
-            tree_logit += resource->GetTreeWeight(tree_id) *
-                          resource->node_value(tree_id, node_id);
-
+            const float tree_weight = resource->GetTreeWeight(tree_id);
+            const auto& leaf_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(leaf_logits.size(), logits_dimension_);
+            for (int32 j = 0; j < logits_dimension_; ++j) {
+              tree_logits[j] += tree_weight * leaf_logits[j];
+            }
             // Stop if it was the last tree.
             if (tree_id == last_tree) {
               break;
@@ -249,7 +251,9 @@ class BoostedTreesPredictOp : public OpKernel {
                                           batch_bucketized_features);
           }
         }
-        output_logits(i, 0) = tree_logit;
+        for (int32 j = 0; j < logits_dimension_; ++j) {
+          output_logits(i, j) = tree_logits[j];
+        }
       }
     };
     // 10 is the magic number. The actual number might depend on (the number of
@@ -329,13 +333,14 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     // path. Note: feature_ids has one less value than logits_path because the
     // first value of each logit path will be the bias.
     auto do_work = [&resource, &batch_bucketized_features, &output_debug_info,
-                    batch_size, last_tree](int32 start, int32 end) {
+                    last_tree](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
         // Proto to store debug outputs, per example.
         boosted_trees::DebugOutput example_debug_info;
         // Initial bias prediction. E.g., prediction based off training mean.
-        float tree_logit =
-            resource->GetTreeWeight(0) * resource->node_value(0, 0);
+        const auto& tree_logits = resource->node_value(0, 0);
+        DCHECK_EQ(tree_logits.size(), 1);
+        float tree_logit = resource->GetTreeWeight(0) * tree_logits[0];
         example_debug_info.add_logits_path(tree_logit);
         int32 node_id = 0;
         int32 tree_id = 0;
@@ -358,8 +363,9 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
             // Get logit after split.
             node_id = resource->next_node(tree_id, node_id, i,
                                           batch_bucketized_features);
-            tree_logit = resource->GetTreeWeight(tree_id) *
-                         resource->node_value(tree_id, node_id);
+            const auto& tree_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(tree_logits.size(), 1);
+            tree_logit = resource->GetTreeWeight(tree_id) * tree_logits[0];
             // Output logit incorporates sum of leaf logits from prior trees.
             example_debug_info.add_logits_path(tree_logit + past_trees_logit);
           }
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 42df4848815db7a097a70b4f1713fd42484be438..5ab9f97992ce5390b7bd2ae774d13c0ed06ee639 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -82,15 +82,38 @@ int32 BoostedTreesEnsembleResource::next_node(
   return -1;
 }
 
-float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
-                                               const int32 node_id) const {
+std::vector<float> BoostedTreesEnsembleResource::node_value(
+    const int32 tree_id, const int32 node_id) const {
   DCHECK_LT(tree_id, tree_ensemble_->trees_size());
   DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
   const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
   if (node.node_case() == boosted_trees::Node::kLeaf) {
-    return node.leaf().scalar();
+    // TODO(crawles): only use vector leaf even if # logits=1.
+    if (node.leaf().has_vector()) {
+      std::vector<float> leaf_values;
+      const auto& leaf_value_vector = node.leaf().vector();
+      const int size = leaf_value_vector.value_size();
+      leaf_values.reserve(size);
+      for (int i = 0; i < size; ++i) {
+        leaf_values.push_back(leaf_value_vector.value(i));
+      }
+      return leaf_values;
+    } else {
+      return {node.leaf().scalar()};
+    }
   } else {
-    return node.metadata().original_leaf().scalar();
+    if (node.metadata().original_leaf().has_vector()) {
+      std::vector<float> node_values;
+      const auto& leaf_value_vector = node.metadata().original_leaf().vector();
+      const int size = leaf_value_vector.value_size();
+      node_values.reserve(size);
+      for (int i = 0; i < size; ++i) {
+        node_values.push_back(leaf_value_vector.value(i));
+      }
+      return node_values;
+    } else {
+      return {node.metadata().original_leaf().scalar()};
+    }
   }
 }
 
@@ -452,15 +475,18 @@ void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
 
     // Change node back into leaf.
     *node->mutable_leaf() = node_metadata.original_leaf();
-    const float parent_value = node_value(tree_id, node_id);
+    const auto& parent_values = node_value(tree_id, node_id);
+    DCHECK_EQ(parent_values.size(), 1);
+    const float parent_value = parent_values[0];
 
     // Save the old values of weights of children.
     (*nodes_meta)[left_id].first = node_id;
-    (*nodes_meta)[left_id].second = parent_value - node_value(tree_id, left_id);
+    (*nodes_meta)[left_id].second =
+        parent_value - node_value(tree_id, left_id)[0];
 
     (*nodes_meta)[right_id].first = node_id;
     (*nodes_meta)[right_id].second =
-        parent_value - node_value(tree_id, right_id);
+        parent_value - node_value(tree_id, right_id)[0];
 
     // Clear gain for leaf node.
     node->clear_metadata();
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index 3c7b2df9b08a2b8912c43b2439e28f34a64b38ef..34a35f173c338964632b62536f21175137e9b371 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -68,7 +68,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
       const int32 tree_id, const int32 node_id, const int32 index_in_batch,
       const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
 
-  float node_value(const int32 tree_id, const int32 node_id) const;
+  std::vector<float> node_value(const int32 tree_id, const int32 node_id) const;
 
   void set_node_value(const int32 tree_id, const int32 node_id,
                       const float logits);
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 973cdec13a368ff95ae3185695507c62c173675c..7c025b34b982f410ac3585855a6e14f3b99f5e2f 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -288,7 +288,9 @@ class BoostedTreesCenterBiasOp : public OpKernel {
       ensemble_resource->AddNewTreeWithLogits(kLayerByLayerTreeWeight, logits);
       current_bias = logits;
     } else {
-      current_bias = ensemble_resource->node_value(0, 0);
+      const auto& current_biases = ensemble_resource->node_value(0, 0);
+      DCHECK_EQ(current_biases.size(), 1);
+      current_bias = current_biases[0];
       continue_centering =
           std::abs(logits / current_bias) > kMinDeltaForCenterBias;
       current_bias += logits;
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index 2810925bbcd645f60af0e6025a74043cd45f21e7..8c4341335fbae66249259b69a9693b8bcf6073f0 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -43,12 +47,42 @@ class BroadcastToOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
 
+    // Handle copy.
+    if (output_shape == input_shape) {
+      ctx->set_output(0, input_tensor);
+      return;
+    }
+
+    OP_REQUIRES(ctx, input_shape.dims() <= output_shape.dims(),
+                errors::InvalidArgument(
+                    "Rank of input (", input_shape.dims(),
+                    ") must be no greater than rank of output shape (",
+                    output_shape.dims(), ")."));
+
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
-
-    const Device& d = ctx->eigen_device<Device>();
-    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
-                                      input_tensor, input_shape);
+    // Handle empty case.
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
+    // Handle broadcast from Scalar.
+    const Device& device = ctx->eigen_device<Device>();
+    if (input_shape.dims() == 0) {
+      functor::FillFunctor<Device, T>()(device, output_tensor->flat<T>(),
+                                        input_tensor.scalar<T>());
+      return;
+    }
+
+    BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
+                /*fewer_dims_optimization=*/true);
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "Incompatible shapes: ", input_shape.DebugString(), " vs. ",
+                    output_shape.DebugString()));
+
+    functor::BroadcastTo<Device, T>()(device, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape, bcast);
   }
 };
 
@@ -65,12 +99,12 @@ TF_CALL_ALL_TYPES(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 
 namespace functor {
-#define DECLARE_GPU_TEMPLATE(Type)                              \
-  template <>                                                   \
-  void BroadcastTo<GPUDevice, Type>::operator()(                \
-      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
-      const TensorShape& output_shape, const Tensor& input,     \
-      const TensorShape& input_shape);                          \
+#define DECLARE_GPU_TEMPLATE(Type)                               \
+  template <>                                                    \
+  void BroadcastTo<GPUDevice, Type>::operator()(                 \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output,  \
+      const TensorShape& output_shape, const Tensor& input,      \
+      const TensorShape& input_shape, const BCast& bcast) const; \
   extern template struct BroadcastTo<GPUDevice, Type>;
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
index bc11c5f914bfcbcbbc4445cace7126717f3d8d2d..6ae860c2b2995f1a9bb5f47ad40b4546923801a6 100644
--- a/tensorflow/core/kernels/broadcast_to_op.h
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -23,196 +23,81 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
 namespace functor {
 
-#define BROADCAST_SHAPE(NDIMS, input_shape, output_shape)                 \
-  auto reshape = AsEigenDSizesWithPrefix<NDIMS>(input_shape);             \
-  auto broadcast = output_shape.AsEigenDSizes<NDIMS>();                   \
-  auto reshape_32bit = AsEigenDSizesWithPrefix<NDIMS, int>(input_shape);  \
-  auto broadcast_32bit = output_shape.AsEigenDSizes<NDIMS, int>();        \
-  if (input_shape.dims() > 0) {                                           \
-    for (int i = 0; i < NDIMS; i++) {                                     \
-      if (reshape[i] != broadcast[i]) {                                   \
-        OP_REQUIRES(                                                      \
-            ctx, ((reshape[i] != 0) && (broadcast[i] % reshape[i] == 0)), \
-            errors::InvalidArgument("invalid shape to broadcast from ",   \
-                                    input_shape.DebugString(), " to ",    \
-                                    output_shape.DebugString()));         \
-        broadcast[i] = broadcast[i] / reshape[i];                         \
-      } else {                                                            \
-        broadcast[i] = 1;                                                 \
-      }                                                                   \
-      if (can_use_32bit) {                                                \
-        broadcast_32bit[i] = static_cast<int>(broadcast[i]);              \
-      }                                                                   \
-    }                                                                     \
+template <typename Device, typename T>
+struct BroadcastTo {
+  template <int NDIMS>
+  void DoBCast32Bit(const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+                    typename TTypes<T, NDIMS>::ConstTensor in,
+                    const typename Eigen::array<int, NDIMS> &bcast) const {
+    To32Bit(out).device(device) = To32Bit(in).broadcast(bcast);
   }
 
-#define HANDLE_BROADCAST_FROM_SCALAR()                              \
-  if (std::is_same<Eigen::GpuDevice, Device>::value) {              \
-    FillFunctor<Device, T>()(d, output_tensor.flat<T>(),            \
-                             input_tensor.scalar<T>());             \
-  } else {                                                          \
-    output.device(d) = output.constant(input_tensor.scalar<T>()()); \
+  template <int NDIMS>
+  void DoBCast(
+      const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+      typename TTypes<T, NDIMS>::ConstTensor in,
+      const typename Eigen::array<Eigen::DenseIndex, NDIMS> &bcast) const {
+    out.device(device) = in.broadcast(bcast);
   }
 
-#define HANDLE_BROADCAST_CASE(dim_i)                                        \
-  case dim_i: {                                                             \
-    if (can_use_32bit) {                                                    \
-      auto input = input_tensor.tensor<T, dim_i>();                         \
-      To32Bit(output).device(d) =                                           \
-          To32Bit(input).reshape(reshape_32bit).broadcast(broadcast_32bit); \
-    } else {                                                                \
-      auto input = input_tensor.tensor<T, dim_i>();                         \
-      output.device(d) = input.reshape(reshape).broadcast(broadcast);       \
-    }                                                                       \
-  } break
-
-template <typename Device, typename T>
-struct BroadcastTo {
-  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
-                  const TensorShape &output_shape, const Tensor &input_tensor,
-                  const TensorShape &input_shape) {
-    if (output_shape.num_elements() == 0) {
-      return;
-    }
-    if (output_shape == input_shape) {
-      output_tensor.flat<T>().device(d) = input_tensor.flat<T>();
-      return;
-    }
-
+  template <int NDIMS>
+  void ReshapeAndBCast(const Device &device, Tensor &output_tensor,
+                       const Tensor &input_tensor, const BCast &bcast) const {
     const bool can_use_32bit = std::is_same<Eigen::GpuDevice, Device>::value &&
                                output_tensor.NumElements() < kint32max &&
                                input_tensor.NumElements() < kint32max;
+    if (can_use_32bit) {
+      DoBCast32Bit<NDIMS>(
+          device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+          BCast::ToIndexArrayType<int, NDIMS>(bcast.x_bcast()));
+    } else {
+      DoBCast<NDIMS>(
+          device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+          BCast::ToIndexArrayType<Eigen::DenseIndex, NDIMS>(bcast.x_bcast()));
+    }
+  }
 
-    switch (output_shape.dims()) {
-      case 0: {
-        if (input_shape.dims() > 0) {
-          ctx->CtxFailure(errors::InvalidArgument(
-              "invalid shape to broadcast from ", input_shape.DebugString(),
-              " to ", output_shape.DebugString()));
-          break;
-        }
-        output_tensor.scalar<T>().device(d) = input_tensor.scalar<T>();
+  // PRECONDITION: rank(input_shape) > 0 &&
+  //               rank(input_shape) <= rank(output_shape)  &&
+  //               output_shape.num_elements() > 0.
+  void operator()(const Device &device, OpKernelContext *ctx,
+                  Tensor &output_tensor, const TensorShape &output_shape,
+                  const Tensor &input_tensor, const TensorShape &input_shape,
+                  const BCast &bcast) const {
+    const int ndims = bcast.y_reshape().size();
+    switch (ndims) {
+      case 1:
+        ReshapeAndBCast<1>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 2:
+        ReshapeAndBCast<2>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 3:
+        ReshapeAndBCast<3>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 4:
+        ReshapeAndBCast<4>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 5:
+        ReshapeAndBCast<5>(device, output_tensor, input_tensor, bcast);
         break;
-      }
-      case 1: {
-        BROADCAST_SHAPE(1, input_shape, output_shape);
-
-        auto output = output_tensor.tensor<T, 1>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 2: {
-        BROADCAST_SHAPE(2, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 2>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 3: {
-        BROADCAST_SHAPE(3, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 3>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 4: {
-        BROADCAST_SHAPE(4, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 4>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-            HANDLE_BROADCAST_CASE(4);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 5: {
-        BROADCAST_SHAPE(5, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 5>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-            HANDLE_BROADCAST_CASE(4);
-            HANDLE_BROADCAST_CASE(5);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
       default:
-        ctx->CtxFailure(errors::InvalidArgument(
-            "invalid shape to broadcast from ", input_shape.DebugString(),
-            " to ", output_shape.DebugString()));
+        ctx->SetStatus(errors::Unimplemented(
+            "Broadcast between ", input_shape.DebugString(), " and ",
+            output_shape.DebugString(), " is not supported yet."));
         break;
     }
   }
-
- private:
-  template <int NDIMS, typename DimType = Eigen::DenseIndex>
-  Eigen::DSizes<DimType, NDIMS> AsEigenDSizesWithPrefix(
-      const TensorShape &shape) const {
-    Eigen::DSizes<DimType, NDIMS> dsizes;
-    for (int d = 0; d < NDIMS - shape.dims(); d++) {
-      dsizes[d] = 1;
-    }
-    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
-      dsizes[d] =
-          static_cast<DimType>(shape.dim_size(d - (NDIMS - shape.dims())));
-    }
-    return dsizes;
-  }
 };
 
-#undef BROADCAST_SHAPE
-#undef HANDLE_BROADCAST_FROM_SCALAR
-#undef HANDLE_BROADCAST_CASE
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 113f1487f0189aa351e5f89e37be116b0ac33ee5..c5e6f06c6578d1a6dc777b39e8e04aa963b5aecd 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -149,7 +149,9 @@ void NcclReducer::Run(StatusCallback done) {
           << col_params_->group.num_tasks << " current task "
           << col_params_->instance.task_names[col_params_->default_rank]
           << " num local devices " << num_local_devices
-          << " num global devices " << num_global_devices;
+          << " num global devices " << num_global_devices << " device "
+          << col_ctx_->device_name << " instance "
+          << col_params_->instance.instance_key;
   NcclManager::instance()->AddToAllReduce(
       std::move(participant),
       {nccl_collective_key, num_local_devices, num_global_devices,
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 04e37e8edcf535a1d9435f527f877f1eb812dc8e..23356283bb52dc4ab7f61193211072e6f95fb1f4 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -43,6 +43,10 @@ class CollectiveOpKernel : public AsyncOpKernel {
       // Call in a blockable thread because it's not guaranteed that
       // this call cannot block.
       c->env()->SchedClosure([this, c, done, col_exec]() {
+        VLOG(1) << "CollectiveOpKernel CompleteParams for collective "
+                << col_params_.name << " device " << c->device()->name()
+                << " group " << col_params_.group.group_key << " instance "
+                << col_params_.instance.instance_key;
         col_exec->CompleteParamsAsync(
             c->device()->name(), &col_params_, c->cancellation_manager(),
             [this, c, done](const Status& s) {
@@ -64,6 +68,57 @@ class CollectiveOpKernel : public AsyncOpKernel {
   std::vector<int32> dependencies_;
 };
 
+class CollectiveGatherOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveGatherOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = GATHER_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Gather");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    // Allocate output on the first pass through this function.  This must be
+    // done immediately, while we're still in the executor thread.  Otherwise
+    // the memory is not guaranteed to be unused by any concurrently executing
+    // GPU kernel.
+    if (c->mutable_output(0) == nullptr) {
+      // Allocate the output tensor.
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          c, c->allocate_output(0, col_params_.instance.shape, &output), done);
+    }
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    auto actual_done = [c, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveGatherOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_CPU),
+                        CollectiveGatherOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_GPU),
+                        CollectiveGatherOpKernel);
+
 class CollectiveReduceOpKernel : public CollectiveOpKernel {
  public:
   explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
@@ -149,10 +204,18 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
       col_params_.instance.shape = c->input(0).shape();
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
-    auto actual_done = [c, col_exec, done](const Status& s) {
+
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveReduceKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveReduceKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
@@ -211,10 +274,17 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
                          " does not match shape of input"),
         done);
 
-    auto actual_done = [c, col_exec, done](const Status& s) {
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveBcastSendOpKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveBcastSendOpKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
@@ -266,10 +336,17 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
 
-    auto actual_done = [c, col_exec, done](const Status& s) {
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveBcastRecvOpKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveBcastRecvOpKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 161810d8cf933237f526768bc8f2a86bf37c8ffc..853d7c3133d48a5ec4690403eab16d2eaa776700 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -117,6 +117,7 @@ TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
 TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
+TF_CALL_int16(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 1a9adfa7319e42aad0aaa4134b905685dc222038..ae828b5bf48a4b657211cdb7efd62559ad364037 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -203,6 +203,7 @@ TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_int16(REGISTER_GPUCONCAT32);
 TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
@@ -212,6 +213,7 @@ TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_int16(REGISTER_GPUCONCAT64);
 TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
@@ -221,6 +223,7 @@ TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_int16(REGISTER_GPU32);
 TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
@@ -230,6 +233,7 @@ TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_int16(REGISTER_GPU64);
 TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 5520cf7524b6a3bb0f3e2fe94760dd444c33fd44..a988f63a6c629303fe30abe994e8941b99ac5d4d 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "absl/base/dynamic_annotations.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -184,6 +185,80 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
 };
 #endif
 
+template <typename T>
+struct Conv2DCustomBackpropInputMatMulFunctor {
+  using MatrixMap = Eigen::Map<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  using ConstMatrixMap = Eigen::Map<
+      const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Compute gradient into 'im2col_buf'.
+    MatrixMap C(im2col_buf, output_image_size, filter_total_size);
+
+    ConstMatrixMap A(out_data, output_image_size, dims_out_depth);
+    ConstMatrixMap B(filter_data, filter_total_size, dims_out_depth);
+
+    C.noalias() = A * B.transpose();
+  }
+};
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+template <>
+struct Conv2DCustomBackpropInputMatMulFunctor<float> {
+  using T = float;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Inputs are in RowMajor order, we "cheat" by swapping the LHS and RHS:
+    //   RowMajor: C   = A   * B
+    //   ColMajor: C^T = B^T * A^T
+    //
+    // Dimension names:
+    //   out_image_size    -> ois
+    //   filter_total_size -> fts
+    //   dims_out_depth    -> dod
+    //
+    // RowMajor:
+    //   im2col      = out_data    * filter_data^T
+    //   [ois x fts] = [ois x dod] * [fts x dod]^T
+    //
+    // ColMajor:
+    //   im2col^T    = filter_data *  out_data^T
+    //   [fts x ois] = [fts x dod] * [dod x ois]*
+
+    const int m = filter_total_size;
+    const int n = output_image_size;
+    const int k = dims_out_depth;  // contraction dim
+
+    const char transposeA = 'T';  // sgemm(A) == filter_data
+    const char transposeB = 'N';  // sgemm(B) == out_data
+
+    const int ldA = dims_out_depth;
+    const int ldB = dims_out_depth;
+    const int ldC = filter_total_size;
+
+    const float alpha = 1.0;
+    const float beta = 0.0;
+
+    // mkldnn_sgemm code can't be instrumented with msan.
+    ANNOTATE_MEMORY_IS_INITIALIZED(
+        im2col_buf, filter_total_size * output_image_size * sizeof(T));
+
+    mkldnn_status_t st =
+        mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, &alpha, filter_data,
+                     &ldA, out_data, &ldB, &beta, im2col_buf, &ldC);
+
+    OP_REQUIRES(
+        ctx, st == 0,
+        errors::Internal("Failed to call mkldnn_sgemm. Error code: ", st));
+  }
+};
+#endif
+
 // Based on implementation written by Yangqing Jia (jiayq).
 template <typename Device, class T>
 class Conv2DCustomBackpropInputOp : public OpKernel {
@@ -417,21 +492,14 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
         input_backprop_data += input_offset;
       }
     } else {
-      typedef Eigen::Map<
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-          MatrixMap;
-      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
-                                             Eigen::RowMajor>>
-          ConstMatrixMap;
-
       for (int image_id = 0; image_id < dims.batch_size;
            image_id += shard_size) {
         const int shard_limit =
             std::min(static_cast<int>(shard_size),
                      static_cast<int>(dims.batch_size) - image_id);
 
-        auto shard = [&dims, &pad_top, &pad_left, &pad_bottom, &pad_right,
-                      &output_image_size, &filter_total_size,
+        auto shard = [&context, &dims, &pad_top, &pad_left, &pad_bottom,
+                      &pad_right, &output_image_size, &filter_total_size,
                       &input_backprop_data, &col_buffer_data,
                       &out_backprop_data, &filter_data, &input_offset,
                       &output_offset, &size_C](int64 start, int64 limit) {
@@ -440,13 +508,9 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
             T* input_data = input_backprop_data + shard_id * input_offset;
             const T* out_data = out_backprop_data + shard_id * output_offset;
 
-            // Compute gradient into 'im2col_buf'.
-            MatrixMap C(im2col_buf, output_image_size, filter_total_size);
-
-            ConstMatrixMap A(out_data, output_image_size, dims.out_depth);
-            ConstMatrixMap B(filter_data, filter_total_size, dims.out_depth);
-
-            C.noalias() = A * B.transpose();
+            Conv2DCustomBackpropInputMatMulFunctor<T>()(
+                context, out_data, filter_data, filter_total_size,
+                output_image_size, dims.out_depth, im2col_buf);
 
             Col2im<T>(im2col_buf, dims.in_depth,
                       dims.spatial_dims[0].input_size,
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 0fd7550830333f749312f5db54d3ffd6ffa22a4a..9ceb51062e832a2e59455d71a0115e98896ef276 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -78,9 +78,6 @@ Status ConvBackpropExtractAndVerifyDimension(
         " stride: ", dim->stride, " dilation: ", dim->dilation);
   }
 
-  // TODO(reedwm): Correctly handle explicit padding here. The rest of the
-  // fields set on 'dim' are only used in XLA. TensorFlow ops do not yet support
-  // explicit padding for XLA.
   int64 effective_filter_size = (dim->filter_size - 1) * dim->dilation + 1;
   dim->expanded_output_size = (dim->output_size - 1) * dim->stride + 1;
   const auto padded_out_size = dim->input_size + effective_filter_size - 1;
@@ -102,7 +99,7 @@ Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, const std::vector<int64>& explicit_paddings,
+    Padding padding, absl::Span<const int64> explicit_paddings,
     TensorFormat data_format, ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index c8e8cf28c55e266575738dfe9ef65d588dd0dd2f..173f92806f911edf6dca043510b1fd9b36a0a66f 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -222,7 +222,7 @@ struct ConvBackpropSpatialDimension {
   int64 stride;
   int64 dilation;
 
-  // The following fields are valid only if the padding is not EXPLICIT.
+  // Output size after scaling by the stride.
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
@@ -270,7 +270,7 @@ Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, const std::vector<int64>& explicit_paddings,
+    Padding padding, absl::Span<const int64> explicit_paddings,
     TensorFormat data_format, ConvBackpropDimensions* dims);
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 105a4b1b825e304175d62c1723aeb46154b46a96..ccd24fcdd4c5e4945f2daf6461727e6038b4dd32 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -110,7 +110,7 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
 
 // Computes and validates convolutions dimensions from Conv2D parameters. If
 // parameters are valid, dimensions will be updated with derived convolution
-// dimensions, otherwise error will be returned.
+// dimensions, otherwise an error will be returned.
 Status ComputeConv2DDimension(const Conv2DParameters& params,
                               const Tensor& input, const Tensor& filter,
                               Conv2DDimensions* dimensions);
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 6921020d09e94fa7b99d7ca6cb95c82274b2e4c0..0eadf4c1714f6987f0a91c153f59d56ce0254014 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -423,7 +423,7 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
   //  ... (altogether 999 lines)
   //  0, 1, 2, ..., 998
   AddInput<float>(TensorShape({1, kLength, kLength, 1}),
-                  [kLength](int i) -> float { return i % kLength; });
+                  [](int i) -> float { return i % kLength; });
   AddInputFromArray<float>(TensorShape({2, 4}),
                            {0, 0, 0.5, 0.5, 0.5, 0.5, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -437,15 +437,15 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
   //  ... (altogether 500 lines)
   //  0, 1, 2, ..., 499
   Tensor result1(allocator(), DT_FLOAT, TensorShape({1, kHalf, kHalf, 1}));
-  test::FillFn<float>(&result1, [kHalf](int i) -> float { return i % kHalf; });
+  test::FillFn<float>(&result1, [](int i) -> float { return i % kHalf; });
 
   // Result 2:
   //  499, 500, 501, ..., 998
   //  ... (altogether 500 lines)
   //  499, 500, 501, ..., 998
   Tensor result2(allocator(), DT_FLOAT, TensorShape({1, kHalf, kHalf, 1}));
-  test::FillFn<float>(
-      &result2, [kHalf](int i) -> float { return i % kHalf + kHalf - 1; });
+  test::FillFn<float>(&result2,
+                      [](int i) -> float { return i % kHalf + kHalf - 1; });
 
   // Expected result is the concat of the two tensors.
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, kHalf, kHalf, 1}));
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 39d0a998fdcfe0710af97e404e142955e57a7c2b..82d92388d401af176d6a555f4f0e51af84caef11 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -643,6 +643,50 @@ static inline Status GesvdImpl(
 
 TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
 
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status GesvdjBatchedImpl(BufSizeFnT bufsize, SolverFnT solver,
+                                       CudaSolver* cuda_solver,
+                                       OpKernelContext* context,
+                                       cusolverDnHandle_t cusolver_dn_handle,
+                                       cusolverEigMode_t jobz, int m, int n,
+                                       Scalar* A, int lda, Scalar* S, Scalar* U,
+                                       int ldu, Scalar* V, int ldv,
+                                       int* dev_lapack_info, int batch_size) {
+  mutex_lock lock(handle_map_mutex);
+  /* Get amount of workspace memory required. */
+  int lwork;
+  /* Default parameters for gesvdj and gesvdjBatched. */
+  gesvdjInfo_t svdj_info;
+  TF_RETURN_IF_CUSOLVER_ERROR(cusolverDnCreateGesvdjInfo(&svdj_info));
+  TF_RETURN_IF_CUSOLVER_ERROR(bufsize(
+      cusolver_dn_handle, jobz, m, n, CUDAComplex(A), lda, S, CUDAComplex(U),
+      ldu, CUDAComplex(V), ldv, &lwork, svdj_info, batch_size));
+  /* Allocate device memory for workspace. */
+  auto dev_workspace =
+      cuda_solver->GetScratchSpace<Scalar>(lwork, "", /* on_host */ false);
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, jobz, m, n, CUDAComplex(A), lda, S, CUDAComplex(U),
+      ldu, CUDAComplex(V), ldv, CUDAComplex(dev_workspace.mutable_data()),
+      lwork, dev_lapack_info, svdj_info, batch_size));
+  TF_RETURN_IF_CUSOLVER_ERROR(cusolverDnDestroyGesvdjInfo(svdj_info));
+  return Status::OK();
+}
+
+#define GESVDJBATCHED_INSTANCE(Scalar, type_prefix)                            \
+  template <>                                                                  \
+  Status CudaSolver::GesvdjBatched<Scalar>(                                    \
+      cusolverEigMode_t jobz, int m, int n, Scalar* dev_A, int lda,            \
+      Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_V, int ldv,           \
+      int* dev_lapack_info, int batch_size) {                                  \
+    return GesvdjBatchedImpl(DN_BUFSIZE_FN(gesvdjBatched, type_prefix),        \
+                             DN_SOLVER_FN(gesvdjBatched, type_prefix), this,   \
+                             context_, cusolver_dn_handle_, jobz, m, n, dev_A, \
+                             lda, dev_S, dev_U, ldu, dev_V, ldv,               \
+                             dev_lapack_info, batch_size);                     \
+  }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVDJBATCHED_INSTANCE);
+
 //=============================================================================
 // Wrappers of cuBlas computational methods begin here.
 //
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 1fc344731c28df2e2d4cb9e931accfc0ca4592ed..fa8b4e241556afef82537db118706ebd35539987 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -312,6 +312,11 @@ class CudaSolver {
   Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
                int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
                int ldvt, int* dev_lapack_info) TF_MUST_USE_RESULT;
+  template <typename Scalar>
+  Status GesvdjBatched(cusolverEigMode_t jobz, int m, int n, Scalar* dev_A,
+                       int lda, Scalar* dev_S, Scalar* dev_U, int ldu,
+                       Scalar* dev_V, int ldv, int* dev_lapack_info,
+                       int batch_size);
 
  private:
   OpKernelContext* context_;  // not owned.
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
index 539f07b0d68321a0f9a33b76aca78bd9e38ce6e9..f4059b2b137ae16dfeed199aae26895f74d39133 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -21,6 +21,7 @@ namespace tensorflow {
 namespace functor {
 DEFINE_BINARY11(mul, Eigen::half, float, double, uint8, int8, uint16, int16,
                 int32, int64, complex64, complex128);
+DEFINE_BINARY2(mul_no_nan, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index cff0407b83a4bafd27573325615322f92e594d46..93d6e38f54ee1867c0ea8c8e673c22925b4d607d 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -19,6 +19,8 @@ namespace tensorflow {
 
 REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
           int32, bfloat16);
+REGISTER2(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, float, double);
+
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -39,6 +41,7 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
+REGISTER2(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, float, double);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index b03ee937ca85f82b41dddc5bab9aa0d7764d4b7a..a6a6a7c74f9d1fc1361546148b9425b87fc48698 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -147,24 +147,49 @@ struct functor_traits<safe_div_or_mod_op<T, DivOrMod>> {
   };
 };
 
-template <typename T>
-struct div_no_nan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
+template <typename T, typename Binary>
+struct no_nan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(no_nan_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
                                                            const T& b) const {
     if (b != 0) {
-      return scalar_quotient_op<T>()(a, b);
+      return Binary()(a, b);
     } else {
-      return 0;
+      return T(0);
     }
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
+    const Packet mask = pcmp_eq(b, pzero(b));
+    const Packet quotient = Binary().packetOp(a, b);
+    return pandnot(quotient, mask);
+  }
+};
+
+template <typename T>
+struct div_no_nan_op : public no_nan_op<T, scalar_quotient_op<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
 };
 
 template <typename T>
 struct functor_traits<div_no_nan_op<T>> {
   enum {
     Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
-    PacketAccess = false,
+    PacketAccess = true,
+  };
+};
+
+template <typename T>
+struct mul_no_nan_op : public no_nan_op<T, scalar_product_op<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(mul_no_nan_op)
+};
+
+template <typename T>
+struct functor_traits<mul_no_nan_op<T>> {
+  enum {
+    Cost = functor_traits<scalar_product_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = true,
   };
 };
 
@@ -791,6 +816,9 @@ struct mul : base<T, Eigen::internal::scalar_product_op<T>> {
   static const bool use_bcast_optimization = true;
 };
 
+template <typename T>
+struct mul_no_nan : base<T, Eigen::internal::mul_no_nan_op<T>> {};
+
 template <typename T>
 struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index cd6803fb3faa30a82a9951993d7eb9e8c455e6f8..d22bf0dc57cd54b04d7178eb8bf557a8c25c549e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -143,6 +143,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "shard_dataset_op",
+    srcs = ["shard_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "window_dataset_op",
     srcs = ["window_dataset_op.cc"],
@@ -252,7 +263,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -310,7 +320,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
     ],
 )
 
@@ -344,7 +353,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -432,6 +440,25 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "sparse_tensor_slice_dataset_op_test",
+    size = "small",
+    srcs = ["sparse_tensor_slice_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":sparse_tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_dataset_op",
     srcs = ["tensor_dataset_op.cc"],
@@ -446,12 +473,32 @@ tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
     ],
 )
 
+tf_cc_test(
+    name = "tensor_slice_dataset_op_test",
+    size = "small",
+    srcs = ["tensor_slice_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -478,6 +525,7 @@ tf_kernel_library(
     name = "reader_dataset_ops",
     srcs = ["reader_dataset_ops.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -553,6 +601,7 @@ cc_library(
     srcs = ["graph_rewrite_dataset.cc"],
     hdrs = ["graph_rewrite_dataset.h"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -589,7 +638,6 @@ tf_kernel_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -633,6 +681,7 @@ tf_kernel_library(
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
+        ":shard_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
         ":sparse_tensor_slice_dataset_op",
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 36a862ff00ef61b2ebdb3c4815b388f8a426b0bf..e163dbaa3d7f64d636d249be294b70f3704e8c31 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -18,6 +18,23 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
+  EXPECT_EQ(a.dtype(), b.dtype());
+  switch (a.dtype()) {
+#define CASE(type)                       \
+  case DataTypeToEnum<type>::value:      \
+    test::ExpectTensorEqual<type>(a, b); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_string(CASE);
+    // TODO(feihugis): figure out how to support variant tensors.
+#undef CASE
+    default:
+      return errors::Internal("Unsupported dtype", a.dtype());
+  }
+  return Status::OK();
+}
+
 Status DatasetOpsTestBase::CreateOpKernel(
     const NodeDef& node_def, std::unique_ptr<OpKernel>* op_kernel) {
   Status status;
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index ee5008318ce96533782bd4f872b884cf38081889..9c3a62c693028c064f19948ee782bb0bc7c2ccf9 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -47,6 +47,19 @@ class DatasetOpsTestBase : public ::testing::Test {
 
   ~DatasetOpsTestBase() {}
 
+  // The method validates whether the two tensors have the same shape, dtype,
+  // and value.
+  static Status ExpectEqual(const Tensor& a, const Tensor& b);
+
+  // Creates a tensor with the specified dtype, shape, and value.
+  template <typename T>
+  static Tensor CreateTensor(TensorShape input_shape,
+                             const gtl::ArraySlice<T>& input_data) {
+    Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+    test::FillValues<T>(&tensor, input_data);
+    return tensor;
+  }
+
   // Creates a new op kernel based on the node definition.
   Status CreateOpKernel(const NodeDef& node_def,
                         std::unique_ptr<OpKernel>* op_kernel);
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 5a6c8bac08dc0647b8bb5c935bf5e0bec17e9522..9def8c916186dc5c7d2b260a9df8e0430d9aa43a 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -231,5 +231,35 @@ Status VariantTensorDataWriter::WriteTensorInternal(StringPiece key,
   return Status::OK();
 }
 
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionLibraryDefinition& to_add) {
+  for (const auto& fn : to_add.ListFunctionNames()) {
+    if (auto found = base->Find(fn)) {
+      if (!OpDefEqual(found->signature(), to_add.Find(fn)->signature())) {
+        return errors::InvalidArgument("Cannot add function '", fn,
+                                       "' because a different function with "
+                                       "the same signature already exists.");
+      }
+      TF_RETURN_IF_ERROR(base->RemoveFunction(fn));
+    }
+  }
+  return base->AddLibrary(to_add);
+}
+
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionDefLibrary& to_add) {
+  for (const auto& fd : to_add.function()) {
+    if (auto found = base->Find(fd.signature().name())) {
+      if (!OpDefEqual(found->signature(), fd.signature())) {
+        return errors::InvalidArgument("Cannot add function '",
+                                       fd.signature().name(),
+                                       "' because a different function with "
+                                       "the same signature already exists.");
+      }
+      TF_RETURN_IF_ERROR(base->RemoveFunction(fd.signature().name()));
+    }
+  }
+  return base->AddLibrary(to_add);
+}
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 0754d1d266f83f0b2490e611d69c969d0a8e9a40..d85e87ca098bece338aed6c1addc0f52030bdcc6 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -98,6 +98,13 @@ class VariantTensorDataWriter : public IteratorStateWriter {
   std::vector<string> keys_;
 };
 
+// Adds the functions in `to_add` to `base`. If a function with a matching
+// signature already exists in `base`, replaces it with the function from
+// `to_add`.
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionLibraryDefinition& to_add);
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionDefLibrary& to_add);
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 290399b3e257117438298a9c8184e559c03e76fe..bddd2d455e55f6e2656953034a27eb351128f534 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -77,6 +79,90 @@ TEST(DatasetUtilsTest, VariantTensorDataNonExistentKey) {
             reader.ReadTensor("NonExistentKey", &val_tensor).code());
 }
 
+TEST(DatasetUtilsTest, AddToFunctionLibrary) {
+  auto make_fn_a = [](const string& fn_name) {
+    return FunctionDefHelper::Create(
+        /*function_name=*/fn_name,
+        /*in_def=*/{"arg: int64"},
+        /*out_def=*/{"ret: int64"},
+        /*attr_def=*/{},
+        /*node_def=*/{{{"node"}, "Identity", {"arg"}, {{"T", DT_INT64}}}},
+        /*ret_def=*/{{"ret", "node:output:0"}});
+  };
+
+  auto make_fn_b = [](const string& fn_name) {
+    return FunctionDefHelper::Create(
+        /*function_name=*/fn_name,
+        /*in_def=*/{"arg: int64"},
+        /*out_def=*/{"ret: int64"},
+        /*attr_def=*/{},
+        /*node_def=*/
+        {{{"node"}, "Identity", {"arg"}, {{"T", DT_INT64}}},
+         {{"node2"}, "Identity", {"node:output:0"}, {{"T", DT_INT64}}}},
+        /*ret_def=*/{{"ret", "node2:output:0"}});
+  };
+
+  FunctionDefLibrary fdef_base;
+  *fdef_base.add_function() = make_fn_a("0");
+  *fdef_base.add_function() = make_fn_a("1");
+  *fdef_base.add_function() = make_fn_a("2");
+
+  FunctionDefLibrary fdef_to_add;
+  *fdef_to_add.add_function() = make_fn_b("0");  // Override
+  *fdef_to_add.add_function() = make_fn_a("1");  // Do nothing
+  *fdef_to_add.add_function() = make_fn_b("3");  // Add new function
+
+  FunctionLibraryDefinition flib_0(OpRegistry::Global(), fdef_base);
+  TF_ASSERT_OK(AddToFunctionLibrary(&flib_0, fdef_to_add));
+
+  FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
+  FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
+  TF_ASSERT_OK(AddToFunctionLibrary(&flib_1, flib_to_add));
+
+  for (const auto& flib : {flib_0, flib_1}) {
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("0"), make_fn_b("0")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("1"), make_fn_a("1")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("2"), make_fn_a("2")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("3"), make_fn_b("3")));
+  }
+}
+
+TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
+  FunctionDefLibrary fdef_base;
+  *fdef_base.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret", "arg"}});
+
+  FunctionDefLibrary fdef_to_add;
+  *fdef_to_add.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64", "ret2: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret", "arg"}, {"ret2", "arg"}});
+
+  FunctionLibraryDefinition flib_0(OpRegistry::Global(), fdef_base);
+  Status s = AddToFunctionLibrary(&flib_0, fdef_to_add);
+  EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
+  EXPECT_EQ(
+      "Cannot add function '0' because a different function with the same "
+      "signature already exists.",
+      s.error_message());
+
+  FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
+  FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
+  s = AddToFunctionLibrary(&flib_1, flib_to_add);
+  EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
+  EXPECT_EQ(
+      "Cannot add function '0' because a different function with the same "
+      "signature already exists.",
+      s.error_message());
+}
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 9171b91a62f0c9a3fb2c4fa5b6e74acf6e2fbb1b..4f59e63e37c71d48f250cb15bfae5e256753b914 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -125,7 +125,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 0ef56915ebe33b0f843fa462ba981c34b6b257f7..77c1749632340a97e58e39a592b374f2ac6148bf 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -450,7 +450,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               result->UpdateStatus(allocate_status, offset);
             } else {
               for (size_t i = 0; i < return_values->size(); ++i) {
-                const Tensor& tensor = return_values->at(i);
+                Tensor& tensor = return_values->at(i);
                 Tensor* batch = &(result->output)[i];
                 if (tensor.NumElements() !=
                     (batch->NumElements() / batch->dim_size(0))) {
@@ -468,8 +468,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 // TODO(mrry): Add a version of DoParallelConcat that allows us
                 // to move `tensor` where possible, to speed up string tensor
                 // batching.
-                Status copy_status =
-                    batch_util::CopyElementToSlice(tensor, batch, offset);
+                Status copy_status = batch_util::CopyElementToSlice(
+                    std::move(tensor), batch, offset);
                 if (!copy_status.ok()) {
                   result->UpdateStatus(copy_status, offset);
                   break;
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index a95773afd12ca0b9cfa89045746ed79eadc2e24a..0397ca01c4ee058bce8079c83c787a4f38f4f578 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -35,9 +35,9 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override {
     int64 num_workers;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
-    OP_REQUIRES(ctx, num_workers > 0,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_workers > 0,
+        errors::InvalidArgument("num_workers must be greater than zero."));
 
     Dataset* dataset =
         new Dataset(ctx, input, num_workers, output_types_, output_shapes_);
@@ -62,8 +62,17 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
     string DebugString() const override { return "RebatchDatasetOp::Dataset"; }
 
    private:
+    bool ShouldOptimizeFunctions() override {
+      // We only want to optimize functions for some particular datasets like
+      // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+      // function optimization and explicitly handle function modifications
+      // for those datasets in the rewrite.
+      return false;
+    }
+
     RewriterConfig CreateGrapplerRewriteConfig() override {
       RewriterConfig rewriter_config;
+      rewriter_config.set_fail_on_optimizer_errors(true);
       rewriter_config.add_optimizers(kOptimizerName);
       rewriter_config.set_meta_optimizer_iterations(
           RewriterConfig_NumIterationsType_ONE);
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index bf96be4eb005e62d1982adf5662984fe15d59091..be5fa4c789ba842952b01ecb256ffa57629d5afa 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 7a16cda0f3dc83d5c00a2006f94bdecde866bfd5..e8fd051a8272c79619f3fba5bfaca7826bd486ea 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -98,8 +98,9 @@ class ThreadPoolHandleOp : public OpKernel {
                                   EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                                     *ret = new ThreadPoolResource(
                                         ctx->env(), {}, display_name_,
-                                        num_threads_, max_intra_op_parallelism_,
-                                        false /* low_latency_hint */);
+                                        num_threads_,
+                                        /*low_latency_hint=*/false,
+                                        max_intra_op_parallelism_);
                                     return Status::OK();
                                   }));
       initialized_ = true;
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 5dff2be39da37e899092c6a764d548b9a4799e22..3469743af63a4d9480de3ed9160c43a650b71410 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -71,7 +71,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         : DatasetIterator<Dataset>(params) {}
 
     ~Iterator() override {
-      if (!finalized_) {
+      if (!finalized_ && initialized_) {
         std::vector<Tensor> ignored;
         Status s =
             instantiated_finalize_func_->RunInstantiated(state_, &ignored);
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.cc b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
index bc4bb46010112dcd5ceccb6d0e5ba8939688e856..cd8026607e73ccf42d43ddb9a1d595778879478c 100644
--- a/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
+++ b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
 
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
@@ -57,14 +59,9 @@ Status GraphRewriteDataset::Optimize(OpKernelContext* ctx) {
 
   // Some functions may have been modified without having their names
   // changed (for example, nested dataset graphs from FlatMap or
-  // Interleave). To avoid name conflicts, we remove these functions from
-  // flib_def_ before adding the optimized function library.
-  for (const FunctionDef& fd : graph_def.library().function()) {
-    if (flib_def_->Find(fd.signature().name()) != nullptr) {
-      TF_RETURN_IF_ERROR(flib_def_->RemoveFunction(fd.signature().name()));
-    }
-  }
-  TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph_def.library()));
+  // Interleave).
+  TF_RETURN_IF_ERROR(
+      AddToFunctionLibrary(flib_def_.get(), graph_def.library()));
 
   Graph graph(OpRegistry::Global());
   TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
@@ -82,9 +79,21 @@ Status GraphRewriteDataset::Optimize(OpKernelContext* ctx) {
 Status GraphRewriteDataset::AsGraphDefInternal(SerializationContext* ctx,
                                                DatasetGraphDefBuilder* b,
                                                Node** output) const {
+  SerializationContext::Params params;
+  // The optimized input needs access to the newly optimized functions when
+  // it is serialized. Here, we use the optimized function library for
+  // serialization, which is the union of the function library from the
+  // OpKernelContext at dataset creation time and newly optimized functions.
+  // This includes all functions that optimized_input_ may use.
+  params.flib_def = flib_def_.get();
+  params.input_list = ctx->input_list();
+  params.optimization_only = ctx->optimization_only();
+  SerializationContext optimized_ctx(params);
+
   // We only serialize the optimized dataset to avoid re-running
   // optimizations when the input pipeline is restored from a checkpoint.
-  TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, optimized_input_, output));
+  TF_RETURN_IF_ERROR(
+      b->AddInputDataset(&optimized_ctx, optimized_input_, output));
   return Status::OK();
 }
 
@@ -161,6 +170,8 @@ Status GraphRewriteDataset::ApplyOptimizations(OpKernelContext* ctx,
   std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef(
           "graph", meta_graph_def, item_config);
+  grappler_item->optimization_options().optimize_function_library =
+      ShouldOptimizeFunctions();
   std::unordered_map<string, tensorflow::DeviceProperties> device_map;
   tensorflow::grappler::VirtualCluster cluster(device_map);
 
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.h b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
index dedbdce71ff202f026cfae8900f6698bcaa39b64..856fcd3ea727f1223a06783b78af0efc41935516 100644
--- a/tensorflow/core/kernels/data/graph_rewrite_dataset.h
+++ b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
@@ -73,6 +73,9 @@ class GraphRewriteDataset : public DatasetBase {
   // optimizations to be run by the Grappler Meta Optimizer.
   virtual RewriterConfig CreateGrapplerRewriteConfig() = 0;
 
+  // Option specifying whether we want to optimize the function library as well.
+  virtual bool ShouldOptimizeFunctions() { return true; }
+
   Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                             string* output_node);
 
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 808f834f62dfac78411829951c9c12570c41336e..7e23ca58ce7bdf65ab66675b42e2d840ab1702c9 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -99,7 +99,17 @@ class IteratorResource : public ResourceBase {
       captured_state = iterator_state_;
     }
     if (captured_state) {
-      return captured_state->iterator->Save(ctx, writer);
+      SerializationContext::Params params;
+      // The iterator state may contain functions that are not present
+      // in ctx's function library. Namely, an iterator may be restored from
+      // a serialized iterator with a modified function library (for example, as
+      // a result of OptimizeDataset). These modified functions are needed
+      // to serialize the iterator again.
+      params.flib_def = captured_state->flib_def.get();
+      params.input_list = ctx->input_list();
+      params.optimization_only = ctx->optimization_only();
+      SerializationContext ctx_with_functions(params);
+      return captured_state->iterator->Save(&ctx_with_functions, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -134,7 +144,14 @@ class IteratorResource : public ResourceBase {
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
-    TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
+
+    // Some function names may be duplicated (for example, if the serialized
+    // graph has an optimized function that retains its original name). We
+    // override functions in flib_def in the event of conflict. It is
+    // safe to assume that any node in the serialized graph is referring to the
+    // serialized function when there is a conflict.
+    TF_RETURN_IF_ERROR(
+        AddToFunctionLibrary(flib_def.get(), graph_def.library()));
     std::unique_ptr<State> new_state = absl::make_unique<State>(
         std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */);
 
@@ -567,7 +584,7 @@ class ToSingleElementOp : public AsyncOpKernel {
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
       // avoid destruction races.
       IteratorBase* raw_iterator = iterator.release();
-      auto cleanup = gtl::MakeCleanup([ctx, raw_iterator, done] {
+      auto cleanup = gtl::MakeCleanup([raw_iterator, done] {
         delete raw_iterator;
         done();
       });
@@ -967,78 +984,58 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   }
 }
 
-namespace {
+void IteratorGetNextAsOptionalOp::ComputeAsync(OpKernelContext* ctx,
+                                               DoneCallback done) {
+  IteratorResource* iterator;
+  OP_REQUIRES_OK_ASYNC(
+      ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
+  // The call to `iterator->GetNext()` may block and depend on an
+  // inter-op thread pool thread, so we issue the call from the
+  // owned thread pool.
+  background_worker_.Schedule(std::bind(
+      [this, ctx, iterator](DoneCallback done) {
+        std::vector<Tensor> components;
+        bool end_of_sequence = false;
 
-class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
- public:
-  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           "tf_data_iterator_get_next_as_optional") {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
+        // NOTE(mrry): We must unref the iterator before calling `done()`, to
+        // avoid destruction races.
+        iterator->Unref();
 
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    IteratorResource* iterator;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
-    // The call to `iterator->GetNext()` may block and depend on an
-    // inter-op thread pool thread, so we issue the call from the
-    // owned thread pool.
-    background_worker_.Schedule(std::bind(
-        [this, ctx, iterator](DoneCallback done) {
-          std::vector<Tensor> components;
-          bool end_of_sequence = false;
-
-          Status s = iterator->GetNext(IteratorContext(ctx), &components,
-                                       &end_of_sequence);
-          // NOTE(mrry): We must unref the iterator before calling `done()`, to
-          // avoid destruction races.
-          iterator->Unref();
-
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-          } else if (end_of_sequence) {
-            OP_REQUIRES_OK_ASYNC(ctx, WriteOptionalNoneToOutput(ctx, 0), done);
-          } else {
-            for (int i = 0; i < components.size(); ++i) {
-              OP_REQUIRES_ASYNC(
-                  ctx, components[i].dtype() == output_types_[i],
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected type for "
-                      "component ",
-                      i, ". Expected: ", DataTypeString(output_types_[i]),
-                      ". Actual: ", DataTypeString(components[i].dtype()), "."),
-                  done);
-              OP_REQUIRES_ASYNC(
-                  ctx,
-                  output_shapes_[i].IsCompatibleWith(components[i].shape()),
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected shape "
-                      "for component ",
-                      i, ". Expected: ", output_shapes_[i].DebugString(),
-                      ". Actual: ", components[i].shape().DebugString(), "."),
-                  done);
-            }
-
-            OP_REQUIRES_OK_ASYNC(
-                ctx,
-                WriteOptionalWithValueToOutput(ctx, 0, std::move(components)),
+        if (!s.ok()) {
+          ctx->SetStatus(s);
+        } else if (end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(ctx, WriteOptionalNoneToOutput(ctx, 0), done);
+        } else {
+          for (int i = 0; i < components.size(); ++i) {
+            OP_REQUIRES_ASYNC(
+                ctx, components[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The given optional does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(components[i].dtype()), "."),
+                done);
+            OP_REQUIRES_ASYNC(
+                ctx, output_shapes_[i].IsCompatibleWith(components[i].shape()),
+                errors::InvalidArgument(
+                    "The given optional does not match the expected shape "
+                    "for component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", components[i].shape().DebugString(), "."),
                 done);
           }
-          done();
-        },
-        std::move(done)));
-  }
 
- private:
-  BackgroundWorker background_worker_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-}  // namespace
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              WriteOptionalWithValueToOutput(ctx, 0, std::move(components)),
+              done);
+        }
+        done();
+      },
+      std::move(done)));
+}
 
 void IteratorToStringHandleOp::Compute(OpKernelContext* ctx) {
   const Tensor& resource_handle_t = ctx->input(0);
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index cd72269859044e6efd97a10ad43bc00c90df7d7d..7d769d365e9aa8d6952a9a8cdb461bc63957d031 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
@@ -115,6 +117,24 @@ class IteratorGetNextOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
+class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
+ public:
+  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        background_worker_(ctx->env(),
+                           "tf_data_iterator_get_next_as_optional") {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  BackgroundWorker background_worker_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 class IteratorGetNextSyncOp : public OpKernel {
  public:
   explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 95f4c1c89150b81a91f191e4e53c2b81c30841c4..e516d7791bfdf2ac40805553dfb5a3afef64d802 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -138,7 +138,6 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
diff --git a/tensorflow/core/kernels/data/map_dataset_op_test.cc b/tensorflow/core/kernels/data/map_dataset_op_test.cc
index 813b435a722899cbece8a6ba794fd0f4ab7425d1..f9c1cf493647ad4649218dfc24f293e2845f1d22 100644
--- a/tensorflow/core/kernels/data/map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op_test.cc
@@ -244,10 +244,11 @@ TEST_F(MapDatasetOpTest, DatasetOutputShapes) {
       CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
   core::ScopedUnref scored_unref_map_dataset(map_dataset);
 
-  std::vector<PartialTensorShape> expected_shapes({{}});
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
   EXPECT_EQ(map_dataset->output_shapes().size(), expected_shapes.size());
   for (int i = 0; i < map_dataset->output_shapes().size(); ++i) {
-    map_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]);
+    EXPECT_TRUE(
+        map_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
   }
 }
 
@@ -398,10 +399,10 @@ TEST_F(MapDatasetOpTest, IteratorOutputShapes) {
   TF_ASSERT_OK(
       map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
 
-  std::vector<PartialTensorShape> expected_shapes({{}});
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
   EXPECT_EQ(iterator->output_shapes().size(), expected_shapes.size());
   for (int i = 0; i < map_dataset->output_shapes().size(); ++i) {
-    iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]);
+    EXPECT_TRUE(iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
   }
 }
 
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 8122048702a6c572486ab8ac36a323f822ab9a0f..f1be942a633c347c6c3156e6a27e4deb79be4298 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -258,6 +258,7 @@ class MapDefunOp : public AsyncOpKernel {
             "output: ",
             index);
       }
+      Tensor* out;
       {  // Locking scope
         mutex_lock l(compute_opts_->mu);
         if (!compute_opts_->output_shapes.at(index).IsCompatibleWith(
@@ -272,15 +273,15 @@ class MapDefunOp : public AsyncOpKernel {
           // this index. Store the shape and allocate the output accordingly.
           compute_opts_->output_shapes.at(index) = val.shape();
 
-          Tensor* out = nullptr;
           TensorShape actual_shape = val.shape();
           actual_shape.InsertDim(0, compute_opts_->batch_size);
           TF_RETURN_IF_ERROR(
               compute_opts_->output.allocate(index, actual_shape, &out));
+        } else {
+          out = (compute_opts_->output)[index];
         }
-        return batch_util::CopyElementToSlice(
-            val, (compute_opts_->output)[index], iter_);
       }
+      return batch_util::CopyElementToSlice(val, out, iter_);
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index f0c6f5807bdfcf9598cc9e873d6a6e04db9d4fa7..d78ed6006c0207425a1aaf822b3a79406a1002fe 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -415,14 +415,10 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
           resource = new MultiDeviceIterator(
               output_types_, output_shapes_, devices_, std::move(flib_def),
               std::move(pflr), lib, std::move(function_handle_cache));
+          // NOTE: `mgr->Create()` transfers the one reference on `resource` to
+          // `mgr`.
           OP_REQUIRES_OK(context, mgr->Create<MultiDeviceIterator>(
                                       container_name, unique_name, resource));
-          Status s = VerifyResource(resource);
-          if (TF_PREDICT_FALSE(!s.ok())) {
-            resource->Unref();
-            context->SetStatus(s);
-            return;
-          }
         } else {
           unique_name = cinfo_.name();
           container_name = cinfo_.container();
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index 6590c7ef9b6b9a795387273ebec33f531625d7d0..473dbebd3062486de3cd48764ed45d9a059832d9 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -23,133 +23,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-class OptionalNoneOp : public OpKernel {
- public:
-  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    OP_REQUIRES_OK(ctx, WriteOptionalNoneToOutput(ctx, 0));
-  }
-};
-
-class OptionalFromValueOp : public OpKernel {
- public:
-  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    OpInputList components_input;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input));
-    std::vector<Tensor> components(components_input.begin(),
-                                   components_input.end());
-    OP_REQUIRES_OK(
-        ctx, WriteOptionalWithValueToOutput(ctx, 0, std::move(components)));
-  }
-};
-
-class OptionalHasValueOp : public OpKernel {
- public:
-  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* optional_input;
-    OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
-                errors::InvalidArgument(
-                    "Input to OptionalHasValue must be a scalar tensor "
-                    "containing an OptionalVariant object."));
-    const OptionalVariant* optional =
-        optional_input->scalar<Variant>()().get<OptionalVariant>();
-    OP_REQUIRES(
-        ctx, optional != nullptr,
-        errors::InvalidArgument(
-            "Input to OptionalHasValue must be an OptionalVariant object."));
-    Tensor* result;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &result));
-    result->scalar<bool>()() = optional->has_value();
-  }
-};
-
-class OptionalGetValueOp : public OpKernel {
- public:
-  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES(
-        ctx, output_shapes_.size() == output_types_.size(),
-        errors::InvalidArgument(
-            "output_types and output_shapes must be same length, got:\n",
-            "output_types: ", output_types_.size(), "\n",
-            "output_shapes: ", output_shapes_.size()));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* optional_input;
-    OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
-                errors::InvalidArgument(
-                    "Input to OptionalHasValue must be a scalar tensor "
-                    "containing an OptionalVariant object."));
-    const OptionalVariant* optional =
-        optional_input->scalar<Variant>()().get<OptionalVariant>();
-    OP_REQUIRES(
-        ctx, optional != nullptr,
-        errors::InvalidArgument(
-            "Input to OptionalHasValue must be an OptionalVariant object."));
-    OP_REQUIRES(
-        ctx, optional->has_value(),
-        errors::InvalidArgument("The given optional does not have a value."));
-    const auto& components = optional->get_values();
-    OP_REQUIRES(ctx, components.size() == output_types_.size(),
-                errors::InvalidArgument(
-                    "The given optional has ", components.size(),
-                    " components, expected ", output_types_.size()));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx, components[i].dtype() == output_types_[i],
-          errors::InvalidArgument(
-              "The given optional does not match the expected type for "
-              "component ",
-              i, ". Expected: ", DataTypeString(output_types_[i]),
-              ". Actual: ", DataTypeString(components[i].dtype()), "."));
-      OP_REQUIRES(ctx,
-                  output_shapes_[i].IsCompatibleWith(components[i].shape()),
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected shape "
-                      "for component ",
-                      i, ". Expected: ", output_shapes_[i].DebugString(),
-                      ". Actual: ", components[i].shape().DebugString(), "."));
-      ctx->set_output(i, components[i]);
-    }
-  }
-
- private:
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
-                        OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
-                        OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
-    OptionalFromValueOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
-    OptionalFromValueOp);
-
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
-                        OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("has_value")
-                            .Priority(1),
-                        OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
-                        OptionalGetValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
-                        OptionalGetValueOp);
-
 static Status OptionalDeviceCopy(
     const OptionalVariant& from, OptionalVariant* to,
     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
@@ -190,6 +63,75 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(OptionalVariant,
 
 }  // namespace
 
+void OptionalNoneOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES_OK(ctx, WriteOptionalNoneToOutput(ctx, 0));
+}
+
+void OptionalFromValueOp::Compute(OpKernelContext* ctx) {
+  OpInputList components_input;
+  OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input));
+  std::vector<Tensor> components(components_input.begin(),
+                                 components_input.end());
+  OP_REQUIRES_OK(ctx,
+                 WriteOptionalWithValueToOutput(ctx, 0, std::move(components)));
+}
+
+void OptionalHasValueOp::Compute(OpKernelContext* ctx) {
+  const Tensor* optional_input;
+  OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
+              errors::InvalidArgument(
+                  "Input to OptionalHasValue must be a scalar tensor "
+                  "containing an OptionalVariant object."));
+  const OptionalVariant* optional =
+      optional_input->scalar<Variant>()().get<OptionalVariant>();
+  OP_REQUIRES(
+      ctx, optional != nullptr,
+      errors::InvalidArgument(
+          "Input to OptionalHasValue must be an OptionalVariant object."));
+  Tensor* result;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &result));
+  result->scalar<bool>()() = optional->has_value();
+}
+
+void OptionalGetValueOp::Compute(OpKernelContext* ctx) {
+  const Tensor* optional_input;
+  OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
+              errors::InvalidArgument(
+                  "Input to OptionalHasValue must be a scalar tensor "
+                  "containing an OptionalVariant object."));
+  const OptionalVariant* optional =
+      optional_input->scalar<Variant>()().get<OptionalVariant>();
+  OP_REQUIRES(
+      ctx, optional != nullptr,
+      errors::InvalidArgument(
+          "Input to OptionalHasValue must be an OptionalVariant object."));
+  OP_REQUIRES(
+      ctx, optional->has_value(),
+      errors::InvalidArgument("The given optional does not have a value."));
+  const auto& components = optional->get_values();
+  OP_REQUIRES(
+      ctx, components.size() == output_types_.size(),
+      errors::InvalidArgument("The given optional has ", components.size(),
+                              " components, expected ", output_types_.size()));
+  for (int i = 0; i < components.size(); ++i) {
+    OP_REQUIRES(ctx, components[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The given optional does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(components[i].dtype()), "."));
+    OP_REQUIRES(ctx, output_shapes_[i].IsCompatibleWith(components[i].shape()),
+                errors::InvalidArgument(
+                    "The given optional does not match the expected shape "
+                    "for component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", components[i].shape().DebugString(), "."));
+    ctx->set_output(i, components[i]);
+  }
+}
+
 Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
                                       std::vector<Tensor> value) {
   OptionalVariant v(std::move(value));
@@ -213,6 +155,33 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   return Status::OK();
 }
 
+namespace {
+
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
+                        OptionalNoneOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
+                        OptionalNoneOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
+    OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
+    OptionalFromValueOp);
+
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("has_value")
+                            .Priority(1),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalGetValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
+                        OptionalGetValueOp);
+
+}  // namespace
+
 REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
                                          DEVICE_CPU, OptionalVariant,
                                          OptionalZerosLike<CPUDevice>);
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index 7089a423d7302decd2e13a6496307e7520e88066..24eb1b81d903b391d413cbfc9b10499c84125a40 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -152,6 +152,47 @@ Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
   return Status::OK();
 }
 
+class OptionalNoneOp : public OpKernel {
+ public:
+  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalFromValueOp : public OpKernel {
+ public:
+  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalHasValueOp : public OpKernel {
+ public:
+  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalGetValueOp : public OpKernel {
+ public:
+  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
+  }
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index aa14d27d5c3ebec797174d5aecf89dd217fe8f3b..87390ad512fcbf0481a0f5c4241d864d0c99cee6 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -64,7 +64,7 @@ class RangeDatasetOp : public DatasetOpKernel {
 
     const std::vector<PartialTensorShape>& output_shapes() const override {
       static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
+          new std::vector<PartialTensorShape>({PartialTensorShape({})});
       return *shapes;
     }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 28abea8f974efa4773e0812799a6fac94203c02e..0bbc09a212811b2d8209d1862aed289c4d23dff8 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -177,10 +177,11 @@ TEST_F(RangeDatasetOpTest, DatasetOutputShapes) {
       CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
   core::ScopedUnref scored_unref(range_dataset);
 
-  std::vector<PartialTensorShape> expected_shapes({{}});
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
   EXPECT_EQ(range_dataset->output_shapes().size(), expected_shapes.size());
   for (int i = 0; i < range_dataset->output_shapes().size(); ++i) {
-    range_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]);
+    EXPECT_TRUE(
+        range_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
   }
 }
 
@@ -304,10 +305,10 @@ TEST_F(RangeDatasetOpTest, IteratorOutputShapes) {
   TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
                                            &iterator));
 
-  std::vector<PartialTensorShape> expected_shapes({{}});
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
   EXPECT_EQ(iterator->output_shapes().size(), expected_shapes.size());
   for (int i = 0; i < range_dataset->output_shapes().size(); ++i) {
-    iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]);
+    EXPECT_TRUE(iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
   }
 }
 
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 789f9c859aab2df61d119b9bb6f6ddd88ce24681..cbc987dc161241e267ae680dcb2db71f2b68a159 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +30,8 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following ops.
 
+constexpr char kTextLineDatasetName[] = "TextLine";
+
 class TextLineDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
@@ -91,8 +94,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::TextLine")});
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::", kTextLineDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -142,6 +145,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
+              metrics::RecordTFDataBytesRead(kTextLineDatasetName,
+                                             line_contents.size());
               out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                         TensorShape({}));
               out_tensors->back().scalar<string>()() = std::move(line_contents);
@@ -268,9 +273,12 @@ class TextLineDatasetOp : public DatasetOpKernel {
 REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
                         TextLineDatasetOp);
 
+constexpr char kFixedLengthRecordDatasetName[] = "FixedLengthRecord";
+
 class FixedLengthRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
+
   explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx)
       : DatasetOpKernel(ctx),
         op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {}
@@ -346,10 +354,12 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       if (compression_type_.empty()) {
         return absl::make_unique<UncompressedIterator>(
             UncompressedIterator::Params{
-                this, strings::StrCat(prefix, "::FixedLengthRecord")});
+                this,
+                strings::StrCat(prefix, "::", kFixedLengthRecordDatasetName)});
       } else {
         return absl::make_unique<CompressedIterator>(CompressedIterator::Params{
-            this, strings::StrCat(prefix, "::FixedLengthRecord")});
+            this,
+            strings::StrCat(prefix, "::", kFixedLengthRecordDatasetName)});
       }
     }
 
@@ -411,6 +421,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               string record;
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
+              metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                             dataset()->record_bytes_);
+
               // Produce the record as output.
               Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
@@ -532,6 +545,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
                 string record;
                 TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
                     dataset()->record_bytes_, &record));
+                metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                               dataset()->record_bytes_);
+
                 // Produce the record as output.
                 Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
                 record_tensor.scalar<string>()() = std::move(record);
@@ -544,6 +560,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               Status s = buffered_input_stream_->ReadNBytes(
                   dataset()->record_bytes_, &record);
               if (s.ok()) {
+                metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                               dataset()->record_bytes_);
                 lookahead_cache_.append(record);
                 record = lookahead_cache_.substr(0, dataset()->record_bytes_);
                 lookahead_cache_ =
@@ -717,6 +735,8 @@ REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU),
                         FixedLengthRecordDatasetOp);
 
+constexpr char kTFRecordDatasetName[] = "TFRecord";
+
 class TFRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
@@ -766,8 +786,8 @@ class TFRecordDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::TFRecord")});
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::", kTFRecordDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -816,6 +836,9 @@ class TFRecordDatasetOp : public DatasetOpKernel {
             Status s =
                 reader_->ReadRecord(&out_tensors->back().scalar<string>()());
             if (s.ok()) {
+              metrics::RecordTFDataBytesRead(
+                  kTFRecordDatasetName,
+                  out_tensors->back().scalar<string>()().size());
               *end_of_sequence = false;
               return Status::OK();
             }
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb64911aa802c6639229be689237db7296558f4
--- /dev/null
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -0,0 +1,195 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/batch_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ShardDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 index = 0;
+    int64 num_shards = 0;
+
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "num_shards", &num_shards));
+    OP_REQUIRES(
+        ctx, num_shards > 0,
+        errors::InvalidArgument("Number of shards must be greater than zero "
+                                "(currently num_shards = ",
+                                num_shards, ")."));
+
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "index", &index));
+    OP_REQUIRES(
+        ctx, index >= 0 && index < num_shards,
+        errors::InvalidArgument("Index must be between 0 and ", num_shards - 1,
+                                " (currently index = ", index, ")."));
+
+    *output = new Dataset(ctx, num_shards, index, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 num_shards, int64 index,
+            const DatasetBase* input)
+        : DatasetBase(DatasetContext(ctx)),
+          num_shards_(num_shards),
+          index_(index),
+          input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Shard")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return strings::StrCat("ShardDatasetOp(", num_shards_, ", ", index_,
+                             ")::Dataset");
+    }
+
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_shards = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_shards_, &num_shards));
+      Node* index = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(index_, &index));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_shards, index}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), next_index_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        std::vector<Tensor> result;
+        do {
+          result.clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &result, end_of_sequence));
+          if (*end_of_sequence) {
+            input_impl_.reset();
+            return Status::OK();
+          }
+        } while ((next_index_++ % dataset()->num_shards_) != dataset()->index_);
+
+        *out_tensors = std::move(result);
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->num_shards_);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("next_index"), next_index_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("next_index"), &next_index_));
+        } else {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      int64 next_index_ GUARDED_BY(mu_);
+    };
+
+    const int64 num_shards_;
+    const int64 index_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShardDataset").Device(DEVICE_CPU),
+                        ShardDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1d01f792fd222ac26e294b98d504d837400aa6c
--- /dev/null
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
@@ -0,0 +1,519 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "sparse_tensor_slice_dataset";
+constexpr char kOpName[] = "SparseTensorSliceDataset";
+
+class SparseTensorSliceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new SparseTensorSliceDataset op kernel.
+  Status CreateSparseTensorSliceDatasetKernel(
+      DataType tvalues, std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(kNodeName, kOpName,
+                                     {"indices", "values", "dense_shape"},
+                                     {{"Tvalues", tvalues}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new SparseTensorSliceDataset op kernel context.
+  Status CreateSparseTensorSliceDatasetContext(
+      OpKernel *const op_kernel, gtl::InlinedVector<TensorValue, 4> *inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct SparseTensorParam {
+  Tensor indices;
+  Tensor values;
+  Tensor dense_shape;
+};
+
+struct TestParam {
+  SparseTensorParam input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs;
+  std::vector<int> breakpoints;
+} TestCases[] = {
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 2}, {0, 0, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999})},
+      {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {0})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {888})},
+       {DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {1})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {999})},
+       {DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}}},
+     {0, 1, 2}},  // 2-D sparse tensor
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 3}, {0, 0, 0, 1, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<double>({2}, {888.0, 999.0})},
+      {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {0, 0})},
+       {DatasetOpsTestBase::CreateTensor<double>({1}, {888.0})},
+       {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {1, 1})},
+       {DatasetOpsTestBase::CreateTensor<double>({1}, {999.0})},
+       {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}}},
+     {0, 1, 2}},  // 3-D sparse tensor
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 4},
+                                               {0, 0, 0, 0, 1, 1, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<string>({2}, {"a", "b"})},
+      {DatasetOpsTestBase::CreateTensor<int64>({4}, {3, 2, 2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {0, 0, 0})},
+       {DatasetOpsTestBase::CreateTensor<string>({1}, {"a"})},
+       {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {1, 1, 1})},
+       {DatasetOpsTestBase::CreateTensor<string>({1}, {"b"})},
+       {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({0, 3}, {})},
+       {DatasetOpsTestBase::CreateTensor<string>({0}, {})},
+       {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}}},
+     {0, 1, 3}},  // 4-D sparse tensor
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 5},
+                                               {0, 0, 0, 0, 0, 1, 1, 1, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999})},
+      {DatasetOpsTestBase::CreateTensor<int64>({5}, {3, 2, 2, 2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 4}, {0, 0, 0, 0})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {888})},
+       {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 4}, {1, 1, 1, 1})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {999})},
+       {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({0, 4}, {})},
+       {DatasetOpsTestBase::CreateTensor<int32>({0}, {})},
+       {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}}},
+     {0, 1, 3}}  // 5-D sparse tensor
+
+};
+
+struct DatasetGetNextTest : SparseTensorSliceDatasetOpTest,
+                            ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_slice = 0;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      TF_EXPECT_OK(
+          ExpectEqual(out_tensors[0], expected_outputs[cur_slice].indices));
+      TF_EXPECT_OK(
+          ExpectEqual(out_tensors[1], expected_outputs[cur_slice].values));
+      TF_EXPECT_OK(
+          ExpectEqual(out_tensors[2], expected_outputs[cur_slice].dense_shape));
+      cur_slice++;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest, DatasetGetNextTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  EXPECT_EQ(dataset->name(), kOpName);
+}
+
+struct DatasetOutputDtypesTest : SparseTensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputDtypesTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  DataTypeVector expected_output_dtypes = {
+      expected_outputs[0].indices.dtype(), expected_outputs[0].values.dtype(),
+      expected_outputs[0].dense_shape.dtype()};
+  TF_EXPECT_OK(
+      VerifyTypesMatch(dataset->output_dtypes(), expected_output_dtypes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorDatasetSliceOpTest, DatasetOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetOutputShapesTest : SparseTensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputShapesTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::vector<PartialTensorShape> expected_output_shapes = {
+      expected_outputs[0].indices.shape(), expected_outputs[0].values.shape(),
+      expected_outputs[0].dense_shape.shape()};
+  TF_EXPECT_OK(
+      VerifyShapesCompatible(dataset->output_shapes(), expected_output_shapes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorDatasetSliceOpTest, DatasetOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetCardinalityTest : SparseTensorSliceDatasetOpTest,
+                                ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  EXPECT_EQ(dataset->Cardinality(), expected_outputs.size());
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorDatasetSliceOpTest, DatasetCardinalityTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+struct IteratorOutputDtypesTest : SparseTensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputDtypesTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  DataTypeVector expected_output_dtypes = {
+      expected_outputs[0].indices.dtype(), expected_outputs[0].values.dtype(),
+      expected_outputs[0].dense_shape.dtype()};
+  TF_EXPECT_OK(
+      VerifyTypesMatch(iterator->output_dtypes(), expected_output_dtypes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest,
+                        IteratorOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct IteratorOutputShapesTest : SparseTensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputShapesTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  std::vector<PartialTensorShape> expected_output_shapes = {
+      expected_outputs[0].indices.shape(), expected_outputs[0].values.shape(),
+      expected_outputs[0].dense_shape.shape()};
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      expected_output_shapes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest,
+                        IteratorOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(SparseTensorSliceDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  EXPECT_EQ(iterator->prefix(), strings::StrCat("Iterator::SparseTensorSlice"));
+}
+
+struct IteratorRoundtripTest : SparseTensorSliceDatasetOpTest,
+                               ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  int cur_iteration = 0;
+  bool end_of_sequence = false;
+  int64 num_slices = input_sparse_tensor.dense_shape.dim_size(0);
+  std::vector<Tensor> out_tensors;
+
+  for (int breakpoint : breakpoints) {
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      cur_iteration++;
+    }
+
+    if (breakpoint == 0) {
+      EXPECT_FALSE(end_of_sequence);
+    } else if (breakpoint <= num_slices) {
+      for (int i = 0; i < out_tensors.size(); ++i) {
+        TF_EXPECT_OK(ExpectEqual(out_tensors[0],
+                                 expected_outputs[cur_iteration - 1].indices));
+        TF_EXPECT_OK(ExpectEqual(out_tensors[1],
+                                 expected_outputs[cur_iteration - 1].values));
+        TF_EXPECT_OK(ExpectEqual(
+            out_tensors[2], expected_outputs[cur_iteration - 1].dense_shape));
+      }
+    } else {
+      EXPECT_TRUE(end_of_sequence);
+    }
+
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_ASSERT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_ASSERT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_ASSERT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest, IteratorRoundtripTest,
+                        ::testing::ValuesIn(TestCases));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 54dcd7eb7d1bd97fa1c58e0d3235670482bafd93..97a1ec402f2abff8627c65e14d2af39e0693afaa 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -28,7 +29,10 @@ namespace {
 class TensorSliceDatasetOp : public DatasetOpKernel {
  public:
   explicit TensorSliceDatasetOp(OpKernelConstruction* ctx)
-      : DatasetOpKernel(ctx) {}
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     OpInputList inputs;
@@ -50,6 +54,10 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
               "All components must have the same size in the 0th dimension"));
     }
     *output = new Dataset(ctx, std::move(components));
+    OP_REQUIRES_OK(ctx,
+                   VerifyTypesMatch((*output)->output_dtypes(), output_types_));
+    OP_REQUIRES_OK(ctx, VerifyShapesCompatible((*output)->output_shapes(),
+                                               output_shapes_));
   }
 
  private:
@@ -170,6 +178,9 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
     DataTypeVector dtypes_;
     std::vector<PartialTensorShape> shapes_;
   };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorSliceDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0e09b45454c258e0e6e500c6bdd2ad580819276
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -0,0 +1,630 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "tensor_slice_dataset";
+constexpr char kOpName[] = "TensorSliceDataset";
+
+class TensorSliceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new TensorSliceDataset op kernel.
+  Status CreateTensorSliceDatasetKernel(
+      DataTypeVector dtypes, std::vector<PartialTensorShape> shapes,
+      std::unique_ptr<OpKernel> *tensor_dataset_kernel) {
+    std::vector<string> components;
+    components.reserve(dtypes.size());
+    for (int i = 0; i < dtypes.size(); i++) {
+      components.emplace_back(strings::StrCat("component_", i));
+    }
+
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, components,
+        {{"Toutput_types", dtypes}, {"output_shapes", shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, tensor_dataset_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new TensorSliceDataset op kernel context.
+  Status CreateTensorSliceDatasetContext(
+      OpKernel *const tensor_dataset_kernel,
+      gtl::InlinedVector<TensorValue, 4> *inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*tensor_dataset_kernel, *inputs));
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(tensor_dataset_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestParam {
+  std::vector<Tensor> components;
+  std::vector<Tensor> expected_outputs;
+  std::vector<int> breakpoints;
+} TestCases[] = {
+    // A single tuple of tensors.
+    {{{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2, 2}),
+                                               {1, 2, 3, 4}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 1}),
+                                                {37.0, 38.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({2, 1}),
+                                                {"a", "b"})}},  // components
+     {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {37.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"a"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {3, 4}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {38.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}),
+                                                {"b"})}},  // expected_outputs
+     {{0, 1, 3}}},                                         //  breakpoints
+    // Nested tensors
+    {{{DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({2, 1}),
+           {DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
+                                                     {1.0, 2.0, 3.0, 4.0}),
+            DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
+                                                     {5.0, 6.0, 7.0, 8.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({2, 1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                     TensorShape({1, 2}), {"a", "b"}),
+                                 DatasetOpsTestBase::CreateTensor<string>(
+                                     TensorShape({1, 2}), {"c", "d"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(
+           TensorShape({2, 3}), {1, 2, 3, 4, 5, 6})}},  // components
+     {{DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
+                                 TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                 TensorShape({1, 2}), {"a", "b"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
+                                 TensorShape({2, 2}), {5.0, 6.0, 7.0, 8.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                 TensorShape({1, 2}), {"c", "d"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(
+           TensorShape({3}), {4, 5, 6})}},  // expected_outputs
+     {{0, 1, 2}}}                           // breakpoints
+};
+
+struct DatasetGetNextTest : TensorSliceDatasetOpTest,
+                            ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+    dtypes.push_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_slice = 0;
+
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+    for (int i = 0; i < out_tensors.size(); ++i) {
+      EXPECT_LT(i + num_tensors_per_slice * cur_slice, expected_outputs.size());
+      if (out_tensors[i].dtype() == DT_VARIANT) {
+        // Currently `ExpectEqual()` does not support the variant tensor
+        // yet, so we manually cast the variant to numeric/string tensor.
+        const Tensor *output = out_tensors[i].scalar<Variant>()().get<Tensor>();
+        const Tensor *expected_output =
+            expected_outputs[i + num_tensors_per_slice * cur_slice]
+                .scalar<Variant>()()
+                .get<Tensor>();
+        TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+      } else {
+        TF_EXPECT_OK(ExpectEqual(
+            out_tensors[i],
+            expected_outputs[i + num_tensors_per_slice * cur_slice]));
+      }
+    }
+    out_tensors.clear();
+    cur_slice++;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetGetNextTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(TensorSliceDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  EXPECT_EQ(tensor_slice_dataset->name(), kOpName);
+}
+
+struct DatasetOutputDtypesTest : TensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputDtypesTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  const DataTypeVector produced_output_dtypes =
+      tensor_slice_dataset->output_dtypes();
+  EXPECT_EQ(produced_output_dtypes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_EQ(produced_output_dtypes[i], expected_outputs[i].dtype());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetOutputShapesTest : TensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputShapesTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  const std::vector<PartialTensorShape> produced_output_shapes =
+      tensor_slice_dataset->output_shapes();
+  std::vector<PartialTensorShape> expected_output_shapes;
+  EXPECT_EQ(produced_output_shapes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_TRUE(
+        produced_output_shapes[i].IsIdenticalTo(expected_outputs[i].shape()));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetCardinalityTest : TensorSliceDatasetOpTest,
+                                ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  EXPECT_EQ(tensor_slice_dataset->Cardinality(), inputs[0].tensor->dim_size(0));
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetCardinalityTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(TensorSliceDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(
+      tensor_slice_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+struct IteratorOutputDtypesTest : TensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputDtypesTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  const DataTypeVector produced_output_dtypes = iterator->output_dtypes();
+
+  EXPECT_EQ(produced_output_dtypes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_EQ(produced_output_dtypes[i], expected_outputs[i].dtype());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, IteratorOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct IteratorOutputShapesTest : TensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputShapesTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  const std::vector<PartialTensorShape> produced_output_shapes =
+      iterator->output_shapes();
+  EXPECT_EQ(produced_output_shapes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_TRUE(
+        produced_output_shapes[i].IsIdenticalTo(expected_outputs[i].shape()));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, IteratorOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(TensorSliceDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  EXPECT_EQ(iterator->prefix(), "Iterator::TensorSlice");
+}
+
+struct IteratorRoundtripTest : TensorSliceDatasetOpTest,
+                               ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+
+  int cur_iteration = 0;
+  bool end_of_sequence = false;
+  int64 num_slices = inputs[0].tensor->dim_size(0);
+  std::vector<Tensor> out_tensors;
+
+  for (int breakpoint : breakpoints) {
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                     &end_of_sequence));
+      cur_iteration++;
+    }
+
+    if (breakpoint == 0) {
+      EXPECT_FALSE(end_of_sequence);
+    } else if (breakpoint <= num_slices) {
+      for (int i = 0; i < out_tensors.size(); ++i) {
+        if (out_tensors[i].dtype() == DT_VARIANT) {
+          const Tensor *output =
+              out_tensors[i].scalar<Variant>()().get<Tensor>();
+          const Tensor *expected_output =
+              expected_outputs[i + num_tensors_per_slice * (cur_iteration - 1)]
+                  .scalar<Variant>()()
+                  .get<Tensor>();
+          TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+        } else {
+          TF_EXPECT_OK(ExpectEqual(
+              out_tensors[i], expected_outputs[i + num_tensors_per_slice *
+                                                       (cur_iteration - 1)]));
+        }
+      }
+    } else {
+      EXPECT_TRUE(end_of_sequence);
+    }
+
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+    TF_ASSERT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, IteratorRoundtripTest,
+                        ::testing::ValuesIn(TestCases));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 6bfb5bd5bc0ae50797080ca3540133b0081f0b13..ba6369533adbbde30f3661a3d8577936de1038fa 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -145,7 +145,7 @@ class DecodeCSVOp : public OpKernel {
               output[f]->flat<float>()(i) = record_defaults[f].flat<float>()(0);
             } else {
               float value;
-              OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
+              OP_REQUIRES(ctx, strings::safe_strtof(fields[f], &value),
                           errors::InvalidArgument(
                               "Field ", f, " in record ", i,
                               " is not a valid float: ", fields[f]));
@@ -165,7 +165,7 @@ class DecodeCSVOp : public OpKernel {
                   record_defaults[f].flat<double>()(0);
             } else {
               double value;
-              OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
+              OP_REQUIRES(ctx, strings::safe_strtod(fields[f], &value),
                           errors::InvalidArgument(
                               "Field ", f, " in record ", i,
                               " is not a valid double: ", fields[f]));
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index eaef5a6097ff5a7235caba37edf6ef94d5860931..3dd019c3d203c63f055113bb992eb1f542e838ae 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -110,6 +110,8 @@ REGISTER(uint8);
 REGISTER(int16);
 REGISTER(int8);
 REGISTER(int64);
+REGISTER(complex64);
+REGISTER(complex128);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index a3e7b53d55ce8b46a9ac2cf81251f80691b9e139..246a6ce04d97a5dec54f2d0b44da7e278d703908 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
@@ -211,6 +212,98 @@ class IfOp : public AsyncOpKernel {
   };
 };
 
+class CaseOp : public AsyncOpKernel {
+ public:
+  explicit CaseOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &branch_funcs_));
+  }
+
+  ~CaseOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    auto lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library"), done);
+
+    // TODO(b/37549631): Because this op has `SetIsStateful()` in its op
+    // registration, this kernel may be shared by multiple subgraphs, which have
+    // different associated `FunctionLibraryRuntime` objects and hence different
+    // `FHandle` namespaces. So we must call Instantiate() to make sure we get
+    // the correct function handles with respect to `lib`. Note the underlying
+    // `lib->Instantiate()` caches the created function handles, so calling
+    // `Instantiate()` repeatedly on the same `lib` and function is cheap.
+    std::vector<FHandle> branch_handles(branch_funcs_.size());
+    for (int i = 0; i < branch_funcs_.size(); i++) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, Instantiate(lib, branch_funcs_[i], &branch_handles[i]), done);
+    }
+
+    const Tensor& branch_index = ctx->input(0);
+    OP_REQUIRES_ASYNC(ctx, TensorShapeUtils::IsScalar(branch_index.shape()),
+                      errors::InvalidArgument("branch_index must be scalar"),
+                      done);
+    int32 branch = branch_index.scalar<int32>()();
+    (new State(this, ctx, branch, branch_handles, done))->Start();
+  }
+
+ private:
+  std::vector<NameAttrList> branch_funcs_;
+
+  class State {
+   public:
+    State(CaseOp* kernel, OpKernelContext* ctx, int branch,
+          std::vector<FHandle> branch_handles, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          branch_(branch),
+          branch_handles_(branch_handles),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
+      for (int i = 1; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      int branch = branch_;
+      // The last branch is the default branch.
+      if (branch < 0 || branch >= branch_handles_.size()) {
+        branch = branch_handles_.size() - 1;
+      }
+      rets_.clear();
+      lib_->Run(
+          // Evaluate one of the branch.
+          opts_, branch_handles_[branch], args_, &rets_,
+          // Done callback
+          [this](Status s) {
+            if (s.ok()) {
+              s = SetOutputs(kernel_, ctx_, rets_);
+            }
+            ctx_->SetStatus(s);
+            DoneCallback captured_done(std::move(done_));
+            delete this;
+            captured_done();
+          });
+    }
+
+   private:
+    CaseOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const int branch_;
+    std::vector<FHandle> branch_handles_;
+    DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+  };
+};
+
 // TODO(drpng): remove this.
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
@@ -219,6 +312,10 @@ REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
+REGISTER_KERNEL_BUILDER(Name("Case").Device(DEVICE_CPU), CaseOp);
+REGISTER_KERNEL_BUILDER(
+    Name("Case").Device(DEVICE_GPU).HostMemory("branch_index"), CaseOp);
+
 REGISTER_KERNEL_BUILDER(Name("StatelessIf").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(
     Name("StatelessIf").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
diff --git a/tensorflow/core/kernels/kernel_platform_strings.h b/tensorflow/core/kernels/kernel_platform_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf40c30a56577ebe21d4a4ba9bf371e30803f79
--- /dev/null
+++ b/tensorflow/core/kernels/kernel_platform_strings.h
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generate platform strings for libtfkernel-*
+
+#ifndef TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+TF_PLATFORM_STRINGS()
+
+#endif  // TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 12863815fb03a5b9232b6bfe7bfb97fe7ea314f4..9f090524abd05fedbfac26282c65424cff845165 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -79,12 +79,10 @@ static Status TensorListDeviceCopy(
   to->max_num_elements = from.max_num_elements;
   to->tensors.reserve(from.tensors.size());
   for (const Tensor& t : from.tensors) {
-    Tensor tmp(t.dtype());
-    // Do not copy uninitialized tensors.
+    to->tensors.emplace_back(t.dtype());
     if (t.dtype() != DT_INVALID) {
-      TF_RETURN_IF_ERROR(copy(t, &tmp));
+      TF_RETURN_IF_ERROR(copy(t, &to->tensors.back()));
     }
-    to->tensors.push_back(tmp);
   }
   return Status::OK();
 }
@@ -178,6 +176,46 @@ Status GetElementShapeFromInput(OpKernelContext* c,
   return Status::OK();
 }
 
+Status GetInputList(OpKernelContext* c, int index, const TensorList** list) {
+  if (!TensorShapeUtils::IsScalar(c->input(index).shape())) {
+    return errors::InvalidArgument("Input list must be a scalar saw: ",
+                                   c->input(index).shape().DebugString());
+  }
+  const TensorList* l = c->input(index).scalar<Variant>()().get<TensorList>();
+  if (l == nullptr) {
+    return errors::InvalidArgument(
+        "Input handle is not a list. Saw: '",
+        c->input(index).scalar<Variant>()().DebugString(), "'");
+  }
+  *list = l;
+  return Status::OK();
+}
+
+Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
+                                   int32 output_index,
+                                   const TensorList& input_list,
+                                   TensorList** output_list) {
+  // Attempt to forward the input tensor to the output if possible.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  std::unique_ptr<Tensor> maybe_output =
+      c->forward_input(input_index, output_index, DT_VARIANT, TensorShape{},
+                       c->input_memory_type(input_index), attr);
+  Tensor* output_tensor;
+  if (maybe_output != nullptr) {
+    // Woohoo, forwarding succeeded!
+    output_tensor = maybe_output.get();
+  } else {
+    // If forwarding is not possible allocate a new output tensor and copy
+    // the `input_list` to it.
+    TF_RETURN_IF_ERROR(
+        c->allocate_output(output_index, {}, &output_tensor, attr));
+    output_tensor->scalar<Variant>()() = input_list;
+  }
+  *output_list = output_tensor->scalar<Variant>()().get<TensorList>();
+  return Status::OK();
+}
+
 class EmptyTensorList : public OpKernel {
  public:
   explicit EmptyTensorList(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -239,11 +277,8 @@ class TensorListPushBack : public OpKernel {
                                         " but tried to append ",
                                         DataTypeString(input.dtype())));
 
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, l->element_shape.IsCompatibleWith(input.shape()),
                 errors::InvalidArgument(
                     "Tried to append a tensor with incompatible shape to a "
@@ -264,21 +299,9 @@ class TensorListPushBack : public OpKernel {
                                   " max_num_elements: ", l->max_num_elements));
     }
 
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    std::unique_ptr<Tensor> maybe_result = c->forward_input(
-        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
-    if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.push_back(
-          input);
-    } else {
-      Tensor* result;
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      TensorList output;
-      output = *l;
-      output.tensors.push_back(input);
-      result->scalar<Variant>()() = std::move(output);
-    }
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors.push_back(input);
   }
 
  private:
@@ -301,12 +324,8 @@ class TensorListLength : public OpKernel {
   ~TensorListLength() override {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(
-        c, l != nullptr,
-        errors::InvalidArgument(
-            "TensorListLength received a variant which is not a list. Saw: '",
-            c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
     result->scalar<int32>()() = l->tensors.size();
@@ -329,15 +348,8 @@ class TensorListElementShape : public OpKernel {
   explicit TensorListElementShape(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
-    OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "TensorListElementShape received a variant which is not a "
-                    "list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     if (l->element_shape.unknown_rank()) {
       OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &result));
@@ -405,12 +417,8 @@ class TensorListResize : public OpKernel {
   explicit TensorListResize(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* input_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, input_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* input_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &input_list));
     int32 size = c->input(1).scalar<int32>()();
     OP_REQUIRES(
         c, size >= 0,
@@ -466,11 +474,8 @@ class TensorListSetItem : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, element_dtype_ == l->element_dtype,
                 errors::InvalidArgument("Invalid data types; op elements ",
                                         DataTypeString(element_dtype_),
@@ -488,21 +493,9 @@ class TensorListSetItem : public OpKernel {
                     "list index. Item element shape: ",
                     value.shape().DebugString(),
                     " list shape: ", l->element_shape.DebugString()));
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    std::unique_ptr<Tensor> maybe_result = c->forward_input(
-        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
-    if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors[index] =
-          value;
-    } else {
-      TensorList output;
-      output = *l;
-      output.tensors[index] = value;
-      Tensor* result;
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      result->scalar<Variant>()() = std::move(output);
-    }
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors[index] = value;
   }
 
  private:
@@ -624,50 +617,54 @@ REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
 
 #endif  // GOOGLE_CUDA
 
-#define REGISTER_TENSOR_LIST_OPS_CPU(T)                           \
-  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListStack<CPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListConcat<CPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")              \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListConcat<CPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListGetItem<CPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListPopBack<CPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListFromTensor<CPUDevice, T>)     \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")             \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListSplit<CPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListStack<CPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListGather<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListConcat<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")                       \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListConcat<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListGetItem<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListPopBack<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")                     \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListFromTensor<CPUDevice, T>)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatter<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")                      \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatter<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterIntoExistingList")        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatterIntoExistingList<CPUDevice, T>) \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListSplit<CPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")                  \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
                           TensorListPushBackBatch<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_OPS_CPU);
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 652ca2b7e2062b6ccc09405ac5718b97fa70db04..9922a92dec39708bff2ef3566b9e264cd5e73f00 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -36,68 +36,73 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_TENSOR_LIST_OPS_GPU(T)                           \
-  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape"),       \
-                          TensorListStack<GPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("indices")              \
-                              .HostMemory("element_shape"),       \
-                          TensorListGather<GPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("index")                \
-                              .HostMemory("element_shape"),       \
-                          TensorListGetItem<GPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape"),       \
-                          TensorListPopBack<GPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("lengths"),             \
-                          TensorListConcat<GPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")              \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("leading_dims")         \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("lengths"),             \
-                          TensorListConcat<GPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU),                \
-                          TensorListPushBackBatch<GPUDevice, T>)  \
-  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape"),       \
-                          TensorListFromTensor<GPUDevice, T>)     \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")             \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("num_elements")         \
-                              .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("lengths"),             \
+#define REGISTER_TENSOR_LIST_OPS_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListStack<GPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("indices")                       \
+                              .HostMemory("element_shape"),                \
+                          TensorListGather<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("index")                         \
+                              .HostMemory("element_shape"),                \
+                          TensorListGetItem<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListPopBack<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("lengths"),                      \
+                          TensorListConcat<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")                       \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("leading_dims")                  \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("lengths"),                      \
+                          TensorListConcat<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")                  \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU),                         \
+                          TensorListPushBackBatch<GPUDevice, T>)           \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")                     \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListFromTensor<GPUDevice, T>)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("indices"),                      \
+                          TensorListScatter<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")                      \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("num_elements")                  \
+                              .HostMemory("indices"),                      \
+                          TensorListScatter<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterIntoExistingList")        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("indices"),                      \
+                          TensorListScatterIntoExistingList<GPUDevice, T>) \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("lengths"),                      \
                           TensorListSplit<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index c25e9cecb43d4a3e1ab9fc994f9928ea13a4717c..682ea15caf94ccdca1e49e7721d3c227f0f1a4fc 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -70,6 +70,13 @@ Status GetElementShapeFromInput(OpKernelContext* c,
                                 const TensorList& tensor_list, int index,
                                 PartialTensorShape* element_shape);
 
+Status GetInputList(OpKernelContext* c, int index, const TensorList** list);
+
+Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
+                                   int32 output_index,
+                                   const TensorList& input_list,
+                                   TensorList** output_list);
+
 template <typename Device, typename T>
 class TensorListStack : public OpKernel {
  public:
@@ -81,12 +88,8 @@ class TensorListStack : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
@@ -184,14 +187,8 @@ class TensorListGetItem : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, element_dtype_ == l->element_dtype,
                 errors::InvalidArgument("Invalid data types; op elements ",
                                         DataTypeString(element_dtype_),
@@ -255,11 +252,8 @@ class TensorListPopBack : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, element_dtype_ == l->element_dtype,
                 errors::InvalidArgument("Invalid data types; op elements ",
                                         DataTypeString(element_dtype_),
@@ -291,20 +285,10 @@ class TensorListPopBack : public OpKernel {
       functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
                                            result->flat<T>());
     }
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    std::unique_ptr<Tensor> maybe_result = c->forward_input(
-        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
-    if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.pop_back();
-    } else {
-      TensorList output;
-      output = *l;
-      output.tensors.pop_back();
-      Tensor* result;
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      result->scalar<Variant>()() = std::move(output);
-    }
+
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors.pop_back();
   }
 
  private:
@@ -334,12 +318,8 @@ class TensorListConcat : public OpKernel {
   void Compute(OpKernelContext* c) override {
     // Check that the input Variant tensor is indeed a TensorList and has the
     // correct element type.
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
@@ -597,12 +577,8 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
@@ -736,6 +712,81 @@ class TensorListFromTensor : public OpKernel {
   }
 };
 
+// Scatters values in `value` into `list`. Assumes that `indices` are valid.
+template <typename Device, typename T>
+Status Scatter(OpKernelContext* c, const Tensor& value, const Tensor& indices,
+               TensorList* list) {
+  for (int index = 0; index < indices.NumElements(); ++index) {
+    const int i = indices.flat<int32>()(index);
+    Tensor tmp = value.Slice(index, index + 1);
+    TensorShape tmp_shape = tmp.shape();
+    tmp_shape.RemoveDim(0);
+    if (!tmp.CopyFrom(tmp, tmp_shape)) {
+      return errors::Unknown("Unexpected shape error.");
+    }
+    // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+    // prevent this.
+    Tensor aligned;
+    TF_RETURN_IF_ERROR(c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+    // TODO(apassos) do all slices in a single kernel invocation instead of
+    // many small ones.
+    aligned.flat<T>().device(c->eigen_device<Device>()) =
+        tmp.unaligned_flat<T>();
+    std::swap(list->tensors[i], aligned);
+  }
+  return Status::OK();
+}
+
+template <typename Device, typename T>
+class TensorListScatterIntoExistingList : public OpKernel {
+ public:
+  TensorListScatterIntoExistingList(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    const Tensor& input_tensor = c->input(1);
+    const Tensor& indices = c->input(2);
+
+    // Check that inputs are valid.
+    OP_REQUIRES(c, input_tensor.dtype() == l->element_dtype,
+                errors::InvalidArgument(
+                    "Invalid data types; input tensor type: ",
+                    DataTypeString(input_tensor.dtype()),
+                    " list element_type: ", DataTypeString(l->element_dtype)));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument(
+                    "Expected indices to be a vector, but received shape: ",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(
+        c, indices.NumElements() == input_tensor.shape().dim_size(0),
+        errors::InvalidArgument(
+            "Expected len(indices) == tensor.shape[0], but saw: ",
+            indices.NumElements(), " vs. ", input_tensor.shape().dim_size(0)));
+
+    // Resize the list if needed to accommodate all indices.
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    const auto indices_vec = indices.vec<int32>();
+    int32 max_index =
+        (indices.NumElements() == 0)
+            ? -1
+            : *std::max_element(indices_vec.data(),
+                                indices_vec.data() + indices.NumElements());
+    if (max_index + 1 > output_list->tensors.size()) {
+      output_list->tensors.resize(max_index + 1);
+    }
+
+    // Scatter the values.
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, output_list));
+  }
+};
+
 template <typename Device, typename T>
 class TensorListScatter : public OpKernel {
  public:
@@ -798,23 +849,8 @@ class TensorListScatter : public OpKernel {
                                  Tensor(DT_INVALID));
     }
 
-    for (int index = 0; index < indices.NumElements(); ++index) {
-      const int i = indices.flat<int32>()(index);
-      Tensor tmp = input_tensor.Slice(index, index + 1);
-      TensorShape tmp_shape = tmp.shape();
-      tmp_shape.RemoveDim(0);
-      OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
-                  errors::Unknown("Unexpected shape error."));
-      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
-      // prevent this.
-      Tensor aligned;
-      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
-      // TODO(apassos) do all slices in a single kernel invocation instead of
-      // many small ondes.
-      aligned.flat<T>().device(c->eigen_device<Device>()) =
-          tmp.unaligned_flat<T>();
-      std::swap(output_list.tensors[i], aligned);
-    }
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, &output_list));
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
 };
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 2599340d78a5308cbd63338db84e569f12541a4b..e611ae28b9a21d297cac179f24d343a4e5248ec9 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/logging_ops.h"
+
 #include <iostream>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -48,6 +51,22 @@ Status AppendStringToFile(const std::string& fname, StringPiece data,
 
 }  // namespace
 
+namespace logging {
+
+typedef std::vector<void (*)(const char*)> Listeners;
+
+Listeners* GetListeners() {
+  static Listeners* listeners = new Listeners;
+  return listeners;
+}
+
+bool RegisterListener(void (*listener)(const char*)) {
+  GetListeners()->push_back(listener);
+  return true;
+}
+
+}  // end namespace logging
+
 class AssertOp : public OpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -157,7 +176,12 @@ class PrintV2Op : public OpKernel {
       OP_REQUIRES_OK(ctx, AppendStringToFile(file_path_, msg, ctx->env()));
       return;
     }
-    if (output_stream_ == "stdout") {
+    auto listeners = logging::GetListeners();
+    if (!listeners->empty()) {
+      for (auto& listener : *listeners) {
+        listener(msg.c_str());
+      }
+    } else if (output_stream_ == "stdout") {
       std::cout << msg << std::endl;
     } else if (output_stream_ == "stderr") {
       std::cerr << msg << std::endl;
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/kernels/logging_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..92a8d63409478e7a0c162ae84361f7e2215aea46
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+
+namespace tensorflow {
+
+namespace logging {
+
+// Register a listener method to call on any printed messages.
+// Returns true if it is successfully registered.
+bool RegisterListener(void (*listener)(const char*));
+
+}  // namespace logging
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
diff --git a/tensorflow/core/kernels/lookup_tables/BUILD b/tensorflow/core/kernels/lookup_tables/BUILD
index f4a41a5cfc06a74edd1220b345cef2275bf8f90f..a25660e987ab80de58cee05551a98d0f00ea4268 100644
--- a/tensorflow/core/kernels/lookup_tables/BUILD
+++ b/tensorflow/core/kernels/lookup_tables/BUILD
@@ -13,48 +13,77 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 cc_library(
-    name = "lookup_table_interface",
-    hdrs = ["lookup_table_interface.h"],
+    name = "resource_interface_templates",
+    hdrs = ["resource_interface_templates.h"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
 cc_library(
-    name = "table_resource_utils",
-    hdrs = ["table_resource_utils.h"],
+    name = "op_kernel_templates",
+    hdrs = ["op_kernel_templates.h"],
     deps = [
-        ":lookup_table_interface",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:tensor_flag_utils",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
-cc_library(
-    name = "table_op_utils",
-    hdrs = ["table_op_utils.h"],
+tf_kernel_library(
+    name = "fingerprint64_map_op_kernels",
+    srcs = [
+        "fingerprint64_map_op_kernels.cc",
+    ],
     deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_hash_map_op_kernels",
+    srcs = [
+        "flat_hash_map_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:tensor_flag_utils",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 tf_kernel_library(
-    name = "fingerprint64_map_ops",
+    name = "generic_table_op_kernels",
     srcs = [
-        "fingerprint64_map_ops.cc",
+        "generic_table_op_kernels.cc",
     ],
     deps = [
-        ":table_op_utils",
-        ":table_resource_utils",
+        ":op_kernel_templates",
+        ":resource_interface_templates",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:string_view_variant_wrapper",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36274bc6b63c6efd871f360f4234133360cf8fd1
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Map x -> (Fingerprint64(x) % num_oov_buckets) + offset.
+// num_oov_buckets and offset are node attributes provided at construction
+// time.
+template <typename KeyType, typename ValueType>
+class Fingerprint64Map final
+    : public virtual LookupInterface<ValueType*, const KeyType&>,
+      public virtual LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                                 absl::Span<const KeyType>> {
+ public:
+  using key_type = KeyType;
+
+  Fingerprint64Map(int64 num_oov_buckets, int64 offset)
+      : num_oov_buckets_(num_oov_buckets), offset_(offset) {}
+
+  Status Lookup(const KeyType& key_to_find, ValueType* value) const override {
+    *value = LookupHelper(key_to_find);
+    return Status::OK();
+  }
+
+  Status Lookup(absl::Span<const KeyType> keys, absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i] = LookupHelper(keys[i]);
+    }
+    return Status::OK();
+  }
+
+  mutex* GetMutex() const override { return nullptr; }
+
+  string DebugString() const override { return __PRETTY_FUNCTION__; }
+
+ private:
+  ABSL_ATTRIBUTE_ALWAYS_INLINE ValueType
+  LookupHelper(const KeyType& key_to_find) const {
+    // This can cause a downcast.
+    return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                  num_oov_buckets_) +
+           offset_;
+  }
+
+  const int64 num_oov_buckets_;
+  const int64 offset_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Fingerprint64Map);
+};
+
+template <typename Fingerprint64Map>
+struct Fingerprint64MapFactory {
+  struct Functor {
+    using resource_type = Fingerprint64Map;
+
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    Fingerprint64Map** container) {
+      int64 num_oov_buckets;
+      int64 offset;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(kernel->def(), "num_oov_buckets", &num_oov_buckets));
+      TF_RETURN_IF_ERROR(GetNodeAttr(kernel->def(), "offset", &offset));
+      *container = new Fingerprint64Map(num_oov_buckets, offset);
+      return Status::OK();
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using ResourceOp = ResourceConstructionOp<
+    typename Fingerprint64MapFactory<
+        Fingerprint64Map<KeyType, ValueType>>::Functor,
+    // These are the aliases.
+    LookupInterface<ValueType*, const KeyType&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const KeyType>>>;
+
+#define REGISTER_STRING_KERNEL(ValueType)                     \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("Fingerprint64Map")                                \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype") \
+          .TypeConstraint<ValueType>("table_value_dtype"),    \
+      ResourceOp<absl::string_view, ValueType>);              \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("Fingerprint64Map")                                \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<string>("heterogeneous_key_dtype")  \
+          .TypeConstraint<ValueType>("table_value_dtype"),    \
+      ResourceOp<string, ValueType>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/fingerprint64_map_ops.cc b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_ops.cc
deleted file mode 100644
index a000828c4b01a932b5fa5c518fb18c3b70af54e4..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/lookup_tables/fingerprint64_map_ops.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/lookup_tables/table_op_utils.h"
-#include "tensorflow/core/kernels/lookup_tables/table_resource_utils.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace tables {
-
-// Map x -> (Fingerprint64(x) % num_oov_buckets) + offset.
-// num_oov_buckets and offset are node attributes provided at construction
-// time.
-template <class HeterogeneousKeyType, class ValueType>
-class Fingerprint64Map final
-    : public LookupTableInterface<HeterogeneousKeyType, ValueType> {
- public:
-  Fingerprint64Map(int64 num_oov_buckets, int64 offset)
-      : num_oov_buckets_(num_oov_buckets), offset_(offset) {}
-
-  mutex* GetMutex() const override { return nullptr; }
-
-  bool UnsafeInsertOrAssign(const HeterogeneousKeyType& key,
-                            const ValueType& value) override {
-    return true;
-  }
-
-  Status TableUnbatchedInsertStatus() const override {
-    return errors::Unimplemented("Fingerprint64Map does not support inserts.");
-  }
-
-  Status BatchInsertOrAssign(absl::Span<const HeterogeneousKeyType> keys,
-                             absl::Span<const ValueType> values) override {
-    return errors::Unimplemented("Fingerprint64Map does not support inserts.");
-  }
-
-  ValueType UnsafeLookupKey(
-      const HeterogeneousKeyType& key_to_find) const override {
-    // This can cause a downcast.
-    return static_cast<ValueType>(Fingerprint64(key_to_find) %
-                                  num_oov_buckets_) +
-           offset_;
-  }
-
-  Status TableUnbatchedLookupStatus() const override { return Status::OK(); }
-
-  Status BatchLookup(absl::Span<const HeterogeneousKeyType> keys,
-                     absl::Span<ValueType> values,
-                     int64 prefetch_lookahead) const override {
-    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
-      return errors::InvalidArgument(
-          "keys and values do not have the same number of elements (found ",
-          keys.size(), " vs ", values.size(), ").");
-    }
-    for (size_t i = 0; i < keys.size(); ++i) {
-      values[i] = Fingerprint64Map::UnsafeLookupKey(keys[i]);
-    }
-    return Status::OK();
-  }
-
-  const absl::optional<const ValueType> DefaultValue() const override {
-    return {};
-  }
-
-  void UnsafePrefetchKey(
-      const HeterogeneousKeyType& key_to_find) const override {}
-
-  size_t UnsafeSize() const override { return 0; }
-
-  Status SizeStatus() const override {
-    return errors::Unimplemented(
-        "Fingerprint64Map does not have a concept of size.");
-  }
-
-  bool UnsafeContainsKey(
-      const HeterogeneousKeyType& key_to_find) const override {
-    return true;
-  }
-
- private:
-  const int64 num_oov_buckets_;
-  const int64 offset_;
-  TF_DISALLOW_COPY_AND_ASSIGN(Fingerprint64Map);
-};
-
-template <typename Fingerprint64Map>
-struct Fingerprint64MapFactory {
-  struct Functor {
-    template <typename ContainerBase>
-    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
-                                    ContainerBase** container) {
-      int64 num_oov_buckets;
-      int64 offset;
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(kernel->def(), "num_oov_buckets", &num_oov_buckets));
-      TF_RETURN_IF_ERROR(GetNodeAttr(kernel->def(), "offset", &offset));
-      *container = new Fingerprint64Map(num_oov_buckets, offset);
-      return Status::OK();
-    }
-  };
-};
-
-#define REGISTER_STRING_KERNEL(table_value_dtype)                             \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Fingerprint64Map")                                                \
-          .Device(DEVICE_CPU)                                                 \
-          .TypeConstraint<Variant>("heterogeneous_key_dtype")                 \
-          .TypeConstraint<table_value_dtype>("table_value_dtype"),            \
-      ResourceConstructionOp<                                                 \
-          LookupTableInterface<absl::string_view, table_value_dtype>,         \
-          Fingerprint64MapFactory<Fingerprint64Map<                           \
-              absl::string_view, table_value_dtype>>::Functor>);              \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Fingerprint64Map")                                                \
-          .Device(DEVICE_CPU)                                                 \
-          .TypeConstraint<string>("heterogeneous_key_dtype")                  \
-          .TypeConstraint<table_value_dtype>("table_value_dtype"),            \
-      ResourceConstructionOp<LookupTableInterface<string, table_value_dtype>, \
-                             Fingerprint64MapFactory<Fingerprint64Map<        \
-                                 string, table_value_dtype>>::Functor>);
-
-REGISTER_STRING_KERNEL(int32);
-REGISTER_STRING_KERNEL(int64);
-
-#undef REGISTER_STRING_KERNEL
-
-}  // namespace tables
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c37ca87cea58d6bd72cc2b71c9fd934eae64081
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc
@@ -0,0 +1,275 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <type_traits>
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/fingerprint.h"
+
+namespace tensorflow {
+namespace tables {
+
+using errors::InvalidArgument;
+
+// absl::flat_hash_map<HeterogeneousKeyType, ValueType> backed table with inline
+// fallback to x -> (Fingerprint64(x) % num_oov_buckets) + offset when looked
+// up keys are not in the flat_hash_map. Inlining the fallback table turns out
+// to be quite efficient in comparison to virtual dispatch for the fallback
+// lookup.
+template <typename ValueType>
+class StaticStringFlatHashMap final
+    : public virtual LookupInterface<ValueType*, const absl::string_view&>,
+      public virtual LookupInterface<ValueType*, const string&>,
+      public virtual LookupWithPrefetchInterface<
+          absl::Span<ValueType>, absl::Span<const absl::string_view>>,
+      public virtual LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                                 absl::Span<const string>>,
+      public virtual KeyValueTableInitializerInterface<
+          absl::Span<const ValueType>, absl::Span<const absl::string_view>>,
+      public virtual KeyValueTableInitializerInterface<
+          absl::Span<const ValueType>, absl::Span<const string>>,
+      public virtual SizeInterface {
+ public:
+  using value_type = ValueType;
+
+  StaticStringFlatHashMap(bool enable_synchronization, int64 num_oov_buckets)
+      : num_oov_buckets_(num_oov_buckets) {
+    if (enable_synchronization) {
+      mutex_ = absl::make_unique<mutex>();
+    }
+  }
+
+  Status Initialize(absl::Span<const absl::string_view> keys,
+                    absl::Span<const ValueType> values) override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+
+    table_.reserve(table_.size() + keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+      table_.insert_or_assign(string(keys[i]), values[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Initialize(absl::Span<const string> keys,
+                    absl::Span<const ValueType> values) override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+
+    table_.reserve(table_.size() + keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+      table_.insert_or_assign(keys[i], values[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Lookup(const absl::string_view& key, ValueType* value) const override {
+    *value = LookupHelper(key);
+    return Status::OK();
+  }
+
+  Status Lookup(const string& key, ValueType* value) const override {
+    *value = LookupHelper(key);
+    return Status::OK();
+  }
+
+  // keys and values are guaranteed to have the same size by convention.
+  Status Lookup(absl::Span<const absl::string_view> keys,
+                absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    const auto keys_size = keys.size();
+    if (prefetch_lookahead <= 0 || prefetch_lookahead >= keys_size) {
+      for (size_t i = 0; i < keys_size; ++i) {
+        values[i] = LookupHelper(keys[i]);
+      }
+    } else {
+      for (size_t i = 0; i < keys_size; ++i) {
+        if (i + prefetch_lookahead < keys.size()) {
+          table_.prefetch(keys[i + prefetch_lookahead]);
+        }
+        values[i] = LookupHelper(keys[i]);
+      }
+    }
+    return Status::OK();
+  }
+
+  // keys and values are guaranteed to have the same size by convention.
+  Status Lookup(absl::Span<const string> keys, absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    const auto keys_size = keys.size();
+    if (prefetch_lookahead <= 0 || prefetch_lookahead >= keys_size) {
+      for (size_t i = 0; i < keys_size; ++i) {
+        values[i] = LookupHelper(keys[i]);
+      }
+    } else {
+      for (size_t i = 0; i < keys_size; ++i) {
+        if (i + prefetch_lookahead < keys.size()) {
+          table_.prefetch(keys[i + prefetch_lookahead]);
+        }
+        values[i] = LookupHelper(keys[i]);
+      }
+    }
+    return Status::OK();
+  }
+
+  uint64 Size() const override { return table_.size(); }
+
+  mutex* GetMutex() const override { return mutex_.get(); }
+
+  string DebugString() const override { return __PRETTY_FUNCTION__; }
+
+ private:
+  template <typename T>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE ValueType
+  LookupHelper(const T& key_to_find) const {
+    auto it = table_.find(key_to_find);
+    if (it != table_.end()) {
+      return it->second;
+    } else {
+      return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                    num_oov_buckets_) +
+             StaticStringFlatHashMap::Size();
+    }
+  }
+
+  const int64 num_oov_buckets_;
+  std::unique_ptr<mutex> mutex_;
+  // The underlying table.
+  absl::flat_hash_map<string, ValueType> table_;
+  TF_DISALLOW_COPY_AND_ASSIGN(StaticStringFlatHashMap);
+};
+
+// Used to allocate StaticStringFlatHashMap objects via the AllocateContainer
+// method.
+template <typename StaticStringFlatHashMap>
+struct StaticStringFlatHashMapFactory {
+  struct Functor {
+    using resource_type = StaticStringFlatHashMap;
+
+    template <typename StaticStringFlatHashMapBase>
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    StaticStringFlatHashMapBase** container) {
+      OpInputList table_int64_args;
+      TF_RETURN_IF_ERROR(
+          ctx->input_list("table_int64_args", &table_int64_args));
+      const size_t variadic_arg_size = table_int64_args.size();
+      if (ABSL_PREDICT_FALSE(variadic_arg_size != 2)) {
+        return errors::InvalidArgument(
+            "table_int64_args should have 2 elements (found ",
+            variadic_arg_size,
+            "). Set the first element to 1 to enable synchronized table use "
+            "and to 0 otherwise. The second element should be "
+            "num_oov_buckets.");
+      }
+
+      const bool enable_synchronization = ctx->input(0).scalar<int64>()() != 0;
+      const int64 num_oov_buckets = ctx->input(1).scalar<int64>()();
+      if (ABSL_PREDICT_FALSE(num_oov_buckets <= 0)) {
+        return errors::InvalidArgument(
+            "num_oov_buckets must be positive. Found: ", num_oov_buckets);
+      }
+      auto* non_virtual_container =
+          new StaticStringFlatHashMap(enable_synchronization, num_oov_buckets);
+      *container = non_virtual_container;
+      const Tensor& keys = ctx->input(table_int64_args.size());
+      const Tensor& values = ctx->input(table_int64_args.size() + 1);
+      if (keys.NumElements() == 0) {
+        return Status::OK();
+      } else if (keys.dtype() == DT_STRING) {
+        return Functor::Initialize(
+            keys.flat<string>(),
+            values.flat<typename StaticStringFlatHashMap::value_type>(),
+            non_virtual_container);
+      } else if (keys.dtype() == DT_VARIANT) {
+        auto keys_flat = keys.flat<Variant>();
+        if (keys_flat(0).get<absl::string_view>() == nullptr) {
+          return errors::InvalidArgument(
+              "Variant keys tensor must have subtype absl::string_view.");
+        }
+        return Functor::Initialize(
+            keys.flat<Variant>(),
+            values.flat<typename StaticStringFlatHashMap::value_type>(),
+            non_virtual_container);
+      }
+      return errors::InvalidArgument(
+          "keys tensor must have type DT_STRING or type DT_VARIANT with "
+          "subtype absl::string_view.");
+    }
+
+    static Status Initialize(
+        const absl::Span<const string> keys,
+        const absl::Span<const typename StaticStringFlatHashMap::value_type>
+            values,
+        StaticStringFlatHashMap* container) {
+      return container->Initialize(keys, values);
+    }
+
+    static Status Initialize(
+        const absl::Span<const Variant> keys,
+        const absl::Span<const typename StaticStringFlatHashMap::value_type>
+            values,
+        StaticStringFlatHashMap* container) {
+      std::vector<typename absl::string_view> keys_vec;
+      keys_vec.reserve(keys.size());
+      for (size_t i = 0; i < keys.size(); ++i) {
+        keys_vec.push_back(*keys[i].get<absl::string_view>());
+      }
+      return container->Initialize(keys_vec, values);
+    }
+  };
+};
+
+template <typename ValueType>
+using ResourceOp = ResourceConstructionOp<
+    typename StaticStringFlatHashMapFactory<
+        StaticStringFlatHashMap<ValueType>>::Functor,
+    // These are the aliases.
+    LookupInterface<ValueType*, const absl::string_view&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const absl::string_view>>,
+    LookupInterface<ValueType*, const string&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const string>>,
+    SizeInterface>;
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("StaticStringFlatHashMap")                              \
+          .Device(DEVICE_CPU)                                      \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype")      \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"), \
+      ResourceOp<table_value_dtype>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb29afd19a3fd2b03171e6a3d97555e34d3b35b
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc
@@ -0,0 +1,227 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <type_traits>
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/kernels/string_view_variant_wrapper.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace tables {
+
+template <typename KeyType, typename ValueType>
+struct TensorInsertFactory {
+  class Functor {
+   public:
+    // If KeyType is not 'valid' then use the value it wraps as the table key
+    // type.
+    using resource_type = InsertOrAssignInterface<
+        absl::Span<const ValueType>,
+        typename absl::conditional_t<
+            IsValidDataType<KeyType>::value, absl::Span<const KeyType>,
+            absl::Span<const typename KeyType::value_type>>>;
+
+    static Status TensorInsert(const Tensor& keys, const Tensor& values,
+                               resource_type* table) {
+      if (keys.NumElements() != values.NumElements()) {
+        return errors::InvalidArgument(
+            "OpKernel tried to map keys vector of size ", keys.NumElements(),
+            " to values vector of size ", values.NumElements());
+      }
+      return TensorInsertHelper(keys, values, table);
+    }
+
+   private:
+    // keys and *values arguments to TensorInsert must have the same number of
+    // elements. This is guaranteed above.
+
+    // 'Simple' types below are types which are natively supported in TF.
+    // Non-variant KeyType which is the same as Container::key_type.
+    // No need to static_cast.
+    template <typename SfinaeArg = KeyType>
+    static absl::enable_if_t<IsValidDataType<SfinaeArg>::value, Status>
+    TensorInsertHelper(const Tensor& keys, const Tensor& values,
+                       resource_type* table) {
+      return table->InsertOrAssign(keys.flat<KeyType>(),
+                                   values.flat<ValueType>());
+    }
+
+    // Variant KeyType; the wrapped type is convertible to
+    // Container::key_type.
+    template <typename VariantSubType = KeyType>
+    static absl::enable_if_t<!IsValidDataType<VariantSubType>::value, Status>
+    TensorInsertHelper(const Tensor& keys, const Tensor& values,
+                       resource_type* table) {
+      const auto keys_flat = keys.flat<Variant>();
+      std::vector<typename VariantSubType::value_type> keys_vec;
+      keys_vec.reserve(keys_flat.size());
+      for (size_t i = 0; i < keys_flat.size(); ++i) {
+        keys_vec.emplace_back(
+            *keys_flat(i).get<typename VariantSubType::value_type>());
+      }
+      return table->InsertOrAssign(keys_vec, values.flat<ValueType>());
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using InsertOp = LookupTableInsertOp<
+    typename TensorInsertFactory<KeyType, ValueType>::Functor>;
+
+template <typename KeyType, typename ValueType>
+struct TensorLookupFactory {
+  class Functor {
+   public:
+    // If KeyType is not 'valid' then use the value it wraps as the table key
+    // type.
+    using resource_type = LookupWithPrefetchInterface<
+        absl::Span<ValueType>,
+        typename absl::conditional_t<
+            IsValidDataType<KeyType>::value, absl::Span<const KeyType>,
+            absl::Span<const typename KeyType::value_type>>>;
+
+    static Status TensorLookup(const resource_type& table, const Tensor& keys,
+                               const int64 prefetch_lookahead,
+                               const int64 num_keys_per_thread,
+                               thread::ThreadPool* threadpool, Tensor* values) {
+      if (keys.NumElements() != values->NumElements()) {
+        return errors::InvalidArgument(
+            "OpKernel tried to map keys vector of size ", keys.NumElements(),
+            " to values vector of size ", values->NumElements());
+      }
+      return TensorLookupHelper(table, keys, prefetch_lookahead,
+                                num_keys_per_thread, threadpool, values);
+    }
+
+   private:
+    // keys and *values arguments to TensorLookup must have the same number of
+    // elements. This is guaranteed above.
+
+    // 'Simple' types below are types which are natively supported in TF.
+    template <typename SfinaeArg = KeyType>
+    static absl::enable_if_t<IsValidDataType<SfinaeArg>::value, Status>
+    TensorLookupHelper(const resource_type& table, const Tensor& keys,
+                       const int64 prefetch_lookahead,
+                       const int64 num_keys_per_thread,
+                       thread::ThreadPool* threadpool, Tensor* values) {
+      const auto keys_flat = keys.flat<KeyType>();
+      auto key_span = absl::MakeSpan(keys_flat);
+      auto value_span = absl::MakeSpan(values->flat<ValueType>().data(),
+                                       values->NumElements());
+      return MultithreadedTensorLookup(table, prefetch_lookahead,
+                                       num_keys_per_thread, key_span,
+                                       value_span, threadpool);
+    }
+
+    // Non-simple KeyType. We'll try an implicit conversion to
+    // Container::key_type.
+    template <typename VariantSubType = KeyType>
+    static absl::enable_if_t<!IsValidDataType<VariantSubType>::value, Status>
+    TensorLookupHelper(const resource_type& table, const Tensor& keys,
+                       const int64 prefetch_lookahead,
+                       const int64 num_keys_per_thread,
+                       thread::ThreadPool* threadpool, Tensor* values) {
+      const auto keys_flat = keys.flat<Variant>();
+      std::vector<typename VariantSubType::value_type> keys_vec;
+      const auto keys_size = keys_flat.size();
+      keys_vec.reserve(keys_size);
+      for (size_t i = 0; i < keys_size; ++i) {
+        keys_vec.emplace_back(*keys_flat(i).get<VariantSubType>()->get());
+      }
+      absl::Span<const typename VariantSubType::value_type> key_span(keys_vec);
+      auto value_span = absl::MakeSpan(values->flat<ValueType>().data(),
+                                       values->NumElements());
+      return MultithreadedTensorLookup(table, prefetch_lookahead,
+                                       num_keys_per_thread, key_span,
+                                       value_span, threadpool);
+    }
+
+    // Wrapper around table.BatchLookup which permits sharding across cores.
+    template <typename K, typename V>
+    static Status MultithreadedTensorLookup(const resource_type& table,
+                                            int64 prefetch_lookahead,
+                                            int64 num_keys_per_thread, K keys,
+                                            V values,
+                                            thread::ThreadPool* threadpool) {
+      mutex temp_mutex;  // Protect status.
+      Status status;
+      auto lookup_keys = [&](int64 begin, int64 end) {
+        auto temp_status = table.Lookup(keys.subspan(begin, end - begin),
+                                        values.subspan(begin, end - begin),
+                                        prefetch_lookahead);
+        if (ABSL_PREDICT_FALSE(!temp_status.ok())) {
+          mutex_lock lock(temp_mutex);
+          status.Update(temp_status);
+        }
+      };
+      threadpool->TransformRangeConcurrently(
+          num_keys_per_thread /* block_size */, keys.size(), lookup_keys);
+      return status;
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using LookupOp = LookupTableFindOp<
+    typename TensorLookupFactory<KeyType, ValueType>::Functor>;
+
+struct TableSizeFunctor {
+  using resource_type = SizeInterface;
+
+  static Status Size(const SizeInterface& table, uint64* size) {
+    *size = table.Size();
+    return Status::OK();
+  }
+};
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                     \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableInsertOrAssignOp")                             \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<string>("insert_key_tensor_dtype")          \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      InsertOp<string, table_value_dtype>);                           \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableInsertOrAssignOp")                             \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<Variant>("insert_key_tensor_dtype")         \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      InsertOp<StringViewVariantWrapper, table_value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableFindOp")                                       \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<string>("lookup_key_tensor_dtype")          \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      LookupOp<string, table_value_dtype>);                           \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableFindOp")                                       \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<Variant>("lookup_key_tensor_dtype")         \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      LookupOp<StringViewVariantWrapper, table_value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(Name("ContainerSizeOp").Device(DEVICE_CPU), \
+                          ContainerSizeOp<TableSizeFunctor>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/lookup_table_interface.h b/tensorflow/core/kernels/lookup_tables/lookup_table_interface.h
deleted file mode 100644
index 0cfe44eda79170c8b140ba6401c95542cbd0b5b5..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/lookup_tables/lookup_table_interface.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_LOOKUP_TABLE_INTERFACE_H_
-#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_LOOKUP_TABLE_INTERFACE_H_
-
-#include <cstddef>
-#include <string>
-
-#include "absl/types/optional.h"
-#include "absl/types/span.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace tables {
-
-// Interface for key-value pair lookups with support for heterogeneous keys.
-// This class contains two main kinds of methods: methods which operate on
-// a batch of inputs and methods which do not. The latter have the prefix
-// 'Unsafe'. Clients must call the corresponding status methods to determine
-// whether they are safe to call within a code block.
-// Implementations must guarantee thread-safety when GetMutex is used to
-// synchronize method access.
-template <typename HeterogeneousKeyType, typename ValueType>
-class LookupTableInterface : public ResourceBase {
- public:
-  using heterogeneous_key_type = HeterogeneousKeyType;
-  using value_type = ValueType;
-  using key_type = heterogeneous_key_type;
-
-  // Return value should be used to synchronize read/write access to
-  // all public methods. If null, no synchronization is needed.
-  virtual mutex* GetMutex() const = 0;
-
-  // Insert the KV pair into the underlying table. If a key equivalent to key
-  // already exists in the underlying table, its corresponding value is
-  // overridden. Returns true only if the key was inserted for the first time.
-  // Undefined if TableUnbatchedInsertStatus() != OK.
-  virtual bool UnsafeInsertOrAssign(const HeterogeneousKeyType& key,
-                                    const ValueType& value) = 0;
-
-  // Returns OK if it is safe to call InsertOrAssign.
-  // Once OK is returned, it is safe to call InsertOrAssign for the rest of the
-  // program.
-  virtual Status TableUnbatchedInsertStatus() const TF_MUST_USE_RESULT = 0;
-
-  // Stores each KV pair {keys[i], values[i]} in the underlying map, overriding
-  // pre-existing pairs which have equivalent keys.
-  // keys and values should have the same size.
-  virtual Status BatchInsertOrAssign(
-      absl::Span<const HeterogeneousKeyType> keys,
-      absl::Span<const ValueType> values) = 0;
-
-  // Prefetch key_to_find into implementation defined data caches.
-  // Implementations are free to leave this a no-op.
-  // Undefined if TableUnbatchedLookupStatus() != OK.
-  virtual void UnsafePrefetchKey(
-      const HeterogeneousKeyType& key_to_find) const {}
-
-  // Returns true if and only if the table contains key_to_find.
-  // Undefined if TableUnbatchedLookupStatus() != OK.
-  virtual bool UnsafeContainsKey(
-      const HeterogeneousKeyType& key_to_find) const = 0;
-
-  // Lookup the value for key_to_find. This value must always be well-defined,
-  // even when ContainsKey(key_to_find) == false. When
-  // dv = DefaultValue() != absl::nullopt and ContainsKey(key_to_find) == false,
-  // dv is returned.
-  // Undefined if TableUnbatchedLookupStatus() != OK.
-  virtual ValueType UnsafeLookupKey(
-      const HeterogeneousKeyType& key_to_find) const = 0;
-
-  // Returns OK if it is safe to call PrefetchKey, ContainsKey, and
-  // UnsafeLookupKey.
-  // If OK is returned, it is safe to call these methods until the next
-  // non-const method of this class is called.
-  virtual Status TableUnbatchedLookupStatus() const TF_MUST_USE_RESULT = 0;
-
-  // Lookup the values for keys and store them in values.
-  // prefetch_lookahead is used to prefetch the key at index
-  // i + prefetch_lookahead at the ith iteration of the implemented loop.
-  // keys and values must have the same size.
-  virtual Status BatchLookup(absl::Span<const HeterogeneousKeyType> keys,
-                             absl::Span<ValueType> values,
-                             int64 prefetch_lookahead) const = 0;
-
-  // Returns the number of elements in the table.
-  // Undefined if SizeStatus() != OK.
-  virtual size_t UnsafeSize() const = 0;
-
-  // Returns OK if the return value of UnsafeSize() is always well-defined.
-  virtual Status SizeStatus() const TF_MUST_USE_RESULT = 0;
-
-  // If non-null value is returned, LookupKey returns that value only for keys
-  // which satisfy ContainsKey(key_to_find) == false.
-  virtual const absl::optional<const ValueType> DefaultValue() const = 0;
-
-  string DebugString() const override { return "A lookup table"; }
-
-  ~LookupTableInterface() override = default;
-};
-
-}  // namespace tables
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_LOOKUP_TABLE_INTERFACE_H_
diff --git a/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h b/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h
new file mode 100644
index 0000000000000000000000000000000000000000..d767ca0661e1fad285729b6b68683395908b4096
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h
@@ -0,0 +1,448 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
+
+#include <cstddef>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/meta/type_traits.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Create resources of type ResourceType and AliasesToRegister using
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// ResourceType**). ResourceType = Functor::resource_type.
+// No-op for resources which have already been created.
+template <typename Functor, typename... AliasesToRegister>
+class ResourceConstructionOp : public OpKernel {
+ public:
+  explicit ResourceConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx,
+                    this](ResourceType** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      ResourceType* resource = nullptr;
+      auto status = Functor::AllocateContainer(ctx, this, &resource);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        // Ideally resource is non-null only if status is OK but we try
+        // to compensate here.
+        if (resource != nullptr) {
+          resource->Unref();
+        }
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(resource->MemoryUsed());
+      }
+      *ret = resource;
+      return Status::OK();
+    };
+
+    // Register the ResourceType alias.
+    ResourceType* resource = nullptr;
+    core::ScopedUnref unref_me(resource);
+    OP_REQUIRES_OK(
+        ctx,
+        cinfo_.resource_manager()->template LookupOrCreate<ResourceType, true>(
+            cinfo_.container(), cinfo_.name(), &resource, creator));
+
+    // Put a handle to resource in the output tensor (the other aliases will
+    // have the same handle).
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+
+    // Create other alias resources.
+    Status status;
+    int dummy[sizeof...(AliasesToRegister)] = {
+        (status.Update(RegisterAlias<AliasesToRegister>(resource)), 0)...};
+    (void)dummy;
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+  ~ResourceConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ResourceType>(cinfo_.container(),
+                                               cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+      // Attempt to delete other resource aliases.
+      Status dummy_status;
+      int dummy[sizeof...(AliasesToRegister)] = {
+          (dummy_status.Update(DeleteAlias<AliasesToRegister>()), 0)...};
+      (void)dummy;
+    }
+  }
+
+ private:
+  using ResourceType = typename Functor::resource_type;
+  template <typename T>
+  Status RegisterAlias(ResourceType* resource) {
+    auto creator = [resource](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      *ret = resource;
+      return Status::OK();
+    };
+
+    T* alias_resource = nullptr;
+    core::ScopedUnref unref_me(alias_resource);
+    return cinfo_.resource_manager()->template LookupOrCreate<T, true>(
+        cinfo_.container(), cinfo_.name(), &alias_resource, creator);
+  }
+
+  template <typename T>
+  Status DeleteAlias() {
+    return cinfo_.resource_manager()->template Delete<T>(cinfo_.container(),
+                                                         cinfo_.name());
+  }
+
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ResourceConstructionOp);
+};
+
+// Create resources of type ContainerBase using the static method
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// FallbackTableBaseType*, ContainerBase**)
+// If the resource has already been created it will be looked up.
+// Container must decrease the reference count of the FallbackTableBaseType*
+// constructor argument before its destructor completes.
+template <typename Functor, typename... AliasesToRegister>
+class TableWithFallbackConstructionOp : public OpKernel {
+ public:
+  explicit TableWithFallbackConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    if (ctx->num_inputs() == table_int64_args.size()) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Expected op to have a resource input after the table_int64_args "
+          "input but no such input found."));
+      return;
+    }
+
+    // Look up the fallback table.
+    FallbackTableBaseType* fallback_table = nullptr;
+    {
+      const Tensor& table_handle = ctx->input(table_int64_args.size());
+      ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+      OP_REQUIRES_OK(
+          ctx, ctx->resource_manager()->Lookup<FallbackTableBaseType, true>(
+                   handle.container(), handle.name(), &fallback_table));
+    }
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx, this, fallback_table](
+                       ResourceType** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // container construction logic can't be merged with
+      // ResourceConstructionOp because Container constructor requires an
+      // input which can only be constructed if the resource manager
+      // internal lock is not already held.
+      ResourceType* resource = nullptr;
+      auto status =
+          Functor::AllocateContainer(ctx, this, fallback_table, &resource);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        // Ideally resource is non-null only if status is OK but we try
+        // to compensate here.
+        if (resource != nullptr) {
+          resource->Unref();
+        }
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(resource->MemoryUsed());
+      }
+      *ret = resource;
+      return Status::OK();
+    };
+
+    // Register the ResourceType alias.
+    ResourceType* table = nullptr;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(
+        ctx,
+        cinfo_.resource_manager()->template LookupOrCreate<ResourceType, true>(
+            cinfo_.container(), cinfo_.name(), &table, creator));
+
+    // Put a handle to resource in the output tensor (the other aliases will
+    // have the same handle).
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+
+    // Create other alias resources.
+    Status status;
+    int dummy[sizeof...(AliasesToRegister)] = {
+        (status.Update(RegisterAlias<AliasesToRegister>(table)), 0)...};
+    (void)dummy;
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+  ~TableWithFallbackConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ResourceType>(cinfo_.container(),
+                                               cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+      // Attempt to delete other resource aliases.
+      Status dummy_status;
+      int dummy[sizeof...(AliasesToRegister)] = {
+          (dummy_status.Update(DeleteAlias<AliasesToRegister>()), 0)...};
+      (void)dummy;
+    }
+  }
+
+ private:
+  using ResourceType = typename Functor::resource_type;
+  using FallbackTableBaseType = typename Functor::fallback_table_type;
+
+  template <typename T>
+  Status RegisterAlias(ResourceType* resource) {
+    auto creator = [resource](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      *ret = resource;
+      return Status::OK();
+    };
+
+    T* alias_resource = nullptr;
+    core::ScopedUnref unref_me(alias_resource);
+    return cinfo_.resource_manager()->template LookupOrCreate<T, true>(
+        cinfo_.container(), cinfo_.name(), &alias_resource, creator);
+  }
+
+  template <typename T>
+  Status DeleteAlias() {
+    return cinfo_.resource_manager()->template Delete<T>(cinfo_.container(),
+                                                         cinfo_.name());
+  }
+
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TableWithFallbackConstructionOp);
+};
+
+// Lookup a table of type ResourceAlias and insert the passed in keys and
+// values tensors using Functor::TensorInsert(keys, values, table).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class LookupTableInsertOp : public OpKernel {
+ public:
+  explicit LookupTableInsertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    const size_t tensor_index_offset = table_int64_args.size();
+    // Business logic for checking tensor shapes, etc, is delegated to the
+    // Functor.
+    const Tensor& keys = ctx->input(tensor_index_offset + 1);
+    const Tensor& values = ctx->input(tensor_index_offset + 2);
+
+    const Tensor& table_handle = ctx->input(tensor_index_offset);
+    ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+    ResourceAlias* table;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &table));
+
+    int memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
+    auto* mutex = table->GetMutex();
+    if (mutex != nullptr) {
+      mutex_lock lock(*mutex);
+      OP_REQUIRES_OK(ctx, Functor::TensorInsert(keys, values, table));
+    } else {
+      OP_REQUIRES_OK(ctx, Functor::TensorInsert(keys, values, table));
+    }
+    if (ctx->track_allocations()) {
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(LookupTableInsertOp);
+};
+
+// Lookup a table of type ResourceAlias and look up the passed in keys using
+// Functor::TensorLookup(
+//     table, keys, prefetch_lookahead, num_keys_per_thread, threadpool, out).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class LookupTableFindOp : public OpKernel {
+ public:
+  explicit LookupTableFindOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    {
+      auto status = ctx->input_list("table_int64_args", &table_int64_args);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        ctx->SetStatus(status);
+        return;
+      }
+    }
+    // We lookup tensors using positional indices because that's more
+    // efficient than looking up their string names.
+    const Tensor& prefetch_lookahead_t = ctx->input(0);
+    const size_t tensor_index_offset = table_int64_args.size();
+    const Tensor& keys = ctx->input(tensor_index_offset + 1);
+    const Tensor& num_threads = ctx->input(tensor_index_offset + 2);
+
+    TensorShape output_shape = keys.shape();
+    Tensor* out;
+    {
+      auto status = ctx->allocate_output(0, output_shape, &out);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        ctx->SetStatus(status);
+        return;
+      }
+    }
+
+    int64 num_threads_scalar;
+    if (TensorShapeUtils::IsScalar(num_threads.shape())) {
+      num_threads_scalar = num_threads.template scalar<int64>()();
+    } else {
+      // Scans through rows of num_threads and returns second entry of first
+      // row whose first entry is <= the number of keys to process.
+      // This allows the user to control parallelism as a function of
+      // the number of keys to lookup.
+      num_threads_scalar = tensor_flag_utils::FindConfigValueForKey<int64, int>(
+          num_threads.template matrix<int64>(), keys.dim_size(0));
+    }
+    const int64 num_keys_per_thread =
+        num_threads_scalar > 0
+            ? std::max(1ll, keys.dim_size(0) / num_threads_scalar)
+            : keys.dim_size(0);
+
+    const int64 prefetch_lookahead = prefetch_lookahead_t.scalar<int64>()();
+
+    const Tensor& table_handle = ctx->input(tensor_index_offset);
+    ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+    ResourceAlias* table;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &table));
+
+    auto* mutex = table->GetMutex();
+    auto* threadpool = ctx->device()->tensorflow_cpu_worker_threads()->workers;
+    if (mutex != nullptr) {
+      // There are many subtle problems with using reader locks so we opt for a
+      // writer lock here.
+      mutex_lock lock(*mutex);
+      OP_REQUIRES_OK(
+          ctx, Functor::TensorLookup(*table, keys, prefetch_lookahead,
+                                     num_keys_per_thread, threadpool, out));
+    } else {
+      OP_REQUIRES_OK(
+          ctx, Functor::TensorLookup(*table, keys, prefetch_lookahead,
+                                     num_keys_per_thread, threadpool, out));
+    }
+  }
+};
+
+// Lookup a container of type ResourceAlias and return its size using
+// Functor::Size(container, &size).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class ContainerSizeOp : public OpKernel {
+ public:
+  explicit ContainerSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& container_handle = ctx->input(0);
+    ResourceHandle handle(container_handle.scalar<ResourceHandle>()());
+    ResourceAlias* container;
+    core::ScopedUnref unref_me(container);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &container));
+
+    Tensor* out;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+
+    auto* mutex = container->GetMutex();
+    if (mutex != nullptr) {
+      tf_shared_lock lock(*mutex);
+      OP_REQUIRES_OK(ctx, Functor::Size(*container, &out->scalar<uint64>()()));
+    } else {
+      OP_REQUIRES_OK(ctx, Functor::Size(*container, &out->scalar<uint64>()()));
+    }
+  }
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
diff --git a/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h b/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h
new file mode 100644
index 0000000000000000000000000000000000000000..7331fb400a4734db19a262503dffa38fb0f71466
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Interface for resources with mutable state.
+class SynchronizedInterface : public virtual ResourceBase {
+ public:
+  // Return value should be used to synchronize read/write access to
+  // all public methods. If null, no synchronization is needed.
+  virtual mutex* GetMutex() const = 0;
+};
+
+// Interface for containers which support batch lookups.
+template <typename ValueType, typename... KeyContext>
+class InsertOrAssignInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Stores each KV pair {keys[i], values[i]} in the underlying map, overriding
+  // pre-existing pairs which have equivalent keys.
+  // keys and values should have the same size.
+  virtual Status InsertOrAssign(KeyContext... key_context,
+                                ValueType values) = 0;
+};
+
+// Interface for containers which support lookups.
+template <typename ValueType, typename... KeyContext>
+class LookupInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Lookup(KeyContext... key_context, ValueType values) const = 0;
+};
+
+// Interface for containers which support lookups with prefetching.
+template <typename ValueType, typename... KeyContext>
+class LookupWithPrefetchInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Lookup(KeyContext... key_context, ValueType values,
+                        int64 prefetch_lookahead) const = 0;
+};
+
+// Interface for containers with size concepts.
+// Implementations must guarantee thread-safety when GetMutex is used to
+// synchronize method access.
+class SizeInterface : public virtual SynchronizedInterface {
+ public:
+  // Returns the number of elements in the container.
+  virtual uint64 Size() const = 0;
+};
+
+// Interface for tables which can be initialized from key and value arguments.
+template <typename ValueType, typename... KeyContext>
+class KeyValueTableInitializerInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Initialize(KeyContext... key_context, ValueType values) = 0;
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
diff --git a/tensorflow/core/kernels/lookup_tables/table_op_utils.h b/tensorflow/core/kernels/lookup_tables/table_op_utils.h
deleted file mode 100644
index 4bc18c7dc204fa74b6a0f6d044b7f48cd554a52d..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/lookup_tables/table_op_utils.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_OP_UTILS_H_
-#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_OP_UTILS_H_
-
-#include <cstddef>
-#include <string>
-#include <type_traits>
-
-#include "absl/base/thread_annotations.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace tables {
-
-// Create resources of type ContainerBase using the static method
-// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
-// ContainerBase**)
-// If the resource has already been created it will be looked up.
-template <class ContainerBase, typename Functor>
-class ResourceConstructionOp : public OpKernel {
- public:
-  explicit ResourceConstructionOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), table_handle_set_(false) {
-    OP_REQUIRES_OK(
-        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    mutex_lock l(mu_);
-
-    if (!table_handle_set_) {
-      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
-                                      use_node_name_sharing_));
-    }
-
-    auto creator = [ctx,
-                    this](ContainerBase** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      ContainerBase* container;
-      auto status = Functor::AllocateContainer(ctx, this, &container);
-      if (TF_PREDICT_FALSE(!status.ok())) {
-        container->Unref();
-        return status;
-      }
-      if (ctx->track_allocations()) {
-        ctx->record_persistent_memory_allocation(container->MemoryUsed());
-      }
-      *ret = container;
-      return Status::OK();
-    };
-
-    ContainerBase* container_base = nullptr;
-    OP_REQUIRES_OK(
-        ctx, cinfo_.resource_manager()->template LookupOrCreate<ContainerBase>(
-                 cinfo_.container(), cinfo_.name(), &container_base, creator));
-    core::ScopedUnref unref_me(container_base);
-
-    Tensor* handle;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ContainerBase>(
-        ctx, cinfo_.container(), cinfo_.name());
-    table_handle_set_ = true;
-  }
-
-  ~ResourceConstructionOp() override {
-    // If the table object was not shared, delete it.
-    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
-      if (!cinfo_.resource_manager()
-               ->template Delete<ContainerBase>(cinfo_.container(),
-                                                cinfo_.name())
-               .ok()) {
-        // Do nothing; the resource may have been deleted by session resets.
-      }
-    }
-  }
-
- private:
-  mutex mu_;
-  bool table_handle_set_ GUARDED_BY(mu_);
-  ContainerInfo cinfo_;
-  bool use_node_name_sharing_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceConstructionOp);
-};
-
-// Create resources of type ContainerBase using the static method
-// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
-// FallbackTableBaseType*, ContainerBase**)
-// If the resource has already been created it will be looked up.
-// Container must decrease the reference count of the FallbackTableBaseType*
-// constructor argument before its destructor completes.
-template <class ContainerBase, class Functor,
-          class FallbackTableBaseType = ContainerBase>
-class TableWithFallbackConstructionOp : public OpKernel {
- public:
-  explicit TableWithFallbackConstructionOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), table_handle_set_(false) {
-    OP_REQUIRES_OK(
-        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    OpInputList table_int64_args;
-    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
-    if (ctx->num_inputs() == table_int64_args.size()) {
-      ctx->SetStatus(errors::InvalidArgument(
-          "Expected op to have a resource input after the table_int64_args "
-          "input but no such input found."));
-      return;
-    }
-
-    FallbackTableBaseType* fallback_table = nullptr;
-    {
-      const Tensor& table_handle = ctx->input(table_int64_args.size());
-      ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
-      OP_REQUIRES_OK(
-          ctx, ctx->resource_manager()->Lookup(handle.container(),
-                                               handle.name(), &fallback_table));
-    }
-    mutex_lock l(mu_);
-
-    if (!table_handle_set_) {
-      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
-                                      use_node_name_sharing_));
-    }
-
-    auto creator = [ctx, this, fallback_table](
-                       ContainerBase** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      // container construction logic can't be merged with
-      // ResourceConstructionOp because Container constructor requires an
-      // input which can only be constructed if the resource manager
-      // internal lock is not already held.
-      ContainerBase* container;
-      auto status =
-          Functor::AllocateContainer(ctx, this, fallback_table, &container);
-      if (TF_PREDICT_FALSE(!status.ok())) {
-        container->Unref();
-        return status;
-      }
-      if (ctx->track_allocations()) {
-        ctx->record_persistent_memory_allocation(container->MemoryUsed());
-      }
-      *ret = container;
-      return Status::OK();
-    };
-
-    ContainerBase* table = nullptr;
-    OP_REQUIRES_OK(
-        ctx, cinfo_.resource_manager()->template LookupOrCreate<ContainerBase>(
-                 cinfo_.container(), cinfo_.name(), &table, creator));
-    core::ScopedUnref unref_me(table);
-
-    Tensor* handle;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ContainerBase>(
-        ctx, cinfo_.container(), cinfo_.name());
-    table_handle_set_ = true;
-  }
-
-  ~TableWithFallbackConstructionOp() override {
-    // If the table object was not shared, delete it.
-    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
-      if (!cinfo_.resource_manager()
-               ->template Delete<ContainerBase>(cinfo_.container(),
-                                                cinfo_.name())
-               .ok()) {
-        // Do nothing; the resource may have been deleted by session resets.
-      }
-    }
-  }
-
- private:
-  mutex mu_;
-  bool table_handle_set_ GUARDED_BY(mu_);
-  ContainerInfo cinfo_;
-  bool use_node_name_sharing_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(TableWithFallbackConstructionOp);
-};
-
-// Op that returns the size of a container.
-template <class Container>
-class ContainerSizeOp : public OpKernel {
- public:
-  explicit ContainerSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& container_handle = ctx->input(0);
-    ResourceHandle handle(container_handle.scalar<ResourceHandle>()());
-    Container* container;
-    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup(
-                            handle.container(), handle.name(), &container));
-    core::ScopedUnref unref_me(container);
-    OP_REQUIRES_OK(ctx, container->SizeStatus());
-
-    Tensor* out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
-
-    auto* mutex = container->GetMutex();
-    if (mutex != nullptr) {
-      tf_shared_lock lock(*mutex);
-      out->scalar<int64>()() = container->UnsafeSize();
-    } else {
-      out->scalar<int64>()() = container->UnsafeSize();
-    }
-  }
-};
-
-}  // namespace tables
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_OP_UTILS_H_
diff --git a/tensorflow/core/kernels/lookup_tables/table_resource_utils.h b/tensorflow/core/kernels/lookup_tables/table_resource_utils.h
deleted file mode 100644
index 2065904d61e63ca598584051a1b05b6c99b51b1f..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/lookup_tables/table_resource_utils.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_RESOURCE_UTILS_H_
-#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_RESOURCE_UTILS_H_
-
-#include <memory>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/lookup_tables/lookup_table_interface.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace tables {
-
-// Parent class for tables with support for multithreaded synchronization.
-template <typename HeterogeneousKeyType, typename ValueType>
-class LookupTableWithSynchronization
-    : public LookupTableInterface<HeterogeneousKeyType, ValueType> {
- public:
-  // By convention, it is assumed that the OpKernel which creates this
-  // resource is bound to an op whose first input is an in64 tensor list with a
-  // first element whose boolean value indicates whether a mutex should be
-  // exported to synchronize access to state. If the tensor list is empty,
-  // the status in ctx is set to InvalidArgument and this object is in an
-  // undefined state.
-  LookupTableWithSynchronization(OpKernelContext* ctx, OpKernel* kernel) {
-    OpInputList table_int64_args;
-    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
-    if (table_int64_args.size() == 0) {
-      ctx->SetStatus(errors::InvalidArgument(
-          "table_int64_args should not be empty. Set the first element "
-          "to 1 to enable synchronized table use and to 0 otherwise."));
-      return;
-    }
-    if (ctx->input(0).scalar<int64>()() != 0) {
-      mutex_ = absl::make_unique<mutex>();
-    }
-  }
-
-  // Mutex for synchronizing access to unsynchronized methods.
-  mutex* GetMutex() const override { return mutex_.get(); }
-
- private:
-  // Use this for locking.
-  mutable std::unique_ptr<mutex> mutex_;
-};
-
-// Parent class for tables which can be constructed with arbitrary
-// lookup fallbacks.
-// Since LookupTableInterface::LookupKey assumes that all keys can be mapped
-// to values, LookupTableWithFallbackInterface allows clients to implement
-// two-stage lookups. If the first key lookup fails, clients can choose
-// to perform a fallback lookup using an externally supplied table.
-template <typename HeterogeneousKeyType, typename ValueType,
-          typename FallbackTableRegisteredType =
-              LookupTableInterface<HeterogeneousKeyType, ValueType>>
-class LookupTableWithFallbackInterface
-    : public LookupTableWithSynchronization<HeterogeneousKeyType, ValueType> {
- public:
-  LookupTableWithFallbackInterface(OpKernelContext* ctx, OpKernel* kernel,
-                                   FallbackTableRegisteredType* fallback_table)
-      : LookupTableWithSynchronization<HeterogeneousKeyType, ValueType>(ctx,
-                                                                        kernel),
-        fallback_table_(fallback_table) {}
-
-  // Clients are required to fail when ctx is set to a not-OK status in
-  // the constructor so this dereference is safe.
-  const FallbackTableRegisteredType& fallback_table() const {
-    return *fallback_table_;
-  }
-
-  ~LookupTableWithFallbackInterface() override {
-    if (fallback_table_ != nullptr) {
-      fallback_table_->Unref();
-    }
-  }
-
- private:
-  FallbackTableRegisteredType* fallback_table_;
-};
-
-}  // namespace tables
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_RESOURCE_UTILS_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 30fe4b077a368fe7c272e3ea570100923b104c75..c3b80f04ed2e3dfe71550bfd6ccf87595343b1c1 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -163,7 +163,7 @@ class TextFileLineIterator
 
   int64 total_size() const override {
     if (vocab_size_ == -1) {
-      int64 new_size;
+      int64 new_size = -1;
       Status status = GetNumLinesInTextFile(env_, filename_, &new_size);
       if (!status.ok()) {
         LOG(WARNING) << "Unable to get line count: " << status;
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 5d8c5c21ca21f097cb5030b43e288765ae384eaf..496c697ac3fbbc4c06a4c24f9521eba3c0cfeb23 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -102,7 +102,7 @@ TEST_F(LRNFloatTest, Depth96) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInput<float>(TensorShape({1, 1, 1, 96}),
-                  [this](int i) -> float { return i + 1; });
+                  [](int i) -> float { return i + 1; });
   TF_ASSERT_OK(RunOpKernel());
   auto actual = GetOutput(0)->tensor<float, 4>();
 
@@ -138,7 +138,7 @@ TEST_F(LRNFloatTest, Depth16) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInput<float>(TensorShape({1, 1, 1, 16}),
-                  [this](int i) -> float { return i + 1; });
+                  [](int i) -> float { return i + 1; });
   TF_ASSERT_OK(RunOpKernel());
   auto actual = GetOutput(0)->tensor<float, 4>();
 
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 3a5c87485cc8d49f4faf8dd8d2ef0781db2a7f75..d8fbb83940a08a1f3989bc3debb02585a32f3c01 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -25,12 +25,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/concat_lib_cpu.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::concat;
 using mkldnn::stream;
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -226,8 +228,50 @@ class MklConcatOp : public OpKernel {
       // format and avoid calling eigen version.
       if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
 
+      OpInputList input_mins, input_maxes;
+      if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+        // MKL-DNN concat does not support input tensors that have different
+        // ranges. Check if the ranges of the all input tensors are the same.
+        // If not, forward it to Eigen implementation.
+
+        OP_REQUIRES_OK(context, context->input_list("input_mins", &input_mins));
+        OP_REQUIRES(context, (input_mins.size() == N),
+                    errors::InvalidArgument(
+                        "QuantizedConcatOp : Expected mins input list length ",
+                        input_mins.size(), " to equal values length ", N));
+
+        OP_REQUIRES_OK(context,
+                       context->input_list("input_maxes", &input_maxes));
+        OP_REQUIRES(context, (input_maxes.size() == N),
+                    errors::InvalidArgument(
+                        "QuantizedConcatOp : Expected maxes input list length ",
+                        input_maxes.size(), " to equal values length ", N));
+        float input_min = input_mins[0].flat<float>()(0);
+        float input_max = input_maxes[0].flat<float>()(0);
+        const float eps = 1.0e-6;
+        for (int i = 1; i < N; ++i) {
+          float min = input_mins[i].flat<float>()(0);
+          float max = input_maxes[i].flat<float>()(0);
+
+          if (fabs(input_min - min) > eps || fabs(input_max - max) > eps) {
+            invoke_eigen = true;
+            break;
+          }
+        }
+      }
+
       // Call Eigen library
       if (invoke_eigen) {
+        // MKL-DNN quantized concat does not support input tensors with
+        // different ranges.
+        // TODO (mabuzain): Add quantized version of CallEigen() to support
+        // this case.
+        OP_REQUIRES(
+            context,
+            (!std::is_same<T, qint8>::value && !std::is_same<T, quint8>::value),
+            errors::Unimplemented("MKL DNN quantized concat does not "
+                                  "support input tensors that have "
+                                  "different ranges"));
         CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
@@ -374,6 +418,23 @@ class MklConcatOp : public OpKernel {
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
+
+      // For quantized concat, min and max outputs are also computed.
+      if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        // All input tensors should have the same range, just use the
+        // first one
+        output_min->flat<float>()(0) = input_mins[0].flat<float>()(0);
+        output_max->flat<float>()(0) = input_maxes[0].flat<float>()(0);
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -423,7 +484,7 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
 
-  // This method finds the most commom format accross all MKL inputs
+  // This method finds the most commom format across all MKL inputs
   // Inputs:
   //   1. input_shapes: shapes of input (MKL) tensors.
   //   2. concat_dim: concat dimension.
@@ -490,6 +551,20 @@ class MklConcatOp : public OpKernel {
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedConcatV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .HostMemory("axis")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklConcatOp<CPUDevice, quint8, NAME_IS_AXIS>)
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedConcatV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .HostMemory("axis")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklConcatOp<CPUDevice, qint8, NAME_IS_AXIS>)
+
 #undef REGISTER_CONCAT_MKL
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index b96cc23186558277ac0ff5f2f134e7095fdb8c7a..c12a4ff0f0c48d5b15c03eb9ee98985930463845 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -392,14 +392,24 @@ class MklDnnConvUtil {
     int64 pad_D1, pad_D2;
 
     if (is_conv2d) {
+      Padding padding_type;
+      if (pad_enabled) {
+        padding_type = Padding::EXPLICIT;
+        pad_top = static_cast<int64>((*pad_l)[0]);
+        pad_left = static_cast<int64>((*pad_l)[1]);
+        pad_bottom = static_cast<int64>((*pad_r)[0]);
+        pad_right = static_cast<int64>((*pad_r)[1]);
+      } else {
+        padding_type = padding_;
+      }
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_rows, filter_rows, dilation_rows, stride_rows,
-                         padding_, &out_rows, &pad_top, &pad_bottom));
+                         padding_type, &out_rows, &pad_top, &pad_bottom));
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_cols, filter_cols, dilation_cols, stride_cols,
-                         padding_, &out_cols, &pad_left, &pad_right));
+                         padding_type, &out_cols, &pad_left, &pad_right));
     } else {
       OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
                                    input_planes, filter_planes, stride_planes,
@@ -413,25 +423,11 @@ class MklDnnConvUtil {
     }
 
     if (is_conv2d) {
-      // Conv + pad fusion is enabled only for 2D
+      // Conv + pad fusion is enabled only for 2D.
       // If pad_enabled, i.e., pad and conv op are fused, then
       // all pads are already passed from pad op through
-      // *pad_l and *pad_r
-      if (pad_enabled) {
-        pad_top = static_cast<int64>((*pad_l)[0]);
-        pad_left = static_cast<int64>((*pad_l)[1]);
-        pad_bottom = static_cast<int64>((*pad_r)[0]);
-        pad_right = static_cast<int64>((*pad_r)[1]);
-        // update the out_rows and out_cols based on all
-        // sides of the pads coming from pad op.
-        out_rows = out_rows + (pad_top + pad_bottom) / stride_rows;
-        out_cols = out_cols + (pad_left + pad_right) / stride_cols;
-      }
-      // Handle padding. MKL-DNN uses asymetric padding.
-      // But, if pad_enabled, i.e., pad and conv op are fused,
-      // then, *pad_l and *pad_r are already set from pad op.
-      // In that case they need not set here.
-      else {
+      // *pad_l and *pad_r and they don't need to be set here.
+      if (!pad_enabled) {
         *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
         *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
       }
diff --git a/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc b/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc68480bbe8b9ed509309a16df2b805fe02e20f1
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+using test::graph::Constant;
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMKL2TF(DataType dtype, const Tensor& first, const Tensor& second,
+                     Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody(){};
+};
+
+class QuantizedConcatTest : public OpsTestBase {
+ protected:
+  QuantizedConcatTest() {}
+
+  void TestSmall8Bit(float first_min, float first_max, float second_min,
+                     float second_max);
+  void TestSecondDim8Bit(float first_min, float first_max, float second_min,
+                         float second_max);
+};
+
+TEST_F(QuantizedConcatTest, Small8BitSameRange) {
+  // Range for both is the same, so impl can use memcpy.
+  TestSmall8Bit(0.0f, 255.0f, 0.0f, 255.0f);
+}
+
+void QuantizedConcatTest::TestSmall8Bit(float first_min, float first_max,
+                                        float second_min, float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "_MklQuantizedConcatV2")
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))     // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("Tidx", DT_INT32)
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  const int first_depth = 1;
+  Tensor first_float(DT_FLOAT,
+                     {first_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  const int second_depth = 1;
+  Tensor second_float(
+      DT_FLOAT, {second_batch, second_height, second_width, second_depth});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_batch = first_batch + second_batch;
+  Tensor expected_float(
+      DT_FLOAT, {expected_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&expected_float,
+                          {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                           13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedConcatTest, SecondDim8BitSameRange) {
+  TestSecondDim8Bit(-10.0f, 150.0f, -10.0f, 150.0f);
+}
+
+void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
+                                            float second_min,
+                                            float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "_MklQuantizedConcatV2")
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))     // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("Tidx", DT_INT32)
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  const int first_depth = 1;
+  Tensor first_float(DT_FLOAT,
+                     {first_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  const int second_depth = 1;
+
+  Tensor second_float(
+      DT_FLOAT, {second_batch, second_height, second_width, second_depth});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_height = first_height + second_height;
+  Tensor expected_float(
+      DT_FLOAT, {first_batch, expected_height, first_width, first_depth});
+  test::FillValues<float>(&expected_float,
+                          {1, 2, 3, 4,  5,  6,  13, 14, 15, 16, 17, 18,
+                           7, 8, 9, 10, 11, 12, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  // Using the same error tolerance as in Eigen QuantizedConcat test
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
index 10825f696253cc6d38bbdee1e6b660d494c34088..fef2d837cf27a0854ffc34ad3d1b60831a776fbc 100644
--- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
@@ -24,8 +24,13 @@ limitations under the License.
 namespace tensorflow {
 template <class T>
 float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
-  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
-  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+
+  // Adjusting for having a symmetric range.
+  // for example: for 8-bit [-127, 127] as opposed to [-128, 127].
+  if (lowest < -highest) ++lowest;
+
   const float float_for_one_quantized_level =
       (range_max - range_min) / (highest - lowest);
   return float_for_one_quantized_level;
@@ -48,6 +53,35 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
   *min_c = c_float_for_one_quant_level * c_lowest;
   *max_c = c_float_for_one_quant_level * c_highest;
 }
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           const Tensor& min_b_vector,
+                                           const Tensor& max_b_vector,
+                                           Tensor** min_c_vector,
+                                           Tensor** max_c_vector) {
+  DCHECK(min_b_vector.NumElements() == (*min_c_vector)->NumElements());
+  DCHECK(max_b_vector.NumElements() == (*max_c_vector)->NumElements());
+  size_t n_channel = min_b_vector.NumElements();
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float* min_b = min_b_vector.flat<float>().data();
+  const float* max_b = max_b_vector.flat<float>().data();
+  float* min_c = (*min_c_vector)->flat<float>().data();
+  float* max_c = (*max_c_vector)->flat<float>().data();
+#pragma omp parallel for
+  for (size_t n = 0; n < n_channel; ++n) {
+    float a_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+    float b_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]);
+    float c_float_for_one_quant_level =
+        a_float_for_one_quant_level * b_float_for_one_quant_level;
+    min_c[n] = c_float_for_one_quant_level * c_lowest;
+    max_c[n] = c_float_for_one_quant_level * c_highest;
+  }
+}
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..767a6f1c3976d335bfd660f3a6990c03805843ba
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include <math.h>
+#include <limits>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class MklRequantizationRangePerChannelOp : public OpKernel {
+ public:
+  explicit MklRequantizationRangePerChannelOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("clip_value_max", &clip_value_max_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(kInputTensorIndex);
+    const Tensor& input_min = ctx->input(kInputMinIndex);
+    const Tensor& input_max = ctx->input(kInputMaxIndex);
+
+    const size_t depth = input_max.NumElements();
+    OP_REQUIRES(
+        ctx, input_min.dim_size(0) == depth,
+        errors::InvalidArgument("input_min has incorrect size, expected ",
+                                depth, " was ", input_min.dim_size(0)));
+    OP_REQUIRES(
+        ctx, input_max.dim_size(0) == depth,
+        errors::InvalidArgument("input_max has incorrect size, expected ",
+                                depth, " was ", input_max.dim_size(0)));
+
+    const float* input_min_data = input_min.flat<float>().data();
+    const float* input_max_data = input_max.flat<float>().data();
+    std::vector<float> ranges(depth);
+    bool is_non_negative = true;
+    Eigen::array<int, 2> shuffling({1, 0});
+    auto input_matrix = input.flat_inner_dims<qint32>();
+
+    // TODO: verify performance of not transposing and finding the min max
+    // directly from input_matrix vs the one presented below of transposing and
+    // using the transposed matrix as the transposing operation in itself might
+    // be more costly.
+    // Note that this operation is a calibration step for quantization and will
+    // cease to exist in the final inference graph(will exist as a const node).
+    auto transposed_input = input_matrix.shuffle(shuffling);
+
+    // Find the ranges of each channel in parallel.
+    float out_min_max = std::numeric_limits<float>::min();
+#pragma omp parallel for reduction(max : out_min_max)
+    for (size_t i = 0; i < depth; ++i) {
+      Eigen::Tensor<qint32, 0, Eigen::RowMajor> min =
+          transposed_input.chip<0>(i).minimum();
+      Eigen::Tensor<qint32, 0, Eigen::RowMajor> max =
+          transposed_input.chip<0>(i).maximum();
+      const int32_t min_per_channel = min();
+      const int32_t max_per_channel = max();
+      const int32_t abs_max =
+          std::max(std::abs(min_per_channel), std::abs(max_per_channel));
+      float scale =
+          std::max(std::abs(input_min_data[i]), std::abs(input_max_data[i]));
+      ranges[i] =
+          scale * static_cast<float>(abs_max) / static_cast<float>(1L << 31);
+      if (min_per_channel < 0) is_non_negative = false;
+
+      // Thread-local out_min_max.
+      out_min_max = std::max(out_min_max, ranges[i]);
+    }
+    // All local out_min_max gets max-reduced into one global out_min_max at
+    // the end of the loop by specifying reduction(max:out_min_max) along with
+    // omp parallel for.
+
+    // Fixing max to clip_value_max_ (example 6.0 to support relu6)
+    if (out_min_max > clip_value_max_) out_min_max = clip_value_max_;
+
+    Tensor* output_min = nullptr;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputMinIndex, {}, &output_min));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputMaxIndex, {}, &output_max));
+    output_min->flat<float>()(0) = is_non_negative ? 0.0f : -out_min_max;
+    output_max->flat<float>()(0) = out_min_max;
+  }
+
+ private:
+  float clip_value_max_ = std::numeric_limits<float>::infinity();
+  const int kInputTensorIndex = 0;
+  const int kInputMinIndex = 1;
+  const int kInputMaxIndex = 2;
+  const int kOutputMinIndex = 0;
+  const int kOutputMaxIndex = 1;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizationRangePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T"),
+                        MklRequantizationRangePerChannelOp);
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_requantize_ops_test.cc b/tensorflow/core/kernels/mkl_requantize_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9961462754f4c2378f7e46931d7878ca283278a5
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantize_ops_test.cc
@@ -0,0 +1,300 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+
+#include <cmath>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+class MklRequantizatedOpsTest : public OpsTestBase {};
+
+class MklRequantizatedOpsTestHelper : public OpsTestBase {
+ public:
+  void Setup(Tensor &input_tensor_qint32, float &range_weights_ch1,
+             float &range_weights_ch2);
+  void TestBody() {}
+};
+
+void MklRequantizatedOpsTestHelper::Setup(Tensor &input_tensor_qint32,
+                                          float &range_weights_ch1,
+                                          float &range_weights_ch2) {
+  // Step 1: Input range assumptions
+  // -------------------------------
+  // Assume input tensor T (NHWC) in FP32 has range [0, 5.0]   size nt*ht*wt*ct
+  // Assume input filter W (NHWC) with 2 output channels of    size nw*ht*wt*2
+  // logically,   filter W has 2 channels W1 and W2 each of    size nw*ht*wt*1
+  // Assume input filter W1(NHWC) in FP32 has range [-2.0, 2.0]size nw*ht*wt*1
+  // Assume input filter W2(NHWC) in FP32 has range [-3.0, 3.0]size nw*ht*wt*1
+
+  // Step 2: Quantization details (per channel)
+  // ------------------------------------------
+  // T and W are quantized using a quantize op.
+  // The input tensor T (NHWC) is quantized to unsigned int8.
+  // Hence T's max value is mapped to ((2^8-1) = 255).
+  // The input filter W (NHWC) is quantized to signed int8.
+  // Hence W's max value is mapped to ((2^7)-1 = 127)).
+
+  // Range of quantized T  in uint8[0  , 255] maps to orig T  in FP32[0   , 5.0]
+  // Range of quantized W1 in int8[-127, 127] maps to orig W1 in FP32[-2.0, 2.0]
+  // Range of quantized W2 in int8[-127, 127] maps to orig W2 in FP32[-3.0, 3.0]
+
+  // Hence the resolution of quantized T will be 5.0/255
+  // Hence the resolution of quantized W1 will be 2.0/127
+  // Hence the resolution of quantized W2 will be 3.0/127
+
+  // Step 3: Assumption of quantizedconv on quantized input&weights(per channel)
+  // ---------------------------------------------------------------------------
+  // The input T and weights W1 (or W2) will be convolved.
+  // The output tensor T is in int32 whose range is [-2^31, 2^31).
+  // For simplicity and symmetry, we truncate the above range to (-2^31, 2^31).
+  // The range of convolved T*W1 is ((2^31)-1) * 5.0/255 * 2.0/127 = 663110.59
+  // So the range of convolved T*W1 in int32(-2^31, 2^31) that maps to
+  // orig T range in FP32[0, 5.0] * [-2.0, 2.0] is [-663110.59, 663110.59].
+
+  // The range of convolved T*W2 is (2^31-1) * 5.0/255 * 3.0/127 = 994665.88
+  // So the range of convolved T*W2 in int32(-2^31, 2^31) that maps to
+  // orig T range in FP32 [0, 5.0] * [-3.0, 3.0]  is [-994665.88, 994665.88]
+
+  // Step 4: Assumption output above is fed to requantization_range_perchannel
+  // --------------------------------------------------------------------------
+  // Here we recalculate the new range for convolved T*W so that we
+  // make good use in int8 quantization from int32 to int8.
+
+  // We assume the above operations are performed and use these values above
+  // as ranges for requantization_range_perchannel_op.
+  range_weights_ch1 = 663110.59;  // For W1 channel
+  range_weights_ch2 = 994665.88;  // For W2 Channel
+
+  // We Fill the input tensor T qint32 with arbitrary int32 values
+  test::FillValues<qint32>(
+      &input_tensor_qint32,
+      {-1000, -2000,  2000,   4000,   -3000,  -6000,  4000,   8000,
+       5000,  10000,  -6000,  -12000, 7000,   14000,  8000,   16000,
+       9000,  -18000, -10000, -20000, 11000,  22000,  -12000, -24000,
+       13000, 26000,  14000,  28000,  -15000, -30000, 16000,  32000});
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // See test RequantizationRangePerChannelTest_Basic and/or
+  // test RequantizationRangePerChannelTest_ClipMax
+}
+
+// Tests the RequantizationRangePerChannel op wherein the range
+// of the weights is calculated per channel.
+TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_Basic) {
+  // Let us set up the tensor and inputs before we run this op.
+  float clip_value_max = static_cast<float>((1L << 31) - 1);
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // Define the shape of T.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // Define, create and initialize the op in question.
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range_per_channel",
+                              "RequantizationRangePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("clip_value_max", clip_value_max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range Nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Step 6: Verify output and store values to test requantize_perchannel
+  // --------------------------------------------------------------------
+
+  // Verify the Expected Outputs
+  const float output_min = GetOutput(0)->flat<float>()(0);
+  const float output_max = GetOutput(1)->flat<float>()(0);
+  EXPECT_NEAR(-14.8217, output_min, 0.002);
+  EXPECT_NEAR(14.8217, output_max, 0.002);
+
+  // Output range is made use in RequantizePerChannelTest_Basic
+}
+
+TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_ClipMax) {
+  // Let us setup the tensor and inputs before we run this op.
+  float clip_value_max = 6;  // Can be used as 6 for Relu 6 activations.
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // define and input tensor T shape.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // Define, create and initialize the op in question.
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range_per_channel",
+                              "RequantizationRangePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("clip_value_max", clip_value_max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Step 6: Verify output and store values to test requantize_perchannel
+  // --------------------------------------------------------------------
+
+  // Verify the expected outputs
+  const float output_min = GetOutput(0)->flat<float>()(0);
+  const float output_max = GetOutput(1)->flat<float>()(0);
+  EXPECT_NEAR(-6.0, output_min, 0.002);  // Values are aligned with clip_value.
+  EXPECT_NEAR(6.0, output_max, 0.002);   // Values are aligned with clip_value.
+}
+
+TEST_F(MklRequantizatedOpsTest, RequantizePerChannelTest_Basic) {
+  // Let us setup the tensor and inputs before we run this op.
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // define an input tensor T shape.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 7: Define and run requantize_perchannel
+  // --------------------------------------------
+  // The output of requantization_range_op_per_channel which calculated the
+  // new ranges of int8 is fed to the requantize per channel op.
+  // Here the values of convolved T*W is converted from int32 to int8.
+
+  TF_ASSERT_OK(NodeDefBuilder("requantize_per_channel", "RequantizePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("out_type", DataTypeToEnum<qint8>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input Nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Calculate the min and max from Step 6 above
+  // in RequantizationRangePerChannelTest_Basic
+  float range_op_output_min = -14.8217;
+  float range_op_output_max = 14.8217;
+
+  // Add the requested_min and requested_max stored from Step 6.
+  AddInputFromArray<float>(TensorShape({1}), {range_op_output_min});
+  AddInputFromArray<float>(TensorShape({1}), {range_op_output_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Verify the output with the expected output
+  Tensor output = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(range_op_output_min, output_min, 0.002);
+  EXPECT_NEAR(range_op_output_max, output_max, 0.002);
+}
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c1a01f8311e81fa2f0dd0945569f0b6980b0be
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+#include <math.h>
+
+#include "mkldnn.hpp"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename Toutput>
+class MklRequantizePerChannelOp : public OpKernel {
+ public:
+  explicit MklRequantizePerChannelOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_type_));
+    OP_REQUIRES(ctx, out_type_ == DT_QINT8 || out_type_ == DT_QUINT8,
+                errors::InvalidArgument(
+                    "out_type must be qint8 or quint8, but got: " + out_type_));
+  }
+  virtual ~MklRequantizePerChannelOp() {}
+  void Compute(OpKernelContext* ctx) override {
+    try {
+      const Tensor& input = ctx->input(kInputTensorIndex);
+      const Tensor& input_min_vec = ctx->input(kInputMinVecIndex);
+      float* input_min_vec_data = (float*)const_cast<void*>(
+          static_cast<const void*>(input_min_vec.flat<float>().data()));
+      const Tensor& input_max_vec = ctx->input(kInputMaxVecIndex);
+      float* input_max_vec_data = (float*)const_cast<void*>(
+          static_cast<const void*>(input_max_vec.flat<float>().data()));
+
+      const Tensor& input_requested_min = ctx->input(this->kRequestMinIndex);
+      const float input_requested_min_float =
+          input_requested_min.flat<float>()(0);
+      const Tensor& input_requested_max = ctx->input(this->kRequestMaxIndex);
+      const float input_requested_max_float =
+          input_requested_max.flat<float>()(0);
+
+      size_t depth = input_min_vec.NumElements();
+      OP_REQUIRES(
+          ctx, input_min_vec.dim_size(0) == depth,
+          errors::InvalidArgument("input_min has incorrect size, expected ",
+                                  depth, " was ", input_min_vec.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_vec.dim_size(0) == depth,
+          errors::InvalidArgument("input_max has incorrect size, expected ",
+                                  depth, " was ", input_max_vec.dim_size(0)));
+
+      if (out_type_ == DT_QINT8) DCHECK(input_requested_min_float < 0.0f);
+
+      const float factor = (out_type_ == DT_QINT8) ? 127.0f : 255.0f;
+      const float requested_min_max =
+          std::max(std::abs(input_requested_min_float),
+                   std::abs(input_requested_max_float));
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputTensorIndex,
+                                               input.shape(), &output));
+
+      std::vector<float> scales(depth);
+      for (int i = 0; i < depth; ++i) {
+        float min_max_from_vec = std::max(std::abs(input_min_vec_data[i]),
+                                          std::abs(input_max_vec_data[i]));
+        scales[i] = factor * (min_max_from_vec / requested_min_max /
+                              static_cast<float>(1L << 31));
+      }
+
+      mkldnn::primitive_attr reorder_attr;
+      reorder_attr.set_output_scales(2, scales);
+
+      memory::dims dims_mkl_order =
+          TFShapeToMklDnnDimsInNCHW(input.shape(), FORMAT_NHWC);
+      memory::desc input_md = memory::desc(dims_mkl_order, MklDnnType<qint32>(),
+                                           memory::format::nhwc);
+      memory::desc output_md =
+          (out_type_ == DT_QINT8)
+              ? memory::desc(dims_mkl_order, MklDnnType<qint8>(),
+                             memory::format::nhwc)
+              : memory::desc(dims_mkl_order, MklDnnType<quint8>(),
+                             memory::format::nhwc);
+
+      memory::primitive_desc input_pd =
+          memory::primitive_desc(input_md, cpu_engine_);
+      memory::primitive_desc output_pd =
+          memory::primitive_desc(output_md, cpu_engine_);
+
+      void* input_buf =
+          static_cast<void*>(const_cast<qint32*>(input.flat<qint32>().data()));
+      void* output_buf;
+      if (out_type_ == DT_QINT8) {
+        output_buf = static_cast<void*>(
+            const_cast<qint8*>(output->flat<qint8>().data()));
+      } else {
+        output_buf = static_cast<void*>(
+            const_cast<quint8*>(output->flat<quint8>().data()));
+      }
+
+      std::unique_ptr<memory> input_mem_prim_(new memory(input_pd, input_buf));
+      std::unique_ptr<memory> output_mem_prim_(
+          new memory(output_pd, output_buf));
+
+      mkldnn::reorder::primitive_desc reorder_pd =
+          mkldnn::reorder::primitive_desc(input_pd, output_pd, reorder_attr);
+      std::vector<mkldnn::primitive> net;
+      net.push_back(
+          mkldnn::reorder(reorder_pd, *input_mem_prim_, *output_mem_prim_));
+      stream(stream::kind::eager).submit(net).wait();
+
+      Tensor* output_min = nullptr;
+      Tensor* output_max = nullptr;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(kOutputMinIndex, {}, &output_min));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(kOutputMaxIndex, {}, &output_max));
+
+      output_min->flat<float>()(0) = input_requested_min_float;
+      output_max->flat<float>()(0) = input_requested_max_float;
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          ctx, errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  const int kInputTensorIndex = 0;
+  const int kInputMinVecIndex = 1;
+  const int kInputMaxVecIndex = 2;
+  const int kRequestMinIndex = 3;
+  const int kRequestMaxIndex = 4;
+  const int kOutputTensorIndex = 0;
+  const int kOutputMinIndex = 1;
+  const int kOutputMaxIndex = 2;
+  DataType out_type_;
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T")
+                            .TypeConstraint<qint8>("out_type"),
+                        MklRequantizePerChannelOp<CPUDevice, qint8>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index 1603a2aa869e4959713741bfb501798193a63d42..b06845f13a4d0f8688618c22d12a223e6264f767 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -242,10 +242,24 @@ class ConsumeMutexLockOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("MutexLock").Device(DEVICE_CPU), MutexLockOp);
 
-REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("MutexLock")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("mutex_lock")
+                            .HostMemory("mutex"),
+                        MutexLockOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("MutexV2").Device(DEVICE_CPU).HostMemory("resource"),
+    ResourceHandleOp<Mutex>);
+
+REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_GPU),
                         ResourceHandleOp<Mutex>);
 
 REGISTER_KERNEL_BUILDER(Name("ConsumeMutexLock").Device(DEVICE_CPU),
                         ConsumeMutexLockOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ConsumeMutexLock").Device(DEVICE_GPU).HostMemory("mutex_lock"),
+    ConsumeMutexLockOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 5645275cfa98eb820b7d1e885b18894bfab17e49..18ed1ea26ac8a63c4716bfdc7197641be522ea7c 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -158,7 +158,8 @@ REGISTER_PACK(string);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-REGISTER_GPU(bool);
+TF_CALL_int16(REGISTER_GPU);
+TF_CALL_bool(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index d8c38fe40061d10ffa7ac5e0d74295ae1ed2f962..4866efef9dbdd5fe097817f1906620a689fba120 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -173,10 +173,11 @@ class PartitionedCallOp : public AsyncOpKernel {
     FunctionLibraryRuntime::InstantiateOptions opts;
     opts.target = lib->device()->name();
     opts.is_multi_device_function = true;
-    opts.optimize_graph_fn = std::bind(
-        grappler::OptimizeGraph, std::placeholders::_1, std::placeholders::_2,
-        std::placeholders::_3, std::placeholders::_4, config_proto_,
-        func_.name(), optimization_options, std::placeholders::_5);
+    opts.optimize_graph_fn =
+        std::bind(grappler::OptimizeGraph, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3,
+                  std::placeholders::_4, std::placeholders::_5, config_proto_,
+                  func_.name(), optimization_options, std::placeholders::_6);
     opts.graph_collector = ctx->graph_collector();
     opts.executor_type = executor_type_;
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 69122f467c8fcf3818ab69f3f96d00b9a6b3c245..903cf9313a22cdc6937cdae53afb7063101400f8 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -215,7 +215,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
       // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
       // emulated via NHWC.
       data_layout = se::dnn::DataLayout::kBatchYXDepth;
-      batch_size *= depth;
+      batch_size *= depth / 4;
       depth = 4;
       break;
     default:
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index b03ac8e87dac8fabe0d45d8685ec4fa5fd642519..ff4e7be1622af8bfd2e19aaff5e1ff3677875f3c 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -246,4 +246,16 @@ REGISTER_QUANTIZED_CONCAT(qint32);
 
 #undef REGISTER_QUANTIZED_CONCAT
 
+#ifdef INTEL_MKL
+#define REGISTER_QUANTIZED_CONCATV2(type)                \
+  REGISTER_KERNEL_BUILDER(Name("QuantizedConcatV2")      \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
+                          QuantizedConcatOp<type>)
+
+REGISTER_QUANTIZED_CONCATV2(quint8);
+REGISTER_QUANTIZED_CONCATV2(qint32);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index edb2b10e3d69b6ac93c13b875d00fa9de7ed5362..a5603885236245de7d8cc3e49c6d836ba6c2af54 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_op_gpu.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -36,170 +37,6 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <class Distribution, bool VariableSamplesPerOutput>
-struct FillPhiloxRandomKernel;
-
-template <typename T, int ElementCount>
-class SampleCopier {
- public:
-  inline __device__ void operator()(
-      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
-#pragma unroll
-    for (int i = 0; i < ElementCount; i++) {
-      buf[i] = array[i];
-    }
-  }
-};
-
-template <>
-class SampleCopier<float, 4> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      float* buf, const tensorflow::random::Array<float, 4>& array) const {
-    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
-    // have 32-bit alignment vs 128-bit alignment. There seems to be no
-    // performance loss when assigning each element to a vector.
-    float4 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    vec.z = array[2];
-    vec.w = array[3];
-    float4* buf_vector = reinterpret_cast<float4*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<int32, 4> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
-    int4 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    vec.z = array[2];
-    vec.w = array[3];
-    int4* buf_vector = reinterpret_cast<int4*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<double, 2> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      double* buf, const tensorflow::random::Array<double, 2>& array) const {
-    double2 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    double2* buf_vector = reinterpret_cast<double2*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<int64, 2> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
-    longlong2 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-// A cuda kernel to fill the data with random numbers from the specified
-// distribution. Each output takes a fixed number of samples.
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, false> {
-  typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
-                              Distribution dist) {
-    const int kGroupSize = Distribution::kResultElementCount;
-
-    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32 total_thread_count = gridDim.x * blockDim.x;
-    int32 offset = thread_id * kGroupSize;
-    gen.Skip(thread_id);
-
-    const SampleCopier<T, kGroupSize> copier;
-    while (offset + kGroupSize <= size) {
-      const typename Distribution::ResultType samples = dist(&gen);
-      copier(&data[offset], samples);
-
-      offset += total_thread_count * kGroupSize;
-      gen.Skip(total_thread_count - 1);
-    }
-
-    typename Distribution::ResultType samples = dist(&gen);
-    for (int i = 0; i < kGroupSize; ++i) {
-      if (offset >= size) {
-        return;
-      }
-      data[offset] = samples[i];
-      ++offset;
-    }
-  }
-};
-
-// A cuda kernel to fill the data with random numbers from the specified
-// distribution. Each output takes a variable number of samples.
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, true> {
-  typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
-                              int64 size, Distribution dist) {
-    using random::PhiloxRandom;
-    using random::SingleSampleAdapter;
-
-    const int kReservedSamplesPerOutput = 256;
-    const int kGroupSize = Distribution::kResultElementCount;
-    const int kGeneratorSkipPerOutputGroup = kGroupSize *
-                                             kReservedSamplesPerOutput /
-                                             PhiloxRandom::kResultElementCount;
-
-    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32 total_thread_count = gridDim.x * blockDim.x;
-    int64 group_index = thread_id;
-    int64 offset = group_index * kGroupSize;
-
-    while (offset < size) {
-      // Since each output takes a variable number of samples, we need to
-      // realign the generator to the beginning for the current output group
-      PhiloxRandom gen = base_gen;
-      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
-      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
-
-      typename Distribution::ResultType samples = dist(&single_samples);
-
-      for (int i = 0; i < kGroupSize; ++i) {
-        if (offset >= size) {
-          return;
-        }
-        data[offset] = samples[i];
-        ++offset;
-      }
-
-      offset += (total_thread_count - 1) * kGroupSize;
-      group_index += total_thread_count;
-    }
-  }
-};
-
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
@@ -224,7 +61,7 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
 
   FillPhiloxRandomKernelLaunch<Distribution>
       <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
-};
+}
 
 // Explicit instantiation of the GPU distributions functors
 // clang-format off
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e32c755d78259a76c0dbad16efb871e7dfc8216d
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+
+#if defined(__CUDACC__)
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
+                              Distribution dist);
+};
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
+                              int64 size, Distribution dist);
+};
+
+template <typename T, int ElementCount>
+class SampleCopier {
+ public:
+  inline __device__ void operator()(
+      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
+#pragma unroll
+    for (int i = 0; i < ElementCount; i++) {
+      buf[i] = array[i];
+    }
+  }
+};
+
+template <>
+class SampleCopier<float, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      float* buf, const tensorflow::random::Array<float, 4>& array) const {
+    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
+    // have 32-bit alignment vs 128-bit alignment. There seems to be no
+    // performance loss when assigning each element to a vector.
+    float4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    float4* buf_vector = reinterpret_cast<float4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int32, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
+    int4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    int4* buf_vector = reinterpret_cast<int4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<double, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      double* buf, const tensorflow::random::Array<double, 2>& array) const {
+    double2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    double2* buf_vector = reinterpret_cast<double2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int64, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
+    longlong2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a fixed number of samples.
+template <class Distribution>
+PHILOX_DEVICE_FUNC void FillPhiloxRandomKernel<Distribution, false>::Run(
+    random::PhiloxRandom gen, T* data, int64 size, Distribution dist) {
+  const int kGroupSize = Distribution::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int32 offset = thread_id * kGroupSize;
+  gen.Skip(thread_id);
+
+  const SampleCopier<T, kGroupSize> copier;
+  while (offset + kGroupSize <= size) {
+    const typename Distribution::ResultType samples = dist(&gen);
+    copier(&data[offset], samples);
+
+    offset += total_thread_count * kGroupSize;
+    gen.Skip(total_thread_count - 1);
+  }
+
+  typename Distribution::ResultType samples = dist(&gen);
+  for (int i = 0; i < kGroupSize; ++i) {
+    if (offset >= size) {
+      return;
+    }
+    data[offset] = samples[i];
+    ++offset;
+  }
+}
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a variable number of samples.
+template <class Distribution>
+PHILOX_DEVICE_FUNC void FillPhiloxRandomKernel<Distribution, true>::Run(
+    const random::PhiloxRandom& base_gen, T* data, int64 size,
+    Distribution dist) {
+  using random::PhiloxRandom;
+  using random::SingleSampleAdapter;
+
+  const int kReservedSamplesPerOutput = 256;
+  const int kGroupSize = Distribution::kResultElementCount;
+  const int kGeneratorSkipPerOutputGroup = kGroupSize *
+                                           kReservedSamplesPerOutput /
+                                           PhiloxRandom::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int64 group_index = thread_id;
+  int64 offset = group_index * kGroupSize;
+
+  while (offset < size) {
+    // Since each output takes a variable number of samples, we need to
+    // realign the generator to the beginning for the current output group
+    PhiloxRandom gen = base_gen;
+    gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+    SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+    typename Distribution::ResultType samples = dist(&single_samples);
+
+    for (int i = 0; i < kGroupSize; ++i) {
+      if (offset >= size) {
+        return;
+      }
+      data[offset] = samples[i];
+      ++offset;
+    }
+
+    offset += (total_thread_count - 1) * kGroupSize;
+    group_index += total_thread_count;
+  }
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // defined(__CUDACC__)
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index d682cd3b52db50575480c749b9b8e2633c4b8f07..c1457b34f1d552edc1fd821e3d7fff53b93df9f1 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -294,7 +294,7 @@ Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
     // Skip entries that do not belong to this shard.
     if (word_id % num_shards_ == shard_) {
       float w = 0.0;
-      if (!strings::safe_strtof(cols.at(cols.size() - 1).c_str(), &w)) {
+      if (!strings::safe_strtof(cols.at(cols.size() - 1), &w)) {
         return errors::InvalidArgument("Wrong vocabulary format at line: ",
                                        line);
       }
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index ffa41ece49640a7de5c71e7d3b87f42522021d38..e9900e58684b346aabf75349fdd44b93fbf1d497 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -40,6 +40,20 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+template <typename T>
+struct Square {
+  __host__ __device__ T operator()(const T& a) const {
+    return a * Eigen::numext::conj(a);
+  }
+};
+
+template <typename T>
+struct Sqrt {
+  __host__ __device__ T operator()(const T& a) const {
+    return Eigen::numext::sqrt(a);
+  }
+};
+
 template <typename T>
 struct Sum {
   __host__ __device__ T operator()(const T& a, const T& b) const {
@@ -500,7 +514,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
     BlockReduceKernel<IN_T, OUT_T, num_threads>
         <<<num_blocks, num_threads, 0, cu_stream>>>(in, out, in_size, op, init);
     return;
-  } else if (in_size <= 1 << 19) {
+  } else if (in_size <= 1 << 18) {
     const int num_threads = 256;
     const int num_blocks = std::min(32, Eigen::divup(in_size, num_threads));
     // it seems like tailoring this to the GPU
@@ -884,6 +898,31 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
   }
 };
 
+// TODO(rmlarsen): Specialize for float16.
+template <typename T>
+struct ReduceFunctor<GPUDevice, functor::EuclideanNormReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::EuclideanNormReducer<T>& reducer) {
+    typedef cub::TransformInputIterator<T, Square<T>, T*> inputIterType;
+    inputIterType input_itr((T*)in.data(), Square<T>());
+    typedef TransformOutputIterator<T, T, Sqrt<T>> outputIterType;
+    outputIterType output_itr((T*)out.data(), Sqrt<T>());
+    ReduceImpl<T, Sum<T>, outputIterType, inputIterType, ReductionAxes>(
+        ctx, output_itr, input_itr, in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Sum<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::EuclideanNormReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
 template <typename T>
 struct ReduceFunctor<GPUDevice, functor::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 2331599b72f46df7a34e9553d5bd41a7613409da..0a1568bdc2521addb954bdd472164922e4f7d0f5 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -33,6 +33,12 @@ struct MeanReducer {
   Scalar initialize() const { return Scalar(0); }
 };
 
+// Dummy class used for template specialization for l2-norm reduction.
+template <typename Scalar>
+struct EuclideanNormReducer {
+  Scalar initialize() const { return Scalar(0); }
+};
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Reducer>
 struct ReduceEigenImpl {
@@ -56,6 +62,39 @@ struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
   }
 };
 
+// TODO(rmlarsen): Refactor this such that taking the sqrt can be optional
+// controlled by an attribute.
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Scalar>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<Scalar>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<Scalar>& reducer) {
+    static_assert(std::is_same<Scalar, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<Scalar> sum_reducer;
+    out.device(d) =
+        (in * in.conjugate()).reduce(reduction_axes, sum_reducer).sqrt();
+  }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<bfloat16>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<bfloat16>& reducer) {
+    static_assert(std::is_same<bfloat16, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<float> sum_reducer;
+    auto in_as_float = in.template cast<float>();
+    out.device(d) = (in_as_float * in_as_float.conjugate())
+                        .reduce(reduction_axes, sum_reducer)
+                        .sqrt()
+                        .template cast<bfloat16>();
+  }
+};
+
 // For most reducers, the identity is Reducer::initialize()
 template <typename Reducer>
 struct Identity {
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f4bf50e7ca0ecf8506b260829cae2127305cadb
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -0,0 +1,81 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_CPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx"),                \
+                          ReductionOp<CPUDevice, type, int32,                \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_CPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx"),                \
+                          ReductionOp<CPUDevice, type, int64,                \
+                                      functor::EuclideanNormReducer<type>>);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<GPUDevice, type, int32,                \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<GPUDevice, type, int64,                \
+                                      functor::EuclideanNormReducer<type>>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_SYCL)                           \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<SYCLDevice, type, int32,               \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_SYCL)                           \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<SYCLDevice, type, int64,               \
+                                      functor::EuclideanNormReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
index c44a40b3b38f5a37574d0d81b7b67adcf27451e1..662f24d9054ab2cfd312ea933f2a7769c6e3983b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
@@ -53,6 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::SumReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, functor::MeanReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex128, functor::EuclideanNormReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::ProdReducer<complex128>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
index 1921130ac043d9d1bfdea415c59aafcedcc31ef3..8ab2a6e13e52b0c92bfde2a2c6acf4423dc5976b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
@@ -53,6 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, functor::MeanReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex64, functor::EuclideanNormReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::ProdReducer<complex64>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
index 119f726b929bd9c599e26684fede9890efceb2f2..c492308a9162596235e8d07e9b376abbd89c2007 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(double);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
index 70ba4abac48bcfe10d577a120cf08fdd8650f367..b006311c125c1e8e86d499ce125aa7cd817f9d5f 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(float);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
index 82f6d7df952fcd8b0aaa3561efd4a4bca93e4dce..91a33b92cb6663310d6cfee9d20127b960e6a11b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(int32);
diff --git a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
index db050fdea38bd6db58424da72ff75e79e9151a09..f33d504e25a202c5ce229276611c0958f97f8eee 100644
--- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
@@ -51,8 +51,9 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
   DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);
 
 DEFINE_FOR_ALL_REDUCERS(Eigen::half);
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index fe8ea59f1be521166d0e42295e79d1bb5a242750..359d7dbeca58be8643e51a1ad2248ccd57f67e79 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -164,6 +164,11 @@ static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
 }
 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
+static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
+}
+BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
 static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
   ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
 }
diff --git a/tensorflow/core/kernels/save_v2_op_test.cc b/tensorflow/core/kernels/save_v2_op_test.cc
index 82e566d35fefa98f96b00a285af618ff98f3da69..589d9639fb4d17e6f3423b92c7d692a7abc25364 100644
--- a/tensorflow/core/kernels/save_v2_op_test.cc
+++ b/tensorflow/core/kernels/save_v2_op_test.cc
@@ -67,9 +67,8 @@ TEST_F(SaveV2OpTest, Simple) {
                    [&tensornames](int x) -> string { return tensornames[x]; });
 
   // Add the slice specs
-  AddInput<string>(TensorShape({13}), [&tensornames](int x) -> string {
-    return "" /* saves in full */;
-  });
+  AddInput<string>(TensorShape({13}),
+                   [](int x) -> string { return "" /* saves in full */; });
 
   // Add a 1-d bool tensor
   AddInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 2c1f613c18cbe6b1e9215e19842436a4ee9c50bc..9c9e7370ac44bfb704f5491e2c572e961f188e3a 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -53,8 +53,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-using Eigen::operator==;
-
 template <typename T>
 using BasicMatrix = Eigen::Tensor<T, 2, Eigen::RowMajor>;
 
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index 872a6e9d1bcce09765d1531c5f2898b2badc66a7..bb9d18e915a5297a3561be1f3f6f2de338855d1b 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -140,9 +140,9 @@ void ReadCSVFileToComplexVectorOrDie(
       for (std::vector<string>::const_iterator j = parts.begin();
            j != parts.end(); ++j) {
         if (j->find_first_of("ij") != string::npos) {
-          strings::safe_strtod((*j).c_str(), &imaginary_part);
+          strings::safe_strtod(*j, &imaginary_part);
         } else {
-          strings::safe_strtod((*j).c_str(), &real_part);
+          strings::safe_strtod(*j, &real_part);
         }
       }
       data_line.push_back(std::complex<double>(real_part, imaginary_part));
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index 2af6b4b8148807df9e1f7c0de65f664efe6acc79..033b9f34780a9fc8790d5aaa07501dd013f14750 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -244,9 +244,9 @@ void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     DeviceContext* device_ctxt = ctx->op_device_context();
     auto device = static_cast<tensorflow::Device*>(ctx->device());
     Allocator* allocator = device->GetAllocator(alloc_attrs);
-    AllocatorStats stats;
-    allocator->GetStats(&stats);
-    if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+    absl::optional<AllocatorStats> stats = allocator->GetStats();
+    if (stats && *stats->bytes_limit &&
+        stats->bytes_in_use > (*stats->bytes_limit * kOccupancy)) {
       // Asynchronously copy the tensor from GPU to CPU memory.
       // TODO(yuanbyu): Swap the oldest tensor first.
       AllocatorAttributes host_alloc_attrs;
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index 7cbbfd490e6232c6730779418a4d50d4bb81d2fb..62c4ee3bd69fd7e87dc501c1e8c397f73a31b92c 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -15,251 +15,173 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <algorithm>
-#include <cmath>
-#include <memory>
-
-#include "absl/strings/str_join.h"
-#include "absl/types/variant.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/dense_update_functor.h"
-#include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/kernels/stateful_random_ops.h"
 #include "tensorflow/core/kernels/random_op.h"
-#include "tensorflow/core/kernels/resource_variable_ops.h"
-#include "tensorflow/core/kernels/scatter_functor.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
-#include "tensorflow/core/kernels/variable_ops.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/random/random_distributions.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/util.h"
-#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-using random::PhiloxRandom;
-
-namespace {
-
-// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
-// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
-// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
-using StateElementType = int64;
-static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
-
-using Algorithm = StateElementType;
-static constexpr Algorithm RNG_ALG_PHILOX = 1;
-
-using SkippableRNG = absl::variant<PhiloxRandom>;
-
-// This function is for hiding the implementation detail about the
-// `absl::variant` index of each algorithm.
-Algorithm GetAlgorithm(SkippableRNG const& rng) {
-  auto idx = rng.index();
-  if (idx == 0) {
-    return RNG_ALG_PHILOX;
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& device,
+                  int64 output_size, int64 alg_tag_skip,
+                  ScopedUnlockUnref* state_var_guard, Tensor* state_tensor,
+                  typename Distribution::ResultElementType* output_data) {
+    auto state_tensor_flat = state_tensor->flat<StateElementType>();
+    auto state_data = state_tensor_flat.data();
+    // Delegates to PhiloxRandom to do the actual increasing.
+    auto philox = GetPhiloxRandomFromMem(state_data + alg_tag_skip);
+    UpdateMemWithPhiloxRandom(philox, output_size, state_data + alg_tag_skip);
+    // No longer needs the lock.
+    state_var_guard->Release();
+    functor::FillPhiloxRandom<CPUDevice, Distribution>()(
+        ctx, device, philox, output_data, output_size, Distribution());
   }
-  // unreachable
-  return RNG_ALG_PHILOX;
-}
+};
 
-// Fills a buffer with random numbers sampled from a given distribution.
-template <class Device, class Distribution>
-Status FillRandom(OpKernelContext* ctx, const Device& device,
-                  SkippableRNG const& gen, int64 size, Distribution dist,
-                  typename Distribution::ResultElementType* data) {
-  auto algorithm = GetAlgorithm(gen);
-  if (algorithm == RNG_ALG_PHILOX) {
-    auto philox = absl::get<PhiloxRandom>(gen);
-    functor::FillPhiloxRandom<Device, Distribution>()(ctx, device, philox, data,
-                                                      size, dist);
-    return Status::OK();
-  } else {
-    // return errors::InvalidArgument("Unsupported algorithm id: ", algorithm);
-    return Status::OK();
+template <typename Device, typename Distribution>
+Status UpdateVariableAndFill(
+    OpKernelContext* ctx, int state_input_idx, bool read_alg_from_state,
+    Algorithm alg, int64 output_size,
+    typename Distribution::ResultElementType* output_data) {
+  Var* var = nullptr;
+  TF_RETURN_IF_ERROR(
+      LookupResource(ctx, HandleFromInput(ctx, state_input_idx), &var));
+  // Use `ScopedUnlockUnref` here instead of `mutex_lock` and `ScopedUnref`
+  // because the former supports early releasing which is needed by
+  // `UpdateVariableAndFill_Philox<CPU>` to avoid holding the lock while
+  // filling.
+  ScopedUnlockUnref state_var_guard(var);
+  Tensor* var_tensor = var->tensor();
+  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
+    return errors::InvalidArgument("dtype of RNG state variable must be ",
+                                   DataTypeString(STATE_ELEMENT_DTYPE),
+                                   ", not ",
+                                   DataTypeString(var_tensor->dtype()));
   }
-}
-
-// The following two functions use the contract "lower 32 bits for the first
-// uint32, higher 32 bits for the second". Note that this is endian-neutral,
-// unlike a direct memory copy `memcpy(output, &input, 8)`.
-void Int64ToUint32s(int64 input, uint32* output1, uint32* output2) {
-  auto u64 = static_cast<uint64>(input);
-  *output1 = static_cast<uint32>(u64);
-  *output2 = static_cast<uint32>(u64 >> 32);
-}
-
-int64 Uint32sToInt64(uint32 input1, uint32 input2) {
-  auto u64_1 = static_cast<uint64>(input1);
-  auto u64_2 = static_cast<uint64>(input2);
-  return static_cast<int64>(u64_1 | (u64_2 << 32));
-}
-
-void GetPhiloxStateFromTensor(Tensor const& tensor,
-                              PhiloxRandom::ResultType* counter,
-                              PhiloxRandom::Key* key) {
-  auto tensor_flat = tensor.flat<StateElementType>();
-  auto tensor_ptr = tensor_flat.data();
-  // tensor_ptr's index is added by 1 to skip the algorithm tag.
-  Int64ToUint32s(tensor_ptr[1], &(*counter)[0], &(*counter)[1]);
-  Int64ToUint32s(tensor_ptr[2], &(*counter)[2], &(*counter)[3]);
-  Int64ToUint32s(tensor_ptr[3], &(*key)[0], &(*key)[1]);
-}
-
-void WritePhiloxStateToTensor(PhiloxRandom::ResultType const& counter,
-                              PhiloxRandom::Key const& key, Tensor* tensor) {
-  auto tensor_flat = tensor->flat<StateElementType>();
-  auto tensor_ptr = tensor_flat.data();
-  // tensor_ptr's index is added by 1 to skip the algorithm tag.
-  tensor_ptr[1] = Uint32sToInt64(counter[0], counter[1]);
-  tensor_ptr[2] = Uint32sToInt64(counter[2], counter[3]);
-  tensor_ptr[3] = Uint32sToInt64(key[0], key[1]);
-}
-
-// A helper function that does the actual work for
-// 'MakeRNGCopyAndUpdateVariable'.
-template <typename Device>
-Status GetRNGCopyAndUpdateTensor(Tensor* tensor, int64 delta,
-                                 SkippableRNG* rng_copy);
-
-template <>
-Status GetRNGCopyAndUpdateTensor<CPUDevice>(Tensor* tensor, int64 delta,
-                                            SkippableRNG* rng_copy) {
-  // The dtype of `tensor` should be `StateElementType` and the first element
-  // is the algorithm.
-  if (tensor->dims() != 1) {
+  if (var_tensor->dims() != 1) {
     return errors::InvalidArgument(
-        "RNG state must have one and only one dimension, not ", tensor->dims());
+        "RNG state must have one and only one dimension, not ",
+        var_tensor->dims());
   }
-  auto tensor_flat = tensor->flat<StateElementType>();
-  if (tensor_flat.size() < 1) {
-    return errors::InvalidArgument("Size of tensor must be at least 1");
+  auto var_tensor_flat = var_tensor->flat<StateElementType>();
+  int64 alg_tag_skip = 0;
+  if (read_alg_from_state) {
+    alg_tag_skip = 1;
+    if (var_tensor_flat.size() < 1) {
+      return errors::InvalidArgument("Size of tensor must be at least 1");
+    }
+    alg = var_tensor_flat(0);
   }
-  auto algorithm = tensor_flat.data()[0];
-  if (algorithm == RNG_ALG_PHILOX) {
-    // Delegates to PhiloxRandom to do the actual increasing.
+  if (alg == RNG_ALG_PHILOX) {
     static_assert(std::is_same<StateElementType, int64>::value,
                   "StateElementType must be int64");
     static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
                   "PhiloxRandom::ResultElementType must be uint32");
-    auto counter_size = PhiloxRandom::ResultType::kElementCount;
-    auto key_size = PhiloxRandom::Key::kElementCount;
-    auto min_tensor_size = 1 + (counter_size + key_size) / 2;
-    if (tensor_flat.size() < min_tensor_size) {
+    if (var_tensor_flat.size() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
       return errors::InvalidArgument(
           "For Philox algorithm, the size of state"
           " must be at least ",
-          min_tensor_size, "; got ", tensor_flat.size());
+          alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ",
+          var_tensor_flat.size());
     }
-    PhiloxRandom::ResultType counter;
-    PhiloxRandom::Key key;
-    GetPhiloxStateFromTensor(*tensor, &counter, &key);
-    PhiloxRandom philox(counter, key);
-    auto old_philox = philox;
-    philox.Skip(delta);  // do the actual increasing
-    WritePhiloxStateToTensor(philox.counter(), philox.key(), tensor);
-    *rng_copy = SkippableRNG(old_philox);
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
+        ctx, var_tensor, var->copy_on_read_mode.load()));
+    UpdateVariableAndFill_Philox<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(), output_size, alg_tag_skip,
+        &state_var_guard, var_tensor, output_data);
     return Status::OK();
   } else {
-    // return errors::InvalidArgument("Unsupported algorithm id: ", algorithm);
-    *rng_copy = SkippableRNG(PhiloxRandom());
-    return Status::OK();
+    return errors::InvalidArgument("Unsupported algorithm id: ", alg);
   }
 }
 
-// Gets a copy of the RNG and updates the variable. The copy can be used to
-// generate upto 'samples' random numbers, and the variable is updated as if
-// 'samples' random numbers have been generated (e.g. if the variable is a
-// counnter, the counter is increased by 'samples').
-template <class Device>
-Status MakeRNGCopyAndUpdateVariable(OpKernelContext* ctx, int input_idx,
-                                    int64 samples, SkippableRNG* rng_copy) {
-  Var* var = nullptr;
-  TF_RETURN_IF_ERROR(
-      LookupResource(ctx, HandleFromInput(ctx, input_idx), &var));
-  core::ScopedUnref s(var);
-  mutex_lock ml(*var->mu());
-  Tensor* var_tensor = var->tensor();
-  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
-    return errors::InvalidArgument("dtype of RNG state variable must be ",
-                                   DataTypeString(STATE_ELEMENT_DTYPE),
-                                   ", not ",
-                                   DataTypeString(var_tensor->dtype()));
-  }
-  TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
-      ctx, var_tensor, var->copy_on_read_mode.load()));
-  TF_RETURN_IF_ERROR(
-      GetRNGCopyAndUpdateTensor<Device>(var_tensor, samples, rng_copy));
-  return Status::OK();
+// Preconditon: input(0) is an existing resource.
+template <typename Device, class Distribution>
+void ComputeImpl(OpKernelContext* ctx, int state_input_idx, int shape_input_idx,
+                 bool read_alg_from_state, Algorithm alg) {
+  using T = typename Distribution::ResultElementType;
+  const Tensor& shape_t = ctx->input(shape_input_idx);
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, ctx->op_kernel().MakeShape(shape_t, &shape));
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+  auto output_flat = output->flat<T>();
+  OP_REQUIRES_OK(ctx, UpdateVariableAndFill<Device, Distribution>(
+                          ctx, state_input_idx, read_alg_from_state, alg,
+                          output_flat.size(), output_flat.data()));
 }
 
 template <typename Device, class Distribution>
 class StatefulRandomOp : public OpKernel {
  public:
-  using T = typename Distribution::ResultElementType;
   explicit StatefulRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  // Assumes that input(0) is an existing resource.
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& shape_t = ctx->input(1);
-    Tensor* output;
-    TensorShape shape;
-    OP_REQUIRES_OK(ctx, MakeShape(shape_t, &shape));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
-    if (shape.num_elements() == 0) return;
-
-    auto output_flat = output->flat<T>();
-    SkippableRNG rng;
-    // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
-    // it just here.
-    OP_REQUIRES_OK(ctx, MakeRNGCopyAndUpdateVariable<Device>(
-                            ctx, 0, output_flat.size() * 256, &rng));
-    // Fill in the random numbers
-    OP_REQUIRES_OK(ctx, FillRandom(ctx, ctx->eigen_device<Device>(), rng,
-                                   output_flat.size(), Distribution(),
-                                   output_flat.data()));
+    ComputeImpl<Device, Distribution>(ctx, 0, 1, true, 0);
   }
 };
 
-}  // namespace
+template <typename Device, class Distribution>
+class StatefulRandomOpV2 : public OpKernel {
+ public:
+  explicit StatefulRandomOpV2(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& alg_tensor = ctx->input(1);
+    OP_REQUIRES(ctx, alg_tensor.dims() == 0,
+                errors::InvalidArgument("algorithm must be of shape [], not ",
+                                        alg_tensor.shape().DebugString()));
+    auto alg = alg_tensor.flat<Algorithm>()(0);
+    ComputeImpl<Device, Distribution>(ctx, 0, 2, false, alg);
+  }
+};
 
 // So far the 'Distribution' type parameter is only used when the algorithm is
 // philox, so 'NormalDistribution<PhiloxRandom, ...>' is fine for now.
 #define REGISTER(DEVICE, TYPE)            \
   REGISTER_KERNEL_BUILDER(                \
-      Name("StatefulStandardNormal")      \
+      Name("StatefulStandardNormalV2")    \
           .Device(DEVICE_##DEVICE)        \
           .HostMemory("resource")         \
+          .HostMemory("algorithm")        \
+          .HostMemory("shape")            \
+          .TypeConstraint<TYPE>("dtype"), \
+      StatefulRandomOpV2<DEVICE##Device,  \
+                         random::NormalDistribution<PhiloxRandom, TYPE> >);
+
+// CPU also has the old 'StatefulStandardNormal' op for backward compatibility.
+#define REGISTER_CPU(TYPE)                \
+  REGISTER(CPU, TYPE)                     \
+  REGISTER_KERNEL_BUILDER(                \
+      Name("StatefulStandardNormal")      \
+          .Device(DEVICE_CPU)             \
+          .HostMemory("resource")         \
           .HostMemory("shape")            \
           .TypeConstraint<TYPE>("dtype"), \
-      StatefulRandomOp<DEVICE##Device,    \
+      StatefulRandomOp<CPUDevice,         \
                        random::NormalDistribution<PhiloxRandom, TYPE> >);
 
-#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
 
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 
+#if GOOGLE_CUDA
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_GPU
 #undef REGISTER_CPU
 #undef REGISTER
 
 // TODO(wangpeng): Add RNG ops for other distributions.
-// TODO(wangpeng): Add support for GPU and XLA.
+// TODO(wangpeng): Add support for XLA.
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops.h b/tensorflow/core/kernels/stateful_random_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..689c2be6647a575c7a3a25d8eae834c31019a695
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops.h
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
+// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
+using StateElementType = int64;
+static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
+
+using Algorithm = StateElementType;
+static constexpr Algorithm RNG_ALG_PHILOX = 1;
+
+using random::PhiloxRandom;
+
+static constexpr int64 PHILOX_MIN_STATE_SIZE =
+    (PhiloxRandom::ResultType::kElementCount +
+     PhiloxRandom::Key::kElementCount) /
+    2;
+
+// The following 5 functions are made templates to avoid duplicate symbols when
+// linking.
+
+// The following two functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+template <typename INT64>
+PHILOX_DEVICE_FUNC void Int64ToUint32s(INT64 input, uint32* output1,
+                                       uint32* output2) {
+  auto u64 = static_cast<uint64>(input);
+  *output1 = static_cast<uint32>(u64);
+  *output2 = static_cast<uint32>(u64 >> 32);
+}
+
+template <typename UINT32>
+PHILOX_DEVICE_FUNC int64 Uint32sToInt64(UINT32 input1, UINT32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return static_cast<int64>(u64_1 | (u64_2 << 32));
+}
+
+template <typename STATE_ELEMENT_TYPE>
+PHILOX_DEVICE_FUNC PhiloxRandom
+GetPhiloxRandomFromMem(STATE_ELEMENT_TYPE const* ptr) {
+  PhiloxRandom::ResultType counter;
+  PhiloxRandom::Key key;
+  Int64ToUint32s(ptr[0], &counter[0], &counter[1]);
+  Int64ToUint32s(ptr[1], &counter[2], &counter[3]);
+  Int64ToUint32s(ptr[2], &key[0], &key[1]);
+  return PhiloxRandom(counter, key);
+}
+
+template <typename PHILOX_RANDOM>
+PHILOX_DEVICE_FUNC void WritePhiloxRandomToMem(PHILOX_RANDOM const& philox,
+                                               StateElementType* ptr) {
+  PhiloxRandom::ResultType const& counter = philox.counter();
+  PhiloxRandom::Key const& key = philox.key();
+  ptr[0] = Uint32sToInt64(counter[0], counter[1]);
+  ptr[1] = Uint32sToInt64(counter[2], counter[3]);
+  ptr[2] = Uint32sToInt64(key[0], key[1]);
+}
+
+template <typename PHILOX_RANDOM>
+PHILOX_DEVICE_FUNC void UpdateMemWithPhiloxRandom(PHILOX_RANDOM const& philox,
+                                                  int64 output_size,
+                                                  StateElementType* ptr) {
+  auto new_philox = philox;
+  // Multiplier 256 is the same as in `FillPhiloxRandomTask`; do not change
+  // it just here.
+  auto delta = output_size * 256;
+  new_philox.Skip(delta);  // do the actual increasing
+  WritePhiloxRandomToMem(new_philox, ptr);
+}
+
+// Does unlock and unref automatically when going out of scope, and also
+// supports early manual release.
+class ScopedUnlockUnref {
+ public:
+  explicit ScopedUnlockUnref(Var* var) : var_(var) {
+    if (var_) {
+      var_->mu()->lock();
+    }
+  }
+  void Release() {
+    if (var_) {
+      var_->mu()->unlock();
+      var_->Unref();
+      var_ = nullptr;
+    }
+  }
+  ~ScopedUnlockUnref() { Release(); }
+
+ private:
+  Var* var_;
+
+  ScopedUnlockUnref(const ScopedUnlockUnref&) = delete;
+  void operator=(const ScopedUnlockUnref&) = delete;
+};
+
+// A per-device helper function that does the actual work for
+// `UpdateVariableAndFill`.
+// Reason to use functor: C++ doesn't allow function-template partial
+// specialization.
+template <typename Device, typename Distribution>
+struct UpdateVariableAndFill_Philox;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+#if GOOGLE_CUDA
+
+using GPUDevice = Eigen::GpuDevice;
+
+// Declares the partially GPU-specialized functor struct.
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& device,
+                  int64 output_size, int64 alg_tag_skip,
+                  ScopedUnlockUnref* not_used, Tensor* state_tensor,
+                  typename Distribution::ResultElementType* output_data);
+};
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ce0db7c56c896b1ebe7f384245e1d07d1754831
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/random_op_gpu.h"
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
+
+namespace tensorflow {
+
+using random::PhiloxRandom;
+
+__device__ int thread_counter;
+
+template <typename Distribution>
+__global__ void FillKernel(
+    Distribution dist, int64 state_size, int64 output_size,
+    StateElementType* state_data,
+    typename Distribution::ResultElementType* output_data) {
+  // Threads in this block share `philox`. Thread 0 is responsible for
+  // initializing it.
+  __shared__ char philox_raw[sizeof(PhiloxRandom)];
+  auto philox = reinterpret_cast<PhiloxRandom*>(philox_raw);
+  if (threadIdx.x == 0) {
+    *philox = GetPhiloxRandomFromMem(state_data);
+  }
+  __syncthreads();
+  functor::FillPhiloxRandomKernel<Distribution,
+                                  Distribution::kVariableSamplesPerOutput>()
+      .Run(*philox, output_data, output_size, dist);
+  // The last thread updates the state.
+  auto total_thread_count = gridDim.x * blockDim.x;
+  auto old_counter_value = atomicAdd(&thread_counter, 1);
+  if (old_counter_value == total_thread_count - 1) {
+    UpdateMemWithPhiloxRandom(*philox, output_size, state_data);
+  }
+}
+
+template <typename Distribution>
+void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
+    OpKernelContext* ctx, const GPUDevice& d, int64 output_size,
+    int64 alg_tag_skip, ScopedUnlockUnref* not_used, Tensor* state_tensor,
+    typename Distribution::ResultElementType* output_data) {
+  OP_REQUIRES(
+      ctx, alg_tag_skip == 0,
+      errors::InvalidArgument(
+          "GPU kernel doesn't support reading algorithm from state variable, "
+          "so alg_tag_skip must be 0; got",
+          alg_tag_skip));
+  auto state_tensor_flat = state_tensor->flat<StateElementType>();
+  auto state_size = state_tensor_flat.size();
+  auto state_data = state_tensor_flat.data();
+
+  // maximize occupancy
+  const int kGroupSize = Distribution::kResultElementCount;
+  int work_element_count = (output_size + kGroupSize - 1) / kGroupSize;
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(work_element_count, d,
+                                             FillKernel<Distribution>, 0, 0);
+
+  int zero = 0;
+  cudaMemcpyToSymbol(thread_counter, &zero, sizeof(int));
+  FillKernel<Distribution>
+      <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+          Distribution(), state_size, output_size, state_data, output_data);
+}
+
+// Explicit instantiation of the GPU distributions functors.
+
+// clang-format off
+// NVCC cannot handle ">>" properly
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, Eigen::half> >;
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >;
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >;
+// clang-format on
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 70dbd15c46cb341d8ad6ed6013b5b9ff8a5d61da..22742dd38e5d56bf3b9970bf6b01ff734f181169 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -51,7 +51,7 @@ class StringToNumberOp : public OpKernel {
     for (int i = 0; i < input_flat.size(); ++i) {
       OP_REQUIRES(
           context,
-          strings::SafeStringToNumeric<OutputType>(input_flat(i).c_str(),
+          strings::SafeStringToNumeric<OutputType>(input_flat(i),
                                                    &output_flat(i)),
           errors::InvalidArgument(kErrorMessage, input_flat(i).c_str()));
     }
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 5e3465d1dd6ce24a82525704f5223b6d9f0ac39f..d33c0cdb7f01a4d11204d20fd020941d544c45ee 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -53,6 +53,7 @@ class CreateSummaryFileWriterOp : public OpKernel {
                                   max_queue, flush_millis, logdir,
                                   filename_suffix, ctx->env(), s);
                             }));
+    core::ScopedUnref unref(s);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
@@ -89,6 +90,7 @@ class CreateSummaryDbWriterOp : public OpKernel {
                   db, experiment_name, run_name, user_name, ctx->env(), s));
               return Status::OK();
             }));
+    core::ScopedUnref unref(s);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 8c3a58b108abe66f2b61b5153923bee192246cd1..9e308cfc0237aeb64754c81595e17ff6a06c16a5 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -93,9 +93,48 @@ class SvdOpGpu : public AsyncOpKernel {
   }
 
   void RunSVD(OpKernelContext* context, DoneCallback done, int64 m, int64 n,
-              int64 p, int64 batch_size, Scalar* input_ptr,
-              RealScalar* outputS_ptr, Scalar* outputU_ptr,
-              Scalar* outputVT_ptr, int* dev_info_ptr, CudaSolver* solver) {
+              int64 p, Tensor& M_copy, Tensor* S, Tensor* U, Tensor* V,
+              std::unique_ptr<CudaSolver> solver) {
+    // Compute U S V* = M.
+    // 1. cuSolver works in column-major rather than row-major.
+    // 2. Gesvd returns V*.
+    // 3. Hence M should be transposed before input and U (rather than V) should
+    // be transposed on output.
+
+    Tensor u_copy;
+    if (compute_uv_) {
+      TensorShape u_shape;
+      if (full_matrices_) {
+        u_shape = U->shape();
+      } else {
+        TensorShape shapeRaw = M_copy.shape();
+        shapeRaw.RemoveLastDims(2);
+        u_shape = shapeRaw;
+        u_shape.AddDim(p);
+        u_shape.AddDim(m);
+      }
+      OP_REQUIRES_OK_ASYNC(
+          context, solver->allocate_scoped_tensor(U->dtype(), u_shape, &u_copy),
+          done);
+    }
+
+    // get the pointers to the data
+    Scalar* input_ptr;
+    RealScalar* outputS_ptr;
+    Scalar* outputU_ptr = NULL;
+    Scalar* outputV_ptr = NULL;
+    auto input_reshaped = M_copy.template flat_inner_dims<Scalar, 3>();
+    input_ptr = input_reshaped.data();
+    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
+    if (compute_uv_) {
+      outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
+      outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
+    }
+    const int64 batch_size = input_reshaped.dimension(0);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
+    int* dev_info_ptr = dev_info.back().mutable_data();
+
     // Save the input matrix
     // Needed for the n=1 fix, see below, since SVD destroys the input
     Tensor input_copy;
@@ -121,12 +160,12 @@ class SvdOpGpu : public AsyncOpKernel {
       if (compute_uv_) {
         if (full_matrices_) {
           outputU = outputU_ptr + batch * m * m;
-          outputVT = outputVT_ptr + batch * n * n;
+          outputVT = outputV_ptr + batch * n * n;
           jobu = 'A';
           jobvt = 'A';
         } else {
           outputU = outputU_ptr + batch * m * p;
-          outputVT = outputVT_ptr + batch * n * p;
+          outputVT = outputV_ptr + batch * n * p;
           jobu = 'S';
           jobvt = 'S';
         }
@@ -155,17 +194,24 @@ class SvdOpGpu : public AsyncOpKernel {
     if (compute_uv_ && n == 1) {
       // 1. compute the (batched) sum
       const GPUDevice& d = context->eigen_device<GPUDevice>();
-      d.memset(outputVT_ptr, 0, batch_size * sizeof(Scalar));
+      d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
       Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
       ComputeValueOfVKernel<<<cfg2D.block_count, cfg2D.thread_per_block, 0,
                               d.stream()>>>(
           cfg2D, m, full_matrices_ ? m : p, input_copy.flat<Scalar>().data(),
-          outputU_ptr, outputS_ptr, outputVT_ptr);
+          outputU_ptr, outputS_ptr, outputV_ptr);
       // 2. clamp V to -1 or +1
       CudaLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d);
       ExtractSignOfVKernel<<<cfg1D.block_count, cfg1D.thread_per_block, 0,
-                             d.stream()>>>(cfg1D, outputVT_ptr);
+                             d.stream()>>>(cfg1D, outputV_ptr);
     }
+
+    if (compute_uv_) {
+      auto device = context->eigen_device<GPUDevice>();
+      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, u_copy, U), done);
+    }
+
+    CheckResult(context, std::move(done), dev_info, std::move(solver));
   }
 
   void CheckResult(OpKernelContext* context, DoneCallback done,
@@ -192,10 +238,9 @@ class SvdOpGpu : public AsyncOpKernel {
   void PerformSVD_MgeqN(OpKernelContext* context, DoneCallback done, int64 m,
                         int64 n, int64 p, const Tensor& M, Tensor* S, Tensor* U,
                         Tensor* V) {
+    // Transpose M, because cuSolver expects it to be column-major
     TensorShape shapeRaw = M.shape();
     shapeRaw.RemoveLastDims(2);
-
-    // Transpose M, because cuSolver expects it to be column-major
     TensorShape input_shape = shapeRaw;
     input_shape.AddDim(n);
     input_shape.AddDim(m);
@@ -210,58 +255,16 @@ class SvdOpGpu : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, M, &input_copy),
                          done);
 
-    // I need to transpose U at the end
-    // Not V, because cuSolver work column-major
-    Tensor u_copy;
-    if (compute_uv_) {
-      TensorShape u_shape;
-      if (full_matrices_) {
-        u_shape = U->shape();
-      } else {
-        u_shape = shapeRaw;
-        u_shape.AddDim(p);
-        u_shape.AddDim(m);
-      }
-      OP_REQUIRES_OK_ASYNC(
-          context, solver->allocate_scoped_tensor(U->dtype(), u_shape, &u_copy),
-          done);
-    }
-
-    // get the pointers to the data
-    Scalar* input_ptr;
-    RealScalar* outputS_ptr;
-    Scalar* outputU_ptr = NULL;
-    Scalar* outputV_ptr = NULL;
-    auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
-    input_ptr = input_reshaped.data();
-    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
-    if (compute_uv_) {
-      outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
-      outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
-    }
-
-    // call the SVD
-    const int64 batch_size = input_reshaped.dimension(0);
-    std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
-    RunSVD(context, done, m, n, p, batch_size, input_ptr, outputS_ptr,
-           outputU_ptr, outputV_ptr, dev_info.back().mutable_data(),
-           solver.get());
-
-    // Transpose U
-    if (compute_uv_) {
-      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, u_copy, U), done);
-    }
-
-    // now check if the SVD operation succeeded or not
-    CheckResult(context, std::move(done), dev_info, std::move(solver));
+    // Call the SVD: compute U S V* = M.
+    RunSVD(context, done, m, n, p, input_copy, S, U, V, std::move(solver));
   }
 
   // The SVD if m < n
   void PerformSVD_MlessN(OpKernelContext* context, DoneCallback done, int64 m,
                          int64 n, int64 p, const Tensor& M, Tensor* S,
                          Tensor* U, Tensor* V) {
-    // Perform the SVD on M'
+    // Perform the SVD on M'. cuSolver works column major so don't need to
+    // transpose M.
 
     // Reuse the input buffer or make a copy for the SVD depending on whether
     // this op owns the input buffer exclusively. This is needed because the
@@ -281,55 +284,8 @@ class SvdOpGpu : public AsyncOpKernel {
                M.NumElements() * sizeof(Scalar));
     }
 
-    // I need to transpose V at the end
-    Tensor v_copy;
-    if (compute_uv_) {
-      TensorShape v_shape;
-      if (full_matrices_) {
-        v_shape = V->shape();
-      } else {
-        TensorShape shapeRaw = M.shape();
-        shapeRaw.RemoveLastDims(2);
-        v_shape = shapeRaw;
-        v_shape.AddDim(p);
-        v_shape.AddDim(n);
-      }
-      OP_REQUIRES_OK_ASYNC(
-          context, solver->allocate_scoped_tensor(V->dtype(), v_shape, &v_copy),
-          done);
-    }
-
-    // get the pointers to the data
-    Scalar* input_ptr;
-    RealScalar* outputS_ptr;
-    Scalar* outputU_ptr = NULL;
-    Scalar* outputV_ptr = NULL;
-    auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
-    input_ptr = input_reshaped.data();
-    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
-    if (compute_uv_) {
-      // Note that U and V are flipped
-      outputU_ptr = v_copy.template flat_inner_dims<Scalar, 3>().data();
-      outputV_ptr = U->template flat_inner_dims<Scalar, 3>().data();
-    }
-
-    // call the SVD
-    const int64 batch_size = input_reshaped.dimension(0);
-    std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
-    // Note that m and n are flipped
-    RunSVD(context, done, n, m, p, batch_size, input_ptr, outputS_ptr,
-           outputU_ptr, outputV_ptr, dev_info.back().mutable_data(),
-           solver.get());
-
-    // Transpose V
-    if (compute_uv_) {
-      auto device = context->eigen_device<GPUDevice>();
-      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, v_copy, V), done);
-    }
-
-    // now check if the SVD operation succeeded or not
-    CheckResult(context, std::move(done), dev_info, std::move(solver));
+    // Call the SVD: compute V S U* = M*.
+    RunSVD(context, done, n, m, p, input_copy, S, V, U, std::move(solver));
   }
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op.cc b/tensorflow/core/kernels/tridiagonal_solve_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5884ffedfbc7b25f59e3c67da4af486ef6239c48
--- /dev/null
+++ b/tensorflow/core/kernels/tridiagonal_solve_op.cc
@@ -0,0 +1,163 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+static const char kErrMsg[] = "The matrix is not invertible.";
+
+template <class Scalar>
+class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit TridiagonalSolveOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    auto num_inputs = input_matrix_shapes.size();
+    OP_REQUIRES(context, num_inputs == 2,
+                errors::InvalidArgument("Expected two input matrices, got ",
+                                        num_inputs, "."));
+
+    auto num_diags = input_matrix_shapes[0].dim_size(0);
+    OP_REQUIRES(
+        context, num_diags == 3,
+        errors::InvalidArgument("Expected diagonals to be provided as a "
+                                "matrix with 3 rows, got ",
+                                num_diags, " rows."));
+
+    auto num_eqs_left = input_matrix_shapes[0].dim_size(1);
+    auto num_eqs_right = input_matrix_shapes[1].dim_size(0);
+    OP_REQUIRES(
+        context, num_eqs_left == num_eqs_right,
+        errors::InvalidArgument("Expected the same number of left-hand sides "
+                                "and right-hand sides, got ",
+                                num_eqs_left, " and ", num_eqs_right, "."));
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({input_matrix_shapes[1]});
+  }
+
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    const int num_eqs = static_cast<int>(input_matrix_shapes[0].dim_size(1));
+    const int num_rhss = static_cast<int>(input_matrix_shapes[1].dim_size(0));
+
+    const double add_cost = Eigen::TensorOpCost::AddCost<Scalar>();
+    const double mult_cost = Eigen::TensorOpCost::MulCost<Scalar>();
+    const double div_cost = Eigen::TensorOpCost::DivCost<Scalar>();
+
+    // Assuming cases with and without row interchange are equiprobable.
+    const double cost =
+        num_eqs * (div_cost * (num_rhss + 1) +
+                   (add_cost + mult_cost) * (2.5 * num_rhss + 1.5));
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const auto diagonals = inputs[0];
+
+    // Subdiagonal elements, first is ignored.
+    const auto& superdiag = diagonals.row(0);
+    // Diagonal elements.
+    const auto& diag = diagonals.row(1);
+    // Superdiagonal elements, n-th is ignored.
+    const auto& subdiag = diagonals.row(2);
+    // Right-hand sides (transposed - necessary for GPU impl).
+    const auto& rhs = inputs[1];
+
+    const int n = diag.size();
+    MatrixMap& x = outputs->at(0);
+    const Scalar zero(0);
+
+    if (n == 0) {
+      return;
+    }
+    if (n == 1) {
+      OP_REQUIRES(context, diag(0) != zero, errors::InvalidArgument(kErrMsg));
+      x.row(0) = rhs.row(0) / diag(0);
+      return;
+    }
+
+    // The three columns in u are the diagonal, superdiagonal, and second
+    // superdiagonal, respectively, of the U matrix in the LU decomposition of
+    // the input matrix (subject to row exchanges due to pivoting). For pivoted
+    // tridiagonal matrix, the U matrix has at most two non-zero superdiagonals.
+    Eigen::Array<Scalar, Eigen::Dynamic, 3> u(n, 3);
+
+    // The code below roughly follows LAPACK's dgtsv routine, with main
+    // difference being not overwriting the input.
+    u(0, 0) = diag(0);
+    u(0, 1) = superdiag(0);
+    x.row(0) = rhs.row(0);
+    for (int i = 0; i < n - 1; ++i) {
+      if (std::abs(u(i)) >= std::abs(subdiag(i + 1))) {
+        // No row interchange.
+        OP_REQUIRES(context, u(i) != zero, errors::InvalidArgument(kErrMsg));
+        const Scalar factor = subdiag(i + 1) / u(i, 0);
+        u(i + 1, 0) = diag(i + 1) - factor * u(i, 1);
+        x.row(i + 1) = rhs.row(i + 1) - factor * x.row(i);
+        if (i != n - 2) {
+          u(i + 1, 1) = superdiag(i + 1);
+          u(i, 2) = 0;
+        }
+      } else {
+        // Interchange rows i and i + 1.
+        const Scalar factor = u(i, 0) / subdiag(i + 1);
+        u(i, 0) = subdiag(i + 1);
+        u(i + 1, 0) = u(i, 1) - factor * diag(i + 1);
+        u(i, 1) = diag(i + 1);
+        x.row(i + 1) = x.row(i) - factor * rhs.row(i + 1);
+        x.row(i) = rhs.row(i + 1);
+        if (i != n - 2) {
+          u(i, 2) = superdiag(i + 1);
+          u(i + 1, 1) = -factor * superdiag(i + 1);
+        }
+      }
+    }
+    x.row(n - 1) /= u(n - 1, 0);
+    x.row(n - 2) = (x.row(n - 2) - u(n - 2, 1) * x.row(n - 1)) / u(n - 2, 0);
+    for (int i = n - 3; i >= 0; --i) {
+      x.row(i) = (x.row(i) - u(i, 1) * x.row(i + 1) - u(i, 2) * x.row(i + 2)) /
+                 u(i, 0);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOp);
+};
+
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<double>),
+                       double);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<complex64>),
+                       complex64);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<complex128>),
+                       complex128);
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.cc b/tensorflow/core/lib/bfloat16/bfloat16.cc
index a591717fd1abfc3d959d219d9ce2bde1272fd8ea..e6e24bc078668b9290f41ce501cea8de2d423779 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.cc
+++ b/tensorflow/core/lib/bfloat16/bfloat16.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+const uint16_t bfloat16::NAN_VALUE;
+const uint16_t bfloat16::ZERO_VALUE;
+
 B16_DEVICE_FUNC bfloat16::operator Eigen::half() const {
   return static_cast<Eigen::half>(float(*this));
 }
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 9a7c635c6efd657d417bd0893209251f90c78b65..1294ccff2676e0cf33585ba4518002457c37e93f 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -372,6 +372,12 @@ struct bfloat16 {
     return x;
   }
 
+  static bfloat16 min_positive_normal() {
+    bfloat16 x;
+    x.value = 0x0080;  // 0x1p-126
+    return x;
+  }
+
   bool IsZero() const { return (value & 0x7FFF) == ZERO_VALUE; }
 
   uint16_t value;
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index ca25f508da9635f02941c99c768947927fd97493..ce87e4dcae65e5a48074e00a6f49f79c1dc76c61 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -81,14 +81,6 @@ TEST(CollectionRegistryDeathTest, DuplicateRegistration) {
       "/tensorflow/metric");
 }
 
-TEST(CollectMetricsTest, NoMetrics) {
-  auto* collection_registry = CollectionRegistry::Default();
-  const std::unique_ptr<CollectedMetrics> collected_metrics =
-      collection_registry->CollectMetrics({});
-  EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
-  EXPECT_EQ(0, collected_metrics->point_set_map.size());
-}
-
 TEST(CollectMetricsTest, Counter) {
   auto counter_with_labels = std::unique_ptr<Counter<2>>(
       Counter<2>::New("/tensorflow/test/counter_with_labels",
@@ -111,7 +103,7 @@ TEST(CollectMetricsTest, Counter) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/counter_with_labels");
@@ -134,7 +126,7 @@ TEST(CollectMetricsTest, Counter) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/counter_with_labels");
@@ -201,7 +193,7 @@ TEST(CollectMetricsTest, Gauge) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/string_gauge_with_labels");
@@ -224,7 +216,7 @@ TEST(CollectMetricsTest, Gauge) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/string_gauge_with_labels");
@@ -307,7 +299,7 @@ TEST(CollectMetricsTest, Sampler) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/sampler_with_labels");
@@ -330,7 +322,7 @@ TEST(CollectMetricsTest, Sampler) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/sampler_with_labels");
diff --git a/tensorflow/core/lib/strings/proto_serialization.cc b/tensorflow/core/lib/strings/proto_serialization.cc
index 5c1fbda2155492c00049f52ce12ae8da665cbda0..2341d3e341d72fe8c385f3abd441dc7c692d9759 100644
--- a/tensorflow/core/lib/strings/proto_serialization.cc
+++ b/tensorflow/core/lib/strings/proto_serialization.cc
@@ -14,20 +14,65 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 
+#include <cstring>
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
+namespace {
+static const int kInlinedBufferSize = 256;
+}  // namespace
 
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result) {
-  DCHECK_LE(msg.ByteSizeLong(), static_cast<size_t>(INT_MAX));
-  const int size = static_cast<int>(msg.ByteSizeLong());
+  const size_t size = msg.ByteSizeLong();
+  DCHECK_LE(size, static_cast<size_t>(INT_MAX));
   *result = string(size, '\0');
-  protobuf::io::ArrayOutputStream array_stream(&(*result)[0], size);
+  return SerializeToBufferDeterministic(msg, const_cast<char*>(result->data()),
+                                        result->size());
+}
+
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size) {
+  DCHECK(msg.ByteSizeLong() == size && size <= static_cast<size_t>(INT_MAX));
+  protobuf::io::ArrayOutputStream array_stream(buffer, size);
   protobuf::io::CodedOutputStream output_stream(&array_stream);
   output_stream.SetSerializationDeterministic(true);
   msg.SerializeWithCachedSizes(&output_stream);
   return !output_stream.HadError() && size == output_stream.ByteCount();
 }
 
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y) {
+  const size_t size = x.ByteSizeLong();
+  if (size != y.ByteSizeLong()) return false;
+  if (size == 0) return true;
+  gtl::InlinedVector<char, kInlinedBufferSize> x_serialized(size);
+  bool success_x = SerializeToBufferDeterministic(x, x_serialized.data(), size);
+  DCHECK(success_x);
+  gtl::InlinedVector<char, kInlinedBufferSize> y_serialized(size);
+  bool success_y = SerializeToBufferDeterministic(y, y_serialized.data(), size);
+  DCHECK(success_y);
+  return memcmp(x_serialized.data(), y_serialized.data(), size) == 0;
+}
+
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed) {
+  const size_t size = proto.ByteSizeLong();
+  gtl::InlinedVector<char, kInlinedBufferSize> serialized(size);
+  SerializeToBufferDeterministic(proto, serialized.data(), size);
+  return Hash64(serialized.data(), size, seed);
+}
+
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto) {
+  const size_t size = proto.ByteSizeLong();
+  gtl::InlinedVector<char, kInlinedBufferSize> serialized(size);
+  SerializeToBufferDeterministic(proto, serialized.data(), size);
+  return Hash64(serialized.data(), size);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/proto_serialization.h b/tensorflow/core/lib/strings/proto_serialization.h
index 6664928e2818c747268ec1c361acce6bcf6c862e..763bd68c1bf8b8cc709d5a01a308550ffefeb743 100644
--- a/tensorflow/core/lib/strings/proto_serialization.h
+++ b/tensorflow/core/lib/strings/proto_serialization.h
@@ -28,6 +28,21 @@ namespace tensorflow {
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result);
 
+// As above, but takes a pre-allocated buffer wrapped by result.
+// PRECONDITION: size == msg.ByteSizeLong() && size <= INT_MAX.
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size);
+
+// Returns true if serializing x and y using
+// SerializeToBufferDeterministic() yields identical strings.
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y);
+
+// Computes Hash64 of the output of SerializeToBufferDeterministic().
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto);
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
diff --git a/tensorflow/core/lib/strings/proto_serialization_test.cc b/tensorflow/core/lib/strings/proto_serialization_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81a6f08ae9bf668951103c3f45d5efac527a8a94
--- /dev/null
+++ b/tensorflow/core/lib/strings/proto_serialization_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+
+#include <string>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+GraphDef MakeGraphDef(int num_nodes) {
+  GraphDef graph_def;
+  for (int i = 0; i < num_nodes; ++i) {
+    NodeDef* node = graph_def.add_node();
+    node->set_name(strings::StrCat("node", i));
+    node->set_op(strings::StrCat("op", i % 10));
+    (*node->mutable_attr())["foo"].set_f(3.14f);
+    (*node->mutable_attr())["bar"].set_s("baz");
+  }
+  return graph_def;
+}
+}  // namespace
+
+static void BM_ProtoSerializationToString(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    string serialized;
+    testing::DoNotOptimize(
+        SerializeToStringDeterministic(graph_def, &serialized));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ProtoSerializationToString)->Range(1, 10000);
+
+static void BM_ProtoSerializationToBuffer(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  const size_t size = graph_def.ByteSizeLong();
+  for (int i = 0; i < iters; ++i) {
+    gtl::InlinedVector<char, 1024> buf(size);
+    testing::DoNotOptimize(
+        SerializeToBufferDeterministic(graph_def, buf.data(), size));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ProtoSerializationToBuffer)->Range(1, 10000);
+
+static void BM_DeterministicProtoHash64(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    testing::DoNotOptimize(DeterministicProtoHash64(graph_def));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_DeterministicProtoHash64)->Range(1, 10000);
+
+static void BM_AreSerializedProtosEqual(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def_a = MakeGraphDef(num_nodes);
+  GraphDef graph_def_b = MakeGraphDef(num_nodes);
+  graph_def_b.mutable_node(0)->mutable_name()[0] = 'l';
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    testing::DoNotOptimize(AreSerializedProtosEqual(graph_def_a, graph_def_a));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_AreSerializedProtosEqual)->Range(1, 10000);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 545284331d7234554fc2fee5543dcf3552c19306..a0b602f301c976acca2c5887de0452210f15acd7 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -63,7 +63,7 @@ struct NcclManager::NcclStream {
   std::unique_ptr<Thread> thread;
   mutex mu;
   condition_variable cv;
-  // Has collective,rank pairs.
+  // Has collective,participant_idx pairs.
   std::deque<std::pair<Collective*, int>> pending_launches_ GUARDED_BY(mu);
   bool shutdown_requested GUARDED_BY(mu) = false;
 };
@@ -312,8 +312,10 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
   CUDA_RETURN_IF_ERROR(cudaGetDevice(&saved_device));
   NCCL_RETURN_IF_ERROR(ncclGroupStart());
   for (int i = 0; i < collective->num_local_devices; ++i) {
-    const int rank =
-        collective->single_node ? i : collective->participants[i]->global_rank;
+    // Set rank to `participant->global_rank` if provided, else `i`.
+    const int rank = collective->participants[i]->global_rank >= 0
+                         ? collective->participants[i]->global_rank
+                         : i;
     CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[i]));
     NCCL_RETURN_IF_ERROR(ncclCommInitRank(
         nccl_comms.data() + i, collective->num_global_devices, nccl_id, rank));
@@ -344,6 +346,12 @@ void NcclManager::AddToAllReduce(std::unique_ptr<Participant> participant,
   AddParticipant(std::move(participant), context, kAllReduce, reduction_op);
 }
 
+void NcclManager::AddToAllGather(std::unique_ptr<Participant> participant,
+                                 const Context& context) {
+  AddParticipant(std::move(participant), context, kAllGather,
+                 ncclSum /* unused */);
+}
+
 void NcclManager::AddBroadcastSend(std::unique_ptr<Participant> participant,
                                    const Context& context) {
   participant->root = true;
@@ -492,13 +500,11 @@ void NcclManager::RunCollective(Collective* collective) {
     return;
   }
 
-  for (int local_rank = 0; local_rank < collective->num_local_devices;
-       ++local_rank) {
-    Participant* p = collective->participants[local_rank].get();
-    NcclStream* nccl_stream =
-        collective->communicator->members[local_rank].nccl_stream;
+  for (int i = 0; i < collective->num_local_devices; ++i) {
+    Participant* p = collective->participants[i].get();
+    NcclStream* nccl_stream = collective->communicator->members[i].nccl_stream;
     CHECK(nccl_stream != nullptr);
-    const int rank = collective->single_node ? local_rank : p->global_rank;
+    const int rank = p->global_rank >= 0 ? p->global_rank : i;
 
     if (p->input != nullptr) {
       // Wait to ensure that the kernel that produces the data in the input
@@ -522,13 +528,11 @@ void NcclManager::RunCollective(Collective* collective) {
     // Note that it would be possible to run multiple collectives at once, if
     // they have non-intersecting sets of devices.
     mutex_lock l(collective_mu);
-    for (int local_rank = 0; local_rank < collective->num_local_devices;
-         ++local_rank) {
+    for (int i = 0; i < collective->num_local_devices; ++i) {
       NcclStream* nccl_stream =
-          collective->communicator->members[local_rank].nccl_stream;
+          collective->communicator->members[i].nccl_stream;
       mutex_lock l(nccl_stream->mu);
-      nccl_stream->pending_launches_.push_front(
-          std::make_pair(collective, local_rank));
+      nccl_stream->pending_launches_.push_front(std::make_pair(collective, i));
       nccl_stream->cv.notify_all();
     }
   }
@@ -555,23 +559,22 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
       next_launch = nccl_stream->pending_launches_.back();
       nccl_stream->pending_launches_.pop_back();
     }
-    Collective* collective = next_launch.first;
-    int local_rank = next_launch.second;
 
     // Launch the nccl kernel.
+    Collective* collective = next_launch.first;
     ncclDataType_t data_type = ToNcclType(collective->data_type);
-    Participant* p = collective->participants[local_rank].get();
-
-    auto nccl_comm = collective->communicator->members[local_rank].nccl_comm;
+    int p_idx = next_launch.second;
+    Participant* p = collective->participants[p_idx].get();
+    auto nccl_comm = collective->communicator->members[p_idx].nccl_comm;
     ncclResult_t nccl_result = ncclSuccess;
     switch (collective->type) {
       case kAllReduce: {
         const void* sendbuff = p->input->tensor_data().data();
         void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
 
-        VLOG(2) << "call NcclAllReduce participant " << local_rank
-                << " sendbuff " << sendbuff << " recvbuff " << recvbuff
-                << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
+        VLOG(2) << "call NcclAllReduce participant " << p_idx << " sendbuff "
+                << sendbuff << " recvbuff " << recvbuff << " nccl_comm "
+                << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
         nccl_result = ncclAllReduce(sendbuff, recvbuff, p->input->NumElements(),
                                     data_type, collective->reduction_op,
@@ -595,16 +598,30 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                                  collective->root_rank, nccl_comm, *cu_stream);
         break;
       }
+      case kAllGather: {
+        const void* sendbuff = p->input->tensor_data().data();
+        void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
+
+        VLOG(2) << "call NcclAllGather participant " << p_idx << " sendbuff "
+                << sendbuff << " sendcount " << p->input->NumElements()
+                << " recvbuff " << recvbuff << " recvcount "
+                << p->output->NumElements() << " nccl_comm " << nccl_comm
+                << " comm_stream " << comm_stream << " cuda_stream "
+                << cu_stream;
+        nccl_result = ncclAllGather(sendbuff, recvbuff, p->input->NumElements(),
+                                    data_type, nccl_comm, *cu_stream);
+        break;
+      }
     }
 
     // Run the done_callback when the nccl kernel finishes running.
-    auto done_callback = [collective, local_rank, nccl_result]() {
+    auto done_callback = [collective, p_idx, nccl_result]() {
       if (nccl_result == ncclSuccess) {
-        collective->participants[local_rank]->done_callback(Status::OK());
+        collective->participants[p_idx]->done_callback(Status::OK());
       } else {
         // Propagate the error, but note that if other members of the collective
         // did launch their kernels, then they are hanging.
-        collective->participants[local_rank]->done_callback(errors::Unknown(
+        collective->participants[p_idx]->done_callback(errors::Unknown(
             "Error invoking NCCL: ", ncclGetErrorString(nccl_result)));
       }
 
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 6ac5deb91a4a762d7c31d8cb501efd191906790a..7cf2c85f3e81ce951c408e797a17ab1634f17811 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -143,6 +143,10 @@ class NcclManager {
   void AddToAllReduce(std::unique_ptr<Participant> participant,
                       const Context& context, ncclRedOp_t reduction_op);
 
+  // Adds one participant to an all-gather.
+  void AddToAllGather(std::unique_ptr<Participant> participant,
+                      const Context& context);
+
   // AddBroadcastSend and AddBroadcastRecv combine to send data from one sender
   // to all receivers.
   void AddBroadcastSend(std::unique_ptr<Participant> participant,
@@ -170,6 +174,7 @@ class NcclManager {
     kAllReduce = 1,
     kBroadcast = 2,
     kReduce = 3,
+    kAllGather = 4,
   };
   struct Collective;
   struct Communicator;
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index e65af133891b3acdc74ee4316c5f6e35d236eb32..420e143c837f600016e66db6833ea8b58edde49d 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -73,9 +73,9 @@ class NcclManagerTest : public ::testing::Test {
 
   static void TearDownTestCase() { delete devices_; }
 
-  TestCase* MakeTestCase(int num_nodes, int num_ranks_per_node,
-                         ncclRedOp_t reduction_op, TensorShape shape,
-                         float value_offset) {
+  TestCase* MakeReductionTestCase(int num_nodes, int num_ranks_per_node,
+                                  ncclRedOp_t reduction_op, TensorShape shape,
+                                  float value_offset) {
     TestCase* test_case = new TestCase();
     test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
@@ -134,6 +134,47 @@ class NcclManagerTest : public ::testing::Test {
     return test_case;
   }
 
+  TestCase* MakeGatherTestCase(int num_nodes, int num_ranks_per_node,
+                               TensorShape in_shape, TensorShape out_shape) {
+    TestCase* test_case = new TestCase();
+    test_case->expected = Tensor(data_type_, out_shape);
+    test::FillFn<Scalar>(&test_case->expected,
+                         [](int) { return static_cast<Scalar>(0); });
+
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int node = 0; node < num_nodes; ++node) {
+      for (int i = 0; i < num_ranks_per_node; ++i) {
+        auto* device = GetDevice(i);
+        auto* stream = device->tensorflow_gpu_device_info()->stream;
+
+        Tensor in_cpu(data_type_, in_shape);
+        test::FillFn<Scalar>(&in_cpu, [&](int index) {
+          return static_cast<Scalar>((index + 1) * value_scale);
+        });
+        // Starting index for this rank's tensor in the all-gathered output.
+        int32 gather_idx =
+            (node * num_ranks_per_node + i) * in_shape.num_elements();
+        for (int j = 0; j < in_shape.num_elements(); ++j) {
+          auto in_val = in_cpu.flat<Scalar>()(j);
+          auto out_expr = test_case->expected.template flat<Scalar>();
+          out_expr(gather_idx + j) = in_val;
+        }
+
+        value_scale *= 10;
+        test_case->ins.emplace_back(GpuAllocator(device), data_type_, in_shape);
+        test_case->outs.emplace_back(GpuAllocator(device), data_type_,
+                                     out_shape);
+
+        const Tensor& in_gpu = test_case->ins.back();
+        auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+        stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
+                           in_cpu.TotalBytes());
+      }
+    }
+
+    return test_case;
+  }
+
   // Waits for the done callback to be called for each participant.
   void WaitForTestCompletion(TestCase* test_case) {
     test_case->mu.lock();
@@ -158,6 +199,9 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
+      VLOG(1) << "Verifying rank " << rank << " expected shape "
+              << test_case->expected.shape() << " out shape "
+              << out_cpu.shape();
       test::ExpectClose(test_case->expected, out_cpu);
     }
   }
@@ -218,8 +262,8 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
     std::unique_ptr<typename TestFixture::TestCase> test_case(
-        this->MakeTestCase(/*num_nodes=*/1, num_ranks, reduction_op,
-                           TensorShape({2, 3}), 0.0f));
+        this->MakeReductionTestCase(/*num_nodes=*/1, num_ranks, reduction_op,
+                                    TensorShape({2, 3}), 0.0f));
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
       VLOG(2) << "rank " << rank << " device " << device->name();
@@ -259,7 +303,7 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
     std::vector<std::pair<int, int>> case_and_rank;
     std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(this->MakeTestCase(
+      test_cases.emplace_back(this->MakeReductionTestCase(
           /*num_nodes=*/1, num_ranks, ncclSum,
           TensorShape({100, i % 5 + 1, i % 3 + 1}), 1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
@@ -324,6 +368,34 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
   }
 }
 
+// Test basic all-gather.
+TYPED_TEST(NcclManagerTest, BasicAllGather) {
+  const int num_ranks = 4;
+  for (int i = 0; i < num_ranks; ++i) {
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeGatherTestCase(/*num_nodes=*/1, num_ranks,
+                                 TensorShape({2, 3}),
+                                 TensorShape({2 * num_ranks, 3})));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
+      VLOG(2) << "rank " << rank << " device " << device->name();
+      auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+      auto* stream = device->tensorflow_gpu_device_info()->stream;
+      auto participant = absl::make_unique<NcclManager::Participant>(
+          device->executor(), stream, event_mgr, device->gpu_id(),
+          &test_case->ins[rank], &test_case->outs[rank], rank,
+          this->CreateDoneCallback(test_case.get()));
+      NcclManager::instance()->AddToAllGather(
+          std::move(participant),
+          {"allgather", /*num_local_devices=*/num_ranks,
+           /*num_global_devices=*/num_ranks, /*communicator_key=*/""});
+    }
+
+    LOG(INFO) << "Verifying results";
+    this->VerifyResults(test_case.get());
+  }
+}
+
 // Multi-node NCCL tests.
 
 TEST(NcclManagerTest, CommunicatorKey) {
@@ -353,8 +425,8 @@ TYPED_TEST(NcclManagerTest, MultiNode) {
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
     std::unique_ptr<typename TestFixture::TestCase> test_case(
-        this->MakeTestCase(num_nodes, num_ranks_per_node, reduction_op,
-                           TensorShape({2, 3}), 0.0f));
+        this->MakeReductionTestCase(num_nodes, num_ranks_per_node, reduction_op,
+                                    TensorShape({2, 3}), 0.0f));
     for (int node = 0; node < num_nodes; ++node) {
       auto node_fn = [this, node, &nccl_managers, &communicator_key,
                       &collective_key, reduction_op, &test_case] {
@@ -393,8 +465,9 @@ TYPED_TEST(NcclManagerTest, MultiNode) {
 TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
   const int num_ranks = 2;
 
-  std::unique_ptr<typename TestFixture::TestCase> test_case(this->MakeTestCase(
-      1 /* num_nodes */, num_ranks, ncclSum, TensorShape({2, 3}), 0.0f));
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
     auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
@@ -427,8 +500,9 @@ TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
 TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
   const int num_ranks = 2;
 
-  std::unique_ptr<typename TestFixture::TestCase> test_case(this->MakeTestCase(
-      1 /* num_nodes */, num_ranks, ncclSum, TensorShape({2, 3}), 0.0f));
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
     auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
@@ -454,8 +528,9 @@ TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
 TYPED_TEST(NcclManagerTest, ConsistentNumberOfDevices) {
   const int num_ranks = 2;
 
-  std::unique_ptr<typename TestFixture::TestCase> test_case(this->MakeTestCase(
-      1 /* num_nodes */, num_ranks, ncclSum, TensorShape({2, 3}), 0.0f));
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
     auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 108f187c981932c420b6bfea709b628832ed719c..8b6ee870799f082378033e4535b48407b6ed4a0d 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -466,47 +466,37 @@ REGISTER_OP("BroadcastTo")
     .Attr("T: type")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle in = c->input(0);
+      ShapeHandle shape_in = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(shape_in, 1, &shape_in));
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
-
       if (!c->RankKnown(out)) {
         // We have no information about the shape of the output.
         c->set_output(0, out);
         return Status::OK();
       }
 
+      ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
         // We have no information about the shape of the input,
         // nothing to do here.
         c->set_output(0, out);
         return Status::OK();
       }
-      if (c->Rank(out) < c->Rank(in)) {
-        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
-                                       c->DebugString(in), " shape ",
-                                       c->DebugString(out));
-      }
-
-      int32 in_offset = c->Rank(out) - c->Rank(in);
-      for (int32 i = 0; i < c->Rank(out); ++i) {
-        DimensionHandle dim = c->Dim(out, i);
-        if (c->ValueKnown(dim)) {
-          // The first in_offset dimensions for input will be expanded with 1,
-          // so no check needed.
-          if (i >= in_offset) {
-            DimensionHandle in_dim = c->Dim(in, i - in_offset);
-            if (c->ValueKnown(in_dim) && c->Value(in_dim) != 0) {
-              if (c->Value(dim) % c->Value(in_dim) != 0) {
-                return errors::InvalidArgument(
-                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
-                    " shape ", c->DebugString(out));
-              }
-            }
-          }
+      int out_rank = c->Rank(out);
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(in, out_rank, &in));
+      int in_rank = c->Rank(in);
+      for (int i = 0; i < in_rank; ++i) {
+        auto in_dim = c->Dim(in, in_rank - i - 1);
+        if (c->Value(in_dim) > 1) {
+          // If the input dimension is greater than 1 then the output dimension
+          // must be equal to it, since we only broadcast "from left to right".
+          auto out_dim = c->Dim(out, out_rank - i - 1);
+          TF_RETURN_IF_ERROR(c->Merge(in_dim, out_dim, &out_dim));
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(out, out_rank - i - 1, out_dim, &out));
         }
       }
-
       c->set_output(0, out);
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 1c29cd2491fcd8d0e9d773e24e956df8212f2c7f..92648ce18876427b9c19b744f23ba787b4fff217 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -509,6 +509,33 @@ TEST(ArrayOpsTest, BroadcastArgs_ShapeFn) {
   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
 }
 
+TEST(ArrayOpsTest, BroadcastTo_ShapeFn) {
+  ShapeInferenceTestOp op("BroadcastTo");
+  op.input_tensors.resize(2);
+
+  INFER_OK(op, "?;[?]", "?");
+  INFER_OK(op, "[];[1]", "[?]");
+  INFER_OK(op, "[1];[1]", "[?]");
+  INFER_OK(op, "[1];[2]", "[?,?]");
+  INFER_OK(op, "[2,2];[3]", "[?,d0_0,d0_1]");
+
+  // Rank checks
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[?,?]");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[2];[]");
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op, "[2,2];[1]");
+
+  Tensor shape_t(DT_INT64, TensorShape{3});
+  test::FillValues<int64>(&shape_t, {2, 10, 3});
+  op.input_tensors[1] = &shape_t;
+  INFER_OK(op, "[1,?,1];[3]", "[2,10,3]");
+  INFER_OK(op, "[1,1,1];[3]", "[2,10,3]");
+  INFER_OK(op, "[10,1];[3]", "[2,d0_0,3]");
+  INFER_ERROR("Dimensions must be equal, but are 3 and 2 for", op,
+              "[3,1,1];[3]");
+  INFER_ERROR("Dimensions must be equal, but are 2 and 10 for", op,
+              "[2,2,1];[3]");
+}
+
 TEST(ArrayOpsTest, BroadcastGradientArgs_ShapeFn) {
   ShapeInferenceTestOp op("BroadcastGradientArgs");
   // Output is always two unknown vectors.
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index e45a8a9b361183303d98f36aef25991566c6f267..06e5f14de76315eb54dfa3ad65f49d5393f8ada7 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -32,6 +32,17 @@ REGISTER_OP("CollectiveReduce")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CollectiveGather")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
 REGISTER_OP("CollectiveBcastSend")
     .Input("input: T")
     .Output("data: T")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index c73ae0cc16448c74910e6b69d98e0e6abe0f2e73..68bdf49118ea3af5ec72d6bbdacf68187501a6dd 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1546,6 +1546,96 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
 op {
   name: "Angle"
   input_arg {
@@ -12387,6 +12477,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -12829,6 +12959,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CollectiveReduce"
   input_arg {
@@ -13575,6 +13786,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -16331,6 +16571,58 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
 op {
   name: "CudnnRNN"
   input_arg {
@@ -18079,6 +18371,30 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -19116,6 +19432,43 @@ op {
     }
   }
 }
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "DecodeWav"
   input_arg {
@@ -21828,6 +22181,124 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "EnsureShape"
   input_arg {
@@ -22135,6 +22606,66 @@ op {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -22478,6 +23009,22 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalDenseToSparseBatchDataset"
   input_arg {
@@ -29365,6 +29912,108 @@ op {
     }
   }
 }
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "InitializeTable"
   input_arg {
@@ -31710,1072 +32359,1469 @@ op {
   is_stateful: true
 }
 op {
-  name: "Log"
+  name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "Log1p"
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log1p"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "velocities"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogMatrixDeterminant"
+  name: "LoadTPUEmbeddingAdadeltaParameters"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "LogMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogSoftmax"
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
   input_arg {
-    name: "logits"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LogSoftmax"
   input_arg {
-    name: "logits"
-    type_attr: "T"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingAdagradParameters"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
+    name: "parameters"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "mom"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "mg"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingFTRLParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "LogicalAnd"
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LogicalNot"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogicalOr"
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "benefits"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableExportV2"
+  name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "momenta"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableFindV2"
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableImportV2"
+  name: "LoadTPUEmbeddingRMSPropParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableInsert"
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableInsertV2"
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableRemoveV2"
+  name: "Log"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "Tin"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "LookupTableSize"
+  name: "Log"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "LookupTableSizeV2"
+  name: "Log"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "LoopCond"
+  name: "Log1p"
   input_arg {
-    name: "input"
-    type: DT_BOOL
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_BOOL
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "LowerBound"
-  input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
-  }
+  name: "Log1p"
   input_arg {
-    name: "values"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Lu"
+  name: "Log1p"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "lu"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
+  }
   attr {
-    name: "output_idx_type"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Lu"
+  name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "lu"
+    name: "sign"
     type_attr: "T"
   }
   output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
+    name: "log_abs_determinant"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
   attr {
-    name: "output_idx_type"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "MakeIterator"
+  name: "LogSoftmax"
   input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+    name: "logits"
+    type_attr: "T"
   }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MapClear"
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
   attr {
-    name: "capacity"
+    name: "num_true"
     type: "int"
-    default_value {
-      i: 0
-    }
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "memory_limit"
+    name: "num_sampled"
     type: "int"
-    default_value {
-      i: 0
-    }
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "unique"
+    type: "bool"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "true_classes"
+    type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "sampled_candidates"
+    type: DT_INT64
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LogicalAnd"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type: DT_BOOL
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LogicalOr"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+    name: "Tvalues"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tvalues"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LookupTableFind"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+    name: "Tout"
+    type: "type"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "Tout"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LookupTableImport"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tout"
+    type: "type"
   }
 }
 op {
-  name: "MapDefun"
+  name: "LookupTableInsertV2"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tin"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "MapDefun"
+  name: "LookupTableSize"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
   }
+}
+op {
+  name: "LookupTableSizeV2"
   input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
   }
   output_arg {
     name: "output"
-    type_list_attr: "output_types"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LowerBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "Tcaptured"
-    type: "list(type)"
+    name: "out_type"
+    type: "type"
     default_value {
+      type: DT_INT32
+    }
+    allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
-    has_minimum: true
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "MapIncompleteSize"
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
   }
+  is_stateful: true
+}
+op {
+  name: "MapClear"
   attr {
     name: "capacity"
     type: "int"
@@ -32813,21 +33859,308 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "MapDataset"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "capacity"
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -38618,6 +39951,32 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "MultiDeviceIterator"
   output_arg {
@@ -40708,6 +42067,80 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Pack"
   input_arg {
@@ -49552,6 +50985,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "ReduceDataset"
   input_arg {
@@ -49605,6 +51057,60 @@ op {
     }
   }
 }
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ReduceJoin"
   input_arg {
@@ -50604,6 +52110,49 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
 op {
   name: "Requantize"
   input_arg {
@@ -50727,74 +52276,103 @@ op {
   }
 }
 op {
-  name: "Reshape"
+  name: "RequantizePerChannel"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "shape"
-    type_attr: "Tshape"
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "Tshape"
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "ResizeArea"
+  name: "Reshape"
   input_arg {
-    name: "images"
+    name: "tensor"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "shape"
+    type_attr: "Tshape"
   }
   output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
   name: "ResizeArea"
@@ -50815,10 +52393,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
         type: DT_UINT8
+        type: DT_INT8
         type: DT_INT16
-        type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
         type: DT_HALF
@@ -50836,7 +52413,7 @@ op {
   }
 }
 op {
-  name: "ResizeBicubic"
+  name: "ResizeArea"
   input_arg {
     name: "images"
     type_attr: "T"
@@ -50854,7 +52431,46 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
         type: DT_INT8
         type: DT_INT16
         type: DT_INT32
@@ -59053,259 +60669,943 @@ op {
     }
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "value"
-    type_attr: "T"
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "begin_mask"
+    name: "num_shards"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "end_mask"
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "ellipsis_mask"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "new_axis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "shrink_axis_mask"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
-}
-op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "parameters"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
@@ -64302,6 +66602,38 @@ op {
     }
   }
 }
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "SerializeIterator"
   input_arg {
@@ -64552,6 +66884,37 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ShardedFilename"
   input_arg {
@@ -64738,6 +67101,10 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -75414,6 +77781,70 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -77288,66 +79719,525 @@ op {
   is_stateful: true
 }
 op {
-  name: "TFRecordReader"
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompilationResult"
   output_arg {
-    name: "reader_handle"
+    name: "output"
     type: DT_STRING
-    is_ref: true
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
+    type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
     default_value {
-      s: ""
+      i: 1
     }
   }
   attr {
-    name: "shared_name"
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "compression_type"
-    type: "string"
+    name: "use_tpu"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
-  deprecation {
-    version: 26
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
   }
   is_stateful: true
 }
 op {
-  name: "TFRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
     type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "container"
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "compression_type"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
-  is_stateful: true
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
   name: "TakeDataset"
@@ -79428,6 +82318,29 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListScatterV2"
   input_arg {
@@ -80423,6 +83336,33 @@ op {
     }
   }
 }
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateDiv"
   input_arg {
@@ -82623,6 +85563,18 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "WrapDatasetVariant"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 1c117166de029d40b84bbd2335b9315cdc53bcba..cc7ce542579ef6973a6c7bd5266f8a8f44cf3d0c 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 // to a stateful "iterator" by passing the "dataset" to the
 // "MakeIterator" op.
 //
-// TODO(b/65524810): DT_VARIANT tensors that represent "dataset" objects are
+// TODO(b/123753214): DT_VARIANT tensors that represent "dataset" objects are
 // not presently serializable. To avoid issues with constant folding, ensure
 // that any "source dataset" ops (i.e. ops that output a dataset and do not
 // take one as input) are marked "stateful".
@@ -37,7 +37,7 @@ REGISTER_OP("TensorDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
                                                 // `components` have shapes
@@ -49,7 +49,7 @@ REGISTER_OP("TensorSliceDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that the
                                                 // dim-0 slices of `components`
@@ -62,7 +62,7 @@ REGISTER_OP("SparseTensorSliceDataset")
     .Input("dense_shape: int64")
     .Output("handle: variant")
     .Attr("Tvalues: type")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -79,7 +79,7 @@ REGISTER_OP("GeneratorDataset")
     .Attr("Tfinalize_func_args: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -275,6 +275,22 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ShardDataset")
+    .Input("input_dataset: variant")
+    .Input("num_shards: int64")
+    .Input("index: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // num_shards should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // index should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -322,7 +338,7 @@ REGISTER_OP("RangeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -388,7 +404,7 @@ REGISTER_OP("TextLineDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -408,7 +424,7 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Input("footer_bytes: int64")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -431,7 +447,7 @@ REGISTER_OP("FixedLengthRecordDatasetV2")
     .Input("buffer_size: int64")
     .Input("compression_type: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -451,7 +467,7 @@ REGISTER_OP("TFRecordDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -538,13 +554,22 @@ REGISTER_OP("IteratorGetNextSync")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(IteratorGetNextShapeFn);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("DatasetToSingleElement")
     .Input("dataset: variant")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
     .SetShapeFn(IteratorGetNextShapeFn);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("ReduceDataset")
     .Input("input_dataset: variant")
     .Input("initial_state: Tstate")
@@ -556,6 +581,7 @@ REGISTER_OP("ReduceDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .SetIsStateful()
     .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("IteratorToStringHandle")
@@ -636,6 +662,8 @@ REGISTER_OP("ModelDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// TODO(b/124308749): Add a stateful version of MapDefun and use it when `f`
+// is stateful.
 REGISTER_OP("MapDefun")
     .Input("arguments: Targuments")
     .Input("captured_inputs: Tcaptured")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 41348972d38bab426d476f873a85c410b5d0df97..7b9d95a38d143c5173b9f77da7ac6a38c7061591 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -42,7 +42,7 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -76,10 +76,15 @@ REGISTER_OP("ExperimentalDatasetCardinality")
     .Output("cardinality: int64")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("ExperimentalDatasetToTFRecord")
     .Input("input_dataset: variant")
     .Input("filename: string")
     .Input("compression_type: string")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
@@ -213,7 +218,7 @@ REGISTER_OP("ExperimentalMapDataset")
 REGISTER_OP("ExperimentalMatchingFilesDataset")
     .Input("patterns: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -267,7 +272,7 @@ REGISTER_OP("ExperimentalRandomDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -338,7 +343,7 @@ REGISTER_OP("ExperimentalSqlDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -467,7 +472,7 @@ REGISTER_OP("ExperimentalLMDBDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index be440ed728129d6553b017fa537e0585d076c35d..4982ec6bd82e2bf221b56a9e75f00ce4f0763f15 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -132,6 +132,35 @@ REGISTER_OP("If")
       return Status::OK();
     });
 
+REGISTER_OP("Case")
+    .Input("branch_index: int32")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("branches: list(func) >= 1")
+    .Attr("output_shapes: list(shape) = []")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      // If `output_shapes` attr is set use that as the shapes of the outputs
+      // else return unknown shapes.
+      if (output_shapes.empty()) return shape_inference::UnknownShape(c);
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as num outputs (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    });
+
 // TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 8a8419588af40a2de81c94dcfdf22fc66030296c..66594b3576e20a761e26e5b4835571332aaba4f7 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -208,6 +208,36 @@ Status SvdShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// The first input is [...,3,M] and second input is [...,M,K].
+// Output is [...,M,K].
+Status TridiagonalSolveShapeFn(InferenceContext* c) {
+  ShapeHandle lhs;
+  ShapeHandle rhs;
+  // Check that rank is at least 2.
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &lhs));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 2, &rhs));
+
+  // Extract batch dimensions and check they are the same.
+  ShapeHandle lhs_batch_shape;
+  ShapeHandle rhs_batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(lhs, 0, -2, &lhs_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(rhs, 0, -2, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(
+      c->Merge(lhs_batch_shape, rhs_batch_shape, &lhs_batch_shape));
+
+  // Check that "M" is the same in both inputs.
+  DimensionHandle m_lhs = c->Dim(lhs, -1);
+  DimensionHandle m_rhs = c->Dim(rhs, -2);
+  TF_RETURN_IF_ERROR(c->Merge(m_lhs, m_rhs, &m_lhs));
+
+  // Check that next-to-last dimension of the first input is 3.
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(lhs, -2), 3, &m_lhs));
+
+  // The output shape is the same as rhs shape.
+  c->set_output(0, rhs);
+  return Status::OK();
+}
+
 }  // namespace
 
 REGISTER_OP("MatrixDeterminant")
@@ -379,6 +409,13 @@ REGISTER_OP("Svd")
     .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(SvdShapeFn);
 
+REGISTER_OP("TridiagonalSolve")
+    .Input("diagonals: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("T: {double, float, complex64, complex128}")
+    .SetShapeFn(TridiagonalSolveShapeFn);
+
 // Deprecated op registrations:
 
 // Can be deleted after 3feb2017.
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index 7d1aed50be7a3e1a64af9fbf369fea2daf5e1729..93732f938a9278f8da322dd8fd98a234695287d9 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -314,4 +314,40 @@ TEST(LinalgOpsTest, Lu_ShapeFn) {
            "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
 }
 
+TEST(LinalgOpsTest, TridiagonalSolve_ShapeFn) {
+  ShapeInferenceTestOp op("TridiagonalSolve");
+  INFER_OK(op, "?;?", "in1");
+  INFER_OK(op, "[3,5];[?,1]", "in1");
+  INFER_OK(op, "[?,5];[5,1]", "in1");
+  INFER_OK(op, "[?,5];[?,?]", "in1");
+  INFER_OK(op, "[?,?];[?,?]", "in1");
+  INFER_OK(op, "[3,5];[5,1]", "in1");
+  INFER_OK(op, "[3,5];[5,2]", "in1");
+
+  INFER_OK(op, "[?,?,?];[?,?,?]", "in1");
+  INFER_OK(op, "[?,3,5];[7,5,2]", "in1");
+  INFER_OK(op, "[7,3,5];[?,5,2]", "in1");
+  INFER_OK(op, "[7,?,5];[?,5,?]", "in1");
+  INFER_OK(op, "[7,3,5];[7,5,2]", "in1");
+
+  INFER_OK(op, "[7,?,3,5];[7,8,5,2]", "in1");
+  INFER_OK(op, "[7,8,3,5];[7,8,5,2]", "in1");
+
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[3];[5,1]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[3,5];[5]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [6,4] and [6,8].",
+      op, "[6,4,3,5];[6,8,5,2]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [?,4] and [6,8].",
+      op, "[?,4,3,5];[6,8,5,2]");
+  INFER_ERROR("Dimension must be 3 but is 4", op, "[4,5];[5,2]");
+  INFER_ERROR("Dimension must be 3 but is 4", op, "[6,4,5];[6,5,2]");
+  INFER_ERROR("Dimensions must be equal, but are 9 and 5", op, "[3,9];[5,2]");
+  INFER_ERROR("Dimensions must be equal, but are 9 and 5", op,
+              "[6,3,9];[6,5,2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index f8fdb10baea9de4593f76d5f34f87c93ff0d538a..123ffc493a929600f940fd41a5645cc39e575ee5 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -20,6 +20,34 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Verifies that `shapes_and_types` is a valid list handle and has the right
+// dtype.
+Status VerifyHandleData(
+    shape_inference::InferenceContext* c,
+    const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
+    DataType element_dtype) {
+  if (shapes_and_types.size() != 1) {
+    return errors::InvalidArgument(
+        "Invalid handle_data for input list. Expected length of "
+        "shape_and_types: ",
+        1, " Saw: ", shapes_and_types.size());
+  }
+  const shape_inference::ShapeAndType& list_shape_type = shapes_and_types[0];
+  if (list_shape_type.dtype != element_dtype) {
+    return errors::InvalidArgument("Expected list with element dtype ",
+                                   DataTypeString(element_dtype),
+                                   " but got list with element dtype ",
+                                   DataTypeString(list_shape_type.dtype));
+  }
+  return Status::OK();
+}
+
+// Assumes that the handle_data is valid.
+shape_inference::ShapeHandle GetElementShapeFromHandleData(
+    const std::vector<shape_inference::ShapeAndType>& shapes_and_types) {
+  return shapes_and_types[0].shape;
+}
+
 REGISTER_OP("EmptyTensorList")
     .Input("element_shape: shape_type")
     .Input("max_num_elements: int32")
@@ -503,6 +531,34 @@ REGISTER_OP("TensorListScatterV2")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListScatterIntoExistingList")
+    .Input("input_handle: variant")
+    .Input("tensor: element_dtype")
+    .Input("indices: int32")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &ignored));
+      // Check that indices is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        TF_RETURN_IF_ERROR(VerifyHandleData(c, *handle_data, element_dtype));
+        element_shape = GetElementShapeFromHandleData(*handle_data);
+      }
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListConcatLists")
     .Input("input_a: variant")
     .Input("input_b: variant")
diff --git a/tensorflow/core/ops/lookup_table_ops.cc b/tensorflow/core/ops/lookup_table_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ce08f6f2f9d7eec7cd2222de2456170e4976d6c
--- /dev/null
+++ b/tensorflow/core/ops/lookup_table_ops.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("LookupTableInsertOrAssignOp")
+    .Input("table_int64_args: num_int64_table_args * int64")
+    .Input("table_handle: resource")
+    .Input("keys: insert_key_tensor_dtype")
+    .Input("values: table_value_dtype")
+    .Attr("insert_key_tensor_dtype: type")
+    .Attr("table_value_dtype: type")
+    .Attr("num_int64_table_args: int >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      // Note that, by design, shape checks are implementation dependent so they
+      // must be deferred until runtime.
+      return Status::OK();
+    });
+
+REGISTER_OP("LookupTableFindOp")
+    .Input("table_int64_args: num_int64_table_args * int64")
+    .Input("table_handle: resource")
+    .Input("keys: lookup_key_tensor_dtype")
+    .Input("num_threads: int64")
+    .Output("values: table_value_dtype")
+    .Attr("table_value_dtype: type")
+    .Attr("lookup_key_tensor_dtype: type")
+    .Attr("num_int64_table_args: int >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      // The output shape cannot be inferred here because the key size
+      // cannot be inferred from the key tensor in general.
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("ContainerSizeOp")
+    .Input("container_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 55dcc50325f600730376c492fa3a2cdde4293ace..a8d454038c9b9d413c9d2cde44fb675da21856e4 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -469,6 +469,16 @@ Status MulGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Mul", MulGrad);
 
+Status MulNoNanGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForBinaryCwise(g, {
+      {{"gx"}, "MulNoNan", {"y", "dz"}},  // y * dz
+      {{"gy"}, "MulNoNan", {"x", "dz"}},  // x * dz
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("MulNoNan", MulGrad);
+
 Status DivGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForBinaryCwise(g, {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 747536a019c378328bbdcfff6535d7361ca52cb8..3cab5419abd4072f3655f46b128e45f400af241e 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -431,6 +431,14 @@ Returns x - y element-wise.
 REGISTER_OP("Mul").BINARY_MORE().SetIsCommutative().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
+REGISTER_OP("MulNoNan")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr("T: {float, double}")
+    .SetIsCommutative()
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 REGISTER_OP("_MklMul")
     .BINARY_MORE()
     .Input("mkl_x: uint8")
@@ -850,6 +858,15 @@ REGISTER_OP("Sum")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
+REGISTER_OP("EuclideanNorm")
+    .Input("input: T")
+    .Input("reduction_indices: Tidx")
+    .Output("output: T")
+    .Attr("keep_dims: bool = false")
+    .Attr("T: numbertype")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::ReductionShape);
+
 REGISTER_OP("Mean")
     .Input("input: T")
     .Input("reduction_indices: Tidx")
@@ -1749,6 +1766,45 @@ inputs: Must all be the same size and shape.
 
 #endif  // INTEL_MKL
 
+REGISTER_OP("RequantizePerChannel")
+    .Input("input: T")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Input("requested_output_min: float")
+    .Input("requested_output_max: float")
+    .Output("output: out_type")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype = DT_QINT32")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+REGISTER_OP("RequantizationRangePerChannel")
+    .Input("input: T")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype = DT_QINT32")
+    .Attr("clip_value_max: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("NextAfter")
     .Attr("T: {float64, float32} = DT_FLOAT")
     .Input("x1: T")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 05379a7d699629d733cacd71343fc9d912eb0893..1e6dbbfb2f5cc33136ef114b0cf0105a64a53976 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -144,6 +144,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[1];[2]", "[d1_0]");
     INFER_OK(op, "[2];[1]", "[d0_0]");
     INFER_OK(op, "[2];[]", "[d0_0]");
+    INFER_OK(op, "[2];[?]", "[d0_0]");
 
     INFER_OK(op, "[0];[0]", "[d0_0|d1_0]");
     INFER_OK(op, "[];[0]", "[d1_0]");
@@ -151,6 +152,9 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[0];[1]", "[d0_0]");
     INFER_OK(op, "[0];[]", "[d0_0]");
 
+    INFER_OK(op, "[2];[?,?]", "[d1_0,d0_0]");
+    INFER_OK(op, "[2,2];[?,?,?]", "[d1_0,d0_0,d0_1]");
+
     // Multiple dimension cases (same test cases, switching x and y).
     INFER_OK(op, "[?,1,2,3,4,5];[3,1,?]",
              "[d0_0,d0_1,d0_2,d0_3|d1_0,d0_4,d0_5]");
diff --git a/tensorflow/core/ops/mkl_array_ops.cc b/tensorflow/core/ops/mkl_array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ad3be611218734f769f9b108e0ff85052c2e72
--- /dev/null
+++ b/tensorflow/core/ops/mkl_array_ops.cc
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+// This file contains the registration of MKL-DNN array ops.
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/strided_slice_op.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
+
+// Adding QuantizedConcatV2 op to be able to replace it by
+// _MklQuantizedConcatV2 in the graph rewrite.
+REGISTER_OP("QuantizedConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("input_mins: N * float32")
+    .Input("input_maxes: N * float32")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      const int n = (c->num_inputs() - 1) / 3;
+      TF_RETURN_IF_ERROR(shape_inference::QuantizedConcatV2Shape(c, n));
+      ShapeHandle unused;
+      for (int i = n + 1; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("input_mins:  N * float32")
+    .Input("input_maxes: N * float32")
+    .Input("mkl_values: N * uint8")
+    .Input("mkl_axis: uint8")
+    .Input("mkl_input_mins:  N * uint8")
+    .Input("mkl_input_maxes: N * uint8")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_output_min: uint8")
+    .Output("mkl_output_max: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      const int n = (c->num_inputs() / 2 - 1) / 3;
+      TF_RETURN_IF_ERROR(shape_inference::QuantizedConcatV2Shape(c, n));
+      ShapeHandle unused;
+      for (int i = n + 1; i < c->num_inputs() / 2; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2b75389be9ae46a76691c0d212385f0747b6245e..b2e7ff1b2e159f00d5b0a0af7cd6e4ba8791981a 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -651,6 +651,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
 op {
   name: "Angle"
   input_arg {
@@ -5141,6 +5194,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -5397,6 +5490,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CollectiveReduce"
   input_arg {
@@ -5848,6 +6022,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -6931,6 +7134,33 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
 op {
   name: "CudnnRNN"
   input_arg {
@@ -8284,6 +8514,7 @@ op {
     has_minimum: true
     minimum: 1
   }
+  is_stateful: true
 }
 op {
   name: "DebugGradientIdentity"
@@ -8841,6 +9072,8 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -10158,6 +10391,124 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "EnsureShape"
   input_arg {
@@ -10296,6 +10647,66 @@ op {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -10546,6 +10957,7 @@ op {
     name: "compression_type"
     type: DT_STRING
   }
+  is_stateful: true
 }
 op {
   name: "ExperimentalDenseToSparseBatchDataset"
@@ -14615,6 +15027,108 @@ op {
     }
   }
 }
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "InitializeTable"
   input_arg {
@@ -15848,999 +16362,699 @@ op {
   is_stateful: true
 }
 op {
-  name: "Log"
+  name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log1p"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
-  }
-}
-op {
-  name: "LogMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "velocities"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdadeltaParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "LogicalAnd"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "LogicalNot"
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LogicalOr"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "updates"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableExportV2"
+  name: "LoadTPUEmbeddingAdagradParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableFindV2"
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "LoadTPUEmbeddingFTRLParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "linears"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableImportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableInsert"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
   input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LookupTableInsertV2"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "linears"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableRemoveV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableSize"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
-  }
-}
-op {
-  name: "LookupTableSizeV2"
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "LoopCond"
   input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LowerBound"
   input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
+    name: "weights"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "table_name"
+    type: "string"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      s: ""
     }
   }
-}
-op {
-  name: "Lu"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "lu"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "output_idx_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "MakeIterator"
+  name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "momenta"
+    type: DT_FLOAT
   }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
+    name: "table_id"
+    type: "int"
     default_value {
-      b: false
+      i: -1
     }
-  }
-}
-op {
-  name: "MapDefun"
-  input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "Tcaptured"
-    type: "list(type)"
+    name: "table_name"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LoadTPUEmbeddingRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "capacity"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "table_id"
+    type: "int"
     default_value {
-      s: ""
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "shared_name"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "MapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
+  name: "Log"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapUnstageNoKey"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -16850,8 +17064,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -16859,60 +17071,42 @@ op {
   }
 }
 op {
-  name: "MatchingFiles"
-  input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-}
-op {
-  name: "MatrixBandPart"
+  name: "Log1p"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "num_lower"
-    type_attr: "Tindex"
-  }
-  input_arg {
-    name: "num_upper"
-    type_attr: "Tindex"
-  }
   output_arg {
-    name: "band"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tindex"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
     type_attr: "T"
   }
   attr {
@@ -16930,294 +17124,1298 @@ op {
   }
 }
 op {
-  name: "MatrixDiag"
+  name: "LogSoftmax"
   input_arg {
-    name: "diagonal"
+    name: "logits"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "logsoftmax"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "MatrixDiagPart"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
   output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "sampled_candidates"
+    type: DT_INT64
   }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  deprecation {
-    version: 27
-    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "unique"
+    type: "bool"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixLogarithm"
+  name: "LogicalAnd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
   }
 }
 op {
-  name: "MatrixSetDiag"
+  name: "LogicalOr"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type: DT_BOOL
   }
   input_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "T"
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
     type: "type"
   }
 }
 op {
-  name: "MatrixSolve"
+  name: "LookupTableExportV2"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tvalues"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixSolveLs"
+  name: "LookupTableFind"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
   attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "Tout"
+    type: "type"
   }
 }
 op {
-  name: "MatrixSquareRoot"
+  name: "LookupTableFindV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "LookupTableImport"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "Max"
+  name: "LookupTableImportV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LowerBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
@@ -18570,6 +19768,32 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "MultiDeviceIterator"
   output_arg {
@@ -20143,6 +21367,80 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Pack"
   input_arg {
@@ -25937,6 +27235,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "ReduceDataset"
   input_arg {
@@ -25989,6 +27306,7 @@ op {
       b: true
     }
   }
+  is_stateful: true
 }
 op {
   name: "ReduceJoin"
@@ -26483,6 +27801,49 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
 op {
   name: "Requantize"
   input_arg {
@@ -26544,6 +27905,73 @@ op {
     }
   }
 }
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
 op {
   name: "Reshape"
   input_arg {
@@ -29025,7 +30453,158 @@ op {
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -29034,17 +30613,21 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyMomentum"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
@@ -29055,10 +30638,6 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
@@ -29101,23 +30680,20 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalAdagrad"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
     type: DT_RESOURCE
   }
   input_arg {
@@ -29125,11 +30701,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -29185,311 +30765,843 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "value"
-    type_attr: "T"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "begin_mask"
+    name: "num_shards"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "end_mask"
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "ellipsis_mask"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "new_axis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "shrink_axis_mask"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "ms"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "mom"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
@@ -31518,6 +33630,38 @@ op {
     }
   }
 }
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "SerializeIterator"
   input_arg {
@@ -31722,6 +33866,37 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ShardedFilename"
   input_arg {
@@ -31837,6 +34012,10 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -35832,14 +38011,6 @@ op {
     default_value {
       type: DT_FLOAT
     }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
   }
   attr {
     name: "shape_dtype"
@@ -35847,11 +38018,39 @@ op {
     default_value {
       type: DT_INT64
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
     }
   }
   is_stateful: true
@@ -37029,40 +39228,333 @@ op {
       s: ""
     }
   }
-  deprecation {
-    version: 26
-    explanation: "Use TFRecordReaderV2"
+  deprecation {
+    version: 26
+    explanation: "Use TFRecordReaderV2"
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompilationResult"
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
   }
   is_stateful: true
 }
 op {
-  name: "TFRecordReaderV2"
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
   output_arg {
-    name: "reader_handle"
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
     type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "container"
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
     type: "string"
     default_value {
-      s: ""
+      s: "STEP_MARK_AT_ENTRY"
     }
   }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
   attr {
-    name: "compression_type"
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
-  is_stateful: true
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
   name: "TakeDataset"
@@ -38656,6 +41148,29 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListScatterV2"
   input_arg {
@@ -39289,6 +41804,33 @@ op {
     }
   }
 }
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateDiv"
   input_arg {
@@ -40662,6 +43204,18 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "WrapDatasetVariant"
   input_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index eff453241d47c55750b9662e13b8755e2d3a42b9..169076a6f673e4e23a874e6f369575f07fbd5168 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -26,7 +26,10 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("DecodeRaw")
     .Input("bytes: string")
     .Output("output: out_type")
-    .Attr("out_type: {half,float,double,int32,uint16,uint8,int16,int8,int64}")
+    .Attr(
+        "out_type: "
+        "{half,float,double,int32,uint16,uint8,int16,int8,int64,complex64,"
+        "complex128}")
     .Attr("little_endian: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       // Note: last dimension is data dependent.
diff --git a/tensorflow/core/ops/sendrecv_ops.cc b/tensorflow/core/ops/sendrecv_ops.cc
index 7d0fda2f87fc14804486a0edcd35f221c1045917..e84a4796c1f14d79e539b5c16f7e6e6b89421abd 100644
--- a/tensorflow/core/ops/sendrecv_ops.cc
+++ b/tensorflow/core/ops/sendrecv_ops.cc
@@ -106,8 +106,8 @@ REGISTER_OP("_HostRecv")
     .Doc(R"doc(
 Receives the named tensor from send_device on recv_device.
 
-_HostRecv requires its input on host memory whereas _Recv requires its
-input on device memory.
+_HostRecv produces its output on host memory whereas _Recv produces its
+output on device memory.
 
 tensor: The tensor to receive.
 tensor_name: The name of the tensor to receive.
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index 4c9277eda557bedf8f7e9e4368892ee46810d768..2162107b7e5dba9a26400b494957afb038713240 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -19,18 +19,42 @@ limitations under the License.
 namespace tensorflow {
 
 Status StatefulRandomShape(shape_inference::InferenceContext* c) {
-  shape_inference::ShapeHandle out;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+  using shape_inference::ShapeHandle;
+
+  // Check algorithm shape
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+  // Set output shape
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &out));
   c->set_output(0, out);
   return Status::OK();
 }
 
-REGISTER_OP("StatefulStandardNormal")
+REGISTER_OP("StatefulStandardNormalV2")
     .Input("resource: resource")
+    .Input("algorithm: int64")
     .Input("shape: shape_dtype")
     .Output("output: dtype")
-    .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT")
-    .Attr("shape_dtype: {int32, int64} = DT_INT64")
+    .Attr("dtype : type = DT_FLOAT")
+    .Attr("shape_dtype : type = DT_INT64")
     .SetShapeFn(StatefulRandomShape);
 
+// Register the old 'StatefulStandardNormal' op
+REGISTER_OP("StatefulStandardNormal")
+    .Input("resource: resource")
+    .Input("shape: shape_dtype")
+    .Output("output: dtype")
+    .Attr("dtype : type = DT_FLOAT")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      using shape_inference::ShapeHandle;
+      // Set output shape
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/core/ops/tpu_configuration_ops.cc
similarity index 92%
rename from tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
rename to tensorflow/core/ops/tpu_configuration_ops.cc
index d5600eef4a9dc69fcfd931a083f86d7941ba8fb4..febb25096fdbfa006a5353c9719c1e7ce1852504 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/core/ops/tpu_configuration_ops.cc
@@ -193,25 +193,10 @@ REGISTER_OP("ConfigureDistributedTPU")
     .Attr("tpu_embedding_config: string = ''")
     .Attr("is_global_init: bool = false")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that sets up the centralized structures for a distributed TPU
-system.
-
-topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-tpu_embedding_config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-describes the embedding lookups of the program.
-embedding_config: Reserved. Do not use.
-is_global_init: Reserved. Do not use.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("ShutdownDistributedTPU")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that shuts down a running distributed TPU system. The Op returns
-an error if no system is running.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/core/ops/tpu_cross_replica_ops.cc
similarity index 53%
rename from tensorflow/contrib/tpu/ops/cross_replica_ops.cc
rename to tensorflow/core/ops/tpu_cross_replica_ops.cc
index 87e3a5946c20be8e2c7a24e198d1fb94335a6b86..c26b49eb34b116b5bab5aa1e0154724318c3dbb9 100644
--- a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
+++ b/tensorflow/core/ops/tpu_cross_replica_ops.cc
@@ -26,7 +26,7 @@ REGISTER_OP("AllToAll")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {numbertype, bool}")
     .Attr("concat_dimension: int")
     .Attr("split_dimension: int")
     .Attr("split_count: int")
@@ -70,79 +70,19 @@ REGISTER_OP("AllToAll")
 
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-An Op to exchange data across TPU replicas. On each replica, the input is
-split into `split_count` blocks along `split_dimension` and send to the other
-replicas given group_assignment. After receiving `split_count` - 1 blocks from
-other replicas, we concatenate the blocks along `concat_dimension` as the
-output.
-
-For example, suppose there are 2 TPU replicas:
-replica 0 receives input: `[[A, B]]`
-replica 1 receives input: `[[C, D]]`
-
-group_assignment=`[[0, 1]]`
-concat_dimension=0
-split_dimension=1
-split_count=2
-
-replica 0's output: `[[A], [C]]`
-replica 1's output: `[[B], [D]]`
-
-input: The local input to the sum.
-group_assignment: An int32 tensor with shape
-  [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-  replica ids in the ith subgroup.
-concat_dimension: The dimension number to concatenate.
-split_dimension: The dimension number to split.
-split_count: The number of splits, this number must equal to the sub-group
-  size(group_assignment.get_shape()[1])
-output: The exchanged result.
-T: The type of elements to be exchanged.
-)doc");
+    });
 
 REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-An Op to sum inputs across replicated TPU instances. Each instance supplies its
-own input.
-
-For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-and `B, D, F, H` as group 1. Thus we get the outputs:
-`[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-
-input: The local input to the sum.
-group_assignment: An int32 tensor with shape
-  [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-  replica ids in the ith subgroup.
-output: The sum of all the distributed inputs.
-T: The type of elements to be summed.
-)doc");
+    .Attr("T: {bfloat16, float, int32, uint32}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("CollectivePermute")
     .Input("input: T")
     .Input("source_target_pairs: int32")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-An Op to permute tensors across replicated TPU instances. Each instance
-supplies its own input.
-
-For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-`[D, A, B, C]`.
-
-input: The local input to be permuted. Currently only supports float and
-  bfloat16.
-source_target_pairs: A tensor with shape [num_pairs, 2].
-output: The permuted input.
-T: The type of elements to be exchanged.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/ops/tpu_embedding_ops.cc
similarity index 63%
rename from tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
rename to tensorflow/core/ops/tpu_embedding_ops.cc
index 676aed0b7b651494eda80ff2d7c7c31097529590..79ebc09adc2f8dc65336342d5b4843f35599cdaf 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_ops.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_configuration.pb.h"
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h"
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -23,6 +20,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
+#include "tensorflow/core/tpu/tpu_embedding_output_layout_utils.h"
 
 namespace tensorflow {
 
@@ -96,10 +96,6 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     if (parameter.has_user_defined() || is_debug_op) {
       auto* arg = op_def->add_input_arg();
       arg->set_name(parameter.name());
-      arg->set_description(
-          strings::StrCat("Value of ", parameter.name(), " used in the ",
-                          GetOptimizationAlgorithmFriendlyName(alg),
-                          " optimization algorithm."));
       arg->set_type(DT_FLOAT);
     }
   }
@@ -127,7 +123,6 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
-  op_def->set_summary("Load embedding parameters for a single table.");
   string parameter_descriptions;
   for (const auto& parameter : state_variable_specs) {
     if (parameter.has_user_defined() || is_debug_op) {
@@ -139,21 +134,6 @@ lookups using the %s optimization algorithm.)",
                        GetOptimizationAlgorithmFriendlyName(alg).c_str());
     }
   }
-  op_def->set_description(strings::Printf(R"doc(
-An op that loads optimization parameters into HBM for embedding. Must be
-preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-embedding table configuration. For example, this op is used to install
-parameters that are loaded from a checkpoint before a training loop is
-executed.
-%s
-table_name: Name of this table; must match a name in the
-  TPUEmbeddingConfiguration proto (overrides table_id).
-num_shards: Number of shards into which the embedding tables are divided.
-shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
-)doc",
-                                          parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
   op_def->set_is_aggregate(false);
   op_def->set_is_stateful(true);
@@ -233,10 +213,6 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     if (parameter.has_user_defined() || is_debug_op) {
       auto* arg = op_def->add_output_arg();
       arg->set_name(parameter.name());
-      arg->set_description(
-          strings::StrCat("Parameter ", parameter.name(), " updated by the ",
-                          tpu::GetOptimizationAlgorithmFriendlyName(alg),
-                          " optimization algorithm."));
       arg->set_type(DT_FLOAT);
     }
   }
@@ -264,7 +240,6 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
-  op_def->set_summary("Retrieve embedding parameters for a single table.");
   string parameter_descriptions;
   for (const auto& param : state_variable_specs) {
     if (param.has_user_defined() || is_debug_op) {
@@ -276,20 +251,6 @@ parameters from embedding updates using the %s optimization algorithm.)",
                        tpu::GetOptimizationAlgorithmFriendlyName(alg).c_str());
     }
   }
-  op_def->set_description(strings::Printf(R"doc(
-An op that retrieves optimization parameters from embedding to host
-memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-the correct embedding table configuration. For example, this op is
-used to retrieve updated parameters before saving a checkpoint.
-%s
-table_name: Name of this table; must match a name in the
-  TPUEmbeddingConfiguration proto (overrides table_id).
-num_shards: Number of shards into which the embedding tables are divided.
-shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
-)doc",
-                                          parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
   op_def->set_is_aggregate(false);
   op_def->set_is_stateful(true);
@@ -388,23 +349,7 @@ REGISTER_OP("RecvTPUEmbeddingActivations")
         c->set_output(i, output_shape);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that receives embedding activations on the TPU.
-
-The TPU system performs the embedding lookups and aggregations specified by
-the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-results of these aggregations are visible to the Tensorflow Graph as the
-outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-one Tensor of activations per table specified in the model. There can be at
-most one RecvTPUEmbeddingActivations op in the TPU graph.
-
-outputs: A TensorList of embedding activations containing one Tensor per
-    embedding table in the model.
-num_outputs: The number of output activation tensors, equal to the number of
-    embedding tables in the model.
-config: Serialized TPUEmbeddingConfiguration proto.
-)doc");
+    });
 
 REGISTER_OP("TPUEmbeddingActivations")
     .Input("embedding_variable: float32")
@@ -415,23 +360,7 @@ REGISTER_OP("TPUEmbeddingActivations")
     .SetShapeFn([](shape_inference::InferenceContext *c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op enabling differentiation of TPU Embeddings.
-
-This op simply returns its first input, which is assumed to have been sliced
-from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of this
-op, and its first argument being a trainable Variable, enables automatic
-differentiation of graphs containing embeddings via the TPU Embedding Python
-libraries.
-
-embedding_variable: A trainable variable, enabling optimizers to find this op.
-sliced_activations: The embedding activations Tensor to return.
-table_id: The id of the table in the embedding layer configuration from which
-    these activations were computed.
-lookup_id: Identifier of the set of embedding indices which produced these
-    activations.
-)doc");
+    });
 
 REGISTER_OP("SendTPUEmbeddingGradients")
     .Input("inputs: N * float32")
@@ -453,25 +382,7 @@ REGISTER_OP("SendTPUEmbeddingGradients")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that performs gradient updates of embedding tables using the specified
-learning rates.
-
-inputs: A TensorList of gradients with which to update embedding tables.
-    This argument has the same length and shapes as the return value of
-    RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-    with respect to the embedding activations. The embedding tables are updated
-    from these gradients via the optimizer specified in the TPU embedding
-    configuration given to tpu.initialize_system.
-learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-    rate tag: see the comments in
-    //third_party/tensorflow/contrib/tpu/proto/optimization_parameters.proto.
-    Multiple tables can share the same dynamic learning rate tag as specified
-    in the configuration. If the learning rates for all tables are constant,
-    this list should be empty.
-config: Serialized TPUEmbeddingConfiguration proto.
-)doc");
+    });
 
 REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Input("batch: N * int32")
@@ -479,19 +390,7 @@ REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that enqueues a list of input batch tensors to TPUEmbedding.
-
-batch: A list of 1D tensors, one for each embedding table, containing the
-    indices into the tables.
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
     .Input("sample_indices: N * int32")
@@ -514,41 +413,7 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that enqueues TPUEmbedding input indices from a SparseTensor.
-
-This Op eases the porting of code that uses embedding_lookup_sparse(),
-although some Python preprocessing of the SparseTensor arguments to
-embedding_lookup_sparse() is required to produce the arguments to this Op,
-since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-step.
-
-The tensors at corresponding positions in the three input lists
-must have the same shape, i.e. rank 1 with dim_size() equal to the total
-number of lookups into the table described by the corresponding table_id.
-
-sample_indices: A list of rank 1 Tensors specifying the training example and
-    feature to which the corresponding embedding_indices and aggregation_weights
-    values belong. sample_indices[i] must equal b * nf + f, where nf is the
-    number of features from the corresponding table, f is in [0, nf), and
-    b is in [0, batch size).
-embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-    (training example, feature) -- aggregation weights.
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-combiners: A list of string scalars, one for each embedding table that specify
-    how to normalize the embedding activations after weighted summation.
-    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-    all tables.
-)doc");
+    });
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
     .Input("sample_indices: N * int32")
@@ -560,44 +425,6 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
     .Attr("combiners: list(string) = []")
     .Attr("table_ids: list(int)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-This Op eases the porting of code that uses tf.nn.embedding_lookup_sparse().
-
-sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-to the ith feature. table_ids[i] indicates which embedding table to look up ith
-feature.
-
-The tensors at corresponding positions in the three input lists (sample_indices,
-embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
-with dim_size() equal to the total number of lookups into the table described by
-the corresponding feature.
-
-sample_indices: A list of rank 1 Tensors specifying the training example to
-    which the corresponding embedding_indices and aggregation_weights values
-    belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
-embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-    It corresponds to sp_ids.values in embedding_lookup_sparse().
-aggregation_weights: A list of rank 1 Tensors containing per training example
-    aggregation weights. It corresponds to sp_weights.values in
-    embedding_lookup_sparse().
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-combiners: A list of string scalars, one for each embedding table that specify
-    how to normalize the embedding activations after weighted summation.
-    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-    all tables.
-table_ids: A list of integers specifying the identifier of the embedding table
-    (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-    corresponding input. The ith input is looked up using table_ids[i]. The size
-    of the table_ids list must be equal to that of sample_indices,
-    embedding_indices and aggregation_weights.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/functional_ops.cc b/tensorflow/core/ops/tpu_functional_ops.cc
similarity index 100%
rename from tensorflow/contrib/tpu/ops/functional_ops.cc
rename to tensorflow/core/ops/tpu_functional_ops.cc
diff --git a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc b/tensorflow/core/ops/tpu_heartbeat_ops.cc
similarity index 72%
rename from tensorflow/contrib/tpu/ops/heartbeat_ops.cc
rename to tensorflow/core/ops/tpu_heartbeat_ops.cc
index ca0f5bc0e562cd9e27b4c456b53fb9f51f1cb1f8..660aa32c8278b27b307e229d427935f36e81e5f5 100644
--- a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
+++ b/tensorflow/core/ops/tpu_heartbeat_ops.cc
@@ -23,15 +23,6 @@ REGISTER_OP("WorkerHeartbeat")
     .Input("request: string")
     .Output("response: string")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Worker heartbeat op.
-
-Heartbeats may be sent periodically to indicate the coordinator is still active,
-to retrieve the current worker status and to expedite shutdown when necessary.
-
-request: A string tensor containing a serialized WorkerHeartbeatRequest
-response: A string tensor containing a serialized WorkerHeartbeatResponse
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
similarity index 100%
rename from tensorflow/contrib/tpu/ops/host_compute_ops.cc
rename to tensorflow/core/ops/tpu_host_compute_ops.cc
diff --git a/tensorflow/contrib/tpu/ops/infeed_ops.cc b/tensorflow/core/ops/tpu_infeed_ops.cc
similarity index 51%
rename from tensorflow/contrib/tpu/ops/infeed_ops.cc
rename to tensorflow/core/ops/tpu_infeed_ops.cc
index 2ed16c2a2270a5399059d7e07f5903e11098bbf9..0090b761c48dc8f953a7ff9a211d99ca29a2f210 100644
--- a/tensorflow/contrib/tpu/ops/infeed_ops.cc
+++ b/tensorflow/core/ops/tpu_infeed_ops.cc
@@ -27,14 +27,7 @@ REGISTER_OP("InfeedDequeue")
     .Attr("dtype: type")
     .Attr("shape: shape")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-A placeholder op for a value that will be fed into the computation.
-
-output: A tensor that will be provided using the infeed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("InfeedEnqueue")
     .Input("input: dtype")
@@ -43,20 +36,7 @@ REGISTER_OP("InfeedEnqueue")
     .Attr("layout: list(int) = []")
     .Attr("device_ordinal: int = -1")
     .SetShapeFn(shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-An op which feeds a single Tensor value into the computation.
-
-input: A tensor that will be provided using the infeed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-layout: A vector holding the requested layout in minor-to-major sequence.
-If a layout attribute is passed, but its values are all -1, the layout will
-be computed by the infeed operation.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("InfeedEnqueueTuple")
     .Input("inputs: dtypes")
@@ -65,21 +45,7 @@ REGISTER_OP("InfeedEnqueueTuple")
     .Attr("layouts: list(int) = []")
     .Attr("device_ordinal: int = -1")
     .SetShapeFn(shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-An op which feeds multiple Tensor values into the computation as an XLA tuple.
-
-inputs: A list of tensors that will be provided using the infeed mechanism.
-dtypes: The element types of each element in `inputs`.
-shapes: The shapes of each tensor in `inputs`.
-layouts: A vector holding the requested layout in minor-to-major sequence for
-all the tuple shapes, in the order the shapes appear in the "shapes" input.
-The layout elements for a sub-shape can be set to -1, in which case the
-corresponding layout will be computed by the infeed operation.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("InfeedDequeueTuple")
     .Output("outputs: dtypes")
@@ -95,14 +61,6 @@ REGISTER_OP("InfeedDequeueTuple")
         c->set_output(i, out);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-A placeholder op for multiple values that will be fed into the computation
-simultaneously as an XLA tuple.
-
-outputs: A list of tensors that will be provided using the infeed mechanism.
-dtypes: The element types of each element in `outputs`.
-shapes: The shapes of each tensor in `outputs`.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc b/tensorflow/core/ops/tpu_ordinal_selector_ops.cc
similarity index 80%
rename from tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc
rename to tensorflow/core/ops/tpu_ordinal_selector_ops.cc
index 54e6b20f7f388b67a96ac8acfe814a4202b56a18..72f24c57dbb4be388264da3c15a1e4fa0de9eb1c 100644
--- a/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc
+++ b/tensorflow/core/ops/tpu_ordinal_selector_ops.cc
@@ -25,15 +25,6 @@ REGISTER_OP("TPUOrdinalSelector")
       c->set_output(0,
                     c->Vector(shape_inference::InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-A TPU core selector Op.
-
-This Op produces a set of TPU cores (for warm-up) or a single TPU core
-(for regular inference) to execute the TPU program on. The output is
-consumed by TPUPartitionedCall.
-
-device_ordinals: A vector 1 or more TPU cores.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/core/ops/tpu_outfeed_ops.cc
similarity index 59%
rename from tensorflow/contrib/tpu/ops/outfeed_ops.cc
rename to tensorflow/core/ops/tpu_outfeed_ops.cc
index b05c76ca64fbaedc205ab06cc31616787ccc84b8..e170ed05a0cd283f086bd75ac28375f3afa15bae 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/core/ops/tpu_outfeed_ops.cc
@@ -26,24 +26,13 @@ REGISTER_OP("OutfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-An op which emits a single Tensor value from an XLA computation.
-
-input: A tensor that will be inserted into the outfeed queue.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OutfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-An op which emits multiple Tensor values from an XLA computation.
-
-inputs: A list of tensors that will be inserted into the outfeed queue as an
-XLA tuple.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OutfeedDequeue")
     .Output("output: dtype")
@@ -51,18 +40,7 @@ REGISTER_OP("OutfeedDequeue")
     .Attr("shape: shape")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Retrieves a single tensor from the computation outfeed.  This operation will
-block indefinitely until data is available.
-
-output: A tensor that will be read from the device outfeed.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("OutfeedDequeueTuple")
     .Output("outputs: dtypes")
@@ -85,18 +63,6 @@ REGISTER_OP("OutfeedDequeueTuple")
         c->set_output(i, out);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Retrieve multiple values that will be emitted by the computation as an XLA
-tuple.  This operations will block indefinitely until data is available.
-Output `i` corresponds to XLA tuple element `i`.
-
-outputs: A list of tensors that will be read from the outfeed.
-dtypes: The element types of each element in `outputs`.
-shapes: The shapes of each tensor in `outputs`.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
similarity index 69%
rename from tensorflow/contrib/tpu/ops/replication_ops.cc
rename to tensorflow/core/ops/tpu_replication_ops.cc
index d4180d1a20bc59f3fbb37b2dbc67790ded9d2d90..7c8949260053a6ca7fd02449d9934a02d6d227ea 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -32,6 +32,7 @@ REGISTER_OP("TPUReplicateMetadata")
     .Attr("computation_shape: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
     .Attr("padding_map: list(string) = []")
+    .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -69,10 +70,7 @@ REGISTER_OP("TPUReplicatedInput")
         }
       }
       return Status::OK();
-    })
-    .Doc(
-        "Operator that connects N unreplicated inputs to an N-way "
-        "replicated TPU computation.");
+    });
 
 REGISTER_OP("TPUReplicatedOutput")
     .Input("input: T")
@@ -84,10 +82,7 @@ REGISTER_OP("TPUReplicatedOutput")
         c->set_output(i, c->input(0));
       }
       return Status::OK();
-    })
-    .Doc(
-        "Operator that connects the output of an N-way replicated TPU "
-        "computation to N separate outputs.");
+    });
 
 REGISTER_OP("TPUCompilationResult")
     .Output("output: string")
@@ -107,40 +102,12 @@ REGISTER_OP("TPUReplicate")
     .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
     .Attr("padding_map: list(string) = []")
+    .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
     .Input("guaranteed_constants: Tguaranteed_constants")
     .Output("outputs: output_types")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Runs replicated computations on a distributed TPU system.
-
-computation: a function containing the computation to run.
-num_replicas: the number of replicas of the computation to run.
-num_cores_per_replica: the number of logical cores in each replica.
-topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-use_tpu: a bool indicating if this computation will run on TPU or CPU/GPU.
-Currently, only supports a default placement (computation is placed on GPU
-if one is available, and on CPU if not).
-device_assignment: a flattened array with shape
-  [replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
-  of logical cores in each replica of a computation to physical coordinates in
-  the TPU topology.
-Tinputs: the types of the arguments to 'computation'.
-inputs: the inputs to 'computation', flattened, in replica-major order.
-Tbroadcast_inputs: the types of the additional arguments to broadcast to all
-  replicas.
-Tguaranteed_constants: the types of the arguments to 'guaranteed_constants'.
-broadcast_inputs: additional arguments to broadcast to all replicas. The
-  broadcast inputs are appended to the per-replica inputs when calling
-  computation.
-guaranteed_constants: arguments which have been guaranteed to not
-change their values during the session lifetime. These contain tensors marked as
-constant using the GuaranteeConstOp.
-output_types: the types of the outputs of 'computation'.
-outputs: the outputs of 'computation'.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 647a797b82cf30384f7f48611788a62a952d5627..40a808b661cbff48f1c4198bcfca5a2261292a25 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -151,6 +151,7 @@ cc_library(
         ":retrying_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index d1e5779f023d205bb9595e7dbd322eb9e7e73fe6..10b57df183d370966338a1c2e6a6ab42aed9b75c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -64,7 +64,7 @@ constexpr int kGetChildrenDefaultPageSize = 1000;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
-ABSL_DEPRECATED("Use GCS_BLOCK_SIZE_MB instead.")
+ABSL_DEPRECATED("Use GCS_READ_CACHE_BLOCK_SIZE_MB instead.")
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
 // The environment variable that disables the GCS block cache for reads.
 // This is the explicit alternative to setting BLOCK_SIZE or MAX_SIZE to 0, and
@@ -73,11 +73,11 @@ constexpr char kReadCacheDisabled[] = "GCS_READ_CACHE_DISABLED";
 // The environment variable that overrides the block size for aligned reads from
 // GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
 constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
-constexpr size_t kDefaultBlockSize = 128 * 1024 * 1024;
+constexpr size_t kDefaultBlockSize = 16 * 1024 * 1024;
 // The environment variable that overrides the max size of the LRU cache of
 // blocks read from GCS. Specified in MB.
 constexpr char kMaxCacheSize[] = "GCS_READ_CACHE_MAX_SIZE_MB";
-constexpr size_t kDefaultMaxCacheSize = 2 * kDefaultBlockSize;
+constexpr size_t kDefaultMaxCacheSize = kDefaultBlockSize;
 // The environment variable that overrides the maximum staleness of cached file
 // contents. Once any block of a file reaches this staleness, all cached blocks
 // will be evicted on the next read.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index dc9eb7796f76aa8eeb5137bd311f4a99940f1388..a998f8e3adf203ad3aad35ab0d3bf4e77fe75936 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -3155,8 +3155,8 @@ TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
 TEST(GcsFileSystemTest, OverrideCacheParameters) {
   // Verify defaults are propagated correctly.
   GcsFileSystem fs1;
-  EXPECT_EQ(128 * 1024 * 1024, fs1.block_size());
-  EXPECT_EQ(2 * fs1.block_size(), fs1.max_bytes());
+  EXPECT_EQ(16 * 1024 * 1024, fs1.block_size());
+  EXPECT_EQ(fs1.block_size(), fs1.max_bytes());
   EXPECT_EQ(0, fs1.max_staleness());
   EXPECT_EQ(120, fs1.timeouts().connect);
   EXPECT_EQ(60, fs1.timeouts().idle);
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index e15400780af0880caadd2f79b7322f39e406ca2b..e91a9f89757ae6f0009ea20120cd98ab25cd1437 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #endif
 #include <fstream>
 #include <utility>
+#include "absl/strings/match.h"
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -44,6 +45,11 @@ constexpr char kGoogleAuthTokenForTesting[] = "GOOGLE_AUTH_TOKEN_FOR_TESTING";
 // The environment variable which can override '~/.config/gcloud' if set.
 constexpr char kCloudSdkConfig[] = "CLOUDSDK_CONFIG";
 
+// The environment variable used to skip attempting to fetch GCE credentials:
+// setting this to 'true' (case insensitive) will skip attempting to contact
+// the GCE metadata service.
+constexpr char kNoGceCheck[] = "NO_GCE_CHECK";
+
 // The default path to the gcloud config folder, relative to the home folder.
 constexpr char kGCloudConfigFolder[] = ".config/gcloud/";
 
@@ -146,10 +152,25 @@ Status GoogleAuthProvider::GetToken(string* t) {
   }
 
   auto token_from_files_status = GetTokenFromFiles();
-  auto token_from_gce_status =
-      token_from_files_status.ok() ? Status::OK() : GetTokenFromGce();
+  if (token_from_files_status.ok()) {
+    *t = current_token_;
+    return Status::OK();
+  }
+
+  char* no_gce_check_var = std::getenv(kNoGceCheck);
+  bool skip_gce_check = no_gce_check_var != nullptr &&
+                        absl::EqualsIgnoreCase(no_gce_check_var, "true");
+  Status token_from_gce_status;
+  if (skip_gce_check) {
+    token_from_gce_status =
+        Status(error::CANCELLED,
+               strings::StrCat("GCE check skipped due to presence of $",
+                               kNoGceCheck, " environment variable."));
+  } else {
+    token_from_gce_status = GetTokenFromGce();
+  }
 
-  if (token_from_files_status.ok() || token_from_gce_status.ok()) {
+  if (token_from_gce_status.ok()) {
     *t = current_token_;
     return Status::OK();
   }
@@ -165,8 +186,13 @@ Status GoogleAuthProvider::GetToken(string* t) {
   // so return an empty token instead of failing.
   *t = "";
 
-  // From now on, always return the empty token.
-  expiration_timestamp_sec_ = UINT64_MAX;
+  // We only want to keep returning our empty token if we've tried and failed
+  // the (potentially slow) task of detecting GCE.
+  if (skip_gce_check) {
+    expiration_timestamp_sec_ = 0;
+  } else {
+    expiration_timestamp_sec_ = UINT64_MAX;
+  }
   current_token_ = "";
 
   return Status::OK();
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.h b/tensorflow/core/platform/cloud/google_auth_provider.h
index 3755b124a87fd0003e5a6343b1a07130f5519dd6..4ab816d54c61e99dea1e2db59d4815f5012d5adc 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.h
+++ b/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -51,7 +51,7 @@ class GoogleAuthProvider : public AuthProvider {
   /// Gets the bearer token from Google Compute Engine environment.
   Status GetTokenFromGce() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  /// Gets the bearer token from the systen env variable, for testing purposes.
+  /// Gets the bearer token from the system env variable, for testing purposes.
   Status GetTokenForTesting() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::unique_ptr<OAuthClient> oauth_client_;
diff --git a/tensorflow/core/platform/cloud/google_auth_provider_test.cc b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
index ec31c5ee8c11645cf9f8a5659538b46d56ce84ca..d2db59200abb4fd5db5a86d9a653729aa518ee63 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
@@ -69,9 +69,10 @@ class GoogleAuthProviderTest : public ::testing::Test {
   void TearDown() override { ClearEnvVars(); }
 
   void ClearEnvVars() {
-    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
     unsetenv("CLOUDSDK_CONFIG");
+    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
     unsetenv("GOOGLE_AUTH_TOKEN_FOR_TESTING");
+    unsetenv("NO_GCE_CHECK");
   }
 };
 
@@ -238,4 +239,31 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   EXPECT_EQ("", token);
 }
 
+TEST_F(GoogleAuthProviderTest, NoGceCheckEnvironmentVariable) {
+  setenv("NO_GCE_CHECK", "True", 1);
+  auto oauth_client = new FakeOAuthClient;
+
+  FakeEnv env;
+  // If the env var above isn't respected, attempting to fetch a token
+  // from GCE will segfault (as the metadata client is null).
+  GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
+                              nullptr, &env);
+
+  string token;
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("", token);
+
+  // We confirm that our env var is case insensitive.
+  setenv("NO_GCE_CHECK", "true", 1);
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("", token);
+
+  // We also want to confirm that our empty token has a short expiration set: we
+  // now set a testing token, and confirm that it's returned instead of our
+  // empty token.
+  setenv("GOOGLE_AUTH_TOKEN_FOR_TESTING", "newToken", 1);
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("newToken", token);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e17ce4ebf320486960faade6a43a995da8b19c22..f9ac4ff0bca29acb36fe43dc3d3062eebfbee21d 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -530,19 +530,13 @@ def tf_additional_proto_hdrs():
     return [
         "platform/default/integral_types.h",
         "platform/default/logging.h",
-        "platform/default/protobuf.h",
     ] + if_windows([
         "platform/windows/integral_types.h",
     ])
 
-def tf_additional_proto_compiler_hdrs():
-    return [
-        "platform/default/protobuf_compiler.h",
-    ]
-
 def tf_additional_proto_srcs():
     return [
-        "platform/default/protobuf.cc",
+        "platform/protobuf.cc",
     ]
 
 def tf_additional_human_readable_json_deps():
@@ -560,6 +554,15 @@ def tf_protos_all():
         otherwise = ["//tensorflow/core:protos_all_cc"],
     )
 
+def tf_profiler_all_protos():
+    return ["//tensorflow/core/profiler:protos_all"]
+
+def tf_grpc_service_all():
+    return [
+        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_service_proto_cc",
+    ]
+
 def tf_protos_grappler_impl():
     return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
 
@@ -579,7 +582,10 @@ def tf_additional_device_tracer_cuda_deps():
     return []
 
 def tf_additional_device_tracer_deps():
-    return []
+    return [
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/internal/cpu:host_tracer",
+    ]
 
 def tf_additional_device_tracer_test_flags():
     return []
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 6faf5c512f393535d089e327925292167406a374..845fe0ec047f64f38a6b36e41d28f8d714384b76 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -7,6 +7,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "check_deps")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
@@ -283,6 +284,20 @@ cc_library(
     ],
 )
 
+# Check that libtensorflow_framework.so does not depend on cuda shared libraries.
+check_deps(
+    name = "libtensorflow_cuda_check_deps",
+    disallowed_deps = [
+        ":cuda",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_driver",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:curand",
+        "@local_config_cuda//cuda:cusolver",
+    ],
+    deps = ["//tensorflow:libtensorflow_framework.so"],
+)
+
 cc_library(
     name = "rocm",
     data = [],
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 8351362e05699c591b5563f2270928f4408077e8..ffcb38fdcd227840372923325bcb9e3a0733cb55 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace {
 
@@ -299,6 +301,14 @@ TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
 class TraceCollectorImpl : public tracing::TraceCollector {
  public:
+  class ActivityHandle : public Handle {
+   public:
+    ActivityHandle(string &&name, int level)
+        : trace_me_(std::move(name), level) {}
+
+   private:
+    profiler::TraceMe trace_me_;
+  };
   TraceCollectorImpl() { tracing::SetTraceCollector(this); }
 
   ~TraceCollectorImpl() override {
@@ -318,14 +328,16 @@ class TraceCollectorImpl : public tracing::TraceCollector {
       }
       ~Impl() override { tls_current_annotation.get() = nullptr; }
     };
-    return std::unique_ptr<Handle>(
-        new Impl{ConcatenateNames(name_part1, name_part2)});
+    return absl::make_unique<Impl>(ConcatenateNames(name_part1, name_part2));
   }
 
-  virtual std::unique_ptr<Handle> CreateActivityHandle(StringPiece, StringPiece,
-                                                       bool) const {
-    // We don't do anything with 'Activities' yet.
-    return nullptr;
+  virtual std::unique_ptr<Handle> CreateActivityHandle(
+      StringPiece name_part1, StringPiece name_part2, bool is_expensive) const {
+    if (!IsEnabledForActivities(is_expensive)) {
+      return nullptr;
+    }
+    return absl::make_unique<ActivityHandle>(
+        ConcatenateNames(name_part1, name_part2), GetLevel(is_expensive));
   }
 
   bool IsEnabledForAnnotations() const override {
@@ -333,8 +345,7 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   }
 
   bool IsEnabledForActivities(bool is_expensive) const override {
-    // We don't do anything with 'Activities' so we are never 'enabled'.
-    return false;
+    return profiler::TraceMeRecorder::Active(GetLevel(is_expensive));
   }
 
   void Start() {
@@ -349,6 +360,10 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   }
 
  private:
+  static int GetLevel(bool is_expensive) {
+    return profiler::GetTFTraceMeLevel(is_expensive);
+  }
+
   std::atomic<bool> active_trace_session_;
 };
 
@@ -421,6 +436,7 @@ class DeviceTracerImpl : public DeviceTracer, public CUPTIClient {
   int64 end_walltime_us_ GUARDED_BY(mu_);
   uint64_t start_timestamp_ GUARDED_BY(mu_);
   uint64_t end_timestamp_ GUARDED_BY(mu_);
+  std::unique_ptr<profiler::cpu::HostTracer> host_tracer_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl);
 };
@@ -429,6 +445,7 @@ DeviceTracerImpl::DeviceTracerImpl(CUPTIManager *cupti_manager)
     : cupti_manager_(cupti_manager) {
   VLOG(1) << "DeviceTracer created.";
   cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
+  host_tracer_ = profiler::cpu::HostTracer::Create(2);
   enabled_ = false;
 }
 
@@ -493,6 +510,7 @@ Status DeviceTracerImpl::Start() {
 
   CUPTI_CALL(GetTimestamp(&start_timestamp_));
   start_walltime_us_ = NowInUsec();
+  host_tracer_->Start().IgnoreError();
   enabled_ = true;
   return Status::OK();
 }
@@ -510,6 +528,7 @@ Status DeviceTracerImpl::Stop() {
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
   enabled_ = false;
+  host_tracer_->Stop().IgnoreError();
   return Status::OK();
 }
 
@@ -676,6 +695,8 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
     collector->Save(memcpy_device, ns);
     collector->Save(strings::StrCat(stream_device, rec.stream_id), nscopy);
   }
+
+  host_tracer_->CollectDataToCollector(collector).IgnoreError();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
index 7ac5e5c4450708a486be956a5806e31b8dd36fa3..481bbf9bae1b1821571b25f15a6e34d5c6914619 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
@@ -28,27 +28,26 @@ namespace profiler {
 
 namespace dynload {
 
-#define LIBCUPTI_WRAP(__name)                                                 \
-  struct DynLoadShim__##__name {                                              \
-    static const char* kName;                                                 \
-    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                             \
-      static auto status =                                                    \
-          stream_executor::internal::CachedDsoLoader::GetLibcuptiDsoHandle(); \
-      return status.ValueOrDie();                                             \
-    }                                                                         \
-    static FuncPointerT DynLoad() {                                           \
-      static void* f;                                                         \
-      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(         \
-          GetDsoHandle(), kName, &f))                                         \
-          << "could not find " << kName << "in libcupti DSO";                 \
-      return reinterpret_cast<FuncPointerT>(f);                               \
-    }                                                                         \
-    template <typename... Args>                                               \
-    CUptiResult operator()(Args... args) {                                    \
-      return DynLoad()(args...);                                              \
-    }                                                                         \
-  } __name;                                                                   \
+#define LIBCUPTI_WRAP(__name)                                                \
+  struct DynLoadShim__##__name {                                             \
+    static const char* kName;                                                \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
+    template <typename... Args>                                              \
+    CUptiResult operator()(Args... args) {                                   \
+      static auto fn = []() -> FuncPointerT {                                \
+        auto handle_or =                                                     \
+            stream_executor::internal::CachedDsoLoader::GetCuptiDsoHandle(); \
+        if (!handle_or.ok()) return nullptr;                                 \
+        void* symbol;                                                        \
+        stream_executor::port::Env::Default()                                \
+            ->GetSymbolFromLibrary(handle_or.ValueOrDie(), kName, &symbol)   \
+            .IgnoreError();                                                  \
+        return reinterpret_cast<FuncPointerT>(symbol);                       \
+      }();                                                                   \
+      if (fn == nullptr) return CUPTI_ERROR_UNKNOWN;                         \
+      return fn(args...);                                                    \
+    }                                                                        \
+  } __name;                                                                  \
   const char* DynLoadShim__##__name::kName = #__name;
 
 LIBCUPTI_WRAP(cuptiActivityDisable);
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index bf9c7b76206b79ad43969a1e3e2de6e6cbdacc46..977ff1272ea2a97e0b52b785b24560e02eb44207 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -46,7 +46,7 @@ Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
   return errors::Internal("Cannot parse JSON protos on Android");
 #else
   proto->Clear();
-  auto status = google::protobuf::util::JsonStringToMessage(str, proto);
+  auto status = protobuf::util::JsonStringToMessage(str, proto);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
diff --git a/tensorflow/core/platform/default/platform.bzl b/tensorflow/core/platform/default/platform.bzl
index 20ab441bf43e19277c697f98f289ba80d755af48..76bfaa896efa2f8d8f06814d6f69f7bf0b66ed33 100644
--- a/tensorflow/core/platform/default/platform.bzl
+++ b/tensorflow/core/platform/default/platform.bzl
@@ -5,55 +5,52 @@ CUDNN_VERSION = ""
 PLATFORM = ""
 
 def cuda_sdk_version():
-  return CUDA_VERSION
+    return CUDA_VERSION
 
 def cudnn_sdk_version():
-  return CUDNN_VERSION
+    return CUDNN_VERSION
 
 def cuda_library_path(name, version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/lib{}.dylib".format(name)
+    if PLATFORM == "Darwin":
+        if not version:
+            return "lib/lib{}.dylib".format(name)
+        else:
+            return "lib/lib{}.{}.dylib".format(name, version)
+    elif not version:
+        return "lib64/lib{}.so".format(name)
     else:
-      return "lib/lib{}.{}.dylib".format(name, version)
-  else:
-    if not version:
-      return "lib64/lib{}.so".format(name)
-    else:
-      return "lib64/lib{}.so.{}".format(name, version)
+        return "lib64/lib{}.so.{}".format(name, version)
 
 def cuda_static_library_path(name):
-  if PLATFORM == "Darwin":
-    return "lib/lib{}_static.a".format(name)
-  else:
-    return "lib64/lib{}_static.a".format(name)
+    if PLATFORM == "Darwin":
+        return "lib/lib{}_static.a".format(name)
+    else:
+        return "lib64/lib{}_static.a".format(name)
 
 def cudnn_library_path(version = cudnn_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/libcudnn.dylib"
-    else:
-      return "lib/libcudnn.{}.dylib".format(version)
-  else:
-    if not version:
-      return "lib64/libcudnn.so"
+    if PLATFORM == "Darwin":
+        if not version:
+            return "lib/libcudnn.dylib"
+        else:
+            return "lib/libcudnn.{}.dylib".format(version)
+    elif not version:
+        return "lib64/libcudnn.so"
     else:
-      return "lib64/libcudnn.so.{}".format(version)
+        return "lib64/libcudnn.so.{}".format(version)
 
 def cupti_library_path(version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "extras/CUPTI/lib/libcupti.dylib"
+    if PLATFORM == "Darwin":
+        if not version:
+            return "extras/CUPTI/lib/libcupti.dylib"
+        else:
+            return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
+    elif not version:
+        return "extras/CUPTI/lib64/libcupti.so"
     else:
-      return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
-  else:
-    if not version:
-      return "extras/CUPTI/lib64/libcupti.so"
-    else:
-      return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
+        return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
 
 def readlink_command():
-  if PLATFORM == "Darwin":
-    return "greadlink"
-  else:
-    return "readlink"
+    if PLATFORM == "Darwin":
+        return "greadlink"
+    else:
+        return "readlink"
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
deleted file mode 100644
index aeef2d9b882c0a3e2624db2dd194345a373bfe0c..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/protobuf.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf.h
-
-#ifndef TENSORFLOW_LITE_PROTOS
-#include "google/protobuf/descriptor.h"
-#include "google/protobuf/descriptor.pb.h"
-#include "google/protobuf/dynamic_message.h"
-#include "google/protobuf/io/tokenizer.h"
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/util/json_util.h"
-#include "google/protobuf/util/type_resolver_util.h"
-#endif
-
-#include "google/protobuf/arena.h"
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/map.h"
-#include "google/protobuf/repeated_field.h"
-
-namespace tensorflow {
-namespace protobuf = ::google::protobuf;
-using protobuf_int64 = ::google::protobuf::int64;
-using protobuf_uint64 = ::google::protobuf::uint64;
-extern const char* kProtobufInt64Typename;
-extern const char* kProtobufUint64Typename;
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
diff --git a/tensorflow/core/platform/default/string_coding.h b/tensorflow/core/platform/default/string_coding.h
deleted file mode 100644
index 70b8ab01444a6175f9c037e35fadc8196c781b19..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/string_coding.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/tensor_coding.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/tensor_coding.h
-
-#include "tensorflow/core/lib/core/coding.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace port {
-
-// Encodes sequences of strings and serialized protocol buffers into a string.
-// Normal usage consists of zero or more calls to Append() and a single call to
-// Finalize().
-class StringListEncoder {
- public:
-  explicit StringListEncoder(string* out) : out_(out) {}
-
-  // Encodes the given protocol buffer. This may not be called after Finalize().
-  void Append(const protobuf::MessageLite& m) {
-    core::PutVarint32(out_, m.ByteSize());
-    m.AppendToString(&rest_);
-  }
-
-  // Encodes the given string. This may not be called after Finalize().
-  void Append(const string& s) {
-    core::PutVarint32(out_, s.length());
-    strings::StrAppend(&rest_, s);
-  }
-
-  // Signals end of the encoding process. No other calls are allowed after this.
-  void Finalize() { strings::StrAppend(out_, rest_); }
-
- private:
-  string* out_;
-  string rest_;
-};
-
-// Decodes a string into sequences of strings (which may represent serialized
-// protocol buffers). Normal usage involves a single call to ReadSizes() in
-// order to retrieve the length of all the strings in the sequence. For each
-// size returned a call to Data() is expected and will return the actual
-// string.
-class StringListDecoder {
- public:
-  explicit StringListDecoder(const string& in) : reader_(in) {}
-
-  // Populates the given vector with the lengths of each string in the sequence
-  // being decoded. Upon returning the vector is guaranteed to contain as many
-  // elements as there are strings in the sequence.
-  bool ReadSizes(std::vector<uint32>* sizes) {
-    int64 total = 0;
-    for (auto& size : *sizes) {
-      if (!core::GetVarint32(&reader_, &size)) return false;
-      total += size;
-    }
-    if (total != static_cast<int64>(reader_.size())) {
-      return false;
-    }
-    return true;
-  }
-
-  // Returns a pointer to the next string in the sequence, then prepares for the
-  // next call by advancing 'size' characters in the sequence.
-  const char* Data(uint32 size) {
-    const char* data = reader_.data();
-    reader_.remove_prefix(size);
-    return data;
-  }
-
- private:
-  StringPiece reader_;
-};
-
-std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
-
-}  // namespace port
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
diff --git a/tensorflow/core/platform/grpc_services.h b/tensorflow/core/platform/grpc_services.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd918193dc52881ea396142a7b0a8e3708cb427c
--- /dev/null
+++ b/tensorflow/core/platform/grpc_services.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
+#define TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
+
+#include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+
+#if !defined(PLATFORM_GOOGLE)
+
+namespace tensorflow {
+namespace grpc {
+
+// Google internal GRPC generates services under namespace "tensorflow::grpc".
+// Creating aliases here to make sure we can access services under namespace
+// "tensorflow::grpc" both in google internal and open-source.
+using ::tensorflow::ProfileAnalysis;
+using ::tensorflow::ProfilerService;
+
+}  // namespace grpc
+}  // namespace tensorflow
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 17a5d5fb5b7099ad01c68d64f5528fa07cc2fa6f..7417ec8aff66de1f393d9c381bbf2b657a85307d 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"  // To pick up PLATFORM_define
 
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
-    defined(GOOGLE_LOGGING)
+    defined(GOOGLE_LOGGING) || defined(__EMSCRIPTEN__)
 #include "tensorflow/core/platform/google/build_config/logging.h"
 #else
 #include "tensorflow/core/platform/default/logging.h"
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 0481b3687137c8b00fa84d33eb317a1a4f5be9df..671e5dd3c862febe5ef4be912525c7f2043857ed 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -40,7 +40,7 @@ limitations under the License.
 #elif defined(_WIN32)
 #define PLATFORM_WINDOWS
 
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__EMSCRIPTEN__)
 #define PLATFORM_POSIX
 
 // Require an outside macro to tell us if we're building for Raspberry Pi or
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index b02b1f3f480c52ce2a43dd376158fc906c424b33..f2dff5a9b6441c5c39f6251c3b8c46dcd8639c74 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -102,7 +102,7 @@ class PosixEnv : public Env {
   }
 
   bool GetCurrentThreadName(string* name) override {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
     return false;
 #else
     char buf[100];
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index ea6066ac7bd6f89a6e07cb01b82c09f108f095ab..807e0083229983722182fec8ee2a14e70cf233fb 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -82,7 +82,9 @@ int NumTotalCPUs() {
 }
 
 int GetCurrentCPU() {
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__EMSCRIPTEN__)
+  return sched_getcpu();
+#elif defined(__linux__) && !defined(__ANDROID__)
   return sched_getcpu();
   // Attempt to use cpuid on all other platforms.  If that fails, perform a
   // syscall.
diff --git a/tensorflow/core/platform/default/protobuf.cc b/tensorflow/core/platform/protobuf.cc
similarity index 72%
rename from tensorflow/core/platform/default/protobuf.cc
rename to tensorflow/core/platform/protobuf.cc
index 548d5834e6f74b14a3ad16c00f5d3015f337f90a..c9e6f3bf5c6b498818001c9d6644d52af8b7f5d2 100644
--- a/tensorflow/core/platform/default/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/default/protobuf.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
-const char* kProtobufInt64Typename = "::google::protobuf::int64";
-const char* kProtobufUint64Typename = "::google::protobuf::uint64";
+const char* kProtobufInt64Typename = "::tensorflow::protobuf_int64";
+const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index fcbf1fc8c5054e110b9a0fe0217b97cecdd27088..59f4129adf40a5b6892a82ac705f73cdcf02886a 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -25,13 +25,31 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
-#include "tensorflow/core/platform/google/protobuf.h"
-#else
-#include "tensorflow/core/platform/default/protobuf.h"
+#ifndef TENSORFLOW_LITE_PROTOS
+#include "google/protobuf/io/tokenizer.h"
+#include "google/protobuf/descriptor.pb.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/dynamic_message.h"
+#include "google/protobuf/text_format.h"
+#include "google/protobuf/util/json_util.h"
+#include "google/protobuf/util/type_resolver_util.h"
 #endif
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+#include "google/protobuf/arena.h"
+#include "google/protobuf/map.h"
+#include "google/protobuf/repeated_field.h"
+
 namespace tensorflow {
+
+namespace protobuf = ::google::protobuf;
+using protobuf_int64 = ::google::protobuf::int64;
+using protobuf_uint64 = ::google::protobuf::uint64;
+extern const char* kProtobufInt64Typename;
+extern const char* kProtobufUint64Typename;
+
 // Parses a protocol buffer contained in a string in the binary wire format.
 // Returns true on success. Note: Unlike protobuf's builtin ParseFromString,
 // this function has no size restrictions on the total size of the encoded
@@ -47,8 +65,19 @@ inline const string& ProtobufStringToString(const string& s) { return s; }
 // Set <dest> to <src>. Swapping is allowed, as <src> does not need to be
 // preserved.
 inline void SetProtobufStringSwapAllowed(string* src, string* dest) {
-  dest->swap(*src);
+  *dest = std::move(*src);
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// These versions of ProtobufStringToString and SetProtobufString get used by
+// tools/proto_text's generated code.  They have the same name as the versions
+// in core/platform/protobuf.h, so the generation code doesn't need to determine
+// if the type is Cord or string at generation time.
+inline string ProtobufStringToString(const Cord& s) { return s.ToString(); }
+inline void SetProtobufStringSwapAllowed(string* src, Cord* dest) {
+  dest->CopyFrom(*src);
 }
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/protobuf_compiler.h b/tensorflow/core/platform/protobuf_compiler.h
index 29679e00892fbd11d1e5242f62650f42ecef5577..916637d13a55044873b5309c1ea0acc9ac4eef47 100644
--- a/tensorflow/core/platform/protobuf_compiler.h
+++ b/tensorflow/core/platform/protobuf_compiler.h
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 #define TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 
-#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
-#include "tensorflow/core/platform/google/protobuf_compiler.h"
-#else
-#include "tensorflow/core/platform/default/protobuf_compiler.h"
-#endif
+#include "google/protobuf/compiler/importer.h"
 
 #endif  // TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index 44317f1a3e41831b903bd0044d53d1eba80168df..dac569088936b984f2c3167962ff4872e03decc3 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -69,12 +69,32 @@ void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
 }
 
 namespace {
+
+// Taken from tensorflow/core/platform/default/logging.cc
+int ParseInteger(const char* str, size_t size) {
+  string integer_str(str, size);
+  std::istringstream ss(integer_str);
+  int level = 0;
+  ss >> level;
+  return level;
+}
+
+// Taken from tensorflow/core/platform/default/logging.cc
+int64 LogLevelStrToInt(const char* tf_env_var_val) {
+  if (tf_env_var_val == nullptr) {
+    return 0;
+  }
+  return ParseInteger(tf_env_var_val, strlen(tf_env_var_val));
+}
+
 static const char* kAWSLoggingTag = "AWSLogging";
 
 Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
   Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
 
-  const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
+  const int64_t level = getenv("AWS_LOG_LEVEL")
+                            ? LogLevelStrToInt(getenv("AWS_LOG_LEVEL"))
+                            : tensorflow::internal::MinLogLevelFromEnv();
 
   switch (level) {
     case INFO:
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 84601de39a6547ee78d190764616058b4595dc33..3280802bac42725132ef9ad22cc0439d45fca5ac 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -19,6 +19,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+#include "strings/cord_varint.h"
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 namespace tensorflow {
 namespace port {
@@ -66,5 +72,174 @@ void CopyFromArray(string* s, const char* base, size_t bytes) {
   s->assign(base, bytes);
 }
 
+class StringListEncoderImpl : public StringListEncoder {
+ public:
+  explicit StringListEncoderImpl(string* out) : out_(out) {}
+  ~StringListEncoderImpl() override = default;
+
+  void Append(const protobuf::MessageLite& m) override {
+    core::PutVarint32(out_, m.ByteSizeLong());
+    tensorflow::string serialized_message;
+    m.AppendToString(&serialized_message);
+    strings::StrAppend(&rest_, serialized_message);
+  }
+
+  void Append(const string& s) override {
+    core::PutVarint32(out_, s.length());
+    strings::StrAppend(&rest_, s);
+  }
+
+  void Finalize() override { strings::StrAppend(out_, rest_); }
+
+ private:
+  string* out_;
+  string rest_;
+};
+
+class StringListDecoderImpl : public StringListDecoder {
+ public:
+  explicit StringListDecoderImpl(const string& in) : reader_(in) {}
+  ~StringListDecoderImpl() override = default;
+
+  bool ReadSizes(std::vector<uint32>* sizes) override {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!core::GetVarint32(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.size())) {
+      return false;
+    }
+    return true;
+  }
+
+  const char* Data(uint32 size) override {
+    const char* data = reader_.data();
+    reader_.remove_prefix(size);
+    return data;
+  }
+
+ private:
+  StringPiece reader_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
+  return std::unique_ptr<StringListEncoder>(new StringListEncoderImpl(out));
+}
+
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
+  return std::unique_ptr<StringListDecoder>(new StringListDecoderImpl(in));
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out) {
+  obj->Ref();
+  out->Clear();
+  // Defines a lambda to unref "obj" when Cord deletes this piece of
+  // memory. +[] converts the lambda to a C style function pointer.
+  auto cleanup = +[](absl::string_view donotcare, void* obj) {
+    reinterpret_cast<core::RefCounted*>(obj)->Unref();
+  };
+  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
+                            cleanup);
+}
+
+void EncodeStringList(const string* strings, int64 n, Cord* out) {
+  out->Clear();
+  for (int i = 0; i < n; ++i) {
+    ::strings::CordAppendVarint(strings[i].size(), out);
+  }
+  for (int i = 0; i < n; ++i) {
+    out->Append(strings[i]);
+  }
+}
+
+bool DecodeStringList(const Cord& src, string* strings, int64 n) {
+  std::vector<uint32> sizes(n);
+  CordReader reader(src);
+  int64 tot = 0;
+  for (auto& v : sizes) {
+    if (!::strings::CordReaderReadVarint(&reader, &v)) return false;
+    tot += v;
+  }
+  if (tot != reader.Available()) {
+    return false;
+  }
+  string* data = strings;
+  for (int i = 0; i < n; ++i, ++data) {
+    auto size = sizes[i];
+    if (size > reader.Available()) {
+      return false;
+    }
+    gtl::STLStringResizeUninitialized(data, size);
+    reader.ReadN(size, gtl::string_as_array(data));
+  }
+  return true;
+}
+
+void CopyFromArray(Cord* c, const char* base, size_t bytes) {
+  c->CopyFrom(base, bytes);
+}
+
+class CordStringListEncoderImpl : public StringListEncoder {
+ public:
+  explicit CordStringListEncoderImpl(Cord* out) : out_(out) {}
+  ~CordStringListEncoderImpl() override = default;
+
+  void Append(const protobuf::MessageLite& m) override {
+    ::strings::CordAppendVarint(m.ByteSizeLong(), out_);
+    m.AppendToString(&rest_);
+  }
+
+  void Append(const string& s) override {
+    ::strings::CordAppendVarint(s.length(), out_);
+    rest_.append(s.data(), s.size());
+  }
+
+  void Finalize() override { out_->Append(rest_); }
+
+ private:
+  Cord* out_;
+  string rest_;
+};
+
+class CordStringListDecoderImpl : public StringListDecoder {
+ public:
+  explicit CordStringListDecoderImpl(const Cord& in) : reader_(in) {}
+  ~CordStringListDecoderImpl() override = default;
+
+  bool ReadSizes(std::vector<uint32>* sizes) override {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!::strings::CordReaderReadVarint(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.Available())) {
+      return false;
+    }
+    return true;
+  }
+
+  const char* Data(uint32 size) override {
+    tmp_.resize(size);
+    reader_.ReadN(size, tmp_.data());
+    return tmp_.data();
+  }
+
+ private:
+  CordReader reader_;
+  std::vector<char> tmp_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out) {
+  return std::unique_ptr<StringListEncoder>(new CordStringListEncoderImpl(out));
+}
+
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in) {
+  return std::unique_ptr<StringListDecoder>(new CordStringListDecoderImpl(in));
+}
+
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 6c6d75830de743b3e24676c1f57b6988aad11a0f..993ce537ffcd3884cfbb32d1edbdbfbe89f72658 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -21,14 +21,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
-#ifdef PLATFORM_GOOGLE
-#include "tensorflow/core/platform/google/cord_coding.h"
-#else
-#include "tensorflow/core/platform/default/string_coding.h"
-#endif
-
 namespace tensorflow {
 namespace port {
 
@@ -42,6 +37,15 @@ inline void CopyToArray(const string& src, char* dst) {
   memcpy(dst, src.data(), src.size());
 }
 
+// Copy subrange [pos:(pos + n)) from src to dst. If pos >= src.size() the
+// result is empty. If pos + n > src.size() the subrange [pos, size()) is
+// copied.
+inline void CopySubrangeToArray(const string& src, size_t pos, size_t n,
+                                char* dst) {
+  if (pos >= src.size()) return;
+  memcpy(dst, src.data() + pos, std::min(n, src.size() - pos));
+}
+
 // Store encoding of strings[0..n-1] in *out.
 void EncodeStringList(const string* strings, int64 n, string* out);
 
@@ -52,6 +56,75 @@ bool DecodeStringList(const string& src, string* strings, int64 n);
 // Assigns base[0..bytes-1] to *s
 void CopyFromArray(string* s, const char* base, size_t bytes);
 
+// Encodes sequences of strings and serialized protocol buffers into a string.
+// Normal usage consists of zero or more calls to Append() and a single call to
+// Finalize().
+class StringListEncoder {
+ public:
+  virtual ~StringListEncoder() = default;
+
+  // Encodes the given protocol buffer. This may not be called after Finalize().
+  virtual void Append(const protobuf::MessageLite& m) = 0;
+
+  // Encodes the given string. This may not be called after Finalize().
+  virtual void Append(const string& s) = 0;
+
+  // Signals end of the encoding process. No other calls are allowed after this.
+  virtual void Finalize() = 0;
+};
+
+// Decodes a string into sequences of strings (which may represent serialized
+// protocol buffers). Normal usage involves a single call to ReadSizes() in
+// order to retrieve the length of all the strings in the sequence. For each
+// size returned a call to Data() is expected and will return the actual
+// string.
+class StringListDecoder {
+ public:
+  virtual ~StringListDecoder() = default;
+
+  // Populates the given vector with the lengths of each string in the sequence
+  // being decoded. Upon returning the vector is guaranteed to contain as many
+  // elements as there are strings in the sequence.
+  virtual bool ReadSizes(std::vector<uint32>* sizes) = 0;
+
+  // Returns a pointer to the next string in the sequence, then prepares for the
+  // next call by advancing 'size' characters in the sequence.
+  virtual const char* Data(uint32 size) = 0;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// Store src contents in *out.  If backing memory for src is shared with *out,
+// will ref obj during the call and will arrange to unref obj when no
+// longer needed.
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out);
+
+// TODO(kmensah): Macro guard this with a check for Cord support.
+inline void CopyToArray(const Cord& src, char* dst) { src.CopyToArray(dst); }
+
+// Copy n bytes of src to dst. If pos >= src.size() the result is empty.
+// If pos + n > src.size() the subrange [pos, size()) is copied.
+inline void CopySubrangeToArray(const Cord& src, int64 pos, int64 n,
+                                char* dst) {
+  src.Subcord(pos, n).CopyToArray(dst);
+}
+
+// Store encoding of strings[0..n-1] in *out.
+void EncodeStringList(const string* strings, int64 n, Cord* out);
+
+// Decode n strings from src and store in strings[0..n-1].
+// Returns true if successful, false on parse error.
+bool DecodeStringList(const Cord& src, string* strings, int64 n);
+
+// Assigns base[0..bytes-1] to *c
+void CopyFromArray(Cord* c, const char* base, size_t bytes);
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in);
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 2bf371276ef6013ac9f8e3c44623f9c7720cffb3..4efc15b7e5ff65085137d348e57f7311dd01db14 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -5,6 +5,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_profiler_all_protos")
 
 tf_cc_binary(
     name = "profiler",
@@ -36,9 +37,35 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "profiler_service_proto",
+    srcs = ["profiler_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = tf_profiler_all_protos() + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "profiler_analysis_proto",
+    srcs = ["profiler_analysis.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = [":profiler_service_proto"] + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "protos_all",
-    srcs = glob(["**/*.proto"]),
+    srcs = glob(
+        ["**/*.proto"],
+        exclude = [
+            "profiler_service.proto",
+            "profiler_analysis.proto",
+        ],
+    ),
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 57d76eb4cb9382790c80a0d55ee94b64e7b9dcdc..341421738e618e7406de05a126a49f4e1e336b93 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -35,10 +35,10 @@ bazel-bin/tensorflow/core/profiler/profiler \
     --profile_path=/tmp/train_dir/profile_xx
 tfprof> op -select micros,bytes,occurrence -order_by micros
 
-# To be open sourced...
-bazel-bin/tensorflow/python/profiler/profiler_ui \
-    --profile_path=/tmp/profiles/profile_1
+# Profiler ui available at: https://github.com/tensorflow/profiler-ui
+python ui.py --profile_context_path=/tmp/train_dir/profile_xx
 ```
+
 ![ProfilerUI](g3doc/profiler_ui.jpg)
 
 ```python
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index bae2148bb09de7feb1b84a96c9b1ed4a28200135..da3039ae3ceba103882d1315c6293af5560e1862 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -375,6 +375,7 @@ tf_cuda_library(
     visibility = [
         "//learning/brain/runtime:__pkg__",  # xprof_bridge
         "//perftools/accelerators/xprof/xprofilez:__pkg__",  # alias xprof::TraceMeRecorder
+        "//tensorflow/core/profiler/internal/cpu:__pkg__",  # host_tracer
         "//tensorflow/core/profiler/lib:__pkg__",  # traceme
     ],
     deps = [
@@ -392,7 +393,6 @@ tf_cuda_cc_test(
         ":traceme_recorder",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b94453c0a4be5e3c886277356b23ef0c5df5b1c9
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -0,0 +1,44 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+tf_cuda_library(
+    name = "host_tracer",
+    srcs = [
+        "host_tracer.cc",
+    ],
+    hdrs = [
+        "host_tracer.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "host_tracer_test",
+    srcs = ["host_tracer_test.cc"],
+    deps = [
+        ":host_tracer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb296646883cf2215d8df8240219ddce04fb7d0
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/platform/env_time.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+
+/* static */ std::unique_ptr<HostTracer> HostTracer::Create(
+    int host_trace_level) {
+  return absl::WrapUnique(new HostTracer(host_trace_level));
+}
+HostTracer::HostTracer(int host_trace_level)
+    : host_trace_level_(host_trace_level) {}
+
+HostTracer::~HostTracer() { Stop().IgnoreError(); }
+
+Status HostTracer::Start() {
+  if (recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder already started");
+  }
+  recording_ = TraceMeRecorder::Start(host_trace_level_);
+  if (!recording_) {
+    return Status(error::INTERNAL, "Failed to start TraceMeRecorder");
+  }
+  return Status::OK();
+}
+
+Status HostTracer::Stop() {
+  if (!recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder not started");
+  }
+  events_ = TraceMeRecorder::Stop();
+  recording_ = false;
+  return Status::OK();
+}
+
+constexpr char kUserMetadataMarker = '#';
+
+Status HostTracer::CollectData(RunMetadata* run_metadata) {
+  auto step_stats_collector =
+      absl::make_unique<StepStatsCollector>(run_metadata->mutable_step_stats());
+  return CollectDataToCollector(step_stats_collector.get());
+}
+
+Status HostTracer::CollectDataToCollector(
+    StepStatsCollector* step_stats_collector) {
+  if (events_.empty() && recording_) {
+    events_ = TraceMeRecorder::Collect();
+  }
+  // Pair up start and end events, and add complete events to trace_entries.
+  absl::flat_hash_map<uint64, uint64> end_times;
+  for (const auto& thread : events_) {
+    for (const auto& event : thread.events) {
+      if (event.end_time && !event.start_time) {
+        end_times.emplace(event.activity_id, event.end_time);
+      }
+    }
+  }
+
+  const string cpu_name = "/host:CPU";
+  for (auto& thread : events_) {
+    step_stats_collector->SaveThreadName(cpu_name, thread.thread.tid,
+                                         thread.thread.name);
+    for (auto& event : thread.events) {
+      if (!event.end_time) {
+        auto it = end_times.find(event.activity_id);
+        if (it != end_times.end()) event.end_time = it->second;
+      }
+      if (event.start_time && event.end_time) {
+        NodeExecStats* ns = new NodeExecStats;
+        if (event.name.back() != kUserMetadataMarker) {
+          ns->set_node_name(std::move(event.name));
+        } else {
+          // Expect the format will be "<name>#<metadata>#"
+          std::vector<absl::string_view> parts =
+              absl::StrSplit(event.name, kUserMetadataMarker);
+          if (parts.size() >= 2) {
+            ns->set_node_name(string(parts[0]));
+            ns->set_timeline_label(string(parts[1]));
+          } else {
+            ns->set_node_name(std::move(event.name));
+          }
+        }
+        ns->set_all_start_micros(event.start_time / EnvTime::kMicrosToNanos);
+        ns->set_all_end_rel_micros((event.end_time - event.start_time) /
+                                   EnvTime::kMicrosToNanos);
+        ns->set_thread_id(thread.thread.tid);
+        // TODO(fishx): Add thread name to RunMetadata
+        step_stats_collector->Save(cpu_name, ns);
+      }
+    }
+  }
+  events_.clear();
+  step_stats_collector->Finalize();
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.h b/tensorflow/core/profiler/internal/cpu/host_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6340c2eddc8ee66d4ffb2ad2829e15f34cc38ec
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
+
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+
+// Controls TraceMeRecorder and converts TraceMeRecorder::Events into
+// RunMetadata messages.
+//
+// Thread-safety: This class is go/thread-compatible.
+class HostTracer : public ProfilerInterface {
+ public:
+  static std::unique_ptr<HostTracer> Create(int host_trace_level);
+
+  ~HostTracer();
+
+  // Starts recording TraceMes.
+  Status Start() override;
+
+  // Stops recording TraceMes.
+  Status Stop() override;
+
+  // Populates user traces and thread names in response.
+  // The user traces and thread names are in no particular order.
+  Status CollectData(RunMetadata* run_metadata) override;
+
+  Status CollectDataToCollector(StepStatsCollector* step_stats_collector);
+
+ private:
+  explicit HostTracer(int host_trace_level);
+
+  // Level of host tracing.
+  const int host_trace_level_;
+
+  // True if currently recording.
+  bool recording_ = false;
+
+  // Container of all traced events.
+  TraceMeRecorder::Events events_;
+};
+
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51f9c6a8ca6e52b21d0335d83f321cc4bbc331dc
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+NodeExecStats MakeNodeStats(const string& name, uint64 thread_id,
+                            const string& label = "") {
+  NodeExecStats ns;
+  ns.set_node_name(name);
+  ns.set_thread_id(thread_id);
+  if (!label.empty()) {
+    ns.set_timeline_label(label);
+  }
+  return ns;
+}
+
+class NodeStatsMatcher {
+ public:
+  explicit NodeStatsMatcher(const NodeExecStats& expected)
+      : expected_(expected) {}
+
+  bool MatchAndExplain(const NodeExecStats& p,
+                       ::testing::MatchResultListener* /* listener */) const {
+    return p.node_name() == expected_.node_name() &&
+           p.thread_id() == expected_.thread_id() &&
+           p.timeline_label() == expected_.timeline_label();
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_.DebugString(); }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_.DebugString();
+  }
+
+ private:
+  const NodeExecStats expected_;
+};
+
+inline ::testing::PolymorphicMatcher<NodeStatsMatcher> EqualsNodeStats(
+    const NodeExecStats& expected) {
+  return ::testing::MakePolymorphicMatcher(NodeStatsMatcher(expected));
+}
+
+TEST(HostTracerTest, CollectsTraceMeEvents) {
+  uint32 thread_id = Env::Default()->GetCurrentThreadId();
+
+  auto tracer = HostTracer::Create(/*host_trace_level=*/1);
+
+  TF_ASSERT_OK(tracer->Start());
+  { TraceMe traceme("hello"); }
+  { TraceMe traceme("world"); }
+  { TraceMe traceme("contains#inside"); }
+  { TraceMe traceme("good#key1=value1#"); }
+  { TraceMe traceme("morning#key1=value1,key2=value2#"); }
+  { TraceMe traceme("incomplete#key1=value1,key2#"); }
+  TF_ASSERT_OK(tracer->Stop());
+
+  RunMetadata run_metadata;
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata));
+
+  EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 1);
+  EXPECT_EQ(run_metadata.step_stats().dev_stats(0).node_stats_size(), 6);
+  EXPECT_THAT(
+      run_metadata.step_stats().dev_stats(0).node_stats(),
+      UnorderedElementsAre(
+          EqualsNodeStats(MakeNodeStats("hello", thread_id)),
+          EqualsNodeStats(MakeNodeStats("world", thread_id)),
+          EqualsNodeStats(MakeNodeStats("contains#inside", thread_id)),
+          EqualsNodeStats(MakeNodeStats("good", thread_id, "key1=value1")),
+          EqualsNodeStats(
+              MakeNodeStats("morning", thread_id, "key1=value1,key2=value2")),
+          EqualsNodeStats(
+              MakeNodeStats("incomplete", thread_id, "key1=value1,key2"))));
+}
+
+void ValidateResult(const RunMetadata& run_metadata, const string& trace_name) {
+  uint32 thread_id = Env::Default()->GetCurrentThreadId();
+
+  EXPECT_THAT(
+      run_metadata.step_stats().dev_stats(0).node_stats(),
+      ElementsAre(EqualsNodeStats(MakeNodeStats(trace_name, thread_id))));
+}
+
+TEST(HostTracerTest, CollectsTraceMeEventsBetweenTracing) {
+  auto tracer = HostTracer::Create(/*host_trace_level=*/1);
+  RunMetadata run_metadata;
+  RunMetadata run_metadata2;
+
+  TF_ASSERT_OK(tracer->Start());
+  { TraceMe traceme("hello"); }
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata));
+  { TraceMe traceme("world"); }
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata2));
+  TF_ASSERT_OK(tracer->Stop());
+
+  ValidateResult(run_metadata, "hello");
+  ValidateResult(run_metadata2, "world");
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index c4b5b5243c204e1105004589ef14e09d7e097d9d..35f90e9bfc01f7bba0d0cc6d65cc23ea549469a1 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -21,6 +21,5 @@ tf_cuda_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:device_tracer",
         "//tensorflow/core/profiler/internal:profiler_interface",
-        "//tensorflow/core/profiler/internal/runtime:eager_profiler",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index bea3f7c9bddd7994133cdc194f20ccd6a1883913..ec588af1d6048fa709d85e86ea2e5e546f8300d1 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/synchronization/notification.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -32,7 +31,7 @@ MATCHER_P(Named, name, "") { return arg.name == name; }
 constexpr static uint64 kNanosInSec = 1000000000;
 
 TEST(RecorderTest, SingleThreaded) {
-  uint64 start_time = static_cast<uint64>(absl::GetCurrentTimeNanos());
+  uint64 start_time = Env::Default()->NowNanos();
   uint64 end_time = start_time + kNanosInSec;
 
   TraceMeRecorder::Record({1, "before", start_time, end_time});
@@ -48,7 +47,7 @@ TEST(RecorderTest, SingleThreaded) {
 }
 
 TEST(RecorderTest, CollectionBeforeStop) {
-  uint64 start_time = static_cast<uint64>(absl::GetCurrentTimeNanos());
+  uint64 start_time = Env::Default()->NowNanos();
   uint64 end_time = start_time + kNanosInSec;
 
   TraceMeRecorder::Record({1, "ignored", start_time, end_time});
@@ -73,8 +72,8 @@ TEST(RecorderTest, CollectionBeforeStop) {
 }
 
 void SpinNanos(int nanos) {
-  uint64 deadline = absl::GetCurrentTimeNanos() + nanos;
-  while (absl::GetCurrentTimeNanos() < deadline) {
+  uint64 deadline = Env::Default()->NowNanos() + nanos;
+  while (Env::Default()->NowNanos() < deadline) {
   }
 }
 
@@ -103,7 +102,7 @@ TEST(RecorderTest, Multithreaded) {
       uint64 j = 0;
       bool was_active = false;
       auto record_event = [&j, i]() {
-        uint64 start_time = static_cast<uint64>(absl::GetCurrentTimeNanos());
+        uint64 start_time = Env::Default()->NowNanos();
         uint64 end_time = start_time + kNanosInSec;
         TraceMeRecorder::Record({/*activity_id=*/j++,
                                  /*name=*/strings::StrCat(i), start_time,
@@ -154,7 +153,7 @@ TEST(RecorderTest, Multithreaded) {
   // Wait while all the threads are spun up.
   while (thread_count.load(std::memory_order_relaxed) < kNumThreads) {
     LOG(INFO) << "Waiting for all threads to spin up...";
-    absl::SleepFor(absl::Milliseconds(1));
+    Env::Default()->SleepForMicroseconds(1 * EnvTime::kMillisToMicros);
   }
 
   // We will probably be done after two iterations (with each thread getting
@@ -165,7 +164,7 @@ TEST(RecorderTest, Multithreaded) {
   for (int iters = 0; iters < kMaxIters && !done(); ++iters) {
     LOG(INFO) << "Looping until convergence, iteration: " << iters;
     TraceMeRecorder::Start(/*level=*/1);
-    absl::SleepFor(absl::Milliseconds(100));
+    Env::Default()->SleepForMicroseconds(100 * EnvTime::kMillisToMicros);
     auto results = TraceMeRecorder::Stop();
     for (const auto& thread : results) {
       if (thread.events.empty()) continue;
@@ -194,7 +193,7 @@ TEST(RecorderTest, Multithreaded) {
         }
       }
     }
-    absl::SleepFor(absl::Milliseconds(1));
+    Env::Default()->SleepForMicroseconds(1 * EnvTime::kMillisToMicros);
   }
   stop.Notify();
 
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 697076c56a7ce3357ab9280d91068c0d413b4b50..f078099321ec7a2f38e25e5dfe006f2ab49da2ac 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -23,10 +23,10 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/contrib/tpu/profiler:trace_events_proto_cc",
         "//tensorflow/core/profiler/internal/gpu:tracer",
         "//tensorflow/core/profiler/internal/runtime:eager_profiler",
         "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler:protos_all_cc",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -53,7 +53,6 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/internal:traceme_recorder",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
     ],
 )
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 31070b76c78ee848c265451ebe1e734aa518f8ce..1910ea89509f0c21d3271eefb8707dedda733f9c 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -15,22 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include <string>
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/gpu/tracer.h"
 #include "tensorflow/core/profiler/internal/runtime/eager_profiler.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
 namespace {
 
+// Track whether there's an active ProfilerSession.
+// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
+// use singletons that do not allow concurrent profiling request (e.g.,
+// DeviceTracer).
+std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
+
 void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
-                                    tpu::Trace* trace,
+                                    profiler::Trace* trace,
                                     const uint64 profile_start_time_micros) {
   auto trace_devices = trace->mutable_devices();
   // TODO(fishx): use a lighter representation instead of GraphDef to insert
@@ -41,22 +47,35 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
     // Create device
     auto* device_stats =
         run_metadata->mutable_step_stats()->mutable_dev_stats(device_id);
-    tensorflow::tpu::Device device;
+    profiler::Device device;
     device.set_name(device_stats->device());
     device.set_device_id(device_id);
-    tensorflow::tpu::Resource resource;
+    profiler::Resource resource;
     resource.set_name("0");
     resource.set_resource_id(0);
     (*device.mutable_resources())[0] = resource;
+    for (const auto& thread_name : device_stats->thread_names()) {
+      profiler::Resource resource;
+      resource.set_resource_id(thread_name.first);
+      resource.set_name(thread_name.second);
+      (*device.mutable_resources())[thread_name.first] = resource;
+    }
     (*trace_devices)[device_id] = device;
 
     // Emit events.
     for (auto node :
          run_metadata->step_stats().dev_stats(device_id).node_stats()) {
+      if (node.all_start_micros() < profile_start_time_micros) {
+        continue;
+      }
       auto* event = trace->add_trace_events();
       auto* args = event->mutable_args();
       event->set_device_id(device_id);
-      event->set_resource_id(0);
+      if (device_stats->device().find("host:CPU") != string::npos) {
+        event->set_resource_id(node.thread_id());
+      } else {
+        event->set_resource_id(0);
+      }
       event->set_name(node.node_name());
       event->set_timestamp_ps(
           (node.all_start_micros() - profile_start_time_micros) *
@@ -73,7 +92,7 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
 }  // namespace
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
-    EagerContext* const context) {
+    ProfilerContext* const context) {
   return absl::WrapUnique(new ProfilerSession(context));
 }
 
@@ -93,7 +112,13 @@ Status ProfilerSession::SerializeToString(string* content) {
     profiler->CollectData(&run_metadata).IgnoreError();
   }
 
-  tpu::Trace trace;
+  if (active_) {
+    // Allow another session to start.
+    session_active.store(false);
+    active_ = false;
+  }
+
+  profiler::Trace trace;
 
   ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
 
@@ -101,13 +126,20 @@ Status ProfilerSession::SerializeToString(string* content) {
   return Status::OK();
 }
 
-ProfilerSession::ProfilerSession(EagerContext* const context)
-    : start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
+ProfilerSession::ProfilerSession(ProfilerContext* const context)
+    : active_(!session_active.exchange(true)),
+      start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
+  if (!active_) {
+    status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE,
+                                 "Another profiling session is active.");
+    return;
+  }
+
   LOG(INFO) << "Profile Session started.";
 
-  if (context != nullptr) {
-    profilers_.push_back(
-        tensorflow::profiler::runtime::EagerProfiler::Create(context));
+  if (context->eager_context != nullptr) {
+    profilers_.push_back(tensorflow::profiler::runtime::EagerProfiler::Create(
+        context->eager_context));
   }
   profilers_.push_back(tensorflow::profiler::gpu::Tracer::Create());
 
@@ -122,6 +154,11 @@ ProfilerSession::~ProfilerSession() {
   for (auto& profiler : profilers_) {
     profiler->Stop().IgnoreError();
   }
+
+  if (active_) {
+    // Allow another session to start.
+    session_active.store(false);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1ab48250553b3f78fd2bc6db17282f9514003620..07276571244b876c8b6635a9d39347c3c1d89a55 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -22,6 +22,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+struct ProfilerContext {
+  EagerContext* eager_context = nullptr;
+};
+
 // A profiler which will start profiling when creating the object and will stop
 // when either the object is destroyed or SerializedToString is called. It will
 // profile all operations run under the given EagerContext.
@@ -32,7 +36,8 @@ namespace tensorflow {
 class ProfilerSession {
  public:
   // Creates and ProfilerSession and starts profiling.
-  static std::unique_ptr<ProfilerSession> Create(EagerContext* const context);
+  static std::unique_ptr<ProfilerSession> Create(
+      ProfilerContext* const context);
 
   // Deletes an exsiting Profiler and enables starting a new one.
   ~ProfilerSession();
@@ -43,7 +48,7 @@ class ProfilerSession {
 
  private:
   // Constructs an instance of the class and starts profiling
-  explicit ProfilerSession(EagerContext* const context);
+  explicit ProfilerSession(ProfilerContext* const context);
 
   // Profiler is neither copyable or movable.
   ProfilerSession(const ProfilerSession&) = delete;
@@ -52,6 +57,9 @@ class ProfilerSession {
   std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
       profilers_ GUARDED_BY(mutex_);
 
+  // True if the session is active.
+  bool active_ GUARDED_BY(mutex_);
+
   tensorflow::Status status_ GUARDED_BY(mutex_);
   const uint64 start_time_micros_;
   mutex mutex_;
diff --git a/tensorflow/core/profiler/lib/traceme.cc b/tensorflow/core/profiler/lib/traceme.cc
index ae1c9215e2d0c291bf7541d37300efd2d00a8c70..90272b8bf584891075de050c7468376abbaed856 100644
--- a/tensorflow/core/profiler/lib/traceme.cc
+++ b/tensorflow/core/profiler/lib/traceme.cc
@@ -31,17 +31,15 @@ uint64 NewActivityId() {
 /* static */ uint64 TraceMe::ActivityStartImpl(
     absl::string_view activity_name) {
   uint64 activity_id = NewActivityId();
-  TraceMeRecorder::Record(
-      {activity_id, string(activity_name),
-       /*start_time=*/static_cast<uint64>(absl::GetCurrentTimeNanos()),
-       /*end_time=*/0});
+  TraceMeRecorder::Record({activity_id, string(activity_name),
+                           /*start_time=*/Env::Default()->NowNanos(),
+                           /*end_time=*/0});
   return activity_id;
 }
 
 /* static */ void TraceMe::ActivityEndImpl(uint64 activity_id) {
-  TraceMeRecorder::Record(
-      {activity_id, /*name=*/"", /*start_time=*/0,
-       /*end_time=*/static_cast<uint64>(absl::GetCurrentTimeNanos())});
+  TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+                           /*end_time=*/Env::Default()->NowNanos()});
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index a6bff3adc571fb29d862039ea5ea70dbb193a6b6..b9fae3d37f0f2dd53a85f4d0d17aeaa626c849f1 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
@@ -73,7 +74,7 @@ class TraceMe {
     DCHECK_GE(level, 1);
     if (TraceMeRecorder::Active(level)) {
       new (&no_init_.name) string(activity_name);
-      start_time_ = absl::GetCurrentTimeNanos();
+      start_time_ = Env::Default()->NowNanos();
     } else {
       start_time_ = kUntracedActivity;
     }
@@ -88,7 +89,7 @@ class TraceMe {
     DCHECK_GE(level, 1);
     if (TraceMeRecorder::Active(level)) {
       new (&no_init_.name) string(std::move(activity_name));
-      start_time_ = absl::GetCurrentTimeNanos();
+      start_time_ = Env::Default()->NowNanos();
     } else {
       start_time_ = kUntracedActivity;
     }
@@ -118,7 +119,7 @@ class TraceMe {
     DCHECK_GE(level, 1);
     if (TraceMeRecorder::Active(level)) {
       new (&no_init_.name) string(name_generator());
-      start_time_ = absl::GetCurrentTimeNanos();
+      start_time_ = Env::Default()->NowNanos();
     } else {
       start_time_ = kUntracedActivity;
     }
@@ -135,9 +136,8 @@ class TraceMe {
     //   start/stop session timestamp (recorded in XprofResponse).
     if (start_time_ != kUntracedActivity) {
       if (TraceMeRecorder::Active()) {
-        TraceMeRecorder::Record(
-            {kCompleteActivity, std::move(no_init_.name), start_time_,
-             static_cast<uint64>(absl::GetCurrentTimeNanos())});
+        TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name),
+                                 start_time_, Env::Default()->NowNanos()});
       }
       no_init_.name.~string();
     }
diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/core/profiler/op_profile.proto
similarity index 98%
rename from tensorflow/contrib/tpu/profiler/op_profile.proto
rename to tensorflow/core/profiler/op_profile.proto
index 292108f949d705762a826d0276a517f1f741fb39..0adca5544a6f579ef64bbf804ff8098e28b37da0 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/core/profiler/op_profile.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.tpu.op_profile;
+package tensorflow.profiler.op_profile;
 
 // Profile is the top-level data that summarizes a program.
 message Profile {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 808e3c853bec0efb9523ee413f3d5272a833358d..cdcb8dddf6d0b3b54fb29156559caed2ba216ca2 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -114,7 +114,7 @@ int Run(int argc, char** argv) {
       Flag("min_output_bytes", &FLAGS_min_output_bytes, "min_output_bytes"),
       Flag("min_micros", &FLAGS_min_micros, "min micros"),
       Flag("min_accelerator_micros", &FLAGS_min_accelerator_micros,
-           "min acclerator_micros"),
+           "min accelerator_micros"),
       Flag("min_cpu_micros", &FLAGS_min_cpu_micros, "min_cpu_micros"),
       Flag("min_params", &FLAGS_min_params, "min params"),
       Flag("min_float_ops", &FLAGS_min_float_ops, "min float ops"),
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/core/profiler/profiler_analysis.proto
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
rename to tensorflow/core/profiler/profiler_analysis.proto
index d3c34bfd490080b86cf3d8b893c550f3a87bbbed..4be75de8bb46a23d26b116f306bad6f107d786ef 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/core/profiler/profiler_analysis.proto
@@ -1,7 +1,7 @@
 syntax = "proto3";
 package tensorflow;
 
-import "tensorflow/contrib/tpu/profiler/tpu_profiler.proto";
+import "tensorflow/core/profiler/profiler_service.proto";
 
 message NewProfileSessionRequest {
   ProfileRequest request = 1;
@@ -58,10 +58,10 @@ message ProfileSessionDataResponse {
   bytes output = 3;
 }
 ////////////////////////////////////////////////////////////////////////////////
-// TPUProfileAnalysis service provide entry point for profiling TPU and for
+// ProfileAnalysis service provide entry point for profiling TPU and for
 // serving profiled data to Tensorboard through GRPC
 ////////////////////////////////////////////////////////////////////////////////
-service TPUProfileAnalysis {
+service ProfileAnalysis {
   // Starts a profiling session, blocks until it completes.
   // TPUProfileAnalysis service delegate this to TPUProfiler service.
   // Populate the profiled data in repository, then return status to caller.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/core/profiler/profiler_service.proto
similarity index 92%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler.proto
rename to tensorflow/core/profiler/profiler_service.proto
index da4a95e0450a9d0c20593ca60b69f3ad467d455d..77702c3c900e5a7391ea09ad93383b4f9c9fb2b2 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -3,11 +3,11 @@ package tensorflow;
 
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/protobuf/config.proto";
-import "tensorflow/contrib/tpu/profiler/op_profile.proto";
+import "tensorflow/core/profiler/op_profile.proto";
 
-// The TPUProfiler service retrieves performance information about
-// the programs running on connected TPUs over a period of time.
-service TPUProfiler {
+// The ProfilerService service retrieves performance information about
+// the programs running on connected devices over a period of time.
+service ProfilerService {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {
   }
@@ -81,7 +81,7 @@ message ProfileToolData {
 
 message ProfileResponse {
   reserved 1;  // was uint64 placeholder for returning something meaningful.
-  // Graphs of programs executed on TPUs during the profiling period.
+  // Graphs of programs executed on devices during the profiling period.
   repeated GraphDef computation_graph = 2;
 
   // Performance profile that can be used to annotate HLO operations in the
@@ -96,7 +96,7 @@ message ProfileResponse {
   // Assembles a hierarchical performance profile based on HLOs in trace events.
   // If the trace covers multiple programs, the longest-running one is analyzed.
   // See op_profile.proto for the detailed semantics of the returned profile.
-  tpu.op_profile.Profile op_profile = 4;
+  profiler.op_profile.Profile op_profile = 4;
 
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 6b4576ec9ba5eec8154d47c90b1b4b272fcbf390..3e5cdaa4984d4ddfb4d4af8e23ab81c2645814d2 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -11,9 +11,10 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow:grpc++",
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_proto_cc",
         "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
         "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler/lib:profiler_session",
     ],
     alwayslink = 1,
@@ -27,9 +28,11 @@ tf_cuda_library(
     deps = [
         ":profiler_service_impl",
         "//tensorflow:grpc++",
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_proto_cc",
         "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
         "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler/lib:profiler_session",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..14dd865a174da7d8d2d118e3e291c2896f243df6
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -0,0 +1,63 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+tf_cuda_library(
+    name = "capture_profile",
+    srcs = [
+        "capture_profile.cc",
+    ],
+    hdrs = [
+        "capture_profile.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dump_tpu_profile",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/platform/cloud:gcs_file_system",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "dump_tpu_profile",
+    srcs = ["dump_tpu_profile.cc"],
+    hdrs = ["dump_tpu_profile.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "trace_events_to_json",
+    srcs = ["trace_events_to_json.cc"],
+    hdrs = ["trace_events_to_json.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+tf_cc_test(
+    name = "trace_events_to_json_test",
+    srcs = ["trace_events_to_json_test.cc"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a24e0faa41f4eface697495be741b1eb4757b84f
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -0,0 +1,251 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+
+#include "grpcpp/grpcpp.h"
+
+#include <cstdio>
+#include <ctime>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/rpc/client/dump_tpu_profile.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace client {
+
+constexpr uint64 kMaxEvents = 1000000;
+
+string GetCurrentTimeStampAsString() {
+  char s[128];
+  std::time_t t = std::time(nullptr);
+  auto result = std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t));
+  DCHECK_NE(result, 0);
+  return s;
+}
+
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const string& repository_root,
+                                      const string& session_id,
+                                      const ProfileOptions& opts) {
+  ProfileRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_max_events(kMaxEvents);
+  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+    // For backward compatibilities, only generate tracetable etc when the
+    // user provide a GCS path for model directory.
+    request.set_repository_root(repository_root);
+    request.set_session_id(session_id);
+  }
+  request.add_tools("op_profile");
+  request.add_tools("input_pipeline");
+  request.add_tools("memory_viewer");
+  request.add_tools("overview_page");
+  *request.mutable_opts() = opts;
+  return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+Status Profile(const string& service_addr, const string& logdir,
+               int duration_ms, const string& repository_root,
+               const string& session_id, const ProfileOptions& opts) {
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                      std::numeric_limits<int32>::max());
+  std::unique_ptr<grpc::ProfilerService::Stub> stub =
+      grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  ProfileResponse response;
+  TF_RETURN_IF_ERROR(
+      FromGrpcStatus(stub->Profile(&context, request, &response)));
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(WriteTensorboardTPUProfile(logdir, session_id, "", response,
+                                           &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    return Status(tensorflow::error::Code::UNAVAILABLE,
+                  "No trace event is collected");
+  }
+  return Status::OK();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+Status NewSession(const string& service_addr,
+                  const std::vector<tensorflow::string>& hostnames,
+                  int duration_ms, const string& repository_root,
+                  const string& session_id, const ProfileOptions& opts) {
+  NewProfileSessionRequest new_session_request;
+  *new_session_request.mutable_request() =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+  new_session_request.set_repository_root(repository_root);
+  new_session_request.set_session_id(session_id);
+  for (const auto& hostname : hostnames) {
+    new_session_request.add_hosts(hostname);
+  }
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  // TODO(jiesun): GRPC support following relevant naming scheme:
+  // 1. dns:///host:port
+  // 2. ipv4:host:port or ipv6:[host]:port
+  // We might need to change the prefix which depends on what TPU name resolver
+  // will give us.
+  std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
+      grpc::ProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  NewProfileSessionResponse new_session_response;
+  TF_RETURN_IF_ERROR(FromGrpcStatus(
+      stub->NewSession(&context, new_session_request, &new_session_response)));
+
+  std::cout << "Profile session succeed for host(s):"
+            << str_util::Join(hostnames, ",") << std::endl;
+  if (new_session_response.empty_trace()) {
+    return Status(tensorflow::error::Code::UNAVAILABLE,
+                  "No trace event is collected");
+  }
+  return Status::OK();
+}
+
+// Starts tracing on a single or multiple TPU hosts and saves the result in the
+// given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts.
+Status StartTracing(const tensorflow::string& service_addr,
+                    const tensorflow::string& logdir,
+                    const tensorflow::string& workers_list,
+                    bool include_dataset_ops, int duration_ms,
+                    int num_tracing_attempts) {
+  // Use the current timestamp as the run name.
+  tensorflow::string session_id = GetCurrentTimeStampAsString();
+  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+  tensorflow::string repository_root =
+      io::JoinPath(logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(workers_list, ",");
+
+  Status status = Status::OK();
+  int remaining_attempts = num_tracing_attempts;
+  tensorflow::ProfileOptions opts;
+  opts.set_include_dataset_ops(include_dataset_ops);
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    if (hostnames.empty()) {
+      status = Profile(service_addr, logdir, duration_ms, repository_root,
+                       session_id, opts);
+    } else {
+      tensorflow::string tpu_master = service_addr;
+      status = NewSession(tpu_master, hostnames, duration_ms, repository_root,
+                          session_id, opts);
+    }
+    if (remaining_attempts <= 0 || status.ok() ||
+        status.code() != tensorflow::error::Code::UNAVAILABLE)
+      break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (status.code() == tensorflow::error::Code::UNAVAILABLE) {
+    std::cout << "No trace event is collected after " << num_tracing_attempts
+              << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+    return status;
+  }
+  return Status::OK();
+}
+
+MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) {
+  MonitorRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_monitoring_level(monitoring_level);
+  return request;
+}
+
+// Repeatedly collects profiles and shows user-friendly metrics for
+// 'num_queries' time(s).
+void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
+                     int monitoring_level, int num_queries) {
+  for (int query = 0; query < num_queries; ++query) {
+    MonitorRequest request =
+        PopulateMonitorRequest(duration_ms, monitoring_level);
+
+    ::grpc::ClientContext context;
+    ::grpc::ChannelArguments channel_args;
+    channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                        std::numeric_limits<int32>::max());
+    std::unique_ptr<grpc::ProfilerService::Stub> stub =
+        grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
+            "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+            channel_args));
+    MonitorResponse response;
+    TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
+
+    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
+              << "):\n\n"
+              << response.data() << std::flush;
+  }
+}
+
+}  // namespace client
+}  // namespace profiler
+}  // namespace tensorflow
+
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
new file mode 100644
index 0000000000000000000000000000000000000000..988036724791cd171f11a7a3666aca4267286646
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace client {
+
+Status ValidateHostPortPair(const string& host_port);
+
+// Repeatedly collects profiles and shows user-friendly metrics for
+// 'num_queries' time(s).
+void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
+                     int monitoring_level, int num_queries);
+
+// Starts tracing on a single or multiple hosts and saves the result in the
+// given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts.
+Status StartTracing(const tensorflow::string& service_addr,
+                    const tensorflow::string& logdir,
+                    const tensorflow::string& workers_list,
+                    bool include_dataset_ops, int duration_ms,
+                    int num_tracing_attempts);
+
+}  // namespace client
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
similarity index 92%
rename from tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
rename to tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
index b4b06a40a2c8aaa97ff82baf93c8f2d55a587e37..f0675c4cf40435fed8f9b0cfbfe41f873304a88e 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
@@ -13,15 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
+#include "tensorflow/core/profiler/rpc/client/dump_tpu_profile.h"
 
 #include <cstdio>
 #include <ctime>
 #include <vector>
 
-#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -29,10 +26,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/op_profile.pb.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 #include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 using ::tensorflow::io::JoinPath;
@@ -88,7 +90,7 @@ Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
                                    const string& host_prefix,
-                                   const tpu::op_profile::Profile& profile,
+                                   const op_profile::Profile& profile,
                                    std::ostream* os) {
   string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
@@ -109,7 +111,7 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
                                   const string& host_prefix,
-                                  const tensorflow::ProfileToolData& tool,
+                                  const ProfileToolData& tool,
                                   std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
   if (EndsWith(tool.name(), kFlatProfilerFileName) ||
@@ -155,5 +157,6 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   return Status::OK();
 }
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
similarity index 80%
rename from tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
rename to tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
index ecf21b1de2219e8896d5e8b79325a193de0b0fa1..961f4e9498d91a6c0d75b82ad87860963360ddb3 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
-#define TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
 
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/grpc_services.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 
 // Dumps all profiling tool data in a TPU profile to a TensorBoard log directory
 // with the given run name. This writes user-facing log messages to `os`.
@@ -36,7 +38,8 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.cc b/tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
similarity index 91%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json.cc
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
index 3f7e67dec88918009a2a9856d9c7a182338f748d..6adaec5546052a5d82a54e4ae1ca78eb10a4a103 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
 #include "include/json/json.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 using ::tensorflow::strings::Appendf;
@@ -96,10 +99,9 @@ inline void AddTraceEvent(const TraceEvent &event, string *json) {
 
 string TraceEventsToJson(const Trace &trace) {
   string json;
-  Appendf(&json,
-          R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
-  Appendf(&json,
-          R"("traceEvents":[)");
+  Appendf(
+      &json, R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
+  Appendf(&json, R"("traceEvents":[)");
   // Convert to a std::map so that devices are sorted by the device id.
   std::map<uint32, const Device *> sorted_devices;
   for (const auto &pair : trace.devices()) {
@@ -114,5 +116,6 @@ string TraceEventsToJson(const Trace &trace) {
   return json;
 }
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h b/tensorflow/core/profiler/rpc/client/trace_events_to_json.h
similarity index 72%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json.h
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json.h
index 3bd76dd01c7d0f35bad9386c11811743e1709fca..d54cc3c619e234b82452a54f613a57cabfa7d5d3 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
-#define TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
 
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 
 // Converts trace events in the trace proto to a JSON string that can be
 // consumed by catapult trace viewer.
 string TraceEventsToJson(const Trace &trace);
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc b/tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
index e97989cc7be961b2a812e46bb07b189bd6cda897..0f883b04dc869218329fd944d45918b0836d1a44 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
 #include "include/json/json.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 string ConvertTextFormattedTraceToJson(const string& trace_str) {
@@ -109,5 +112,6 @@ TEST(TraceEventsToJson, JsonConversion) {
 }
 
 }  // namespace
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 835aa1e3f5cd89c4d82c7ffcb84e37fd57e1679a..257e4e0bf5fa320c499a40065021b0030564bc45 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -17,27 +17,32 @@ limitations under the License.
 #include <memory>
 #include <utility>
 #include "grpcpp/grpcpp.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/core/platform/grpc_services.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
-std::unique_ptr<Thread> StartProfilerServer(EagerContext* const eager_context,
-                                            int32 port) {
-  return WrapUnique(eager_context->TFEnv()->StartThread(
-      {}, "profiler_server", [eager_context, port]() {
-        string server_address = strings::StrCat("0.0.0.0:", port);
-        std::unique_ptr<TPUProfiler::Service> service =
-            CreateProfilerService(eager_context);
-        ::grpc::ServerBuilder builder;
-        builder.AddListeningPort(server_address,
-                                 ::grpc::InsecureServerCredentials());
-        builder.RegisterService(service.get());
-        std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
-        LOG(INFO) << "Profiling Server listening on " << server_address;
-        server->Wait();
-      }));
+std::unique_ptr<Thread> StartProfilerServer(
+    ProfilerContext* const profiler_context, int32 port) {
+  Env* env = profiler_context->eager_context != nullptr
+                 ? profiler_context->eager_context->TFEnv()
+                 : Env::Default();
+  // Starting the server in the child thread may be delay and user may already
+  // delete the profiler context at that point. So we need to make a copy.
+  ProfilerContext ctx = *profiler_context;
+  return WrapUnique(env->StartThread({}, "profiler server", [ctx, port]() {
+    string server_address = strings::StrCat("0.0.0.0:", port);
+    std::unique_ptr<grpc::ProfilerService::Service> service =
+        CreateProfilerService(ctx);
+    ::grpc::ServerBuilder builder;
+    builder.AddListeningPort(server_address,
+                             ::grpc::InsecureServerCredentials());
+    builder.RegisterService(service.get());
+    std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+    LOG(INFO) << "Profiling Server listening on " << server_address;
+    server->Wait();
+  }));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
index ebefdd6cb08289e8a43b0f1525f3176657530095..4e8c715ac753d57add26de28a2524d4f737567ec 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.h
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -15,10 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
 
-#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
 
 namespace tensorflow {
-std::unique_ptr<Thread> StartProfilerServer(EagerContext* const eager_context,
-                                            int32 port);
+
+std::unique_ptr<Thread> StartProfilerServer(
+    ProfilerContext* const profiler_context, int32 port);
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 872ef2ada973aa365fc2b9d431f85b4934f2b992..f25ee66833604882309679615e02bf4b6125d9ed 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -15,19 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 #include "grpcpp/support/status.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/grpc_services.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
 
-// TODO(fishx): Rename TPUProfiler to something more generic.
-class ProfilerServiceImpl : public TPUProfiler::Service {
+class ProfilerServiceImpl : public grpc::ProfilerService::Service {
  public:
-  explicit ProfilerServiceImpl(EagerContext* const eager_context)
-      : eager_context_(eager_context) {}
+  explicit ProfilerServiceImpl(const ProfilerContext& profiler_context)
+      : profiler_context_(profiler_context) {}
   ~ProfilerServiceImpl() override {}
 
   ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
@@ -39,13 +38,15 @@ class ProfilerServiceImpl : public TPUProfiler::Service {
                          ProfileResponse* response) override {
     LOG(INFO) << "Received a profile request.";
     std::unique_ptr<ProfilerSession> profiler =
-        ProfilerSession::Create(eager_context_);
+        ProfilerSession::Create(&profiler_context_);
     if (!profiler->Status().ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
                             profiler->Status().error_message());
     }
 
-    Env* env = eager_context_->TFEnv();
+    Env* env = profiler_context_.eager_context != nullptr
+                   ? profiler_context_.eager_context->TFEnv()
+                   : Env::Default();
     for (size_t i = 0; i < req->duration_ms(); ++i) {
       env->SleepForMicroseconds(1000);
       if (ctx->IsCancelled()) {
@@ -62,13 +63,13 @@ class ProfilerServiceImpl : public TPUProfiler::Service {
   }
 
  private:
-  EagerContext* const eager_context_;
+  ProfilerContext profiler_context_;
 };
 }  // namespace
 
-std::unique_ptr<TPUProfiler::Service> CreateProfilerService(
-    EagerContext* const eager_context) {
-  return MakeUnique<ProfilerServiceImpl>(eager_context);
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
+    const ProfilerContext& profiler_context) {
+  return MakeUnique<ProfilerServiceImpl>(profiler_context);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 311a26745184703d6da836c31016c79cb1b92b72..64ae01d58377c751945e05417528118026b1614e 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -18,12 +18,14 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/server_context.h"
 #include "grpcpp/support/status.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
 
 namespace tensorflow {
-std::unique_ptr<TPUProfiler::Service> CreateProfilerService(
-    EagerContext* const eager_context);
+
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
+    const ProfilerContext& profiler_context);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events.proto b/tensorflow/core/profiler/trace_events.proto
similarity index 98%
rename from tensorflow/contrib/tpu/profiler/trace_events.proto
rename to tensorflow/core/profiler/trace_events.proto
index 96c4784c691d8f34cf8715cdc0ed9886412f5f90..69ec88ca9a798a0faf1864ce9cf5c3f8bb7df0ca 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events.proto
+++ b/tensorflow/core/profiler/trace_events.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.tpu;
+package tensorflow.profiler;
 
 // A 'Trace' contains metadata for the individual traces of a system.
 message Trace {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index a2cc1bc9353bf434438ec9d21ff3995e0806f1d0..3e24235369a6bd06d3c8cf0df66e1ee3ead2b9b2 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -156,6 +156,16 @@ message GPUOptions {
     // CollectiveReduce, and serves as an override to automatic ring order
     // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
     string collective_ring_order = 4;
+
+    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
+    // keep track of when GPU memory is freed and when kernels actually
+    // complete so that we can know when a nominally free memory chunk
+    // is really not subject to pending use.
+    bool timestamped_allocator = 5;
+
+    // If > 0 limit the number of pending kernels on any compute
+    // stream to this number.
+    int32 pending_cap = 6;
   }
 
   // Everything inside experimental is subject to change and is not subject
@@ -429,6 +439,10 @@ message ConfigProto {
     // If true, make collective op execution order sequential and deterministic
     // for potentially concurrent collective instances.
     bool collective_deterministic_sequential_execution = 6;
+
+    // If true, use NCCL for CollectiveOps.  This feature is highly
+    // experimental.
+    bool collective_nccl = 7;
   };
 
   Experimental experimental = 16;
@@ -506,6 +520,25 @@ message RunMetadata {
 
   // Graphs of the partitions executed by executors.
   repeated GraphDef partition_graphs = 3;
+
+  message FunctionGraphs {
+    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
+    repeated GraphDef partition_graphs = 1;
+
+    GraphDef pre_optimization_graph = 2;
+    GraphDef post_optimization_graph = 3;
+  }
+  // This is only populated for graphs that are run as functions in TensorFlow
+  // V2. There will be an entry below for each function that is traced.
+  // The main use cases of the post_optimization_graph and the partition_graphs
+  // is to give the caller insight into the graphs that were actually run by the
+  // runtime. Additional information (such as those in step_stats) will match
+  // these graphs.
+  // We also include the pre_optimization_graph since it is usually easier to
+  // read, and is helpful in situations where the caller wants to get a high
+  // level idea of what the built graph looks like (since the various graph
+  // optimization passes might change the structure of the graph significantly).
+  repeated FunctionGraphs function_graphs = 4;
 }
 
 // Defines a connection between two tensors in a `GraphDef`.
diff --git a/tensorflow/core/protobuf/graph_debug_info.proto b/tensorflow/core/protobuf/graph_debug_info.proto
index cd3cdeba422713ffd9bfe20a895714ee94dd3a71..a123d3cf496a8d653abe652468ce22dda47bc18f 100644
--- a/tensorflow/core/protobuf/graph_debug_info.proto
+++ b/tensorflow/core/protobuf/graph_debug_info.proto
@@ -18,6 +18,12 @@ message GraphDebugInfo {
 
     // Col number in the file line.
     int32 col = 3;
+
+    // Name of function contains the file line.
+    string func = 4;
+
+    // Source code contained in this file line.
+    string code = 5;
   }
 
   // This represents a stack trace which is a ordered list of `FileLineCol`.
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index c104463c51c7e7be02430c7750ebacee60ed50e4..4a998c5bfcd29a23df01aca6feca827afebd3258 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -16,11 +16,13 @@ limitations under the License.
 syntax = "proto3";
 
 package tensorflow;
+
 option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+
+// add go_package externally with copybara
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
@@ -138,6 +140,11 @@ message RunStepRequest {
   // response body. This is a workaround since the RPC subsystem may
   // truncate long metadata messages.
   bool store_errors_in_response_body = 7;
+
+  // Unique identifier for this request. Every RunStepRequest must
+  // have a unique request_id, and retried RunStepRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 8;
 }
 
 message RunStepResponse {
@@ -183,6 +190,11 @@ message PartialRunSetupRequest {
   // Target Nodes. A list of node names. The named nodes will be run in future
   // steps, but their outputs will not be fetched.
   repeated string target = 4;
+
+  // Unique identifier for this request. Every PartialRunSetupRequest must
+  // have a unique request_id, and retried PartialRunSetupRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 5;
 }
 
 message PartialRunSetupResponse {
@@ -204,8 +216,7 @@ message CloseSessionRequest {
   string session_handle = 1;
 }
 
-message CloseSessionResponse {
-}
+message CloseSessionResponse {}
 
 // Reset() allows misbehaving or slow sessions to be aborted and closed, and
 // causes their resources eventually to be released.  Reset() does not wait
@@ -237,8 +248,7 @@ message ResetRequest {
   repeated string device_filters = 2;
 }
 
-message ResetResponse {
-}
+message ResetResponse {}
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -279,6 +289,11 @@ message MakeCallableRequest {
 
   // Options that define the behavior of the created callable.
   CallableOptions options = 2;
+
+  // Unique identifier for this request. Every MakeCallableRequest must
+  // have a unique request_id, and retried MakeCallableRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 3;
 }
 
 message MakeCallableResponse {
@@ -303,6 +318,11 @@ message RunCallableRequest {
   // Values of the tensors passed as arguments to the callable, in the order
   // defined in the CallableOptions.feed field passed to MakeCallable.
   repeated TensorProto feed = 3;
+
+  // Unique identifier for this request. Every RunCallableRequest must
+  // have a unique request_id, and retried RunCallableRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 4;
 }
 
 message RunCallableResponse {
@@ -330,5 +350,4 @@ message ReleaseCallableRequest {
   int64 handle = 2;
 }
 
-message ReleaseCallableResponse {
-}
+message ReleaseCallableResponse {}
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index 75a2a88ed72cd909f607286b574b0c343c6268f6..fa0192cf67c500994e5dd976c414c248b3a321a2 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -12,6 +12,7 @@ import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/op_def.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/protobuf/saved_object_graph.proto";
 import "tensorflow/core/protobuf/saver.proto";
 
 // NOTE: This protocol buffer is evolving, and will go through revisions in the
@@ -84,6 +85,9 @@ message MetaGraphDef {
 
   // Asset file def to be used with the defined graph.
   repeated AssetFileDef asset_file_def = 6;
+
+  // Extra information about the structure of functions and stateful objects.
+  SavedObjectGraph object_graph_def = 7;
 }
 
 // CollectionDef should cover most collections.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index b5c9599872b0bd4d086af0836f6e9f42750223cf..7a62c6ee1f3373a6d300c2677a8cddc3e1ed01b3 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -78,6 +78,9 @@ message RewriterConfig {
   Toggle scoped_allocator_optimization = 15;
   // Force small ops onto the CPU (default is ON).
   Toggle pin_to_host_optimization = 18;
+  // Enable the swap of kernel implementations based on the device placement
+  // (default is ON).
+  Toggle implementation_selector = 22;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
 
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
similarity index 78%
rename from tensorflow/python/saved_model/saved_object_graph.proto
rename to tensorflow/core/protobuf/saved_object_graph.proto
index c57bf15efad229f182cf564e8ded3433634c8116..48060b33dc42ca74c0464dbce20b0e88ac4a30e6 100644
--- a/tensorflow/python/saved_model/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -1,10 +1,10 @@
 syntax = "proto3";
 
-import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
+import "tensorflow/core/protobuf/trackable_object_graph.proto";
+import "tensorflow/core/protobuf/struct.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/versions.proto";
-import "tensorflow/python/saved_model/struct.proto";
 
 option cc_enable_arenas = true;
 
@@ -14,15 +14,13 @@ package tensorflow;
 // describes the directed graph of Python objects (or equivalent in other
 // languages) that make up a model, with nodes[0] at the root.
 
-// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
-// ObjectGraph belongs to the SavedModel and contains pointers to functions and
-// type information, while CheckpointableObjectGraph lives in the checkpoint and
-// contains pointers only to variable values.
-
-// NOTE: This protocol buffer format is experimental and subject to change.
+// SavedObjectGraph shares some structure with TrackableObjectGraph, but
+// SavedObjectGraph belongs to the MetaGraph and contains pointers to functions
+// and type information, while TrackableObjectGraph lives in the checkpoint
+// and contains pointers only to variable values.
 
 message SavedObjectGraph {
-  // List of objects in the SavedModel.
+  // Flattened list of objects in the object graph.
   //
   // The position of the object in this list indicates its id.
   // Nodes[0] is considered the root node.
@@ -37,11 +35,11 @@ message SavedObject {
   // Objects which this object depends on: named edges in the dependency
   // graph.
   //
-  // Note: only valid if kind == "object".
-  repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
+  // Note: currently only valid if kind == "user_object".
+  repeated TrackableObjectGraph.TrackableObject.ObjectReference
       children = 1;
 
-  // Removed when forking from CheckpointableObjectGraph.
+  // Removed when forking SavedObject from TrackableObjectGraph.
   reserved "attributes";
   reserved 2;
 
@@ -49,8 +47,8 @@ message SavedObject {
   // (optimizer, variable, slot variable) relationship; none of the three
   // depend on the others directly.
   //
-  // Note: only valid if kind == "object".
-  repeated CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
+  // Note: currently only valid if kind == "user_object".
+  repeated TrackableObjectGraph.TrackableObject.SlotVariableReference
       slot_variables = 3;
 
   oneof kind {
@@ -60,6 +58,7 @@ message SavedObject {
     SavedVariable variable = 7;
     SavedBareConcreteFunction bare_concrete_function = 8;
     SavedConstant constant = 9;
+    SavedResource resource = 10;
   }
 }
 
@@ -76,11 +75,11 @@ message SavedUserObject {
   VersionDef version = 2;
 }
 
-// A SavedAsset represents a file in a SavedModel.
+// A SavedAsset points to an asset in the MetaGraph.
 //
-// When bound to a function this object evaluates to a Variable from which the
-// absolute filename can be read. Users should not expect the filename to be
-// maintained.
+// When bound to a function this object evaluates to a tensor with the absolute
+// filename. Users should not depend on a particular part of the filename to
+// remain stable (e.g. basename could be changed).
 message SavedAsset {
   // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
   //
@@ -128,13 +127,11 @@ message SavedConstant {
 }
 
 // Represents a Variable that is initialized by loading the contents from the
-// SavedModel checkpoint.
+// checkpoint.
 message SavedVariable {
   DataType dtype = 1;
   TensorShapeProto shape = 2;
   bool trainable = 3;
-
-  // TODO(andresp): Add save_slice_info_def?
 }
 
 // Represents `FunctionSpec` used in `Function`. This represents a
@@ -153,3 +150,9 @@ message FunctionSpec {
   // The input signature, if specified.
   StructuredValue input_signature = 5;
 }
+
+// A SavedResource represents a TF object that holds state during its lifetime.
+message SavedResource {
+  // An object of this type can have a reference to a:
+  // create_resource() and an initialize() function.
+}
diff --git a/tensorflow/python/saved_model/struct.proto b/tensorflow/core/protobuf/struct.proto
similarity index 64%
rename from tensorflow/python/saved_model/struct.proto
rename to tensorflow/core/protobuf/struct.proto
index fd7db84e05d60b1b45df1960b253e1a6661aa186..55b9b520a89a41b066fa2958a4aedf5914dc247a 100644
--- a/tensorflow/python/saved_model/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -8,6 +8,27 @@ package tensorflow;
 // `StructuredValue` represents a dynamically typed value representing various
 // data structures that are inspired by Python data structures typically used in
 // TensorFlow functions as inputs and outputs.
+//
+// For example when saving a Layer there may be a `training` argument. If the
+// user passes a boolean True/False, that switches between two concrete
+// TensorFlow functions. In order to switch between them in the same way after
+// loading the SavedModel, we need to represent "True" and "False".
+//
+// A more advanced example might be a function which takes a list of
+// dictionaries mapping from strings to Tensors. In order to map from
+// user-specified arguments `[{"a": tf.constant(1.)}, {"q": tf.constant(3.)}]`
+// after load to the right saved TensorFlow function, we need to represent the
+// nested structure and the strings, recording that we have a trace for anything
+// matching `[{"a": tf.TensorSpec(None, tf.float32)}, {"q": tf.TensorSpec([],
+// tf.float64)}]` as an example.
+//
+// Likewise functions may return nested structures of Tensors, for example
+// returning a dictionary mapping from strings to Tensors. In order for the
+// loaded function to return the same structure we need to serialize it.
+//
+// This is an ergonomic aid for working with loaded SavedModels, not a promise
+// to serialize all possible function signatures. For example we do not expect
+// to pickle generic Python objects, and ideally we'd stay language-agnostic.
 message StructuredValue {
   // The kind of value.
   oneof kind {
@@ -29,11 +50,11 @@ message StructuredValue {
     // Represents a boolean value.
     bool bool_value = 14;
 
-    // Represents a tf.TensorShape.
+    // Represents a TensorShape.
     tensorflow.TensorShapeProto tensor_shape_value = 31;
-    // Represents an enum value for tf.DType.
+    // Represents an enum value for dtype.
     tensorflow.DataType tensor_dtype_value = 32;
-    // Represents a value for tf.TensorShape.
+    // Represents a value for tf.TensorSpec.
     TensorSpecProto tensor_spec_value = 33;
 
     // Represents a list of `Value`.
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/core/protobuf/tpu/BUILD
similarity index 100%
rename from tensorflow/contrib/tpu/proto/BUILD
rename to tensorflow/core/protobuf/tpu/BUILD
diff --git a/tensorflow/contrib/tpu/proto/compilation_result.proto b/tensorflow/core/protobuf/tpu/compilation_result.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/compilation_result.proto
rename to tensorflow/core/protobuf/tpu/compilation_result.proto
diff --git a/tensorflow/contrib/tpu/proto/dynamic_padding.proto b/tensorflow/core/protobuf/tpu/dynamic_padding.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/dynamic_padding.proto
rename to tensorflow/core/protobuf/tpu/dynamic_padding.proto
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
similarity index 90%
rename from tensorflow/contrib/tpu/proto/optimization_parameters.proto
rename to tensorflow/core/protobuf/tpu/optimization_parameters.proto
index bc50c613f3d2a09f9e51353fab4938055549a4cd..752f0dbc9cf1d026a11191efb2e9bf6a2686f424 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -89,11 +89,11 @@ message FtrlParameters {
 // the normal version of Adam that updates all parameters in the embedding
 // table, even for entries that are not used in the current minibatch
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
-// use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
-// order to get correct results; a warning will be printed otherwise (which may
-// change to an error in the future). If use_sum_inside_sqrt is set, the Adam
-// variable update formula will be changed from m / (sqrt(v) + epsilon) to
-// m / sqrt(v + epsilon**2); this option improves the performance of TPU
+// use_non_lazy_adam is enabled, gradient accumulation is also required to be
+// enabled in order to get correct results; a warning will be printed otherwise
+// (which may change to an error in the future). If use_sum_inside_sqrt is set,
+// the Adam variable update formula will be changed from m / (sqrt(v) + epsilon)
+// to m / sqrt(v + epsilon**2); this option improves the performance of TPU
 // training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
@@ -170,6 +170,19 @@ message ProximalAdagradParameters {
   float initial_accumulator = 3;
 }
 
+// Status of using gradient accumulation (doing two passes over the input
+// gradients: one to accumulate them into a temporary array and another to apply
+// them using the actual optimization algorithm). The extra message is to wrap
+// the enum for scoping.
+message GradientAccumulationStatus {
+  // Defaults to ENABLED.
+  enum Status {
+    UNSPECIFIED = 0;
+    ENABLED = 1;
+    DISABLED = 2;
+  }
+};
+
 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
   LearningRate learning_rate = 13;
@@ -191,12 +204,10 @@ message OptimizationParameters {
   // once per minibatch.
   float weight_decay_factor = 16;
 
-  // Whether to use gradient accumulation (do two passes over the input
+  // Status of using gradient accumulation (doing two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
-  // apply them using the actual optimization algorithm). This feature is
-  // experimental -- it has not been fully verified and may cause training
-  // crashes and/or failures.
-  bool use_gradient_accumulation = 15;
+  // apply them using the actual optimization algorithm).
+  GradientAccumulationStatus.Status gradient_accumulation_status = 17;
 
   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
@@ -212,6 +223,8 @@ message OptimizationParameters {
     AdadeltaParameters adadelta = 12;
     ProximalAdagradParameters proximal_adagrad = 14;
   }
+
+  reserved 15;  // Old use_gradient_accumulation.
 }
 
 // Specification of an optimization algorithm's state variables (both the main
diff --git a/tensorflow/contrib/tpu/proto/topology.proto b/tensorflow/core/protobuf/tpu/topology.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/topology.proto
rename to tensorflow/core/protobuf/tpu/topology.proto
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
similarity index 97%
rename from tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto
rename to tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
index da19b135d7497d1bd5d2e212cab97db78c756cad..53280edfa6df9fc24d1a38dd6458d79fc29600d8 100644
--- a/tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto
+++ b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
@@ -2,8 +2,8 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-import "tensorflow/contrib/tpu/proto/optimization_parameters.proto";
-import "tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto";
+import "tensorflow/core/protobuf/tpu/optimization_parameters.proto";
+import "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto";
 
 message TPUEmbeddingConfiguration {
   // Description of the various embedding tables.
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto
rename to tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
diff --git a/tensorflow/core/protobuf/checkpointable_object_graph.proto b/tensorflow/core/protobuf/trackable_object_graph.proto
similarity index 84%
rename from tensorflow/core/protobuf/checkpointable_object_graph.proto
rename to tensorflow/core/protobuf/trackable_object_graph.proto
index f2956404b5e0d384f8fcec391ac0ac6c8b583a5e..02d852e6f3df024fa35bf9e4d05af5f2f8d568a5 100644
--- a/tensorflow/core/protobuf/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/trackable_object_graph.proto
@@ -8,10 +8,10 @@ package tensorflow;
 // own variables, allowing for more robust checkpoint loading into modified
 // programs.
 
-message CheckpointableObjectGraph {
-  message CheckpointableObject {
+message TrackableObjectGraph {
+  message TrackableObject {
     message ObjectReference {
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // An index into `TrackableObjectGraph.nodes`, indicating the object
       // being referenced.
       int32 node_id = 1;
       // A user-provided name for the edge.
@@ -37,12 +37,12 @@ message CheckpointableObjectGraph {
     }
 
     message SlotVariableReference {
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the
+      // An index into `TrackableObjectGraph.nodes`, indicating the
       // variable object this slot was created for.
       int32 original_variable_node_id = 1;
       // The name of the slot (e.g. "m"/"v").
       string slot_name = 2;
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the
+      // An index into `TrackableObjectGraph.nodes`, indicating the
       // `Object` with the value of the slot variable.
       int32 slot_variable_node_id = 3;
     }
@@ -55,5 +55,5 @@ message CheckpointableObjectGraph {
     repeated SlotVariableReference slot_variables = 3;
   }
 
-  repeated CheckpointableObject nodes = 1;
+  repeated TrackableObject nodes = 1;
 }
diff --git a/tensorflow/contrib/tpu/utils/BUILD b/tensorflow/core/tpu/BUILD
similarity index 82%
rename from tensorflow/contrib/tpu/utils/BUILD
rename to tensorflow/core/tpu/BUILD
index c27b73728702dcb1c84a82d3a07d15978ed2710f..5cbed402f75bf7ecf67ea06a2ad8d89260d7c1d1 100644
--- a/tensorflow/contrib/tpu/utils/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -8,9 +8,9 @@ cc_library(
     hdrs = ["tpu_embedding_optimization_parameters_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
         "@com_google_absl//absl/base",
     ],
 )
@@ -21,10 +21,10 @@ cc_library(
     hdrs = ["tpu_embedding_output_layout_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_cc",
     ],
 )
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
similarity index 98%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
rename to tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index d98e0b7a5ed52c00a8cf2b1a1bbc53f1b1cd28c7..2c179b5f83cbec888d1425e91ea7455e60acdb40 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -135,7 +135,7 @@ Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
 }
 namespace {
 // Make a normal state variable specification. Please refer to
-// //third_party/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+// //tensorflow/core/protobuf/tpu/optimization_parameters.proto
 // (StateVariableSpecification message) for instructions on how to set the
 // padding_initial_value field.
 StateVariableSpecification MakeStandardStateVariableSpecification(
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
similarity index 91%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h
rename to tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
index 81d50264edb93e889d736c62a493b058e2f1bd56..ceb07ff35510ae3b034ad391456e5a8a21fa4240 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
-#define TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
 
 #include <string>
 #include "absl/base/casts.h"
-#include "tensorflow/contrib/tpu/proto/optimization_parameters.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/optimization_parameters.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -87,4 +87,4 @@ const float kGradientAccumulatorInitialValue = absl::bit_cast<float, uint32>(1);
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
similarity index 96%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc
rename to tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
index 8480ec4b8bb98e867db3e4e4ed14d4cc529efe49..3a027757af7cb90d465e230b9934a4214888c4f1 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h"
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_output_layout_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.pb.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
similarity index 81%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h
rename to tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
index c10fbeeff2b5af93a118902c0afb3b59cc1a9d60..5bff401b9d2d37f35086fb7c8a39c62d79d7daa9 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
-#define TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
 
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_configuration.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -35,4 +35,4 @@ Status ComputeOutputTensorShapes(
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 6d73c38e3c904458e7438915d5fe35db9f4c8fc8..2d647fd8d86866c93f2a3890e3e40e7f70f670f5 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -105,15 +105,21 @@ class BCast {
   static Vec FromShape(const TensorShape& shape);
   static TensorShape ToShape(const BCast::Vec& vec);
 
-  template <int NDIMS>
-  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+  template <typename IndexType, int NDIMS>
+  static Eigen::array<IndexType, NDIMS> ToIndexArrayType(
       const BCast::Vec& vec) {
     CHECK_EQ(vec.size(), NDIMS);
-    Eigen::array<Eigen::DenseIndex, NDIMS> ret;
+    Eigen::array<IndexType, NDIMS> ret;
     for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i];
     return ret;
   }
 
+  template <int NDIMS>
+  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+      const BCast::Vec& vec) {
+    return ToIndexArrayType<Eigen::DenseIndex, NDIMS>(vec);
+  }
+
  private:
   bool valid_ = true;
   Vec x_reshape_;
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index cb088faec1ece7cffde4499df900be9d8dd16bc5..56e618872a71e190cbec9c1cd33e1b246a1c9e08 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -289,6 +289,30 @@ bool DeviceNameUtils::IsSpecification(const ParsedName& less_specific,
   return true;
 }
 
+void DeviceNameUtils::EnsureSpecification(ParsedName* more_specific,
+                                          const ParsedName& less_specific) {
+  if (less_specific.has_job) {
+    more_specific->has_job = true;
+    more_specific->job = less_specific.job;
+  }
+  if (less_specific.has_replica) {
+    more_specific->has_replica = true;
+    more_specific->replica = less_specific.replica;
+  }
+  if (less_specific.has_task) {
+    more_specific->has_task = true;
+    more_specific->task = less_specific.task;
+  }
+  if (less_specific.has_type) {
+    more_specific->has_type = true;
+    more_specific->type = less_specific.type;
+  }
+  if (less_specific.has_id) {
+    more_specific->has_id = true;
+    more_specific->id = less_specific.id;
+  }
+}
+
 /* static */
 bool DeviceNameUtils::IsCompleteSpecification(const ParsedName& pattern,
                                               const ParsedName& name) {
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index bb5e2b3f0c42b321bc7ab45cdad2ec951671be96..b047e814bd694a775af0487365c85ce02ad573fd 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -110,6 +110,11 @@ class DeviceNameUtils {
   static bool IsSpecification(const ParsedName& less_specific,
                               const ParsedName& more_specific);
 
+  // Makes minimal changes to more_specific so that it becomes a
+  // specification of less_specific.
+  static void EnsureSpecification(ParsedName* more_specific,
+                                  const ParsedName& less_specific);
+
   // Like IsSpecification, but the second argument "name" must have a
   // non-wildcard value for all of its components.
   static bool IsCompleteSpecification(const ParsedName& pattern,
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index cd168f6597347a9a19cd4486ef437302c5ab735d..4e53c59ba364cc1daf7d8db7cd0529986a8e3094 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -63,7 +63,7 @@ class SparseTensor {
                                     ix.shape().dim_size(0), ", values = ",
                                     vals.shape().dim_size(0), ")"));
     }
-    int dims;
+    int dims = 0;
     TF_RETURN_IF_ERROR(GetDimsFromIx(ix, &dims));
     if (order.size() != dims) {
       return Status(error::INVALID_ARGUMENT,
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index f5f0d7c3c852390ead414bf37260e531119e100b..5f99f0a9c90122ebe194b734af4950c0241156cd 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -37,8 +37,7 @@ cc_binary(
         "-lm",
         "-z defs",
         "-s",
-        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        "$(location {})".format(LINKER_SCRIPT),
+        "-Wl,--version-script,$(location {})".format(LINKER_SCRIPT),
     ],
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
index 3fe33df920d008845bfd1002075fd6b5dc25b31f..72b62f1ad4d7094f97b43841251c824a558da82f 100644
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ b/tensorflow/examples/autograph/integration_tests/keras_test.py
@@ -87,18 +87,16 @@ class KerasTest(tf.test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_recursive_true(self):
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 'Object conversion is not yet supported.'):
-      with tf.Graph().as_default():
-        model = CompoundModel()
-        model.build(tf.TensorShape((None, 10, 10, 1)))
-        init = tf.global_variables_initializer()
-
-        with tf.Session() as sess:
-          self.evaluate(init)
-          sample_input = tf.random_uniform((1, 10, 10, 1))
-          output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(self.evaluate(output).shape, (1, 3))
+    with tf.Graph().as_default():
+      model = CompoundModel()
+      model.build(tf.TensorShape((None, 10, 10, 1)))
+      init = tf.global_variables_initializer()
+
+      with tf.Session() as sess:
+        self.evaluate(init)
+        sample_input = tf.random_uniform((1, 10, 10, 1))
+        output = model(sample_input)  # pylint: disable=not-callable
+        self.assertEqual(self.evaluate(output).shape, (1, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
index 3f94984692341b2d7ae975597ecdd1893486afb4..22ee785dc341bf117d458eef2f0e7dd5e170181b 100644
--- a/tensorflow/examples/ios/benchmark/ios_image_load.h
+++ b/tensorflow/examples/ios/benchmark/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
                                                  int* out_width,
diff --git a/tensorflow/examples/ios/camera/CameraExampleViewController.h b/tensorflow/examples/ios/camera/CameraExampleViewController.h
index 0aefbc6eedb0f140f7c162512cf60027bbec7501..277b6e272dc34b429021abba4a3e2381a2459060 100644
--- a/tensorflow/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/examples/ios/camera/CameraExampleViewController.h
@@ -16,8 +16,8 @@
 #import <UIKit/UIKit.h>
 
 #include <memory>
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
+#include "third_party/tensorflow/core/public/session.h"
+#include "third_party/tensorflow/core/util/memmapped_file_system.h"
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate,
diff --git a/tensorflow/examples/ios/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
index f10b0b983a957bd52d5bd6dc0841d899a3196beb..991568751e9bb6acdaaf1da3f217438392575aa2 100644
--- a/tensorflow/examples/ios/camera/ios_image_load.h
+++ b/tensorflow/examples/ios/camera/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/ios/camera/tensorflow_utils.h b/tensorflow/examples/ios/camera/tensorflow_utils.h
index 78bdb82aae63d14835b99021ed6686b50777577b..33e95b185c74a7fb026ebf1495dca98a12d4e2ae 100644
--- a/tensorflow/examples/ios/camera/tensorflow_utils.h
+++ b/tensorflow/examples/ios/camera/tensorflow_utils.h
@@ -18,8 +18,8 @@
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
+#include "third_party/tensorflow/core/public/session.h"
+#include "third_party/tensorflow/core/util/memmapped_file_system.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 // Reads a serialized GraphDef protobuf file from the bundle, typically
diff --git a/tensorflow/examples/ios/simple/ios_image_load.h b/tensorflow/examples/ios/simple/ios_image_load.h
index 0e0b771118b9eb5b33dcf7b9bea1a33b4873ac6d..2d2ee78e991e42fa1e21ae697c2c76606fc7639c 100644
--- a/tensorflow/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/examples/ios/simple/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index f40c6d9d28f677d9beafa83d81927224049ac6df..d7d8d95f66594212aaf4a9ad0655c2302643bbe4 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -36,18 +36,80 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "export_simple_text_embedding",
+    srcs = ["export_simple_text_embedding.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_model_in_sequential_keras",
+    srcs = ["use_model_in_sequential_keras.py"],
+    deps = [
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "mnist_util",
+    srcs = ["mnist_util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_mnist_cnn",
+    srcs = ["export_mnist_cnn.py"],
+    deps = [
+        ":mnist_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_mnist_cnn",
+    srcs = ["use_mnist_cnn.py"],
+    deps = [
+        ":mnist_util",
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "saved_model_test",
     srcs = [
         "saved_model_test.py",
     ],
     data = [
+        ":export_mnist_cnn",
         ":export_rnn_cell",
+        ":export_simple_text_embedding",
         ":export_text_rnn_model",
+        ":use_mnist_cnn",
+        ":use_model_in_sequential_keras",
         ":use_rnn_cell",
         ":use_text_rnn_model",
     ],
+    shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = [
+        "noasan",  # forge input size exceeded
+        "nomsan",  # forge input size exceeded
+        "notsan",  # forge input size exceeded
+    ],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..68acfb481795331ba6ad87587ec55bfbfb9c2b00
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
@@ -0,0 +1,189 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports a convolutional feature extractor for MNIST in SavedModel format.
+
+The feature extractor is a convolutional neural network plus a hidden layer
+that gets trained as part of an MNIST classifier and then written to a
+SavedModel (without the classification layer). From there, use_mnist_cnn.py
+picks it up for transfer learning.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.examples.saved_model.integration_tests import mnist_util
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'export_dir', None,
+    'Directory of exported SavedModel.')
+flags.DEFINE_integer(
+    'epochs', 10,
+    'Number of epochs to train.')
+flags.DEFINE_bool(
+    'fast_test_mode', False,
+    'Shortcut training for running in unit tests.')
+flags.DEFINE_bool(
+    'export_print_hparams', False,
+    'If true, the exported function will print its effective hparams.')
+
+
+def make_feature_extractor(l2_strength, dropout_rate):
+  """Returns a Keras Model to compute a feature vector from MNIST images."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', name='conv1',
+                               kernel_regularizer=regularizer())(net)
+  net = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', name='conv2',
+                               kernel_regularizer=regularizer())(net)
+  net = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), name='pool1')(net)
+  net = tf.keras.layers.Dropout(dropout_rate, name='dropout1')(net)
+  net = tf.keras.layers.Flatten(name='flatten')(net)
+  net = tf.keras.layers.Dense(128, activation='relu', name='dense1',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def set_feature_extractor_hparams(model, dropout_rate):
+  model.get_layer('dropout1').rate = dropout_rate
+
+
+def make_classifier(feature_extractor, l2_strength, dropout_rate=0.5):
+  """Returns a Keras Model to classify MNIST using feature_extractor."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = feature_extractor(net)
+  net = tf.keras.layers.Dropout(dropout_rate)(net)
+  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def wrap_keras_model_for_export(model, batch_input_shape,
+                                set_hparams, default_hparams):
+  """Wraps `model` for saving and loading as SavedModel."""
+  if default_hparams is None: default_hparams = {}
+  hparam_keys = list(default_hparams.keys())
+  hparam_defaults = tuple(default_hparams.values())
+  # The goal is to save a function with this argspec...
+  argspec = tf_inspect.FullArgSpec(
+      args=(['inputs', 'training'] + hparam_keys),
+      defaults=((False,) + hparam_defaults),
+      varargs=None, varkw=None,
+      kwonlyargs=[], kwonlydefaults=None,
+      annotations={})
+  # ...and this behavior:
+  def call_fn(inputs, training, *args):
+    if FLAGS.export_print_hparams:
+      args = [tf.keras.backend.print_tensor(args[i], 'training=%s and %s='
+                                            % (training, hparam_keys[i]))
+              for i in range(len(args))]
+    kwargs = dict(zip(hparam_keys, args))
+    if kwargs: set_hparams(model, **kwargs)
+    return model(inputs, training=training)
+  # We cannot spell out `args` in def statement for call_fn, but since
+  # tf.function uses tf_inspect, we can use tf_decorator to wrap it with
+  # the desired argspec.
+  def wrapped(*args, **kwargs):  # TODO(arnoegw): Can we use call_fn itself?
+    return call_fn(*args, **kwargs)
+  traced_call_fn = tf.function(autograph=False)(
+      tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec))
+  # Now we need to trigger traces for
+  # - training set to Python values True or False (hence two traces),
+  # - tensor inputs of the expected nesting, shape and dtype,
+  # - tensor-valued kwargs for hparams, with caller-side defaults.
+  # Tracing with partially determined shapes requires an input signature,
+  # so we initiate tracing from a helper function with only tensor inputs.
+  @tf.function(autograph=False)
+  def trigger_traces(inputs, **kwargs):
+    return tuple(traced_call_fn(inputs, training=training, **kwargs)
+                 for training in (True, False))
+  inputs_spec = tf.TensorSpec(shape=batch_input_shape, dtype=tf.float32)
+  hparams_spec = {name: tf.TensorSpec.from_tensor(tf.constant(value))
+                  for name, value in default_hparams.items()}
+  _ = trigger_traces.get_concrete_function(inputs_spec, **hparams_spec)
+
+  # Assemble the output object.
+  obj = tf.train.Checkpoint()
+  obj.__call__ = traced_call_fn
+  obj.trainable_variables = model.trainable_variables
+  obj.variables = model.trainable_variables + model.non_trainable_variables
+  obj.regularization_losses = [_get_traced_loss(model, i)
+                               for i in range(len(model.losses))]
+  return obj
+
+
+def _get_traced_loss(model, i):
+  """Returns tf.function for model.losses[i] with a trace for zero args.
+
+  The intended usage is
+    [_get_traced_loss(model, i) for i in range(len(model.losses))]
+  This is better than
+    [tf.function(lambda: model.losses[i], input_signature=[]) for i ...]
+  because it avoids capturing a loop index in a lambda, and removes any
+  chance of deferring the trace.
+
+  Args:
+    model: a Keras Model.
+    i: an integer between from 0 up to but to len(model.losses).
+  """
+  f = tf.function(lambda: model.losses[i])
+  _ = f.get_concrete_function()
+  return f
+
+
+def main(argv):
+  del argv
+
+  # Build a complete classifier model using a feature extractor.
+  default_hparams = dict(dropout_rate=0.25)
+  l2_strength = 0.01  # Not a hparam for inputs -> outputs.
+  feature_extractor = make_feature_extractor(l2_strength=l2_strength,
+                                             **default_hparams)
+  classifier = make_classifier(feature_extractor, l2_strength=l2_strength)
+
+  # Train the complete model.
+  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
+      fake_tiny_data=FLAGS.fast_test_mode)
+  classifier.compile(loss=tf.keras.losses.categorical_crossentropy,
+                     optimizer=tf.keras.optimizers.SGD(),
+                     metrics=['accuracy'])
+  classifier.fit(x_train, y_train,
+                 batch_size=128,
+                 epochs=FLAGS.epochs,
+                 verbose=1,
+                 validation_data=(x_test, y_test))
+
+  # Save the feature extractor to a framework-agnostic SavedModel for reuse.
+  # Note that the feature_extractor object has not been compiled or fitted,
+  # so it does not contain an optimizer and related state.
+  exportable = wrap_keras_model_for_export(feature_extractor,
+                                           (None,) + mnist_util.INPUT_SHAPE,
+                                           set_feature_extractor_hparams,
+                                           default_hparams)
+  tf.saved_model.save(exportable, FLAGS.export_dir)
+
+
+if __name__ == '__main__':
+  # tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
index f1ae38b8139ba17f06d58593527e5463bec94c31..3660ed2a8a5c07355e9bd0702d800d8fe7a18510 100644
--- a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
+++ b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
@@ -33,7 +33,7 @@ def main(argv):
   del argv
 
   root = tf.train.Checkpoint()
-  # Create a cell and attach to our checkpointable.
+  # Create a cell and attach to our trackable.
   root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
 
   # Wrap the rnn_cell.__call__ function and assign to next_state.
diff --git a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..af61fa37add3be5c6f6ee628313bf7d96ed74d43
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text embedding model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+# TODO(vbardiovsky): remove these when symbols are public.
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.training.tracking import tracking
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+def write_vocabulary_file(vocabulary):
+  """Write temporary vocab file for module construction."""
+  tmpdir = tempfile.mkdtemp()
+  vocabulary_file = os.path.join(tmpdir, "tokens.txt")
+  with tf.io.gfile.GFile(vocabulary_file, "w") as f:
+    for entry in vocabulary:
+      f.write(entry + "\n")
+  return vocabulary_file
+
+
+class TextEmbeddingModel(tf.train.Checkpoint):
+  """Text embedding model.
+
+  A text embeddings model that takes a sentences on input and outputs the
+  sentence embedding.
+  """
+
+  def __init__(self, vocabulary, emb_dim, oov_buckets):
+    super(TextEmbeddingModel, self).__init__()
+    self._oov_buckets = oov_buckets
+    self._vocabulary_file = tracking.TrackableAsset(
+        write_vocabulary_file(vocabulary))
+    self._total_size = len(vocabulary) + oov_buckets
+    self._table = lookup_ops.index_table_from_file(
+        vocabulary_file=self._vocabulary_file,
+        num_oov_buckets=self._oov_buckets,
+        hasher_spec=lookup_ops.FastHashSpec)
+    self.embeddings = tf.Variable(
+        tf.random.uniform(shape=[self._total_size, emb_dim]))
+    self.variables = [self.embeddings]
+    self.trainable_variables = self.variables
+
+  def _tokenize(self, sentences):
+    # Perform a minimalistic text preprocessing by removing punctuation and
+    # splitting on spaces.
+    normalized_sentences = tf.strings.regex_replace(
+        input=sentences, pattern=r"\pP", rewrite="")
+    normalized_sentences = tf.reshape(normalized_sentences, [-1])
+    sparse_tokens = tf.string_split(normalized_sentences, " ")
+
+    # Deal with a corner case: there is one empty sentence.
+    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
+    # Deal with a corner case: all sentences are empty.
+    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
+    sparse_token_ids = self._table.lookup(sparse_tokens.values)
+
+    return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
+
+  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
+  def __call__(self, sentences):
+    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
+
+    return tf.nn.safe_embedding_lookup_sparse(
+        embedding_weights=self.embeddings,
+        sparse_ids=tf.SparseTensor(token_ids, token_values, token_dense_shape),
+        sparse_weights=None,
+        combiner="sqrtn")
+
+
+def main(argv):
+  del argv
+
+  vocabulary = ["cat", "is", "on", "the", "mat"]
+  module = TextEmbeddingModel(vocabulary=vocabulary, emb_dim=10, oov_buckets=10)
+  tf.saved_model.save(module, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/mnist_util.py b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e4ffda19e7bf5a3cf8c0418c8e23709d9dc69c7
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience wrapper around Keras' MNIST and Fashion MNIST data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+INPUT_SHAPE = (28, 28, 1)
+NUM_CLASSES = 10
+
+
+def load_reshaped_data(use_fashion_mnist=False, fake_tiny_data=False):
+  """Returns MNIST or Fashion MNIST train and test data."""
+  if fake_tiny_data:
+    num_fakes = 10
+    x_train = x_test = np.zeros((num_fakes, 28, 28), dtype=np.uint8)
+    y_train = y_test = np.zeros((num_fakes,), dtype=np.int64)
+  else:
+    mnist = (tf.keras.datasets.fashion_mnist if use_fashion_mnist else
+             tf.keras.datasets.mnist)
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+  return ((_prepare_image(x_train), _prepare_label(y_train)),
+          (_prepare_image(x_test), _prepare_label(y_test)))
+
+
+def _prepare_image(x):
+  """Converts images to [n,h,w,c] format in range [0,1]."""
+  return x[..., None].astype('float32') / 255.
+
+
+def _prepare_label(y):
+  """Conerts labels to one-hot encoding."""
+  return tf.keras.utils.to_categorical(y, NUM_CLASSES)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
index 770e809ffcd0241aec3c62339265e3e6c13f3e3c..78d2d1e2ade580dff6dc3e39bc3f6e4d47b056bd 100644
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -59,6 +59,28 @@ class SavedModelTest(tf.test.TestCase):
     use_binary = resource_loader.get_path_to_datafile("use_rnn_cell")
     self.assertCommandSucceeded(use_binary, model_dir=export_dir)
 
+  @test_util.run_v2_only
+  def test_text_embedding_in_sequential_keras(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_simple_text_embedding")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile(
+        "use_model_in_sequential_keras")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_mnist_cnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile("export_mnist_cnn")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir,
+                                fast_test_mode="true")
+
+    use_binary = resource_loader.get_path_to_datafile("use_mnist_cnn")
+    self.assertCommandSucceeded(use_binary, export_dir=export_dir,
+                                fast_test_mode="true")
+
 if __name__ == "__main__":
   # tf.enable_v2_behavior()
   tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e1ca33029a928e8f6a8bad1f068d9e56e71be54
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
@@ -0,0 +1,124 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imports a convolutional feature extractor for MNIST in SavedModel format.
+
+This program picks up the SavedModel written by export_mnist_cnn.py and
+uses the feature extractor contained in it to classification on either
+classic MNIST (digits) or Fashion MNIST (thumbnails of apparel). Optionally,
+it trains the feature extractor further as part of the new classifier.
+As expected, that makes training slower but does not help much for the
+original training dataset but helps a lot for transfer to the other dataset.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.examples.saved_model.integration_tests import mnist_util
+from tensorflow.examples.saved_model.integration_tests import util
+from tensorflow.python.saved_model import load as svmd_load
+tf.saved_model.load = svmd_load.load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'export_dir', None,
+    'Directory of exported SavedModel.')
+flags.DEFINE_integer(
+    'epochs', 5,
+    'Number of epochs to train.')
+flags.DEFINE_bool(
+    'retrain', False,
+    'If set, the imported SavedModel is trained further.')
+flags.DEFINE_float(
+    'dropout_rate', None,
+    'If set, dropout rate passed to the SavedModel.')
+flags.DEFINE_float(
+    'regularization_loss_multiplier', None,
+    'If set, multiplier for the regularization losses in the SavedModel.')
+flags.DEFINE_bool(
+    'use_fashion_mnist', False,
+    'Use Fashion MNIST (products) instead of the real MNIST (digits). '
+    'With this, --retrain gains a lot.')
+flags.DEFINE_bool(
+    'fast_test_mode', False,
+    'Shortcut training for running in unit tests.')
+
+
+def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
+  """Returns a Keras Model to classify MNIST using feature_extractor."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = feature_extractor(net)
+  net = tf.keras.layers.Dropout(dropout_rate)(net)
+  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def scale_regularization_losses(obj, multiplier):
+  """Scales obj.regularization_losses by multiplier if not None."""
+  if multiplier is None: return
+  def _scale_one_loss(l):  # Separate def avoids lambda capture of loop var.
+    f = tf.function(lambda: tf.multiply(multiplier, l()))
+    _ = f.get_concrete_function()
+    return f
+  obj.regularization_losses = [_scale_one_loss(l)
+                               for l in obj.regularization_losses]
+
+
+def main(argv):
+  del argv
+
+  # Load a pre-trained feature extractor and wrap it for use in Keras.
+  obj = tf.saved_model.load(FLAGS.export_dir)
+  scale_regularization_losses(obj, FLAGS.regularization_loss_multiplier)
+  arguments = {}
+  if FLAGS.dropout_rate is not None:
+    arguments['dropout_rate'] = FLAGS.dropout_rate
+  feature_extractor = util.CustomLayer(obj, output_shape=[128],
+                                       trainable=FLAGS.retrain,
+                                       arguments=arguments)
+
+  # Build a classifier with it.
+  model = make_classifier(feature_extractor)
+
+  # Train the classifier (possibly on a different dataset).
+  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
+      use_fashion_mnist=FLAGS.use_fashion_mnist,
+      fake_tiny_data=FLAGS.fast_test_mode)
+  model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                optimizer=tf.keras.optimizers.SGD(),
+                metrics=['accuracy'],
+                # TODO(arnoegw): Remove after investigating huge allocs.
+                run_eagerly=True)
+  print('Training on %s with %d trainable and %d untrainable variables.' %
+        ('Fashion MNIST' if FLAGS.use_fashion_mnist else 'MNIST',
+         len(model.trainable_variables), len(model.non_trainable_variables)))
+  model.fit(x_train, y_train,
+            batch_size=128,
+            epochs=FLAGS.epochs,
+            steps_per_epoch=3,
+            verbose=1,
+            validation_data=(x_test, y_test))
+
+
+if __name__ == '__main__':
+  # tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c8aab8e2373fbd4ce13bb6ff22184dfefadbed7
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
@@ -0,0 +1,73 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use text embedding module in sequential Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import numpy as np
+
+import tensorflow as tf
+# TODO(vbardiovsky): Remove when load symbol is public.
+from tensorflow.examples.saved_model.integration_tests import util
+from tensorflow.python.saved_model.load import load
+
+tf.saved_model.load = load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def train(fine_tuning):
+  """Build a Keras model and train with mock data."""
+  features = np.array(["my first sentence", "my second sentence"])
+  labels = np.array([1, 0])
+  dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+
+  module = tf.saved_model.load(FLAGS.model_dir)
+
+  # Create the sequential keras model.
+  l = tf.keras.layers
+  model = tf.keras.Sequential()
+  model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
+  model.add(util.CustomLayer(module, output_shape=[10], trainable=fine_tuning))
+  model.add(l.Dense(100, activation="relu"))
+  model.add(l.Dense(50, activation="relu"))
+  model.add(l.Dense(1, activation="sigmoid"))
+
+  model.compile(
+      optimizer="adam",
+      loss="binary_crossentropy",
+      metrics=["accuracy"],
+      # TODO(b/124446120): Remove after fixed.
+      run_eagerly=True)
+
+  model.fit_generator(generator=dataset.batch(1), epochs=5)
+
+
+def main(argv):
+  del argv
+
+  train(fine_tuning=False)
+  train(fine_tuning=True)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/util.py b/tensorflow/examples/saved_model/integration_tests/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4b5b62b7e0bf10fa61d32dd0f06903a2edf9e96
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/util.py
@@ -0,0 +1,104 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for integration tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow as tf
+
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.util import tf_inspect
+
+
+# TODO(vbardiovsky): We should just reuse Keras's Lambda layer, when that
+# enables to get trainable variables.
+class CustomLayer(tf.keras.layers.Layer):
+  """Wraps callable object as a `Layer` object.
+
+  Args:
+    func: The callable object to wrap. Layer inputs are passed as the first
+      positional argument. If `func` accepts a `training` argument, a Python
+      boolean is passed for it.
+      If present, the following attributes of `func` have a special meaning:
+        * variables: a list of all tf.Variable objects that `func` depends on.
+        * trainable_variables: those elements of `variables` that are reported
+          as trainable variables of this Keras Layer.
+        * regularization_losses: a list of callables to be added as losses
+          of this Keras layer. Each one must accept zero arguments and return
+          a scalare tensor.
+    output_shape: A tuple with the (possibly partial) output shape of `func`
+      *without* leading batch size (by analogy to Dense(..., input_shape=...)).
+    trainable: Boolean controlling whether the trainable variables of `func`
+      are reported as trainable variables of this layer.
+    arguments: optionally, a dict with additional keyword arguments passed
+      to `func`.
+  """
+
+  def __init__(self, func, output_shape, trainable=False, arguments=None,
+               **kwargs):
+    # Set self._{non,}_trainable_weights before calling Layer.__init__.
+    if hasattr(func, 'trainable_variables'):
+      self._trainable_weights = [v for v in func.trainable_variables]
+      trainable_variables_set = set(func.trainable_variables)
+    else:
+      self._trainable_weights = []
+      trainable_variables_set = set()
+    if hasattr(func, 'variables'):
+      self._non_trainable_weights = [v for v in func.variables
+                                     if v not in trainable_variables_set]
+    else:
+      self._non_trainable_weights = []  # TODO(arnoegw): Infer from `func`.
+    super(CustomLayer, self).__init__(trainable=trainable, **kwargs)
+    # Prepare to call `func`.
+    self._func = func
+    self._func_fullargspec = tf_inspect.getfullargspec(func.__call__)
+    self._func_wants_training = (
+        'training' in self._func_fullargspec.args or
+        'training' in self._func_fullargspec.kwonlyargs)
+    self._arguments = arguments or {}
+    # TODO(vbardiovsky): We should be able to get the embedding dimension from
+    # the restored model.
+    self._output_shape = tuple(output_shape)
+    # Forward the callable's regularization losses (if any).
+    if hasattr(func, 'regularization_losses'):
+      for l in func.regularization_losses:
+        if not callable(l):
+          raise ValueError(
+              'CustomLayer(func) expects func.regularization_losses to be an '
+              'iterable of callables, each returning a scalar loss term.')
+        self.add_loss(l)  # Supports callables.
+
+  def call(self, x, training=None):
+    # We basically want to call this...
+    f = functools.partial(self._func, x, **self._arguments)
+    # ...but we may also have to pass a Python boolean for `training`.
+    if not self._func_wants_training:
+      result = f()
+    else:
+      if training is None:
+        training = tf.keras.backend.learning_phase()  # Could be a tensor.
+      result = smart_cond.smart_cond(training,
+                                     lambda: f(training=True),
+                                     lambda: f(training=False))
+    # TODO(vbardiovsky): Polymorphic function should return shaped tensor.
+    result.set_shape(self.compute_output_shape(x.shape))
+    return result
+
+  def compute_output_shape(self, input_shape):
+    return (input_shape[0],) + self._output_shape
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index ca044e57ede8c1ac95beede7c282afbf5d130609..88f7fe7faa635339f2b0ef314a71236365902d7f 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -63,6 +63,13 @@ tf_py_test(
 
 py_binary(
     name = "train",
+    srcs = ["train.py"],
+    srcs_version = "PY2AND3",
+    deps = [":train_main_lib"],
+)
+
+py_library(
+    name = "train_main_lib",
     srcs = [
         "train.py",
     ],
@@ -90,6 +97,13 @@ py_binary(
     name = "freeze",
     srcs = ["freeze.py"],
     srcs_version = "PY2AND3",
+    deps = [":freeze_main_lib"],
+)
+
+py_library(
+    name = "freeze_main_lib",
+    srcs = ["freeze.py"],
+    srcs_version = "PY2AND3",
     deps = [":freeze_lib"],
 )
 
@@ -122,6 +136,13 @@ py_binary(
     name = "wav_to_features",
     srcs = ["wav_to_features.py"],
     srcs_version = "PY2AND3",
+    deps = [":wav_to_features_main_lib"],
+)
+
+py_library(
+    name = "wav_to_features_main_lib",
+    srcs = ["wav_to_features.py"],
+    srcs_version = "PY2AND3",
     deps = [":wav_to_features_lib"],
 )
 
@@ -154,6 +175,13 @@ py_binary(
     name = "generate_streaming_test_wav",
     srcs = ["generate_streaming_test_wav.py"],
     srcs_version = "PY2AND3",
+    deps = [":generate_streaming_test_wav_main_lib"],
+)
+
+py_library(
+    name = "generate_streaming_test_wav_main_lib",
+    srcs = ["generate_streaming_test_wav.py"],
+    srcs_version = "PY2AND3",
     deps = [":generate_streaming_test_wav_lib"],
 )
 
@@ -201,6 +229,13 @@ py_binary(
     name = "label_wav",
     srcs = ["label_wav.py"],
     srcs_version = "PY2AND3",
+    deps = [":label_wav_main_lib"],
+)
+
+py_library(
+    name = "label_wav_main_lib",
+    srcs = ["label_wav.py"],
+    srcs_version = "PY2AND3",
     deps = [":label_wav_lib"],
 )
 
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index c8ab24871c4168eb69363a2cc99492e542ca5bec..b3bd73a08b28c10dc66a3b0019411b82709a4264 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -121,7 +121,7 @@ History
 * 0.1.0: Initial release.
 * 0.2.0: Many fixes, including lower memory footprint and support for Python 3.
 * 0.3.0: Use 0.7.1 release.
-* 0.4.0: Move notMMNIST data for Google Cloud.
+* 0.4.0: Move notMNIST data for Google Cloud.
 * 0.5.0: Actually use 0.7.1 release.
 * 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
 * 1.0.0: Update to TF 1.0.0 release.
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 05b8e6fd19a68ab103dee6fd533c80d0f078386a..5f07b534fa7c41b6c88627b0c20994154b64b594 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -619,139 +619,6 @@ func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Outp
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
-type QuantizeAndDequantizeV2Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
-//
-// value: Whether the quantization is signed or unsigned. (actually this parameter should
-// have been called <b>`signed_output`</b>)
-// If not specified, defaults to true
-func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
-//
-// value: Whether the range is given or should be determined from the `input` tensor.
-// If not specified, defaults to false
-func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
-//
-// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
-// used when rounding float values to their quantized equivalents. The following
-// rounding modes are currently supported:
-//
-// *   HALF_TO_EVEN: this is the default round_mode.
-// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
-//     rounds up to -7.
-//
-// If not specified, defaults to "HALF_TO_EVEN"
-func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This op simulates the precision loss from the quantized forward pass by:
-//
-// 1. Quantizing the tensor to fixed point numbers, which should match the target
-//    quantization method when it is used in inference.
-// 2. Dequantizing it back to floating point numbers for the following ops, most
-//    likely matmul.
-//
-// There are different ways to quantize. This version uses only scaling, so 0.0
-// maps to 0.
-//
-// From the specified 'num_bits' in the quantized output type, it determines
-// minimum and maximum representable quantized values.
-//
-// e.g.
-//
-// *   [-128, 127] for signed, num_bits = 8, or
-// *   [0, 255] for unsigned, num_bits = 8.
-//
-// If range_given == False, the initial input_min, input_max will be determined
-// automatically as the minimum and maximum values in the input tensor, otherwise
-// the specified values of input_min, input_max are used.
-//
-// Note: If the input_min, input_max are specified, they do not need to equal the
-// actual minimum and maximum values in the tensor. e.g. in some cases it may be
-// beneficial to specify these values such that the low probability extremes of the
-// input distribution are clipped.
-//
-// This op determines the maximum scale_factor that would map the initial
-// [input_min, input_max] range to a range that lies within the representable
-// quantized range.
-//
-// It determines the scale from one of input_min and input_max, then updates the
-// other one to maximize the respresentable range.
-//
-// e.g.
-//
-// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-//     5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
-//     would update input_max to be 127 / 12.8 = 9.921875
-// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-//     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
-//     would update input_min to be 128.0 / 12.7 = -10.07874
-// *   if the output is unsigned, input_min is forced to be 0, and only the
-//     specified input_max is used.
-//
-// After determining the scale_factor and updating the input range, it applies the
-// following to each value in the 'input' tensor.
-//
-// output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
-//
-// The above round function rounds the value based on the given round_mode.
-//
-//
-// Arguments:
-//	input: Tensor to quantize and then dequantize.
-//	input_min: If `range_given == True`, this specifies the minimum input value that needs to
-// be represented, otherwise it is determined from the min value of the `input`
-// tensor.
-//	input_max: If `range_given == True`, this specifies the maximum input value that needs to
-// be represented, otherwise it is determined from the max value of the `input`
-// tensor.
-func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV2",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Bitcasts a tensor from one type to another without copying data.
 //
 // Given a tensor `input`, this operation returns a tensor that has the same buffer
@@ -782,49 +649,6 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 	return op.Output(0)
 }
 
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
-//
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SpaceToDepthAttr is an optional argument to SpaceToDepth.
 type SpaceToDepthAttr func(optionalAttr)
 
@@ -1195,65 +1019,6 @@ func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddin
 	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ListDiff",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Inserts a dimension of 1 into a tensor's shape.
 //
 // Given a tensor `input`, this operation inserts a dimension of 1 at the
@@ -1603,78 +1368,6 @@ func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Returns the gradient of `StridedSlice`.
-//
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
-//
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
-		Input: []tf.Input{
-			shape, begin, end, strides, dy,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StridedSliceAttr is an optional argument to StridedSlice.
 type StridedSliceAttr func(optionalAttr)
 
@@ -1868,37 +1561,6 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 	return op.Output(0)
 }
 
-// Return a slice from 'input'.
-//
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
-//
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-//
-// Arguments:
-//
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Slice",
-		Input: []tf.Input{
-			input, begin, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SizeAttr is an optional argument to Size.
 type SizeAttr func(optionalAttr)
 
@@ -3290,30 +2952,6 @@ func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Outpu
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
-//
-//     Computes `x[i, :] = v; return x`.
-//
-// Arguments:
-//	x: A tensor of type `T`.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceUpdate",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Makes a copy of `x`.
 //
 // Arguments:
@@ -4434,89 +4072,31 @@ func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int6
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
-
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
-	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
-	}
-}
-
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
-//
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
+		Type: "CollectiveGather",
 		Input: []tf.Input{
 			input,
 		},
@@ -6036,77 +5616,6 @@ func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size
 	return op.Output(0)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
-
 // MapPeekAttr is an optional argument to MapPeek.
 type MapPeekAttr func(optionalAttr)
 
@@ -6675,170 +6184,92 @@ func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 	return op.Output(0), op.Output(1)
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
+// Deprecated. Use TensorArrayGradV3
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			handle, index, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "Xlogy",
+		Type: "TensorArrayGradV2",
 		Input: []tf.Input{
-			x, y,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["element_shape"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			size,
 		},
 		Attrs: attrs,
 	}
@@ -6846,112 +6277,82 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// Returns x / y element-wise.
+// Split the data from the input value into TensorArray elements.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Selects the k nearest centers for each point.
+// Assuming that `lengths` takes on values
 //
-// Rows of points are assumed to be input points. Rows of centers are assumed to be
-// the list of candidate centers. For each point, the k centers that have least L2
-// distance to it are computed.
+//   ```(n0, n1, ..., n(T-1))```
 //
-// Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
-//	k: Number of nearest centers to return for each point. If k is larger than m, then
-// only m centers are returned.
+// and that `value` has shape
 //
-// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
-// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
-// corresponding center in nearest_center_indices.
-func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NearestNeighbors",
-		Input: []tf.Input{
-			points, centers, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns x * y element-wise.
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "TensorArraySplitV3",
 		Input: []tf.Input{
-			x, y,
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
 
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// EmptyInit sets the optional init attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["init"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// Creates a tensor with the given shape.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// This operation creates a tensor of `shape` and `dtype`.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	shape: 1-D. Represents the shape of the output tensor.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "Empty",
 		Input: []tf.Input{
-			value, bias,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -6959,269 +6360,360 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Concat the elements from the TensorArray into value `value`.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// Takes `T` elements of shapes
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
+// Scatter the data from the input value into specific TensorArray elements.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV3",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			true_classes,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
+// Creates a TensorArray for storing multiple gradients of values in the given handle.
 //
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// Similar to TensorArrayGradV3. However it creates an accumulator with an
+// expanded shape compared to the input TensorArray whose gradient is being
+// computed. This enables multiple gradients for the same TensorArray to be
+// calculated using the same accumulator.
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
+// have shape which is this shape_to_prepend value concatenated with shape of the
+// elements in the TensorArray corresponding to the input handle.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "TensorArrayGradWithShape",
 		Input: []tf.Input{
-			x,
+			handle, flow_in, shape_to_prepend,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// Arguments:
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
+// **A note about the input flow_in:**
 //
-func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalSlidingWindowDataset",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns which elements of x are finite.
+// Pop the element at the top of the stack.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["axis"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Returns a one-hot tensor.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+// ```
+// output =
+//   [5.0 0.0 0.0]  // one_hot(0)
+//   [0.0 0.0 5.0]  // one_hot(2)
+//   [0.0 0.0 0.0]  // one_hot(-1)
+//   [0.0 5.0 0.0]  // one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+// ```
+// output =
+//   [0.0 3.0 3.0 3.0]
+//   [3.0 3.0 3.0 0.0]
+//   [3.0 3.0 3.0 3.0]
+//   [3.0 0.0 3.0 3.0]
+// //  ^                one_hot(0)
+// //      ^            one_hot(2)
+// //          ^        one_hot(-1)
+// //              ^    one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+// ```
+// output =
+//   [
+//     [1.0, 0.0, 0.0]  // one_hot(0)
+//     [0.0, 0.0, 1.0]  // one_hot(2)
+//   ][
+//     [0.0, 1.0, 0.0]  // one_hot(1)
+//     [0.0, 0.0, 0.0]  // one_hot(-1)
+//   ]
+// ```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7230,163 +6722,148 @@ func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, en
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "OneHot",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			indices, depth, on_value, off_value,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Computes the number of elements in the given queue.
 //
 // Arguments:
+//	handle: The handle to a queue.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			input, dimension,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: any tensor.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			input,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
 // If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["signed_input"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7395,9 +6872,9 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -7405,195 +6882,254 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Returns locations of nonzero / true values in a tensor.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
 // For example:
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
 // ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// Arguments:
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "Where",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			condition,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues a tuple of one or more tensors from the given queue.
+//
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			input,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
+//
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			x,
+			handle, tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
 //
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["filterbank_channel_count"] = value
 	}
 }
 
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["dct_coefficient_count"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -7601,561 +7137,437 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the index of a data point that should be added to the seed set.
+// A queue that produces elements in first-in first-out order.
 //
-// Entries in distances are assumed to be squared distances of candidate points to
-// the already sampled centers in the seed set. The op constructs one Markov chain
-// of the k-MC^2 algorithm and returns the index of one candidate point to be added
-// as an additional cluster center.
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
 //
 // Arguments:
-//	distances: Vector with squared distances to the closest previously sampled cluster center
-// for each candidate point.
-//	seed: Scalar. Seed for initializing the random number generator.
+//	component_types: The type of each component in a value.
 //
-// Returns Scalar with the index of the sampled point.
-func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "KMC2ChainInitialization",
-		Input: []tf.Input{
-			distances, seed,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "PaddingFIFOQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// Builds a merged tensor such that
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// For example:
+// For example, if each `indices[m]` is scalar or vector, we have
 //
 // ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
 //
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
 //
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
+//     merged.shape = [max(indices)] + constant
 //
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
 // ```
 //
-// Arguments:
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+//
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			x,
+			data, partitions,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes reciprocal of square root of x element-wise.
-//
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Rsqrt",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// Get the value of the tensor specified by its handle.
+// Produces a string handle for the given MultiDeviceIterator.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	multi_device_iterator: A MultiDeviceIterator resource.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns A string representing the resource.
+func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "MultiDeviceIteratorToStringHandle",
 		Input: []tf.Input{
-			handle,
+			multi_device_iterator,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// Checks whether a tree has been initialized.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "TensorForestTreeIsInitializedOp",
 		Input: []tf.Input{
-			y, dy,
+			tree_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
-
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// Gets next element for the provided shard number.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "MultiDeviceIteratorGetNextFromShard",
 		Input: []tf.Input{
-			input,
+			multi_device_iterator, shard_num, incarnation_id,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Add",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
-		Input: []tf.Input{
-			alpha, sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Computes square of x element-wise.
+// Initializes the multi device iterator with the given dataset.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Square",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "MultiDeviceIteratorInit",
 		Input: []tf.Input{
-			features,
+			dataset, multi_device_iterator, max_buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a batched matrix tensor with new batched diagonal values.
+// to zero.
 //
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// The output is computed as follows:
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
 //
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+// The indicator function
 //
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
+// For example:
 //
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
-		Input: []tf.Input{
-			input, diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise max of two SparseTensors.
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			input, num_lower, num_upper,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "IteratorGetNextAsOptional",
 		Input: []tf.Input{
-			x,
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDenseToSparseBatchDataset",
+		Type: "OptionalGetValue",
 		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
+			optional,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cast",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
 // Outputs a tensor containing the reduction across all input tensors.
@@ -8231,61 +7643,7 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
-//
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x AND y element-wise.
-//
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Quantized Batch normalization.
+// Quantized Batch normalization.
 //
 // This op is deprecated and will be removed in the future. Prefer
 // `tf.nn.batch_normalization`.
@@ -8723,24 +8081,6 @@ func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LessEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes softmax activations.
 //
 // For each batch `i` and class `j` we have
@@ -8810,85 +8150,33 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "Elu",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Computes square of x element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "Square",
 		Input: []tf.Input{
 			x,
 		},
@@ -8897,125 +8185,6 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Checks whether a tree has been initialized.
-//
-// Arguments:
-//	tree_handle: Handle to the tree.
-//
-// Returns Whether the tree is initialized.
-func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeIsInitializedOp",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets next element for the provided shard number.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
-		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
-}
-
 // LeakyReluGradAttr is an optional argument to LeakyReluGrad.
 type LeakyReluGradAttr func(optionalAttr)
 
@@ -9054,23 +8223,6 @@ func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, option
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LeakyReluAttr is an optional argument to LeakyRelu.
 type LeakyReluAttr func(optionalAttr)
 
@@ -9215,7 +8367,7 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 // Computes the minimum along segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
 // This operator is similar to the unsorted segment sum operator found
@@ -9229,6 +8381,15 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 // possible value for the specific numeric type,
 // `output[i] = numeric_limits<T>::max()`.
 //
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
 // If the given segment ID `i` is negative, then the corresponding value is
 // dropped, and will not be included in the result.
 //
@@ -9276,28 +8437,32 @@ func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -9305,6 +8470,24 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
 // The polygamma function is defined as:
@@ -9327,35 +8510,6 @@ func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
 type MaxPoolGradGradV2Attr func(optionalAttr)
 
@@ -9478,6 +8632,21 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	return op.Output(0)
 }
 
+// Connects N inputs to an N-way replicated TPU computation.
+func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedInput",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -9525,27 +8694,6 @@ func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)
 
@@ -9734,37 +8882,20 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
 
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
 //	input: Shape `[batch, depth, rows, cols, in_channels]`.
@@ -9775,7 +8906,7 @@ func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9784,7 +8915,7 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
 			input, filter, out_backprop,
 		},
@@ -9794,24 +8925,6 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -9889,78 +9002,6 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // MaxPoolGradAttr is an optional argument to MaxPoolGrad.
 type MaxPoolGradAttr func(optionalAttr)
 
@@ -10092,6 +9133,129 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Conv2DAttr is an optional argument to Conv2D.
 type Conv2DAttr func(optionalAttr)
 
@@ -10262,36 +9426,66 @@ func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dens
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
+type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters with debug support.
 //
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"reduction": reduction}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NcclReduce",
+		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			parameters, momenta, velocities, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the bias tensor will be added to the last dimension
@@ -10301,23 +9495,23 @@ type BiasAddGradAttr func(optionalAttr)
 // The tensor will be added to "in_channels", the third-to-the-last
 //     dimension.
 // If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
+// Adds `bias` to `value`.
 //
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10326,9 +9520,9 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			out_backprop,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -10336,13 +9530,151 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Xdivy",
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Selects the k nearest centers for each point.
+//
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
+//
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NearestNeighbors",
+		Input: []tf.Input{
+			points, centers, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -10549,261 +9881,6 @@ func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
@@ -10824,21 +9901,6 @@ func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	return op.Output(0)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // UnbatchGradAttr is an optional argument to UnbatchGrad.
 type UnbatchGradAttr func(optionalAttr)
 
@@ -11105,409 +10167,183 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
 // If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// Converts a sparse representation into a dense tensor.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns locations of nonzero / true values in a tensor.
+// Builds an array `dense` with shape `output_shape` such that
 //
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// For example:
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
 //
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
 // ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			condition,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["message"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// An identity op that triggers an error if a gradient is requested.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	input: any tensor.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
-}
-
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
-
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
-	}
+	return op.Output(0)
 }
 
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// Computes the sum along sparse segments of a tensor.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
+// for an explanation of segments.
 //
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// For example:
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
 //
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// Arguments:
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+	return op.Output(0)
 }
 
 // SparseReduceMaxAttr is an optional argument to SparseReduceMax.
@@ -11565,147 +10401,149 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			x,
+			bytes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
+type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "RetrieveTPUEmbeddingADAMParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
 //
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// Examples
-// =========
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Suppose that
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-// ```
-// output =
-//   [5.0 0.0 0.0]  // one_hot(0)
-//   [0.0 0.0 5.0]  // one_hot(2)
-//   [0.0 0.0 0.0]  // one_hot(-1)
-//   [0.0 5.0 0.0]  // one_hot(1)
-// ```
-//
-// Suppose that
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-// ```
-// output =
-//   [0.0 3.0 3.0 3.0]
-//   [3.0 3.0 3.0 0.0]
-//   [3.0 3.0 3.0 3.0]
-//   [3.0 0.0 3.0 3.0]
-// //  ^                one_hot(0)
-// //      ^            one_hot(2)
-// //          ^        one_hot(-1)
-// //              ^    one_hot(1)
-// ```
-//
-// Suppose that
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[2 x 2 x 3]`:
-// ```
-// output =
-//   [
-//     [1.0, 0.0, 0.0]  // one_hot(0)
-//     [0.0, 0.0, 1.0]  // one_hot(2)
-//   ][
-//     [0.0, 1.0, 0.0]  // one_hot(1)
-//     [0.0, 0.0, 0.0]  // one_hot(-1)
-//   ]
-// ```
-//
-// Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-//
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11714,166 +10552,174 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+// RandomStandardNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+		m["seed"] = value
 	}
 }
 
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["input_mode"] = value
+		m["seed2"] = value
 	}
 }
 
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	opspec := tf.OpSpec{
+		Type: "RandomStandardNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// A RNN backed by cuDNN.
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			input, input_h, input_c, params,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+// RandomUniformSeed sets the optional seed attribute to value.
 //
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["seed"] = value
 	}
 }
 
-// Decompress strings.
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
 //
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			bytes,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -11881,115 +10727,186 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// Arguments:
-//	bytes: All the elements must have the same length.
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			bytes,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			x,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "Cross",
 		Input: []tf.Input{
-			gradients, features,
+			a, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
+// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
+type StatefulStandardNormalV2Attr func(optionalAttr)
 
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["dtype"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// Outputs random values from a normal distribution.
 //
-// Input images can be of different types but output images are always float.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11998,9 +10915,9 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "StatefulStandardNormalV2",
 		Input: []tf.Input{
-			images, size,
+			resource, algorithm, shape,
 		},
 		Attrs: attrs,
 	}
@@ -12008,303 +10925,231 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 	return op.Output(0)
 }
 
-// Gather ragged slices from `params` axis `0` according to `indices`.
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
 //
 // ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
 //
-// where
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
 //
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
 //
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
 //
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
 //
-// Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to flat_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
 //
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
-	opspec := tf.OpSpec{
-		Type: "RaggedGather",
-		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
 //
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+//	mutex: The mutex resource to lock.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "MutexLock",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			mutex,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			resource, indices, updates,
+			serialized,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
+// Identity transformation that models performance.
 //
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Identity transformation that models performance.
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
+//	input_dataset: A variant tensor representing the input dataset.
 //
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+//
+func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
+		Type: "ModelDataset",
 		Input: []tf.Input{
-			input, pattern,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Fast Fourier transform.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	input: A complex tensor.
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "FFT",
 		Input: []tf.Input{
-			predictions, targets, k,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// MaxPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// Performs max pooling on the input.
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "MaxPool",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -12312,253 +11157,315 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Multiplies sparse updates into the variable referenced by `resource`.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "ResourceScatterMul",
 		Input: []tf.Input{
-			x,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// Subtracts sparse updates from the variable referenced by `resource`.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// This operation computes
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// Adds sparse updates to the variable referenced by `resource`.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Batch normalization.
+// Reads the value of a variable.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// The tensor returned by this operation is immutable.
 //
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			resource,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			shape,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// DecodeJpegChannels sets the optional channels attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["channels"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
+// DecodeJpegRatio sets the optional ratio attribute to value.
 //
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
-		Attrs: attrs,
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
 
-// RandomUniformSeed sets the optional seed attribute to value.
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["acceptable_fraction"] = value
 	}
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	contents: 0-D.  The JPEG-encoded image.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			shape,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -12566,104 +11473,144 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// NthElementReverse sets the optional reverse attribute to value.
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["normalize"] = value
 	}
 }
 
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
 //
-//     values.shape = input.shape[:-1]
+// The inputs are:
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12672,9 +11619,9 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			input, n,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
@@ -12682,103 +11629,69 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xlogy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Stops gradient computation.
 //
-// Arguments:
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
 //
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "StopGradient",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Eagerly executes a python function to compute func(input)->output. The
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "EagerPyFunc",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
@@ -12788,169 +11701,107 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 	}
 	var idx int
 	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
 		return
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return output
 }
 
-// Compute the pairwise cross product.
+// Concats all tensors in the list along the 0th dimension.
 //
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// Requires that all tensors have the same shape except the first dimension.
 //
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
+// input_handle: The input list.
+// element_shape: The shape of the uninitialized elements in the list. If the first
+//   dimension is not -1, it is assumed that all list elements have the same
+//   leading dim.
+// leading_dims: The list of leading dims of uninitialized list elements. Used if
+//   the leading dim of input_handle.element_shape or the element_shape input arg
+//   is not already set.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "TensorListConcatV2",
 		Input: []tf.Input{
-			a, b,
+			input_handle, element_shape, leading_dims,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
-type StatefulStandardNormalAttr func(optionalAttr)
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
 
-// StatefulStandardNormalDtype sets the optional dtype attribute to value.
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["lower"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	shape: The shape of the output tensor.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormal",
-		Input: []tf.Input{
-			resource, shape,
-		},
-		Attrs: attrs,
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-// is alive, any other request to use `MutexLock` with this mutex will wait.
+// backsubstitution.
 //
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
 //
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
 //
 // Arguments:
-//	mutex: The mutex resource to lock.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "MutexLock",
-		Input: []tf.Input{
-			mutex,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-//
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
-//
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			serialized,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -12958,146 +11809,127 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["Targmax"] = value
-	}
-}
-
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// Saves tensors in V2 checkpoint format.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			input,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a TensorList which, when stacked, has the value of `tensor`.
+// Concatenates quantized tensors along one dimension.
 //
-// Each tensor in the result list corresponds to one row of the input tensor.
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
 //
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			tensor, element_shape,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// This operation computes
+// For example, if the input is
 //
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
+// Graphically the output tensors are:
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			resource, indices, updates,
+			indices, values, shape, start, size,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// Runs multiple additive regression ensemble predictors on input instances and
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "BoostedTreesPredict",
 		Input: []tf.Input{
-			input,
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
 		},
 		Attrs: attrs,
 	}
@@ -13105,239 +11937,136 @@ func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
+// Pads a tensor with zeros.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// The padded size of each dimension D of the output is:
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// For example:
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
 //
-// Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
+		Type: "Pad",
 		Input: []tf.Input{
-			resource, indices, updates,
+			input, paddings,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	resource: the input resource handle.
 //
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			resource, indices, updates,
+			resource,
 		},
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "Minimum",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			x, y,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
-//
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+// if < 0, `scale * features` otherwise.
 //
-// Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
 //
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Type: "Selu",
 		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
 
-// EncodePngCompression sets the optional compression attribute to value.
-//
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// Number of unique elements along last dimension of input `set`.
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13346,9 +12075,9 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "SetSize",
 		Input: []tf.Input{
-			image,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -13356,194 +12085,271 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 	return op.Output(0)
 }
 
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+// Adds sparse `updates` to an existing tensor according to `indices`.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterAdd",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
+// Computes the sign and the log of the absolute value of the determinant of
 //
-// one in the source data format.
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
 //
 // Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//	input: Shape is `[N, M, M]`.
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "LogMatrixDeterminant",
 		Input: []tf.Input{
-			x,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Initializes the multi device iterator with the given dataset.
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
+			predictions, targets, k,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// Check if the input matches the regex pattern.
 //
-// The tensor returned by this operation is immutable.
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "RegexFullMatch",
 		Input: []tf.Input{
-			resource,
+			input, pattern,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
 //
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
 //
 // Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
 //
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
+		Type: "RaggedTensorToSparse",
 		Input: []tf.Input{
-			mutex_lock,
+			tf.OutputList(rt_nested_splits), rt_dense_values,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
 
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Applies sparse addition to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that addition would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// add = tf.scatter_nd_add(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(add)
-// ```
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
 //
-// The resulting update to ref would look like this:
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13552,194 +12358,146 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
+		Type: "FusedBatchNormGradV2",
 		Input: []tf.Input{
-			ref, indices, updates,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Updates the tree ensemble by either adding a layer to the last tree being grown
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// or by starting a new tree.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			x,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "ExperimentalIteratorGetDevice",
 		Input: []tf.Input{
-			input,
+			resource,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13748,9 +12506,9 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			shape, seed,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -13758,51 +12516,16 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
-
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
-//
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "ExperimentalLatencyStatsDataset",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			input_dataset, tag,
 		},
 		Attrs: attrs,
 	}
@@ -13810,72 +12533,49 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// Divides sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] /= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			resource, indices, updates,
+			a_indices, a_values, a_shape, b,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["out_type"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13884,806 +12584,5548 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			shape, seed,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
-type UnicodeDecodeAttr func(optionalAttr)
-
-// UnicodeDecodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Decodes each string in `input` into a sequence of Unicode code points.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-// The `row_splits` tensor indicates where the codepoints for
-// each input string begin and end within the `char_values` tensor.
-// In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
+// Reordering does not affect the shape of the SparseTensor.
 //
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
 // Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
-func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeDecode",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			input,
+			input_indices, input_values, input_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			split_dim, indices, values, shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
-type UnicodeEncodeAttr func(optionalAttr)
-
-// UnicodeEncodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-}
-
-// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD (U+65533).
-// If not specified, defaults to 65533
-func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
+	return output_indices, output_values, output_shape
 }
 
-// Encode a tensor of ints into unicode strings.
+// Applies sparse addition to `input` using individual values or slices
 //
-// Returns a vector of strings, where `output[i]` is constructed by encoding the
-// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
-// using `output_encoding`.
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
 //
-// ---
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// Example:
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
 //
-// ```
-// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
-// input_splits = [0, 5, 10]
-// output_encoding = 'UTF-8'
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
 //
-// output = ['Hello', 'World']
-// ```
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// Arguments:
-//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
-//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
-// In particular, `output[i]` is constructed by encoding the codepoints in the
-// slice `input_values[input_splits[i]:input_splits[i+1]]`.
-//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
-// "UTF-16-BE", and "UTF-32-BE"`.
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
 //
-// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
-func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeEncode",
-		Input: []tf.Input{
-			input_values, input_splits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of tensors in the input tensor list.
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
 //
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Determine the script codes of a given tensor of Unicode integer code points.
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
 //
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
 //
 // Arguments:
-//	input: A Tensor of int32 Unicode code points.
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
 //
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
+		Type: "ScatterNdNonAliasingAdd",
 		Input: []tf.Input{
-			input,
+			input, indices, updates,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
-//
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
+// Creates a MultiDeviceIterator resource.
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Range",
-		Input: []tf.Input{
-			start, limit, delta,
-		},
+		Type: "MultiDeviceIterator",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["overlapping"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["deterministic"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse addition to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Replaces the contents of the table with the specified keys and values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Extract `patches` from `images` and put them in the "depth" output dimension.
+//
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
+//
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
+//
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExtractImagePatches",
+		Input: []tf.Input{
+			images,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMean",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Get the current size of the TensorArray.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for the scaled exponential linear (Selu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
+//
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SeluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Sum",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
+//
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
+//
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSliceGrad",
+		Input: []tf.Input{
+			shape, begin, end, strides, dy,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EuclideanNormAttr is an optional argument to EuclideanNorm.
+type EuclideanNormAttr func(optionalAttr)
+
+// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the euclidean norm of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EuclideanNorm",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise min of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMinimum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
+type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu6",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMul",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
+
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSequenceExample",
+		Input: []tf.Input{
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+}
+
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingADAMParameters",
+		Input: []tf.Input{
+			parameters, momenta, velocities,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Inverse 2D real-valued fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
+
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+//
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
+//
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceStridedSliceAssign",
+		Input: []tf.Input{
+			ref, begin, end, strides, value,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMax",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fetches multiple values from infeed as an XLA tuple.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Enqueue multiple Tensor values on the computation outfeed.
+//
+// Arguments:
+//	inputs: A list of tensors that will be inserted into the outfeed queue as an
+// XLA tuple.
+//
+// Returns the created operation.
+func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Applies softmax to a batched N-D `SparseTensor`.
+//
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a Tensor by indexing into the TensorList.
+//
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGather",
+		Input: []tf.Input{
+			input_handle, indices, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
+//
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CompilationResultProto indicating the status of the TPU compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
+		Input: []tf.Input{
+			var_, m, lr, logbase, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_prod(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 6, 6, 4],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Connects outputs of an N-way replicated computation to N outputs.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedOutput",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
+		return
+	}
+	return outputs
+}
+
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParameters",
+		Input: []tf.Input{
+			parameters, accumulators, linears,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Push an element onto the tensor_array.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV3",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
+type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
+//
+// value: Whether the quantization is signed or unsigned. (actually this parameter should
+// have been called <b>`signed_output`</b>)
+// If not specified, defaults to true
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+//
+// value: Whether the range is given or should be determined from the `input` tensor.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+//
+// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+// used when rounding float values to their quantized equivalents. The following
+// rounding modes are currently supported:
+//
+// *   HALF_TO_EVEN: this is the default round_mode.
+// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+//     rounds up to -7.
+//
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This op simulates the precision loss from the quantized forward pass by:
+//
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
+//
+// There are different ways to quantize. This version uses only scaling, so 0.0
+// maps to 0.
+//
+// From the specified 'num_bits' in the quantized output type, it determines
+// minimum and maximum representable quantized values.
+//
+// e.g.
+//
+// *   [-128, 127] for signed, num_bits = 8, or
+// *   [0, 255] for unsigned, num_bits = 8.
+//
+// If range_given == False, the initial input_min, input_max will be determined
+// automatically as the minimum and maximum values in the input tensor, otherwise
+// the specified values of input_min, input_max are used.
+//
+// Note: If the input_min, input_max are specified, they do not need to equal the
+// actual minimum and maximum values in the tensor. e.g. in some cases it may be
+// beneficial to specify these values such that the low probability extremes of the
+// input distribution are clipped.
+//
+// This op determines the maximum scale_factor that would map the initial
+// [input_min, input_max] range to a range that lies within the representable
+// quantized range.
+//
+// It determines the scale from one of input_min and input_max, then updates the
+// other one to maximize the respresentable range.
+//
+// e.g.
+//
+// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+//     5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+//     would update input_max to be 127 / 12.8 = 9.921875
+// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+//     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+//     would update input_min to be 128.0 / 12.7 = -10.07874
+// *   if the output is unsigned, input_min is forced to be 0, and only the
+//     specified input_max is used.
+//
+// After determining the scale_factor and updating the input range, it applies the
+// following to each value in the 'input' tensor.
+//
+// output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
+//
+// The above round function rounds the value based on the given round_mode.
+//
+//
+// Arguments:
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If `range_given == True`, this specifies the minimum input value that needs to
+// be represented, otherwise it is determined from the min value of the `input`
+// tensor.
+//	input_max: If `range_given == True`, this specifies the maximum input value that needs to
+// be represented, otherwise it is determined from the max value of the `input`
+// tensor.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV2",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A TPU core selector Op.
+//
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
+//
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUOrdinalSelector",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Looks up keys in a table, outputs the corresponding values.
+//
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
+//
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableFindV2",
+		Input: []tf.Input{
+			table_handle, keys, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// num_elements: The size of the output list. Must be large enough to accommodate
+//   the largest index in indices. If -1, the list is just large enough to include
+//   the largest index in indices.
+// output_handle: The TensorList.
+func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterV2",
+		Input: []tf.Input{
+			tensor, indices, element_shape, num_elements,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecodeWithOffsets",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
+type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParameters",
+		Input: []tf.Input{
+			parameters, momenta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
-			indices,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
 	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
+	return scope.AddOperation(opspec)
+}
+
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
 	}
-	return key, values
 }
 
-// Returns element-wise integer closest to x.
+// Return histogram of values.
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
 // ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			x,
+			values, value_range, nbins,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["num_elements"] = value
 	}
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
+// Stacks all tensors in the list.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// Requires that all tensors have the same shape.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
 //
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "TensorListStack",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			input_handle, element_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
+// A placeholder op for a value that will be fed into the computation.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeue",
 
-// SubstrUnit sets the optional unit attribute to value.
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["unit"] = value
+		m["dtype"] = value
 	}
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// Examples
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// Using scalar `pos` and `len`:
+// The outputs are a deterministic function of `shape` and `seed`.
 //
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// output = [b'ell', b'orl']
-// ```
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes its input available to the next iteration.
 //
-// Using `pos` and `len` with same shape as `input`:
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
 //
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
+
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
 //
-// Broadcasting `pos` and `len` onto `input`:
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+	return func(m optionalAttr) {
+		m["old_vocab_size"] = value
+	}
+}
+
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
 //
-// Broadcasting `input` onto `pos` and `len`:
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
 //
-// output = [b'hir', b'ee', b'n']
-// ```
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			input, pos, len,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Exits the current frame to its parent frame.
+// Worker heartbeat op.
 //
-// Exit makes its input `data` available to the parent frame.
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "WorkerHeartbeat",
 		Input: []tf.Input{
-			data,
+			request,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			reader_handle,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates quantized tensors along one dimension.
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
 //
 // Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+			tf.OutputList(batch), mode_override,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
+// An op that receives embedding activations on the TPU.
 //
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// The TPU system performs the embedding lookups and aggregations specified by
+// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+// results of these aggregations are visible to the Tensorflow Graph as the
+// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+// one Tensor of activations per table specified in the model. There can be at
+// most one RecvTPUEmbeddingActivations op in the TPU graph.
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+//	num_outputs: The number of output activation tensors, equal to the number of
+// embedding tables in the model.
+//	config: Serialized TPUEmbeddingConfiguration proto.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
-		Input: []tf.Input{
-			indices, values, shape, start, size,
-		},
+		Type: "RecvTPUEmbeddingActivations",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
+		return
+	}
+	return outputs
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// This operation computes
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
 //
-//     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// For example:
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
+		Type: "Select",
 		Input: []tf.Input{
-			resource, indices, updates,
+			condition, x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// Returns the set of files matching one or more glob patterns.
 //
-// ```
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
 // Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["squeeze_dims"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
+// Removes dimensions of size 1 from the shape of a tensor.
 //
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
 //
 // For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	input: The `input` to squeeze.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14692,62 +18134,48 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			input, delimiter,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// Update '*var' according to the adadelta scheme.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14756,137 +18184,115 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the complex conjugate of a complex number.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
-// For example:
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input,
+			boxes, scores, max_output_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
-
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
 
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Backprop step of CudnnRNN.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// Compute the backprop of both data and weights in a RNN.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14895,86 +18301,75 @@ func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// Store the input tensor in the state of the current session.
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	value: The tensor to be stored.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "GetSessionHandleV2",
 		Input: []tf.Input{
-			x,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
 
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var, m, and v tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the AdaMax algorithm.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	m: Should be from a Variable().
 //	v: Should be from a Variable().
 //	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
 //	lr: Scaling factor. Must be a scalar.
 //	beta1: Momentum factor. Must be a scalar.
 //	beta2: Momentum factor. Must be a scalar.
@@ -14982,48 +18377,7 @@ func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
-//
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
-//
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15032,124 +18386,89 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
 
-// Computes numerical negative value element-wise.
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
 //
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
@@ -15159,55 +18478,66 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	}
 	var idx int
 	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
 		return
 	}
-	return outputs
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
+// If not specified, defaults to false
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["sloppy"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+//
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "ExperimentalParseExampleDataset",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -15215,101 +18545,83 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// 2D real-valued fast Fourier transform.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15318,370 +18630,278 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			inputs, min, max,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// Returns which elements of x are Inf.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
+// Gather ragged slices from `params` axis `0` according to `indices`.
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
 //
 // ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
 // ```
 //
-// The resulting update to ref would look like this:
+// where
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
 //
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "RaggedGather",
 		Input: []tf.Input{
-			ref, indices, updates,
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Produces a string handle for the given MultiDeviceIterator.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//
-// Returns A string representing the resource.
-func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorToStringHandle",
-		Input: []tf.Input{
-			multi_device_iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
-type CudnnRNNV3Attr func(optionalAttr)
-
-// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNV3Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNV3Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNV3Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
 	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// A RNN backed by cuDNN.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNV3",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths,
+			boxes, scores, max_output_size, iou_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			data, partitions,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
+	return op.Output(0)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
+//	linear: Should be from a Variable().
 //	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15690,138 +18910,260 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
 //
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
+		Type: "Skipgram",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
 
-// StringFormatTemplate sets the optional template attribute to value.
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 	return func(m optionalAttr) {
-		m["template"] = value
+		m["data_format"] = value
 	}
 }
 
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
+// Performs max pooling on the input.
 //
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["placeholder"] = value
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolV2",
+		Input: []tf.Input{
+			input, ksize, strides,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StringFormatSummarize sets the optional summarize attribute to value.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Formats a string template using a list of tensors.
+// Deprecated. Use TensorArrayReadV3
 //
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV2",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Batch normalization.
 //
-// Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "StringFormat",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
@@ -15829,47 +19171,65 @@ func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["container"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15878,397 +19238,354 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// num_elements: The size of the output list. Must be large enough to accommodate
-//   the largest index in indices. If -1, the list is just large enough to include
-//   the largest index in indices.
-// output_handle: The TensorList.
-func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterV2",
-		Input: []tf.Input{
-			tensor, indices, element_shape, num_elements,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["num_cores_per_replica"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["topology"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+// value: Whether to place the computation on the TPU.
+// If not specified, defaults to true
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+		m["use_tpu"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+		m["device_assignment"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["computation_shape"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["host_compute_core"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["padding_map"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["step_marker_location"] = value
+	}
+}
+
+// Metadata indicaitng how the TPU computation should be replicated.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	num_replicas: Number of replicas of the computation
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
-		Input: []tf.Input{
-			image_size, bounding_boxes,
-		},
+		Type: "TPUReplicateMetadata",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
-// LRNBias sets the optional bias attribute to value.
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["table_id"] = value
 	}
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["table_name"] = value
 	}
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// Load FTRL embedding parameters with debug support.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
 		Input: []tf.Input{
-			input,
+			parameters, accumulators, linears, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			filename,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
 //
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "NonMaxSuppressionWithOverlaps",
 		Input: []tf.Input{
-			x, y,
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["num_elements"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Stacks all tensors in the list.
+// Computes gradient of the FractionalAvgPool function.
 //
-// Requires that all tensors have the same shape.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
+// Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListStack",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			input_handle, element_shape,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -16276,43 +19593,42 @@ func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Outp
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["replace_global"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Replaces the match of pattern in input with rewrite.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "StaticRegexReplace",
 		Input: []tf.Input{
-			shape, seed,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -16320,146 +19636,155 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// Computes gradients for the exponential linear (Elu) operation.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			data,
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+//
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDatasetV2",
+		Input: []tf.Input{
+			input_dataset, batch_size, drop_remainder,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fact",
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// Arguments:
 //
-// then the final deserialized `SparseTensor` will be:
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
 //
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			serialized_sparse,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
-//
-// Or, to remove specific size 1 dimensions:
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Arguments:
-//	input: The `input` to squeeze.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16468,9 +19793,9 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			input,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -16478,98 +19803,105 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Merge",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Closes the given queue.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	handle: The handle to a queue.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16578,201 +19910,216 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "QueueCloseV2",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			handle,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Writes the given dataset to the given file using the TFRecord format.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorDataset",
+		Type: "ExperimentalDatasetToTFRecord",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			input_dataset, filename, compression_type,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	out_backprop: Any number of dimensions.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"reduction": reduction}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "NcclReduce",
 		Input: []tf.Input{
-			input, fft_length,
+			tf.OutputList(input),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			input, paddings,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
 //
 // Arguments:
-//	resource: the input resource handle.
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "CrossReplicaSum",
 		Input: []tf.Input{
-			resource,
+			input, group_assignment,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
 //	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16781,171 +20128,162 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns which elements of x are Inf.
+// An Op to permute tensors across replicated TPU instances.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// Each instance supplies its own input.
+//
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
+//
+// Arguments:
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
+//
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "CollectivePermute",
 		Input: []tf.Input{
-			x,
+			input, source_target_pairs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Returns the complex conjugate of a complex number.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// For example:
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "Conj",
 		Input: []tf.Input{
-			shape,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
-		m["min_count"] = value
+		m["table_id"] = value
 	}
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["table_name"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// Retrieve centered RMSProp embedding parameters.
 //
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["skip_empty"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16954,189 +20292,165 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			string_tensor,
+			input, delimiter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
-//
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
 
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
 	return func(m optionalAttr) {
-		m["density_unit"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
+
+// PrintV2OutputStream sets the optional output_stream attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["output_stream"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// Prints a string scalar.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// Prints a string scalar to the desired output_stream.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	input: The string scalar to print.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17145,69 +20459,61 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "PrintV2",
 		Input: []tf.Input{
-			image,
+			input,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// The gradient operator for the SparseSlice op.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Arguments:
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseSliceGrad",
+		Input: []tf.Input{
+			backprop_val_grad, input_indices, input_start, output_indices,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Draws samples from a multinomial distribution.
+// Creates a dataset by applying optimizations to `input_dataset`.
+//
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+//
+func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "OptimizeDataset",
 		Input: []tf.Input{
-			logits, num_samples,
+			input_dataset, optimizations,
 		},
 		Attrs: attrs,
 	}
@@ -17215,35 +20521,36 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
 // value: If True, updating of the var and accum tensors will be protected by
 // a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+//
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17252,178 +20559,140 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the last element of the input list as well as a list with all but that element.
-//
-// Fails if the list is empty.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
-		Input: []tf.Input{
-			input_handle, element_shape,
-		},
-		Attrs: attrs,
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
-
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// All elements selected by `indices` must have the same shape.
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise for integer types.
+// Returns 0 if the denominator is zero.
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "DivNoNan",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -17432,128 +20701,117 @@ func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// Subtracts a value from the current value of a variable.
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			resource, value,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
 	}
-	return tensors
+	return scope.AddOperation(opspec)
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
 
-		Attrs: attrs,
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
+// Restores a tensor from checkpoint files.
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
 //
 // Arguments:
-//	input: Base64 strings to decode.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "Restore",
 		Input: []tf.Input{
-			input,
+			file_pattern, tensor_name,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Input images and output images must be quantized types.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17562,125 +20820,160 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			images, size, min, max,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// Arguments:
+//
+//	num_threads: Identifies the number of threads to use for the private threadpool.
+//
+//
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalPrivateThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, num_threads,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+// L2 Loss.
+//
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
+		Type: "L2Loss",
 		Input: []tf.Input{
-			resource,
+			t,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// StackV2StackName sets the optional stack_name attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["stack_name"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// A stack that produces elements in first-in last-out order.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "StackV2",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			max_size,
 		},
 		Attrs: attrs,
 	}
@@ -17688,66 +20981,99 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
+
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalLatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
+}
+
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["dropout"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+		m["seed"] = value
 	}
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Compute the backprop of both data and weights in a RNN.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17756,56 +21082,60 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// InfeedEnqueueShape sets the optional shape attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["shape"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// InfeedEnqueueLayout sets the optional layout attribute to value.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	input: A tensor that will be provided using the infeed mechanism.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17814,82 +21144,98 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "InfeedEnqueue",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Store the input tensor in the state of the current session.
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	value: The tensor to be stored.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			value,
+			features, labels,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If `True`, retain reduced dimensions with length `1`.
 // If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["separator"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// Joins a string Tensor across the given dimensions.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17898,323 +21244,308 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
 
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// TopKSorted sets the optional sorted attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["sorted"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayV3
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "TopK",
 		Input: []tf.Input{
-			size,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// BatchToSpace for N-D tensors of type T.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// This operation is equivalent to the following steps:
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			true_classes,
+			input, block_shape, crops,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// UnpackAxis sets the optional axis attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["axis"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+//
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "Unpack",
 		Input: []tf.Input{
-			input, ksize, strides,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
-
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
 	}
+	return output
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// Delete the stack from its resource container.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	handle: The handle to a stack.
+//
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StackCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
-//
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
-//
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -18222,41 +21553,39 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
+		Type: "Timestamp",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -18349,66 +21678,160 @@ func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+// tensors.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			gradients, features,
+			tf.OutputList(input),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
 }
 
-// Provides the time since epoch in seconds.
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Timestamp",
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// For example:
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18417,9 +21840,32 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
 		},
 		Attrs: attrs,
 	}
@@ -18427,180 +21873,242 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			parameters, accumulators, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
 // Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
 //
-// Inputs are the logits, not probabilities.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+// For example:
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "Real",
 		Input: []tf.Input{
-			features, labels,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			x,
+			tag, tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "Qr",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // TensorArrayV3Attr is an optional argument to TensorArrayV3.
@@ -18696,399 +22204,207 @@ func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0), op.Output(1)
 }
 
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
-//
-// Arguments:
-//
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
+// 3D real-valued fast Fourier transform.
 //
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
 // @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
+// Equivalent to np.fft.rfftn with 3 dimensions.
 // @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
-
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D max pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "Relu",
 		Input: []tf.Input{
-			input,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Update '*var' according to the AddSign update.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// Divides sparse updates into the variable referenced by `resource`.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// This operation computes
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
 //
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
 //
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "ResourceScatterDiv",
 		Input: []tf.Input{
-			input,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
 
-// StringLengthUnit sets the optional unit attribute to value.
-//
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
 	return func(m optionalAttr) {
-		m["unit"] = value
+		m["out_idx"] = value
 	}
 }
 
-// String lengths of `input`.
+// Computes the difference between two lists of numbers or strings.
 //
-// Computes the length of each string given in the input tensor.
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
 //
-// Arguments:
-//	input: The string for which to compute the length.
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
 //
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringLength",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// For example, given this input:
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19097,355 +22413,258 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns 0 if the denominator is zero.
-//
-//
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DivNoNan",
+		Type: "ListDiff",
 		Input: []tf.Input{
 			x, y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
 
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
+		m["table_id"] = value
 	}
 }
 
-// Decodes each string in `input` into a sequence of Unicode code points.
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
 //
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
 //
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
 //
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
 		Input: []tf.Input{
-			input,
+			parameters, accumulators, updates, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "Identity",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "Atan2",
 		Input: []tf.Input{
-			x, y,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+//     Updates specified rows with values in `v`.
 //
-// Inputs are the logits, not probabilities.
+//     Computes `x[i, :] = v; return x`.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "InplaceUpdate",
 		Input: []tf.Input{
-			features, labels,
+			x, i, v,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
+// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
+type OutfeedDequeueTupleAttr func(optionalAttr)
 
-// ReduceJoinSeparator sets the optional separator attribute to value.
+// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
-//
-// For example:
+// Retrieve multiple values from the computation outfeed.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
-		Input: []tf.Input{
-			inputs, reduction_indices,
-		},
+		Type: "OutfeedDequeueTuple",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTuple", err)
+		return
 	}
+	return outputs
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
+// Identity op for gradient debugging.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-//     output = sum(t ** 2) / 2
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			t,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns which elements of x are NaN.
+//
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "IsNan",
 		Input: []tf.Input{
 			x,
 		},
@@ -19454,160 +22673,171 @@ func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
 // If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["dilations"] = value
 	}
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["capacity"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// If `k` varies dynamically, use `TopKV2` below.
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "MapUnstage",
 		Input: []tf.Input{
-			input,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
+// An op enabling differentiation of TPU Embeddings.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
 //
 // Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "TPUEmbeddingActivations",
 		Input: []tf.Input{
-			string_tensor,
+			embedding_variable, sliced_activations,
 		},
 		Attrs: attrs,
 	}
@@ -19615,232 +22845,244 @@ func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64
 	return op.Output(0)
 }
 
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
-
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// BatchToSpace for 4-D tensors of type T.
+//
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
+//
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
+//
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces the match of pattern in input with rewrite.
+// The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
 //
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expression.
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 //
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for the exponential linear (Elu) operation.
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// The output tensor has shape `[2, 2, 4, 1]` and value:
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			gradients, outputs,
+			input, crops,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
+		Type: "ExperimentalStatsAggregatorSummary",
 		Input: []tf.Input{
-			a, x,
+			iterator,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "MakeIterator",
 		Input: []tf.Input{
-			input_dataset, count,
+			dataset, iterator,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// The gradient operator for the SparseAdd op.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
+// Arguments:
 //
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// want to use Nesterov momentum.
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
+//	accum: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19849,286 +23091,169 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+// Returns the last element of the input list as well as a list with all but that element.
 //
+// Fails if the list is empty.
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "TensorListPopBack",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			input_handle, element_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Determine the script codes of a given tensor of Unicode integer code points.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	input: A Tensor of int32 Unicode code points.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "UnicodeScript",
 		Input: []tf.Input{
-			shape, rate,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+// Creates a sequence of numbers.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// For example:
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			contents, crop_window,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -20136,301 +23261,283 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
+// Return a slice from 'input'.
 //
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
 //
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+//
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "Slice",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// Arguments:
+// The Hurwitz zeta function is defined as:
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "Zeta",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			x, q,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+// Returns the cardinality of `input_dataset`.
 //
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// Returns the cardinality of `input_dataset`.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetCardinality",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
+// value: The container name for the `SparseTensorsMap` read by this op.
 // If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
 // If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			input,
+			sparse_handles,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// accum = accum * momentum - lr * grad
+// var += accum
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "ResourceSparseApplyKerasMomentum",
 		Input: []tf.Input{
-			tag, tensor,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
 
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// Update '*var' according to the Adam algorithm.
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20439,367 +23546,336 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "ResourceApplyAdamWithAmsgrad",
 		Input: []tf.Input{
-			input,
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
 //
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			input,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["container"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// Creates a non-initialized hash table.
 //
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+		Type: "HashTableV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
-
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_name"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
-//
-// Arguments:
+// Retrieve Momentum embedding parameters with debug support.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// Enqueue a Tensor on the computation outfeed.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	input: A tensor that will be inserted into the outfeed queue.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "OutfeedEnqueue",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			input,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			features,
+			tag, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["container"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
-		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-// `value  20 5  16 3  7`
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["max_load_factor"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			empty_key, deleted_key,
 		},
 		Attrs: attrs,
 	}
@@ -20807,158 +23883,144 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["table_name"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
+// Retrieve ADAM embedding parameters with debug support.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
-		Attrs: attrs,
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
-		Input: []tf.Input{
-			handle,
-		},
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
 
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["output_dtype"] = value
+		m["is_training"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// A RNN backed by cuDNN.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20967,543 +24029,565 @@ func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
+		Type: "CudnnRNN",
 		Input: []tf.Input{
-			logits, num_samples, seed,
+			input, input_h, input_c, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			bytes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["is_constant"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"frame_name": frame_name}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "Enter",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// TryRpcProtocol sets the optional protocol attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
 // If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+func TryRpcProtocol(value string) TryRpcAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["protocol"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// TryRpcFailFast sets the optional fail_fast attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["fail_fast"] = value
 	}
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["timeout_in_ms"] = value
 	}
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// Perform batches of RPC requests.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
 //
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
-	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
 //
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
 //
-// Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
+// then call this op with arguments:
 //
-//	value_dtype: Type of the table values.
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "TryRpc",
 		Input: []tf.Input{
-			empty_key, deleted_key,
+			address, method, request,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
-
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Applies upper_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
+// Add all input tensors element wise.
 //
 // Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
-//
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UpperBound",
+		Type: "AddN",
 		Input: []tf.Input{
-			sorted_inputs, values,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
+type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve MDL Adagrad Light embedding parameters.
 //
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
+		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
 
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters with debug support.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
-		},
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
+		Type: "MapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
 //
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			y, dy,
+			records, tf.OutputList(record_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// Produces the max pool of the input tensor for quantized types.
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			x, y,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
-
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
-	}
-}
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["seed"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["seed2"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// Randomly shuffles a tensor along its first dimension.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21512,9 +24596,9 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -21522,181 +24606,222 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
 //
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
 //
 // Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "EnqueueTPUEmbeddingSparseBatch",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
 //	shape: The shape of the output tensor.
 //	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
 //
 // Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			shape, seed, minval, maxval,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// An Op to exchange data across TPU replicas.
 //
-// Arguments:
-//	input: A complex tensor.
+// On each replica, the input is split into `split_count` blocks along
+// `split_dimension` and send to the other replicas given group_assignment. After
+// receiving `split_count` - 1 blocks from other replicas, we concatenate the
+// blocks along `concat_dimension` as the output.
 //
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// For example, suppose there are 2 TPU replicas:
+// replica 0 receives input: `[[A, B]]`
+// replica 1 receives input: `[[C, D]]`
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D fast Fourier transform.
+// group_assignment=`[[0, 1]]`
+// concat_dimension=0
+// split_dimension=1
+// split_count=2
 //
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
+// replica 0's output: `[[A], [C]]`
+// replica 1's output: `[[B], [D]]`
 //
 // Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//	concat_dimension: The dimension number to concatenate.
+//	split_dimension: The dimension number to split.
+//	split_count: The number of splits, this number must equal to the sub-group
+// size(group_assignment.get_shape()[1])
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The exchanged result.
+func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "AllToAll",
 		Input: []tf.Input{
-			input,
+			input, group_assignment,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
+// Adds a value to the current value of a variable.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
 // Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			input,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
+// Real-valued fast Fourier transform.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
 // corresponding dimension of `input`, the dimension is cropped. If it is larger,
 // the dimension is padded with zeros.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.irfft2
+// Equivalent to np.fft.rfft
 // @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "RFFT",
 		Input: []tf.Input{
 			input, fft_length,
 		},
@@ -21705,103 +24830,92 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
+// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
+type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// DecodeJpegChannels sets the optional channels attribute to value.
+// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["table_id"] = value
 	}
 }
 
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
+// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["ratio"] = value
+		m["table_name"] = value
 	}
 }
 
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Retrieve Adadelta embedding parameters.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
+
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["out_type"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Applies upper_bound(sorted_search_values, values) along each row.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
 //
+//   result = UpperBound(sorted_sequence, values)
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21810,9 +24924,9 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "UpperBound",
 		Input: []tf.Input{
-			contents,
+			sorted_inputs, values,
 		},
 		Attrs: attrs,
 	}
@@ -21820,248 +24934,312 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// `index  0  1  2  3  4`
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// `value  20 5  16 3  7`
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			input, fft_length,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
+
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			x, y,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
 // Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "SigmoidGrad",
 		Input: []tf.Input{
-			features,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Creates a dataset that changes the batch size.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_workers.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "ExperimentalRebatchDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, num_workers,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			reader_handle,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorListConcatAttr is an optional argument to TensorListConcat.
-type TensorListConcatAttr func(optionalAttr)
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
 
-// TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
+// Update '*var' according to the proximal adagrad scheme.
 //
-// input_handle: The input list.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListConcat",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			input_handle,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
+// Creates a TensorList which, when stacked, has the value of `tensor`.
 //
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+// Each tensor in the result list corresponds to one row of the input tensor.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "TensorListFromTensor",
 		Input: []tf.Input{
-			pattern,
+			tensor, element_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
 
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["embedding_config"] = value
 	}
 }
 
-// Return histogram of values.
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
 //
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["tpu_embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
 //
-// Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["is_global_init"] = value
+	}
+}
+
+// Sets up the centralized structures for a distributed TPU system.
 //
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22070,322 +25248,189 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
-		Input: []tf.Input{
-			values, value_range, nbins,
-		},
+		Type: "ConfigureDistributedTPU",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+// ```
 //
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// Arguments:
 //
-// Our Conv3D implements a form of cross-correlation.
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
 //
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			input, filter,
+			tensor, shape, input_min, input_max,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+//
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["component_types"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["capacity"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-//
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-//
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-//
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-//
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
+// PriorityQueueV2Container sets the optional container attribute to value.
 //
-// Now we can quantize the elements of our tensor:
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// ```c++
-// result = round(input * s)
-// ```
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
 //
 // Arguments:
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
-		Input: []tf.Input{
-			input, min_range, max_range,
-		},
+		Type: "PriorityQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
-//
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
-		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
-		},
-		Attrs: attrs,
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// See also `SaveSlices`.
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			var_, alpha, l1, l2, grad, indices,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// Check if the input matches the regex pattern.
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "StaticRegexFullMatch",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -22393,76 +25438,102 @@ func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, o
 	return op.Output(0)
 }
 
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
+
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieves a single tensor from the computation outfeed.
+//
+// This operation will block indefinitely until data is available.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
-		Input: []tf.Input{
-			input_handle, tensor,
-		},
+		Type: "OutfeedDequeue",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
 
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22471,9 +25542,9 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -22481,1049 +25552,1142 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
+type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// then the final deserialized `SparseTensor` will be:
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters with debug support.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
+		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			x, y,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
+// Encode audio data using the WAV file format.
 //
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			input, fft_length,
+			audio, sample_rate,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
 //
-// Graphically this is equivalent to doing
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "ResourceApplyAdaMax",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
+	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// if hashed_output=true then the output will be
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "Assert",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters with debug support.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
+//
+//
 //
 // Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
+type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["table_name"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// Retrieve FTRL embedding parameters with debug support.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUnbatchDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
+
+// StringFormatTemplate sets the optional template attribute to value.
+//
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["template"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
+//
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["placeholder"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+// StringFormatSummarize sets the optional summarize attribute to value.
+//
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["summarize"] = value
 	}
 }
 
-// Creates an empty hash table.
+// Formats a string template using a list of tensors.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	inputs: The list of tensors to format into the placeholder string.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
+		Type: "StringFormat",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseSlice op.
+// Returns true if queue is closed.
 //
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
 // Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
-//
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_workers.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
-//
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
+		Type: "Atanh",
 		Input: []tf.Input{
-			input_dataset, num_workers,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			y, dy,
+			l, grad,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Assigns a new value to a variable.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "AssignVariableOp",
 		Input: []tf.Input{
-			images,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset by applying optimizations to `input_dataset`.
-//
-// Creates a dataset by applying optimizations to `input_dataset`.
+// Returns a tensor of ones with the same shape and type as x.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
-//
+//	x: a tensor of type T.
 //
-func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OptimizeDataset",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			input_dataset, optimizations,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// The gradient of SparseFillEmptyRows.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			reverse_index_map, grad_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
-type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$vhat_t := max{vhat_{t-1}, v_t}$$
-// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	vhat: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//
+//
 //
 // Returns the created operation.
-func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdamWithAmsgrad",
+		Type: "LoadTPUEmbeddingAdagradParameters",
 		Input: []tf.Input{
-			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			parameters, accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Strip leading and trailing whitespaces from the Tensor.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
+
+// StringLengthUnit sets the optional unit attribute to value.
+//
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["unit"] = value
 	}
 }
 
-// Op removes and returns a random (key, value)
+// String lengths of `input`.
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Computes the length of each string given in the input tensor.
+//
+// Arguments:
+//	input: The string for which to compute the length.
+//
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "StringLength",
 		Input: []tf.Input{
-			indices,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
 	}
-	return key, values
+	return scope.AddOperation(opspec)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
+// Computes numerical negative value element-wise.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a non-initialized hash table.
+// Decode web-safe base64-encoded strings.
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	input: Base64 strings to decode.
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+// SubstrUnit sets the optional unit attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["unit"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// Return substrings from `Tensor` of strings.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
 //
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
 //
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
+// A negative `pos` indicates distance within the string backwards from the end.
 //
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
 //
+// output = [b'ell', b'orl']
 // ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
 // ```
 //
-// and
+// Broadcasting `pos` and `len` onto `input`:
 //
 // ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
 // ```
 //
-// then the final `SparseTensor` will be:
+// Broadcasting `input` onto `pos` and `len`:
 //
 // ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
 // ```
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "Substr",
 		Input: []tf.Input{
-			sparse_handles,
+			input, pos, len,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
+type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["table_name"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// Retrieve proximal Adagrad embedding parameters.
 //
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
+		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Assigns a new value to a variable.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			resource, value,
+			reader_handle,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Strip leading and trailing whitespaces from the Tensor.
-//
-// Arguments:
-//	input: A string `Tensor` of any shape.
+// Returns the number of tensors in the input tensor list.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "TensorListLength",
 		Input: []tf.Input{
-			input,
+			input_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			x,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
-//
-// Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
-//
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "Asinh",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
+
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
 //
-// if < 0, `scale * features` otherwise.
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Selu",
-		Input: []tf.Input{
-			features,
-		},
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["replace_control_characters"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
+// Transcode the input text from a source encoding to a destination encoding.
 //
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
+//
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "UnicodeTranscode",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -23531,220 +26695,235 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// Adds sparse `updates` to an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by adding sparse `updates` to the passed
-// in `tensor`.
-// This operation is very similar to `tf.scatter_nd_add`, except that the updates
-// are added onto an existing tensor (as opposed to a variable). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of tensor_scatter_add is to add individual elements to a
-// tensor by index. For example, say we want to add 4 elements in a rank-1
-// tensor with 8 elements.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [1, 12, 1, 11, 10, 1, 1, 13]
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
 //
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// In Python, this scatter add operation would look like this:
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
 //
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
 //
-// The resulting tensor would look like this:
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
 //
-//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
 //
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns A new tensor copied from tensor and updates added according to the indices.
-func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorScatterAdd",
+		Type: "UnicodeDecode",
 		Input: []tf.Input{
-			tensor, indices, updates,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// one or more square matrices.
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
 //
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
 // Arguments:
-//	input: Shape is `[N, M, M]`.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			input,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
 //
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// Useful special cases:
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Subtracts a value from the current value of a variable.
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			resource, value,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
 // value: Index of file to open first if multiple files match
-// `file_pattern`.
+// `file_pattern`. See the documentation for `Restore`.
 // If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
 		m["preferred_shard"] = value
 	}
@@ -23752,32 +26931,24 @@ func RestorePreferredShard(value int64) RestoreAttr {
 
 // Restores a tensor from checkpoint files.
 //
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// See also `RestoreSlice`.
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
 //	file_pattern: Must have a single element. The pattern of the files from
 // which we read the tensor.
 //	tensor_name: Must have a single element. The name of the tensor to be
 // restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
 //	dt: The type of the tensor to be restored.
 //
 // Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23786,9 +26957,9 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
@@ -23796,123 +26967,136 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
+// Returns the element-wise sum of a list of tensors.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
 //
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
 //
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			images, size, min, max,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Arguments:
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-//	num_threads: Identifies the number of threads to use for the private threadpool.
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
 //
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
-func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalPrivateThreadPoolDataset",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			input_dataset, num_threads,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
-type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["sloppy"] = value
+		m["seed"] = value
 	}
 }
 
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// Arguments:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalParseExampleDataset",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -23920,467 +27104,663 @@ func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_
 	return op.Output(0)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
 //
-// value: Whether to use Adaptive SDCA for the inner loop.
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
 // If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+// Applies sparse subtraction to individual values or slices in a Variable.
 //
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
 //
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "ResourceScatterNdSub",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
+	return scope.AddOperation(opspec)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
 	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// Concats all tensors in the list along the 0th dimension.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// Requires that all tensors have the same shape except the first dimension.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// input_handle: The input list.
-// element_shape: The shape of the uninitialized elements in the list. If the first
-//   dimension is not -1, it is assumed that all list elements have the same
-//   leading dim.
-// leading_dims: The list of leading dims of uninitialized list elements. Used if
-//   the leading dim of input_handle.element_shape or the element_shape input arg
-//   is not already set.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListConcatV2",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			input_handle, element_shape, leading_dims,
+			resource, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// Draws samples from a multinomial distribution.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
 //
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// backsubstitution.
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// The output is computed as follows:
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			matrix, rhs,
+			input, diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
+// Returns the element-wise max of two SparseTensors.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
-type UnicodeTranscodeAttr func(optionalAttr)
-
-// UnicodeTranscodeErrors sets the optional errors attribute to value.
+// List of the given size with empty elements.
 //
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
+// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
+type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
-// as ' ', will preserve string alignment to the source since invalid bytes will be
-// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
-// replacement character will preserve byte alignment to the source.
-// If not specified, defaults to 65533
-func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["replacement_char"] = value
+		m["table_id"] = value
 	}
 }
 
-// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
+		m["table_name"] = value
 	}
 }
 
-// Transcode the input text from a source encoding to a destination encoding.
-//
-// The input is a string tensor of any shape. The output is a string tensor of
-// the same shape containing the transcoded strings. Output strings are always
-// valid unicode. If the input contains invalid encoding positions, the
-// `errors` attribute sets the policy for how to deal with them. If the default
-// error-handling policy is used, invalid formatting will be substituted in the
-// output by the `replacement_char`. If the errors policy is to `ignore`, any
-// invalid encoding positions in the input are skipped and not included in the
-// output. If it set to `strict` then any invalid formatting will result in an
-// InvalidArgument error.
+// Load MDL Adagrad Light embedding parameters.
 //
-// This operation can be used with `output_encoding = input_encoding` to enforce
-// correct formatting for inputs even if they are already in the desired encoding.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// If the input is prefixed by a Byte Order Mark needed to determine encoding
-// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
-// BOM will be consumed and not emitted into the output. If the input encoding
-// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
-// interpreted as a non-breaking-space and is preserved in the output (including
-// always for UTF-8).
+// Arguments:
+//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
+//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
+//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
 //
-// The end result is that if the input is marked as an explicit endianness the
-// transcoding is faithful to all codepoints in the source. If it is not marked
-// with an explicit endianness, the BOM is not considered part of the string itself
-// but as metadata, and so is not preserved in the output.
 //
-// Arguments:
-//	input: The text to be processed. Can have any shape.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//	output_encoding: The unicode encoding to use in the output. Must be one of
-// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
 //
-// Returns A string tensor containing unicode text encoded using `output_encoding`.
-func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeTranscode",
+		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
 		Input: []tf.Input{
-			input,
+			parameters, accumulators, weights, benefits,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asinh",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			x,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
-			start, stop, step,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Stops gradient computation.
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			input,
+			matrix, rhs, l2_regularizer,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Performs a padding as a preprocess during a convolution.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			predictions, targets,
+			input, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -24388,76 +27768,86 @@ func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (pr
 	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
-//
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
-		Input: []tf.Input{
-			x, y,
-		},
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
-
-// RandomGammaSeed sets the optional seed attribute to value.
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["explicit_paddings"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			shape, alpha,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -24465,206 +27855,228 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
-//
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
 //
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "ExperimentalSqlDataset",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			driver_name, data_source_name, query,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
+// Load centered RMSProp embedding parameters.
 //
-// For example:
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
 //
-// This is typically used by gradient computations for a concat operation.
 //
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
+			parameters, ms, mom, mg,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
+	return scope.AddOperation(opspec)
 }
 
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
+
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// where
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+// one in the source data format.
 //
-// is the lower incomplete Gamma function.
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
+// Returns x / y element-wise.
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "Div",
 		Input: []tf.Input{
-			x, q,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the cardinality of `input_dataset`.
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// Returns the cardinality of `input_dataset`.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to return cardinality for.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns The cardinality of `input_dataset`. Named constants are used to represent
-// infinite and unknown cardinality.
-func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetCardinality",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			input_dataset,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
+// Sends `input` to all devices that are connected to the output.
 //
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
+// Sends `input` to all devices that are connected to the output.
 //
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
 //
-func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalSqlDataset",
+		Type: "NcclBroadcast",
 		Input: []tf.Input{
-			driver_name, data_source_name, query,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -24672,57 +28084,84 @@ func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_nam
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			a, b, x,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["container"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
 //
-// For example:
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24731,9 +28170,9 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			input,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -24741,131 +28180,110 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "TensorListPushBack",
 		Input: []tf.Input{
-			input,
+			input_handle, tensor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["input_mode"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
-		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24874,9 +28292,9 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			shape, rate,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
 		Attrs: attrs,
 	}
@@ -24884,48 +28302,83 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "FilterByLastComponentDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the absolute value of a tensor.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "Abs",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
 
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			x, y,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -24933,363 +28386,252 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// Restore a reader to a previously saved state.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			x, y,
+			reader_handle, state,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// Inverse fast Fourier transform.
 //
-// ```
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
+//	input: A complex tensor.
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "IFFT",
 		Input: []tf.Input{
-			condition, x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// 2D fast Fourier transform.
 //
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// Arguments:
+//	input: A complex tensor.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			a, b,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Serializes the tree handle to a proto
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
 // Arguments:
-//	tree_handle: Handle to the tree resource to be serialized.
+//	input: A complex tensor.
 //
-// Returns Serialied proto string of the tree resource.
-func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSerialize",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			tree_handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
+// Returns the truth value of (x != y) element-wise.
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			a, b,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
 
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["table_id"] = value
 	}
 }
 
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["table_name"] = value
 	}
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// Load Momentum embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
 //
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
-
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, gradient_accumulators,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
 
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dtype"] = value
 	}
 }
-
-// Converts CudnnRNN params from canonical form to usable form.
-//
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
-//
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25298,9 +28640,9 @@ func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
+		Type: "StatefulStandardNormal",
 		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+			resource, shape,
 		},
 		Attrs: attrs,
 	}
@@ -25308,256 +28650,299 @@ func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.
 	return op.Output(0)
 }
 
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
+		Type: "Erf",
 		Input: []tf.Input{
-			input_dataset,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Floor",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// Returns the number of records this Reader has produced.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			input, axis,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
-//
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
-	}
-}
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
 
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
-//
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Creates or finds a child frame, and makes `data` available to the child frame.
+// Concats all tensors in the list along the 0th dimension.
 //
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+// Requires that all tensors have the same shape except the first dimension.
 //
-// Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "TensorListConcat",
 		Input: []tf.Input{
-			data,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Add all input tensors element wise.
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+//
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
+//
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["protocol"] = value
+		m["mode"] = value
 	}
 }
 
-// TryRpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["fail_fast"] = value
+		m["round_mode"] = value
 	}
 }
 
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
 //
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
 //
 // ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
 // ```
 //
-// then call this op with arguments:
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
 //
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+//
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
 // ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+//
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
 // ```
 //
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
+// Otherwise, if T is unsigned, the fixed-point range is
 //
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
 //
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
+// From this we compute our scaling factor, s:
 //
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
 //
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
+// Now we can quantize the elements of our tensor:
+//
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TryRpc",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			address, method, request,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
@@ -25565,197 +28950,157 @@ func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["delimiter"] = value
+		m["Tout"] = value
 	}
 }
 
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
-//
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
+// Computes the complex absolute value of a tensor.
 //
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			table_handle, filename,
+			x,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// Returns the truth value of x AND y element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// Computes the reciprocal of x element-wise.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "Inv",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["dilations"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// Input images can be of different types but output images are always float.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			images, size,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -25763,44 +29108,39 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// Computes square root of x element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			input, axis,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -25808,44 +29148,58 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 	return op.Output(0)
 }
 
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalUniqueDataset",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			input_dataset,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
+// Computes the inverse of one or more square invertible matrices or their
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25854,9 +29208,9 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			input, dimension,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -25864,237 +29218,161 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// Computes reciprocal of square root of x element-wise.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "Rsqrt",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
+		Type: "Round",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// Delete the TensorArray from its resource container.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			data, segment_ids,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "Exp",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMin",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "NthElement",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
+// Computes the maximum along segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
 // This operator is similar to the unsorted segment sum operator found
 // [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
+// Instead of computing the sum over segments, it computes the maximum such that:
 //
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
 //
-// If there is no entry for a given segment ID `i`, it outputs 1.
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
 //
 // If the given segment ID `i` is negative, then the corresponding value is
 // dropped, and will not be included in the result.
 //
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  3, 3, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
 // Arguments:
 //
 //	segment_ids: A tensor whose shape is a prefix of `data.shape`.
@@ -26103,12 +29381,12 @@ func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num
 // Returns Has same shape as data, except for the first `segment_ids.rank`
 // dimensions, which are replaced with a single dimension which has size
 // `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
 			data, segment_ids, num_segments,
 		},
@@ -26117,207 +29395,89 @@ func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, nu
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "Softplus",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserializes a serialized tree ensemble config and replaces current tree
-//
-// ensemble.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-//
-// Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// Computes exponential of x - 1 element-wise.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
+		Type: "Expm1",
 		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A Reader that outputs the entire contents of a file as a value.
-//
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// Computes natural logarithm of x element-wise.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
-		Attrs: attrs,
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// Returns the index of a data point that should be added to the seed set.
+//
+// Entries in distances are assumed to be squared distances of candidate points to
+// the already sampled centers in the seed set. The op constructs one Markov chain
+// of the k-MC^2 algorithm and returns the index of one candidate point to be added
+// as an additional cluster center.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	distances: Vector with squared distances to the closest previously sampled cluster center
+// for each candidate point.
+//	seed: Scalar. Seed for initializing the random number generator.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns Scalar with the index of the sampled point.
+func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "KMC2ChainInitialization",
 		Input: []tf.Input{
-			handle,
+			distances, seed,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "Sinh",
 		Input: []tf.Input{
 			x,
 		},
@@ -26326,123 +29486,83 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Computes the sum along sparse segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
 // Arguments:
 //
 //	indices: A 1-D tensor. Has same rank as `segment_ids`.
 //	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
 //
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
 
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["Truncate"] = value
 	}
 }
 
-// Computes size of weights that can be used by a Cudnn RNN model.
-//
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "S": S}
+	attrs := map[string]interface{}{"DstT": DstT}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
+		Type: "Cast",
 		Input: []tf.Input{
-			num_layers, num_units, input_size,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -26450,196 +29570,203 @@ func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output,
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
+
+// UnicodeEncodeErrors sets the optional errors attribute to value.
 //
-// N is the size of the segment being reduced.
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// Encode a tensor of ints into unicode strings.
 //
-// Arguments:
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// ---
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
+//
+// Arguments:
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
+//
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "UnicodeEncode",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input_values, input_splits,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
-//
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-//
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "Erfc",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Computes sigmoid of `x` element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "Sin",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["epsilon"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["data_format"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["is_training"] = value
 	}
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// Gradient for batch normalization.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26648,216 +29775,120 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "Cos",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
+// Computes the determinant of one or more square matrices.
 //
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			start, stop, num,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-// The input tensors `real` and `imag` must have the same shape.
+// or by starting a new tree.
 //
-// For example:
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "BoostedTreesUpdateEnsemble",
 		Input: []tf.Input{
-			real, imag,
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -26865,13 +29896,13 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Acos",
 		Input: []tf.Input{
 			x,
 		},
@@ -26880,92 +29911,71 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
+// Computes the Bessel i0e function of `x` element-wise.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "BesselI0e",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
+// Shuffle dimensions of x according to a permutation.
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "Transpose",
 		Input: []tf.Input{
-			input_dataset, count,
+			x, perm,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// For example:
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26974,9 +29984,9 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "Min",
 		Input: []tf.Input{
-			input,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -26984,80 +29994,66 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Sends `input` to all devices that are connected to the output.
-//
-// Sends `input` to all devices that are connected to the output.
-//
-// The graph should be constructed so that all ops connected to the output have a
-// valid device assignment, and the op itself is assigned one of these devices.
+// Computes the Bessel i1e function of `x` element-wise.
 //
-// input: The input to the broadcast.
-// output: The same as input.
-// shape: The shape of the input tensor.
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
 //
-func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "NcclBroadcast",
+		Type: "BesselI1e",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Returns an element-wise indication of the sign of a number.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Creates a dataset that passes a sliding window over `input_dataset`.
 //
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
+// Arguments:
 //
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "ExperimentalSlidingWindowDataset",
 		Input: []tf.Input{
-			images, size,
+			input_dataset, window_size, window_shift, window_stride,
 		},
 		Attrs: attrs,
 	}
@@ -27065,513 +30061,492 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: the container this variable is placed in.
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
-//
-// value: the name by which this variable is referred to.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a handle to a Variable resource.
+// Op removes and returns the (key, value) element with the smallest
 //
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
 	}
+	return key, values
 }
 
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// Returns element-wise integer closest to x.
 //
+// If the result is midway between two representable values,
+// the even representable is chosen.
 // For example:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
 // ```
-//
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "RandomGammaGrad",
 		Input: []tf.Input{
-			input,
+			alpha, sample,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
+// Returns x + y element-wise.
 //
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ClipByValue",
+		Type: "Add",
 		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+// Returns x + y element-wise.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "AddV2",
 		Input: []tf.Input{
-			arr, size, weights,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["seed"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["seed2"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			x, axis,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Return the shape of s0 op s1 with broadcast.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			s0, s1,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// Saves the input tensors to disk.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Save",
+		Input: []tf.Input{
+			filename, tensor_names, tf.OutputList(data),
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MulNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
+// Returns x / y element-wise for integer types.
 //
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "TruncateDiv",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
 
-// CumprodExclusive sets the optional exclusive attribute to value.
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
 //
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["out_type"] = value
 	}
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
+// Requantizes input with min and max values known per channel.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
+//
+// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
+// Restores tensors from a V2 checkpoint.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			x, axis,
+			prefix, tensor_names, shape_and_slices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
 
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["shapes"] = value
 	}
 }
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["capacity"] = value
 	}
 }
 
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["Tactivation"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// A queue that produces elements in first-in first-out order.
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	component_types: The type of each component in a value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
-		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
-		},
+		Type: "FIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "ExperimentalIgnoreErrorsDataset",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -27579,434 +30554,478 @@ func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "Xdivy",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
+// Bucketizes 'input' based on 'boundaries'.
 //
-// Arguments:
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
 //
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// Returns Same shape with 'input', each value of input replaced with bucket index.
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns x + y element-wise, working on quantized buffers.
+// Calculates gains for each feature and returns the best possible split information for the feature.
 //
-// Arguments:
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
 //
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
-//
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// For example:
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// PNG-encode an image.
 //
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
 //
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Roll",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			input, shift, axis,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
+
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
 //
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			handle, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
 }
 
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
+		Type: "Maximum",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
-//
-// Arguments:
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
+		Type: "Mod",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "TruncateMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
 //
 // Arguments:
-//	table_handle: Handle to the table.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			table_handle,
+			concat_dim, tf.OutputList(shape),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
 		Input: []tf.Input{
-			grads, original_image,
+			parameters, ms, mom, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Outputs all keys and values in the table.
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
 //
-// Arguments:
-//	table_handle: Handle to the table.
 //
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
 //
+// where
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "Igamma",
 		Input: []tf.Input{
-			table_handle,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// The regularized incomplete beta integral is defined as:
 //
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "Betainc",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			a, b, x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
-type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
 
-// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: The type list for the return values.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
 	return func(m optionalAttr) {
-		m["output_types"] = value
+		m["out_type"] = value
 	}
 }
 
-// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: The list of shapes being produced.
-// If not specified, defaults to <>
+// Returns the shape of a tensor.
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Generates a MultiDeviceIterator resource from its provided string handle.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	string_handle: String representing the resource.
+// For example:
 //
-// Returns A MultiDeviceIterator resource.
-func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28015,9 +31034,9 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorFromStringHandle",
+		Type: "Shape",
 		Input: []tf.Input{
-			string_handle,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -28025,162 +31044,72 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
-
-// MutableHashTableV2Container sets the optional container attribute to value.
+// Computes fingerprints of the input strings.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Computes the power of one value to another.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
-		Attrs: attrs,
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["out_type"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// Arguments:
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
 //
-// Arguments:
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28189,165 +31118,206 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Flips all bits elementwise.
+// Returns the truth value of (x < y) element-wise.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Invert",
+		Type: "Less",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
-//
-// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceDeserialize",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
+			shape, rate,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+// Gets the next output from the given iterator.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "IteratorGetNextSync",
 		Input: []tf.Input{
-			input,
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Returns the truth value of (x >= y) element-wise.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
-//
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x, y,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
+// Returns the truth value of x OR y element-wise.
 //
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PrintFirstN sets the optional first_n attribute to value.
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["first_n"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// PrintSummarize sets the optional summarize attribute to value.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
+// Multiply the matrix "a" by the matrix "b".
 //
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28356,9 +31326,9 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "MatMul",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -28366,79 +31336,97 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
 
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
+// value: Number of elements of the file, use -1 if unknown.
 // If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["vocab_size"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
 //
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
 // Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// PrintV2OutputStream sets the optional output_stream attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["output_stream"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Prints a string scalar.
+// Computes the mean of elements across dimensions of a tensor.
 //
-// Prints a string scalar to the desired output_stream.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: The string scalar to print.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28447,53 +31435,98 @@ func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Oper
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PrintV2",
+		Type: "Mean",
 		Input: []tf.Input{
-			input,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
+		Type: "Prod",
 		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// Arguments:
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
 //
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
+// Input images can be of different types but output images are always float.
 //
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -28501,48 +31534,33 @@ func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Outpu
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["display_name"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28551,9 +31569,9 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "Max",
 		Input: []tf.Input{
-			tensor,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -28561,24 +31579,16 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "ExperimentalUniqueDataset",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -28586,340 +31596,477 @@ func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
+
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// Arguments:
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts the quantized `input` tensor into a lower-precision `output`.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Converts the quantized `input` tensor into a lower-precision `output`, using the
+// output range specified with `requested_output_min` and `requested_output_max`.
+//
+// `[input_min, input_max]` are scalar floats that specify the range for the float
+// interpretation of the `input` data. For example, if `input_min` is -1.0f and
+// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "Requantize",
 		Input: []tf.Input{
-			resource, indices, updates,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// Creates a dataset that emits the lines of one or more text files.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TanhGrad",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			y, dy,
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// # ==> [[5, 5, 5, 5],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			tags, values,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// Computes the mean along segments of a tensor.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			tag, values,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_min(c, tf.constant([0, 0, 1]))
+// # ==> [[1, 2, 2, 1],
+// #      [5, 6, 7, 8]]
+// ```
 //
 // Arguments:
-//	handle: The handle to a queue.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			handle,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with images.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
 //
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
+// `num_segments` should equal the number of distinct segment IDs.
 //
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 5,  5, 5, 5],
+// #       [5,  6, 7, 8]]
+// ```
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// Arguments:
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
 //
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			tag, tensor,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+// Computes the product along segments of a tensor.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// For example:
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  6, 6, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "Cosh",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Splits a tensor into a list.
+// Computes the mean along sparse segments of a tensor.
 //
-// list[i] corresponds to lengths[i] tensors from the input tensor.
-// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// tensor: The input tensor.
-// element_shape: A shape compatible with that of elements in the tensor.
-// lengths: Vector of sizes of the 0th dimension of tensors in the list.
-// output_handle: The list.
-func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListSplit",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			tensor, element_shape, lengths,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"T": T, "S": S}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			value,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
@@ -28927,175 +32074,195 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// Computes gradients for SparseSegmentMean.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The shape of the elements of the given list, as a tensor.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+// N is the size of the segment being reduced.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			input_handle,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the item in the list with the given index.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
 //
+// where
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "Igammac",
 		Input: []tf.Input{
-			input_handle, index, element_shape,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Resizes the list.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
+// Arguments:
 //
-// input_handle: the input list
-// size: size of the output list
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListResize",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			input_handle, size,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-//
-// For example:
+// Computes gradients for SparseSegmentSqrtN.
 //
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			diagonal,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["bias"] = value
 	}
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29104,9 +32271,9 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
@@ -29114,44 +32281,44 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
-// Sets the index-th position of the list to contain the given tensor.
-//
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
-		Input: []tf.Input{
-			input_handle, index, item,
-		},
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a Tensor by indexing into the TensorList.
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListGather",
+		Type: "Any",
 		Input: []tf.Input{
-			input_handle, indices, element_shape,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -29159,109 +32326,107 @@ func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, e
 	return op.Output(0)
 }
 
-// Creates a TensorList by indexing into a Tensor.
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
 //
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			tensor, indices, element_shape,
+			resource,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArrayScatterV3
+// Generates values in an interval.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["scientific"] = value
+		m["Tout"] = value
 	}
 }
 
-// AsStringShortest sets the optional shortest attribute to value.
+// Converts two real numbers to a complex number.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
-	}
-}
-
-// AsStringFill sets the optional fill attribute to value.
+// The input tensors `real` and `imag` must have the same shape.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// For example:
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29270,9 +32435,9 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "Complex",
 		Input: []tf.Input{
-			input,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
@@ -29280,216 +32445,212 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
 //
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
+// For example:
 //
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
 // ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RaggedRange",
+		Type: "Imag",
 		Input: []tf.Input{
-			starts, limits, deltas,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
-//
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "Tanh",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
-
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Computes the maximum along segments of a tensor.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_max(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 3, 3, 4],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			handle, n,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// Arguments:
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
-//
-// creates directory if not existing.
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
+//
+// Arguments:
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
-		Input: []tf.Input{
-			filename, contents,
-		},
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["Tout"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
+// Returns the argument of a complex number.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29498,9 +32659,9 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "Angle",
 		Input: []tf.Input{
-			input, axis,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -29508,90 +32669,129 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// Clips tensor values to a specified min and max.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			input,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "Bincount",
 		Input: []tf.Input{
-			gradients, features,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
 
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["exclusive"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
 // ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
 // ```
 //
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29600,73 +32800,67 @@ func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			input,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+// Return the shape of s0 op s1 with broadcast.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			images, scale,
+			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["src_format"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29675,9 +32869,9 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			matrix, rhs,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -29685,49 +32879,67 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
-type ResourceApplyKerasMomentumAttr func(optionalAttr)
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
 
-// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If `True`, perform exclusive cumprod.
 // If not specified, defaults to false
-func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["exclusive"] = value
 	}
 }
 
-// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
+// value: A `bool` (default: False).
 // If not specified, defaults to false
-func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["reverse"] = value
 	}
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// want to use Nesterov momentum.
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// accum = accum * momentum - lr * grad
-// var += accum
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
 //
-// Returns the created operation.
-func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29736,83 +32948,121 @@ func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyKerasMomentum",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns a serialized GraphDef representing `input_dataset`.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Returns a graph representation for `input_dataset`.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
 //
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
-		Input: []tf.Input{
-			input_dataset,
-		},
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LuAttr is an optional argument to Lu.
-type LuAttr func(optionalAttr)
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
 
-// LuOutputIdxType sets the optional output_idx_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LuOutputIdxType(value tf.DataType) LuAttr {
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["output_idx_type"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Computes the LU decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// The input has to be invertible.
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// The output consists of two tensors LU and P containing the LU decomposition
-// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
-// upper triangular factors.
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
-// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
-// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
-// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
-// entries correspond to the upper triangular part, including the diagonal, of LU.
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// P represents a permutation matrix encoded as a list of indices each between `0`
-// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
-// P, then the L, U and P satisfies P_mat * input = L * U.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
-// size `[M, M]`.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
-// lower triangular factor `L` with unit diagonal, and whose upper triangular part
-// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
-// `[..., M]`.
-// @compatibility(scipy)
-// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
-// packed into a single tensor, the permutation is applied to `input` instead of
-// the right hand side and the permutation `P` is returned as a list of indices
-// instead of a permutation matrix.
-// @end_compatibility
-func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29821,61 +33071,86 @@ func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Lu",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			input,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deprecated. Use TensorArrayCloseV3
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+// Arguments:
 //
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			handle,
+			x, y, min_x, max_x, min_y, max_y,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
-//
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
+// Returns x + y element-wise, working on quantized buffers.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// Arguments:
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
 //
-// Arguments:
-//	input: Strings to be encoded.
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29884,263 +33159,256 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			input,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Scatters tensor at indices in an input list.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A dataset that creates window datasets from the input dataset.
+// Computes a range that covers the actual values present in a quantized tensor.
 //
-// Arguments:
+// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+// range that covers the actual values present in that tensor. This op is typically
+// used to produce the `requested_output_min` and `requested_output_max` for
+// `Requantize`.
 //
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
+// Arguments:
 //
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
 //
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "WindowDataset",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
+			input, input_min, input_max,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the matrix square root of one or more square matrices:
+// Rolls the elements of a tensor along an axis.
 //
-// matmul(sqrtm(A), sqrtm(A)) = A
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
 //
-// The input matrix should be invertible. If the input matrix is real, it should
-// have no eigenvalues which are real and negative (pairs of complex conjugate
-// eigenvalues are allowed).
+// For example:
 //
-// The matrix square root is computed by first reducing the matrix to
-// quasi-triangular form with the real Schur decomposition. The square root
-// of the quasi-triangular matrix is then computed directly. Details of
-// the algorithm can be found in: Nicholas J. Higham, "Computing real
-// square roots of a real matrix", Linear Algebra Appl., 1987.
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the matrix square root for all input submatrices `[..., :, :]`.
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[..., M, M]`.
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.sqrtm
-// @end_compatibility
-func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSquareRoot",
+		Type: "Roll",
 		Input: []tf.Input{
-			input,
+			input, shift, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
+// Updates the table to associates keys with values.
 //
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the singular value decompositions of one or more matrices.
+// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// Arguments:
 //
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+//	num_shards: An integer representing the number of shards operating in parallel.
+//	index: An integer representing the current worker index.
 //
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "ShardDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, num_shards, index,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "PaddedBatchDatasetV2",
 		Input: []tf.Input{
-			data, segment_ids,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-//
-// Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "Ceil",
 		Input: []tf.Input{
-			images,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// Computes the number of elements in the given table.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NoOp",
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["align_corners"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30149,425 +33417,430 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
+// Outputs all keys and values in the table.
 //
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// See also `Save`.
 //
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
 //
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			table_handle,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
+type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: The type list for the return values.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["output_types"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// value: The list of shapes being produced.
+// If not specified, defaults to <>
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Generates a MultiDeviceIterator resource from its provided string handle.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
+//	string_handle: String representing the resource.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A MultiDeviceIterator resource.
+func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "MultiDeviceIteratorFromStringHandle",
 		Input: []tf.Input{
-			set1, set2,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
 //
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-//       input_shape[M+1], ..., input_shape[N-1]]
+// *MIN_COMBINED Mode Example*
 //
-// Some examples:
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
 //
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// If the mode is 'MIN_FIRST', then this approach is used:
 //
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
 // ```
 //
-// The output tensor has shape `[1, 2, 2, 1]` and value:
+// *SCALED mode Example*
 //
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
 //
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
 //
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
 // ```
 //
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+// Our input tensor range is then `[-m, m]`.
 //
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
 // ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
 // ```
 //
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
+// Otherwise, if T is unsigned, the fixed-point range is
 // ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 // ```
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
 // ```
 //
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
 // ```
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			input, min_range, max_range,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
+// Flips all bits elementwise.
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Invert",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
 //
-// This is the opposite of `pack`.
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
 		Input: []tf.Input{
-			value,
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
 	}
-	return output
+	return scope.AddOperation(opspec)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
+//	input: A complex64 tensor.
 //
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
 //
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			resource,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the stack from its resource container.
+// Shuts down a running distributed TPU system.
 //
-// Arguments:
-//	handle: The handle to a stack.
+// The op returns an error if no system is running.
 //
 // Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
+		Type: "ShutdownDistributedTPU",
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "AdjustContrast",
 		Input: []tf.Input{
-			basename, num_shards,
+			images, contrast_factor, min_value, max_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
+		m["message"] = value
 	}
 }
 
-// TextLineReaderV2Container sets the optional container attribute to value.
+// PrintFirstN sets the optional first_n attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["first_n"] = value
 	}
 }
 
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["summarize"] = value
 	}
 }
 
-// A Reader that outputs the lines of a file delimited by '\n'.
+// Prints a list of tensors.
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30576,97 +33849,114 @@ func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "Print",
+		Input: []tf.Input{
+			input, tf.OutputList(data),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, serialized_summary_metadata,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
+// Arguments:
 //
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// The remappings are 1-D tensors with the following properties:
 //
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PrefetchDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
+	}
+}
+
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
-//
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -30674,133 +33964,182 @@ func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Ou
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// Read an element from the TensorArray into output `value`.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMax",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			tags, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["max_images"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["bad_color"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
+// Outputs a `Summary` protocol buffer with images.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30809,37 +34148,52 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30848,316 +34202,256 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
+// Splits a tensor into a list.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "TensorListSplit",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			tensor, element_shape, lengths,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
+
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-//     Adds v into specified rows of x.
+// Merges summaries.
 //
-//     Computes y = x; y[i, :] += v; return y.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			x, i, v,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// The shape of the elements of the given list, as a tensor.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "TensorListElementShape",
 		Input: []tf.Input{
-			reader_handle,
+			input_handle,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
-
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["batching_queue"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
+// Returns the item in the list with the given index.
 //
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
 //
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
 //
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "Batch",
+		Type: "TensorListGetItem",
 		Input: []tf.Input{
-			tf.OutputList(in_tensors),
+			input_handle, index, element_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
-	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
+	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Resizes the list.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
 //
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+// input_handle: the input list
+// size: size of the output list
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "TensorListResize",
 		Input: []tf.Input{
-			images, delta,
+			input_handle, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of bicubic interpolation.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
+		Type: "Diag",
 		Input: []tf.Input{
-			grads, original_image,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["seed2"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31166,9 +34460,9 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			images, size,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -31176,672 +34470,495 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of nearest neighbor interpolation.
+// Sets the index-th position of the list to contain the given tensor.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "TensorListSetItem",
 		Input: []tf.Input{
-			grads, size,
+			input_handle, index, item,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// Creates a TensorList by indexing into a Tensor.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatter",
+		Input: []tf.Input{
+			tensor, indices, element_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Extract the shape information of a JPEG-encoded image.
-//
-// This op only parses the image header, so it is much faster than DecodeJpeg.
-//
-// Arguments:
-//	contents: 0-D. The JPEG-encoded image.
+// Deprecated. Use TensorArrayScatterV3
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "TensorArrayScatterV2",
 		Input: []tf.Input{
-			contents,
+			handle, indices, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["scientific"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["shortest"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// AsStringWidth sets the optional width attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["width"] = value
 	}
 }
 
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// AsStringFill sets the optional fill attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
 // If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+func AsStringFill(value string) AsStringAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["fill"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
-//
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "AsString",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
 //
-// Accepted values are:
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "RaggedRange",
 		Input: []tf.Input{
-			contents,
+			starts, limits, deltas,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
-//
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
-			contents,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Computes the Cholesky decomposition of one or more square matrices.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	input: Shape is `[..., M, M]`.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// creates directory if not existing.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "WriteFile",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			filename, contents,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
 	}
+	return scope.AddOperation(opspec)
 }
 
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "All",
+		Input: []tf.Input{
+			input, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
 //
-// Parts of the bounding box may fall outside the image.
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "SelfAdjointEig",
 		Input: []tf.Input{
-			images, boxes,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
+// Computes softplus gradients for a softplus operation.
 //
-// This operation is a synchronous version IteratorGetNext. It should only be used
-// in situations where the iterator does not block the calling thread, or where
-// the calling thread is not a member of the thread pool used to execute parallel
-// operations (e.g. in eager mode).
-func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextSync",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			iterator,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Solves tridiagonal systems of equations.
+//
+// `diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
+// represent matrices with three rows being the superdiagonal, diagonals, and
+// subdiagonals, in order. The last element of the superdiagonal and the first
+// element of the subdiagonal is ignored.
+// `rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
+// each left-hand side.
+// The output is a tensor of shape `[..., M, K]` containing the solutions.
+//
+// Arguments:
+//	diagonals: Shape is `[..., 3, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNextSync", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TridiagonalSolve",
+		Input: []tf.Input{
+			diagonals, rhs,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["compute_v"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEigV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// Adjust the saturation of one or more images.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustSaturation",
+		Input: []tf.Input{
+			images, scale,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
 // If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// Solves systems of linear equations.
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31850,100 +34967,144 @@ func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
 
-// ExtractGlimpseCentered sets the optional centered attribute to value.
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
-		m["centered"] = value
+		m["use_locking"] = value
 	}
 }
 
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
-		m["normalized"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// ExtractGlimpseNoise sets the optional noise attribute to value.
+// Returns a serialized GraphDef representing `input_dataset`.
 //
-// value: indicates if the noise should `uniform`, `gaussian`, or
-// `zero`. The default is `uniform` which means the the noise type
-// will be decided by `uniform_noise`.
-// If not specified, defaults to "uniform"
-func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
+// Returns a graph representation for `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+//
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DatasetToGraph",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
 	return func(m optionalAttr) {
-		m["noise"] = value
+		m["output_idx_type"] = value
 	}
 }
 
-// Extracts a glimpse from the input tensor.
+// Computes the LU decomposition of one or more square matrices.
 //
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// The input has to be invertible.
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31952,238 +35113,289 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "Lu",
 		Input: []tf.Input{
-			input, size, offsets,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// A container for an iterator resource.
+// Deprecated. Use TensorArrayCloseV3
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
-type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["pad"] = value
 	}
 }
 
-// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a handle to a TensorForestTreeResource
-func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
+// A dataset that creates window datasets from the input dataset.
+//
+// Arguments:
+//
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeResourceHandleOp",
-
+		Type: "WindowDataset",
+		Input: []tf.Input{
+			input_dataset, size, shift, stride, drop_remainder,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// Computes the matrix square root of one or more square matrices:
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// matmul(sqrtm(A), sqrtm(A)) = A
+//
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
+//
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+//	input: Shape is `[..., M, M]`.
 //
+// Returns Shape is `[..., M, M]`.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "MatrixSquareRoot",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
 // If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+// SvdFullMatrices sets the optional full_matrices attribute to value.
 //
-// Arguments:
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the singular value decompositions of one or more matrices.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "Svd",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// 3D fast Fourier transform.
+// Converts one or more images from RGB to HSV.
 //
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			input,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
 
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32192,208 +35404,182 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Saves input tensors slices to disk.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
 
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
-//
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
-// If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			table_handle, keys,
+			basename, shard, num_shards,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
-type CombinedNonMaxSuppressionAttr func(optionalAttr)
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// CombinedNonMaxSuppressionPadPerClass sets the optional pad_per_class attribute to value.
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: If false, the output nmsed boxes, scores and classes
-// are padded/clipped to `max_total_size`. If true, the
-// output nmsed boxes, scores and classes are padded to be of length
-// `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
-// which case it is clipped to `max_total_size`. Defaults to false.
-// If not specified, defaults to false
-func CombinedNonMaxSuppressionPadPerClass(value bool) CombinedNonMaxSuppressionAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["pad_per_class"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// This operation performs non_max_suppression on the inputs per batch, across
-// all classes.
-// Prunes away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system. Also note that
-// this algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is the final boxes, scores and classes tensor
-// returned after performing non_max_suppression.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then
-// same boxes are used for all classes otherwise, if `q` is equal to number of
-// classes, class-specific boxes are used.
-//	scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
-// representing a single score corresponding to each box (each row of boxes).
-//	max_output_size_per_class: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression per class
-//	max_total_size: A scalar representing maximum number of boxes retained over all classes.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns A [batch_size, max_detections, 4] float32 tensor
-// containing the non-max suppressed boxes.A [batch_size, max_detections] float32 tensor
-// containing the scores for the boxes.A [batch_size, max_detections] float32 tensor
-// containing the classes for the boxes.A [batch_size] int32 tensor indicating the number of
-// valid detections per batch item. Only the top num_detections[i] entries in
-// nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
-// entries are zero paddings.
-func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size_per_class tf.Output, max_total_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...CombinedNonMaxSuppressionAttr) (nmsed_boxes tf.Output, nmsed_scores tf.Output, nmsed_classes tf.Output, valid_detections tf.Output) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32402,137 +35588,192 @@ func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CombinedNonMaxSuppression",
-		Input: []tf.Input{
-			boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
+}
+
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+//
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
 }
 
-// Computes the matrix logarithm of one or more square matrices:
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
 //
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
 //
-// \\(log(exp(A)) = A\\)
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
 //
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
+// The remappings are 1-D tensors with the following properties:
 //
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
 //
-// Returns Shape is `[..., M, M]`.
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
+//
+// Arguments:
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			input,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "FakeParam",
+}
 
-		Attrs: attrs,
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
-//
-// This operation returns the same result as the C++ std::nextafter function.
-//
-// It can also return a subnormal number.
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// @compatibility(cpp)
-// Equivalent to C++ std::nextafter function.
-// @end_compatibility
-func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NextAfter",
-		Input: []tf.Input{
-			x1, x2,
-		},
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// List of the given size with empty elements.
+// Quantizes then dequantizes a tensor.
 //
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			element_shape, num_elements,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -32540,73 +35781,77 @@ func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Ou
 	return op.Output(0)
 }
 
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
-		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32615,275 +35860,327 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Add the quantile summaries to each quantile stream resource.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
+			reader_handle, queue_handle,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
 //
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalRandomDataset",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			seed, seed2,
+			reader_handle, queue_handle, num_records,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// A dataset that splits the elements of its input into multiple elements.
-func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalUnbatchDataset",
+		Type: "InplaceAdd",
 		Input: []tf.Input{
-			input_dataset,
+			x, i, v,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that overrides the maximum intra-op parallelism.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
+//	reader_handle: Handle to a Reader.
 //
-//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
-//
-//
-func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalMaxIntraOpParallelismDataset",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			input_dataset, max_intra_op_parallelism,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
 
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
-//
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 	return func(m optionalAttr) {
-		m["maxsplit"] = value
+		m["max_enqueued_batches"] = value
 	}
 }
 
-// Split elements of `source` based on `sep` into a `SparseTensor`.
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
 //
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
 //
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
 //
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
 //
-// Note that the above mentioned behavior matches python's str.split.
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
 //
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
+		Type: "Batch",
 		Input: []tf.Input{
-			input, sep,
+			tf.OutputList(in_tensors),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
 // Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
 //
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
-//
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			input_dataset, thread_pool,
+			images, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bicubic interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "ResizeBicubicGrad",
 		Input: []tf.Input{
-			features,
+			grads, original_image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
 
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
+		m["align_corners"] = value
 	}
 }
 
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
-//
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeProto",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			sizes, tf.OutputList(values),
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -32891,163 +36188,144 @@ func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
-//
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
-//
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			handle, flow_in,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
+
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+//
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
+// Extract the shape information of a JPEG-encoded image.
 //
-// If `x` and `y` are reals, this will return the floating-point division.
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			x, y,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
-		Input: []tf.Input{
-			input_dataset, another_dataset,
-		},
-		Attrs: attrs,
+}
+
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
+// Accepted values are:
 //
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			input, filter,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -33055,635 +36333,663 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+//
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+//	contents: 0-D.  The GIF-encoded image.
 //
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			resource_handle, serialized,
+			contents,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
-type ResourceScatterNdSubAttr func(optionalAttr)
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
 
-// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Applies sparse subtraction to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-// with 8 elements. In Python, that subtraction would look like this:
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// sub = tf.scatter_nd_sub(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(sub)
-// ```
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// The resulting update to ref would look like this:
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-//     [1, -9, 3, -6, -4, 6, 7, -4]
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the created operation.
-func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdSub",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			ref, indices, updates,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// Arguments:
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
-		},
-		Attrs: attrs,
+}
+
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
-//
-// pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
-//
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
 //
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
-		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
-		},
-		Attrs: attrs,
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
 // Arguments:
+//	component_types: The type of each component in a value.
 //
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
-//
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CacheDataset",
-		Input: []tf.Input{
-			input_dataset, filename,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more binary files.
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
-		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
-		},
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			images, boxes,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
-type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
-// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorHandle",
+}
 
-		Attrs: attrs,
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// A container for an iterator resource.
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.
-func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "AnonymousIterator",
+}
 
-		Attrs: attrs,
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+// Generate a single randomly distorted bounding box for an image.
 //
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+// For example,
 //
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
 //
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
 // ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 //
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			input, crops,
+			image_size, bounding_boxes, min_object_covered,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+// Computes requantization range per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	clip_value_max: The maximum value of the output that needs to be clipped.
+// Example: set this to 6 for Relu6.
+//
+// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorSummary",
+		Type: "RequantizationRangePerChannel",
 		Input: []tf.Input{
-			iterator,
+			input, input_min, input_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MakeIterator",
-		Input: []tf.Input{
-			dataset, iterator,
-		},
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Adjust the contrast of one or more images.
+// ExtractGlimpseNoise sets the optional noise attribute to value.
 //
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
 //
-// Contrast is adjusted independently for each channel of each image.
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
 //
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
-		Input: []tf.Input{
-			images, contrast_factor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			iterator,
+			input, size, offsets,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
-//
-//
+// A container for an iterator resource.
 //
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
+		Type: "Iterator",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0)
+}
+
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
+}
+
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	return components
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
-		Input: []tf.Input{
-			resource_handle,
-		},
+		Type: "TensorForestTreeResourceHandleOp",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
 
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["output_shapes"] = value
+		m["method"] = value
 	}
 }
 
-// Converts the given string representing a handle to an iterator to a resource.
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			string_handle,
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
@@ -33691,406 +36997,378 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
-//
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
-//
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-//
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
+
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+	return func(m optionalAttr) {
+		m["reshuffle_each_iteration"] = value
+	}
+}
+
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
+// Arguments:
 //
-// See also `tf.batch_gather` and `tf.gather_nd`.
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GatherV2",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			params, indices, axis,
+			input_dataset, buffer_size, seed, seed2,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	input: A complex64 tensor.
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "FFT3D",
 		Input: []tf.Input{
-			resource_handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// FIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["method"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
+		Type: "CropAndResizeGradBoxes",
+		Input: []tf.Input{
+			grads, image, boxes, box_ind,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserializes a proto into the tree handle
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	tree_handle: Handle to the tree resource to be restored.
-//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns the created operation.
-func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeDeserialize",
-		Input: []tf.Input{
-			tree_handle, tree_config,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
+		Type: "NonMaxSuppressionV3",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
-type DecodeProtoV2Attr func(optionalAttr)
-
-// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
-//
-// value: Either the special value `local://` or a path to a file containing
-// a serialized `FileDescriptorSet`.
-// If not specified, defaults to "local://"
-func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
-//
-// value: Either `binary` or `text`.
-// If not specified, defaults to "binary"
-func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["message_format"] = value
-	}
-}
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
 
-// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
 //
-// value: Whether to sanitize the result or not.
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
 // If not specified, defaults to false
-func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
 	return func(m optionalAttr) {
-		m["sanitize"] = value
+		m["pad_to_max_output_size"] = value
 	}
 }
 
-// The op extracts fields from a serialized protocol buffers message into tensors.
-//
-// The `decode_proto` op extracts fields from a serialized protocol buffers
-// message into tensors.  The fields in `field_names` are decoded and converted
-// to the corresponding `output_types` if possible.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// Each output tensor is a dense tensor. This means that it is padded to
-// hold the largest number of repeated elements seen in the input
-// minibatch. (The shape is also padded by one to prevent zero-sized
-// dimensions). The actual repeat counts for each example in the
-// minibatch can be found in the `sizes` output. In many cases the output
-// of `decode_proto` is fed immediately into tf.squeeze if missing values
-// are not a concern. When using tf.squeeze, always pass the squeeze
-// dimension explicitly to avoid surprises.
-//
-// For the most part, the mapping between Proto field types and
-// TensorFlow dtypes is straightforward. However, there are a few
-// special cases:
-//
-// - A proto field that contains a submessage or group can only be converted
-// to `DT_STRING` (the serialized submessage). This is to reduce the
-// complexity of the API. The resulting string can be used as input
-// to another instance of the decode_proto op.
-//
-// - TensorFlow lacks support for unsigned integers. The ops represent uint64
-// types as a `DT_INT64` with the same twos-complement bit pattern
-// (the obvious way). Unsigned int32 values can be represented exactly by
-// specifying type `DT_INT64`, or using twos-complement if the caller
-// specifies `DT_INT32` in the `output_types` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Both binary and text proto serializations are supported, and can be
-// chosen using the `format` attribute.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	bytes: Tensor of serialized protos with shape `batch_shape`.
-//	message_type: Name of the proto message type to decode.
-//	field_names: List of strings containing proto field names. An extension field can be decoded
-// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
-//	output_types: List of TF types to use for the respective field in field_names.
-//
-// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-// Each entry is the number of values found for the corresponding field.
-// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
-// `values[i]` has datatype `output_types[i]`
-// and shape `[batch_shape, max(sizes[...,i])]`.
-func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeProtoV2",
+		Type: "NonMaxSuppressionV4",
 		Input: []tf.Input{
-			bytes,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	sizes = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("DecodeProtoV2", err)
-		return
-	}
-	return sizes, values
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
+		Type: "LookupTableRemoveV2",
 		Input: []tf.Input{
-			optional,
+			table_handle, keys,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
-		Input: []tf.Input{
-			optional,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
-		return
+// CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
+type CombinedNonMaxSuppressionAttr func(optionalAttr)
+
+// CombinedNonMaxSuppressionPadPerClass sets the optional pad_per_class attribute to value.
+//
+// value: If false, the output nmsed boxes, scores and classes
+// are padded/clipped to `max_total_size`. If true, the
+// output nmsed boxes, scores and classes are padded to be of length
+// `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+// which case it is clipped to `max_total_size`. Defaults to false.
+// If not specified, defaults to false
+func CombinedNonMaxSuppressionPadPerClass(value bool) CombinedNonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["pad_per_class"] = value
 	}
-	return components
 }
 
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// This operation performs non_max_suppression on the inputs per batch, across
+// all classes.
+// Prunes away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system. Also note that
+// this algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is the final boxes, scores and classes tensor
+// returned after performing non_max_suppression.
+//
+// Arguments:
+//	boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then
+// same boxes are used for all classes otherwise, if `q` is equal to number of
+// classes, class-specific boxes are used.
+//	scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
+// representing a single score corresponding to each box (each row of boxes).
+//	max_output_size_per_class: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression per class
+//	max_total_size: A scalar representing maximum number of boxes retained over all classes.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A [batch_size, max_detections, 4] float32 tensor
+// containing the non-max suppressed boxes.A [batch_size, max_detections] float32 tensor
+// containing the scores for the boxes.A [batch_size, max_detections] float32 tensor
+// containing the classes for the boxes.A [batch_size] int32 tensor indicating the number of
+// valid detections per batch item. Only the top num_detections[i] entries in
+// nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+// entries are zero paddings.
+func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size_per_class tf.Output, max_total_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...CombinedNonMaxSuppressionAttr) (nmsed_boxes tf.Output, nmsed_scores tf.Output, nmsed_classes tf.Output, valid_detections tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
+		Type: "CombinedNonMaxSuppression",
 		Input: []tf.Input{
-			iterator,
+			boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Fast Fourier transform.
+// Computes the matrix logarithm of one or more square matrices:
 //
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	input: A complex tensor.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+// Returns Shape is `[..., M, M]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
 // @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "MatrixLogarithm",
 		Input: []tf.Input{
 			input,
 		},
@@ -34099,294 +37377,324 @@ func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Identity transformation that models performance.
-//
-// Identity transformation that models performance.
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
 //
-func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "ModelDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
+		Type: "FakeParam",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This operation returns the same result as the C++ std::nextafter function.
+//
+// It can also return a subnormal number.
+//
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "NextAfter",
 		Input: []tf.Input{
-			x, y,
+			x1, x2,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
 //
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
 //
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
 //
-// The resulting value `output` would look like this:
+// associative container.   Elements are ordered by key.
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// Arguments:
+//	key: int64
 //
-// See `tf.scatter_nd` for more details about how to make updates to slices.
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
-			input, indices, updates,
+			key, indices, tf.OutputList(values),
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
 
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
+// value: Swap `elem` to CPU. Default to false.
 // If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["swap_memory"] = value
 	}
 }
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
+// Push an element onto the stack.
 //
-// `value  20 5  16 3  7`
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
 	return func(m optionalAttr) {
-		m["deterministic"] = value
+		m["protocol"] = value
 	}
 }
 
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
+// RpcFailFast sets the optional fail_fast attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["fail_fast"] = value
 	}
 }
 
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
 //
-// value: An second seed to avoid seed collision.
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
 // If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+func RpcTimeoutInMs(value int64) RpcAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["timeout_in_ms"] = value
 	}
 }
 
-// Performs fractional max pooling on the input.
+// Perform batches of RPC requests.
 //
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
 //
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
 //
-// First we define the following:
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
 //
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
 //
-// Then, row_pooling_sequence should satisfy:
+// then call this op with arguments:
 //
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "Rpc",
 		Input: []tf.Input{
-			value,
+			address, method, request,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a MultiDeviceIterator resource.
-//
-// Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "ExperimentalDirectedInterleaveDataset",
 		Input: []tf.Input{
-			input, paddings, filter,
+			selector_input_dataset, tf.OutputList(data_input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -34394,86 +37702,59 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
+		m["seed"] = value
 	}
 }
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+// Outputs random integers from a uniform distribution.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the input.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -34481,920 +37762,732 @@ func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output,
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
+// Add the quantile summaries to each quantile stream resource.
 //
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
 //
-// For example:
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(summaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "ExperimentalRandomDataset",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			seed, seed2,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// Creates a dataset that overrides the maximum intra-op parallelism.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//
+//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+//
+//
+func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Equal",
+		Type: "ExperimentalMaxIntraOpParallelismDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, max_intra_op_parallelism,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
 
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+//
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["maxsplit"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayGatherV3
+// Split elements of `source` based on `sep` into a `SparseTensor`.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
+//
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+//
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
+//
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
+		Type: "StringSplitV2",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			input, sep,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
+// Arguments:
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "ExperimentalThreadPoolDataset",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input_dataset, thread_pool,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
-
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
-//
-// value: The type of each component in a value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["component_types"] = value
-	}
-}
-
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// PriorityQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements sorted by the first component value.
-//
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
-//
-// Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-//
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
-		Attrs: attrs,
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
 
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
+// The op serializes protobuf messages provided in the input tensors.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
 //
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
 //
-// Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			sizes, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BesselI0e",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			x,
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
-
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+// Returns x / y element-wise for real types.
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			handle, n,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
+		},
+		Attrs: attrs,
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
+// Converts the given variant tensor to an iterator and stores it in the given resource.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
 //
 // Returns the created operation.
-func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetToTFRecord",
+		Type: "DeserializeIterator",
 		Input: []tf.Input{
-			input_dataset, filename, compression_type,
+			resource_handle, serialized,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
-}
-
-// Closes the given queue.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// pseudorandomly.
 //
 // Arguments:
-//	handle: The handle to a queue.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			handle,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
+//
+// Arguments:
+//
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Atanh",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, filename,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns true if queue is closed.
-//
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+// Creates a dataset that emits the records from one or more binary files.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			handle,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+// Gradients for batch normalization.
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Abs",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			x,
+			t, m, v, gamma, backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
-
-// StackV2StackName sets the optional stack_name attribute to value.
-//
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
-	return func(m optionalAttr) {
-		m["stack_name"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// A stack that produces elements in first-in last-out order.
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
-//
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "TFRecordDataset",
 		Input: []tf.Input{
-			max_size,
+			filenames, compression_type, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
 
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
+		Type: "ExperimentalStatsAggregatorHandle",
 
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RpcFailFast sets the optional fail_fast attribute to value.
+// A container for an iterator resource.
 //
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.
+func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "AnonymousIterator",
 
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
+// Adjust the contrast of one or more images.
 //
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
 //
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
+// Contrast is adjusted independently for each channel of each image.
 //
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rpc",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			address, method, request,
+			images, contrast_factor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
+		Type: "IteratorGetNext",
 		Input: []tf.Input{
-			input_dataset, tag,
+			iterator,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+// Outputs the single element from the given dataset.
 //
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
+//
+//
+//
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DatasetToSingleElement",
+		Input: []tf.Input{
+			dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
 	}
+	return components
 }
 
-// Push an element onto the stack.
+// Converts the given `resource_handle` representing an iterator to a string.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+//	resource_handle: A handle to an iterator resource.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			handle, elem,
+			resource_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
 
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["output_types"] = value
 	}
 }
 
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	string_handle: A string representation of the given handle.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -35403,338 +38496,274 @@ func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "IteratorFromStringHandle",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a TensorArray for storing multiple gradients of values in the given handle.
-//
-// Similar to TensorArrayGradV3. However it creates an accumulator with an
-// expanded shape compared to the input TensorArray whose gradient is being
-// computed. This enables multiple gradients for the same TensorArray to be
-// calculated using the same accumulator.
+// Gather slices from `params` axis `axis` according to `indices`.
 //
-// Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
-// have shape which is this shape_to_prepend value concatenated with shape of the
-// elements in the TensorArray corresponding to the input handle.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradWithShape",
-		Input: []tf.Input{
-			handle, flow_in, shape_to_prepend,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
 //
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
 //
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
 //
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
 // ```
 //
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
 //
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// See also `tf.batch_gather` and `tf.gather_nd`.
 //
 // Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
 //
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "GatherV2",
 		Input: []tf.Input{
-			input, threshold,
+			params, indices, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Push an element onto the tensor_array.
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	resource_handle: A handle to an iterator resource.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
+		Type: "SerializeIterator",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
-//
-// `indices` must be a vector, its length must match the first dim of `value`.
+// Deserializes a proto into the tree handle
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	tree_handle: Handle to the tree resource to be restored.
+//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns the created operation.
+func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
+		Type: "TensorForestTreeDeserialize",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			tree_handle, tree_config,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EmptyAttr is an optional argument to Empty.
-type EmptyAttr func(optionalAttr)
-
-// EmptyInit sets the optional init attribute to value.
-//
-// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
-// If not specified, defaults to false
-func EmptyInit(value bool) EmptyAttr {
-	return func(m optionalAttr) {
-		m["init"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Creates a tensor with the given shape.
-//
-// This operation creates a tensor of `shape` and `dtype`.
-//
-// Arguments:
-//	shape: 1-D. Represents the shape of the output tensor.
-//
-//
-// Returns A `Tensor` of type `T`.
-func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Empty",
+		Type: "OptionalFromValue",
 		Input: []tf.Input{
-			shape,
+			tf.OutputList(components),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
 
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
 //
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Concat the elements from the TensorArray into value `value`.
-//
-// Takes `T` elements of shapes
-//
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
-//
-// and concatenates them into a Tensor of shape:
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
-//
-// All elements must have the same shape (excepting the first dimension).
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["message_format"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
+}
+
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+//
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Split the data from the input value into TensorArray elements.
+// The op extracts fields from a serialized protocol buffers message into tensors.
 //
-// Assuming that `lengths` takes on values
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
 //
-//   ```(n0, n1, ..., n(T-1))```
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
 //
-// and that `value` has shape
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
 //
-// this splits values into a TensorArray with T tensors.
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
 //
-// TensorArray index t will be the subtensor of values with starting position
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
 //
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
 //
-// and having size
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
 //
-//   ```nt x d0 x d1 x ...```
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names. An extension field can be decoded
+// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
+//	output_types: List of TF types to use for the respective field in field_names.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "DecodeProtoV2",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			bytes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for the scaled exponential linear (Selu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
-//
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SeluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return sizes, values
 }
 
-// Get the current size of the TensorArray.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
+		Type: "OptionalNone",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
+		Type: "OptionalHasValue",
 		Input: []tf.Input{
-			handle, flow_in,
+			optional,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index af5503f2ad308fffb03d2ebd5964eec273896c72..54b10cd184f1862329c361d4bcb4b0d736522360 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -380,15 +380,13 @@ tf_cc_binary(
     linkopts = select({
         "//tensorflow:debug": [],  # Disable all custom linker options in debug mode
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by LINKER_EXPORTED_SYMBOLS
-            "$(location {})".format(LINKER_EXPORTED_SYMBOLS),
+            "-Wl,-exported_symbols_list,$(location {})".format(LINKER_EXPORTED_SYMBOLS),
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
             "-s",
-            "-Wl,--version-script",  #  This line must be directly followed by LINKER_VERSION_SCRIPT
-            "$(location {})".format(LINKER_VERSION_SCRIPT),
+            "-Wl,--version-script,$(location {})".format(LINKER_VERSION_SCRIPT),
         ],
     }),
     linkshared = 1,
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index e1916ca4d9d6aa179e1a69451a5e981783560026..f423cc4d8277509d45aa8344e322f71b7f1306a8 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -18,7 +18,7 @@ XLINT_OPTS = [
     "-Xlint:-processing",
     "-Xlint:-serial",
     "-Xlint:-try",
-    "-Xlint:-classfile", # see b/32750402, go/javac-warnings#classfile
+    "-Xlint:-classfile",  # see b/32750402, go/javac-warnings#classfile
 ]
 
 # The bazel errorprone plugin currently only enables default errorChecks
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 5d6387e88e96802e9226774abd391ac2dd673143..db6116bd5c843c2846d6b9f67e253e87db6daffc 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -516,7 +516,7 @@ bool CanGenerateOp(const OpDef& op_def, const ApiDef& api_def) {
     return false;
   }
   for (const auto& attr : op_def.attr()) {
-    if (attr.type() == "func") {
+    if (attr.type() == "func" || attr.type() == "list(func)") {
       return false;  // TODO(karllessard) add support for function attributes
     }
   }
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
index f4ff34ea0361fba5528126b93f3f6e45289d8df2..b46721a93dcbd105dea7c52e8ea615cbd00af1c8 100644
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -17,46 +17,48 @@ load(
 # and then archive those source files into
 #     ops/gen_sources.srcjar
 #
-def tf_java_op_gen_srcjar(name,
-                          gen_tool,
-                          base_package,
-                          api_def_srcs=[],
-                          out_dir="ops/",
-                          out_src_dir="src/main/java/",
-                          visibility=["//tensorflow/java:__pkg__"]):
+def tf_java_op_gen_srcjar(
+        name,
+        gen_tool,
+        base_package,
+        api_def_srcs = [],
+        out_dir = "ops/",
+        out_src_dir = "src/main/java/",
+        visibility = ["//tensorflow/java:__pkg__"]):
+    gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
+    srcs = api_def_srcs[:]
 
-  gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
-  srcs = api_def_srcs[:]
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                "$$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
 
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
+    gen_cmds += ["$(location " + gen_tool + ")" +
+                 " --output_dir=$(@D)/" + out_src_dir +
+                 " --base_package=" + base_package +
+                 " --api_dirs=" + api_def_args_str]
 
-  gen_cmds += ["$(location " + gen_tool + ")" +
-               " --output_dir=$(@D)/" + out_src_dir +
-               " --base_package=" + base_package +
-               " --api_dirs=" + api_def_args_str]
+    # Generate a source archive containing generated code for these ops.
+    gen_srcjar = out_dir + name + ".srcjar"
+    gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
 
-  # Generate a source archive containing generated code for these ops.
-  gen_srcjar = out_dir + name + ".srcjar"
-  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
-
-  native.genrule(
-      name=name,
-      srcs=srcs,
-      outs=[gen_srcjar],
-      tools=[
-          "@local_jdk//:jar",
-          "@local_jdk//:jdk",
-          gen_tool
-      ] + tf_binary_additional_srcs(),
-      cmd=" && ".join(gen_cmds))
+    native.genrule(
+        name = name,
+        srcs = srcs,
+        outs = [gen_srcjar],
+        tools = [
+            "@local_jdk//:jar",
+            "@local_jdk//:jdk",
+            gen_tool,
+        ] + tf_binary_additional_srcs(),
+        cmd = " && ".join(gen_cmds),
+    )
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 7d71ca32d501ef4cbbcdce73158233a1a1b8154d..46800a89bb773368549e739f59e41828626d1dfa 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -4,13 +4,15 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "if_not_windows")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
+    "testdata/*.tflite",
+    "testdata/*.csv",
     "models/testdata/*",
 ]))
 
@@ -173,24 +175,17 @@ cc_library(
         "stderr_reporter.h",
     ],
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
-    linkopts = [
-    ] + select({
-        "//tensorflow:android": [
-            "-llog",
-        ],
-        "//conditions:default": [
-        ],
-    }),
     deps = [
         ":arena_planner",
         ":graph_info",
         ":memory_planner",
+        ":minimal_logging",
         ":schema_fbs_version",
         ":simple_memory_arena",
         ":string",
         ":util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api:api",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
@@ -368,4 +363,65 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "minimal_logging",
+    srcs = [
+        "minimal_logging.cc",
+    ] + select({
+        "//tensorflow:android": [
+            "minimal_logging_android.cc",
+        ],
+        "//tensorflow:ios": [
+            "minimal_logging_ios.cc",
+        ],
+        "//conditions:default": [
+            "minimal_logging_default.cc",
+        ],
+    }),
+    hdrs = ["minimal_logging.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    linkopts = select({
+        "//tensorflow:android": ["-llog"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:private"],
+)
+
+cc_test(
+    name = "minimal_logging_test",
+    size = "small",
+    srcs = ["minimal_logging_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":minimal_logging",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Shared lib target for convenience, pulls in the core runtime and builtin ops.
+# Note: This target is not yet finalized, and the exact set of exported (C/C++)
+# APIs is subject to change.
+tflite_cc_shared_object(
+    name = "libtensorflowlite.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
+            "-Wl,-install_name,@rpath/libtensorflowlite.so",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-Wl,--version-script,$(location //tensorflow/lite:tflite_version_script.lds)",
+        ],
+    }),
+    deps = [
+        ":framework",
+        ":tflite_exported_symbols.lds",
+        ":tflite_version_script.lds",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 181590e5a946dd5aa536530ef66739ea54fd50d6..4fc3d2e2380375e7714e7114bbf882c7e6f1e75d 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -226,6 +226,7 @@ def generated_test_models():
     return [
         "abs",
         "add",
+        "add_n",
         "arg_min_max",
         "avg_pool",
         "batch_to_space_nd",
@@ -237,8 +238,10 @@ def generated_test_models():
         "conv2d_transpose",
         "conv_with_shared_weights",
         "conv_to_depthwiseconv_with_shared_weights",
+        "cos",
         "depthwiseconv",
         "div",
+        "elu",
         "equal",
         "exp",
         "expand_dims",
@@ -249,6 +252,7 @@ def generated_test_models():
         "fully_connected",
         "fused_batch_norm",
         "gather",
+        "gather_nd",
         "gather_with_constant",
         "global_batch_norm",
         "greater",
@@ -283,6 +287,7 @@ def generated_test_models():
         "prelu",
         "pow",
         "range",
+        "rank",
         "reduce_any",
         "reduce_max",
         "reduce_min",
@@ -292,6 +297,7 @@ def generated_test_models():
         "relu6",
         "reshape",
         "resize_bilinear",
+        "resolve_constant_strided_slice",
         "reverse_v2",
         "rsqrt",
         "shape",
@@ -310,7 +316,6 @@ def generated_test_models():
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
-        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index fc871d359243109df8f9e1c69972a425b474534d..e7fb59282ce470b27ebd5f58550d537753fda374 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -132,6 +132,11 @@ typedef enum {
   kTfLiteBuiltinCeil = 104,
   kTfLiteBuiltinReverseV2 = 105,
   kTfLiteBuiltinAddN = 106,
+  kTfLiteBuiltinGatherNd = 107,
+  kTfLiteBuiltinCos = 108,
+  kTfLiteBuiltinWhere = 109,
+  kTfLiteBuiltinRank = 110,
+  kTfLiteBuiltinElu = 111,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 680addfa2faa7afd3c7032e147f3be2c600c7668..661b648550c9a3fc64b8bc2fb97a2f3b16e7aac1 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -12,6 +12,7 @@ cc_library(
         "c_api_internal.h",
     ],
     visibility = [
+        "//learning/brain/mobile/kernel_test:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
 )
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 332c2db14511af18a8e3d99fc93891ce92d1792a..40fea17fefdbc06bb01d8ae950cb2233f5c16cc3 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -333,6 +333,9 @@ typedef struct {
   TfLiteType out_type;
 } TfLiteShapeParams;
 
+typedef struct {
+} TfLiteRankParams;
+
 typedef struct {
   // Parameters supported by version 1:
   float min;
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
index 4ce7c481e1c26e6fcfdaa680e9ca666b82968d53..4967183dd56df64b75c719869d16d052ae976081 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -71,6 +71,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteTransposeConvParams transpose_conv_params;
   TfLiteSparseToDenseParams sparse_to_dense_params;
   TfLiteShapeParams shape_params;
+  TfLiteRankParams rank_params;
   TfLiteFakeQuantParams fake_quant_params;
   TfLitePackParams pack_params;
   TfLiteOneHotParams one_hot_params;
diff --git a/tensorflow/lite/context_util.h b/tensorflow/lite/context_util.h
index 68b91ea0b93e602c20d1db3284a523e9f55dfd5b..2f846cc259e34b1f750ba0787dffa93db597cbe0 100644
--- a/tensorflow/lite/context_util.h
+++ b/tensorflow/lite/context_util.h
@@ -38,6 +38,7 @@ class TfLiteIntArrayView {
   const_iterator begin() const { return int_array_->data; }
   const_iterator end() const { return &int_array_->data[int_array_->size]; }
   size_t size() const { return end() - begin(); }
+  int operator[](size_t pos) const { return int_array_->data[pos]; }
 
  private:
   const TfLiteIntArray* int_array_;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 175caa1afa6dd4ac365a2a3800a37ee6083666e4..7ca0fc2deb4d4fb713e1eb98c572dfb3542c26fa 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -680,8 +680,10 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     // ok for now, since there is no call implementation either.
     case BuiltinOperator_CALL:
     case BuiltinOperator_CONCAT_EMBEDDINGS:
+    case BuiltinOperator_COS:
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_ELU:
     case BuiltinOperator_EMBEDDING_LOOKUP:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_EXP:
@@ -728,6 +730,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_SQUARED_DIFFERENCE:
     case BuiltinOperator_REVERSE_V2:
     case BuiltinOperator_ADD_N:
+    case BuiltinOperator_GATHER_ND:
+    case BuiltinOperator_WHERE:
+    case BuiltinOperator_RANK:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 2fdafa356c8d13304e19ecb0b0126147ad88d78a..ec6762b16c95e86fb65ec187d5e92f91eff1cbc5 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -592,7 +592,12 @@ TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
 
   // Short-circuit the state change if the dimensions don't change, avoiding
   // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+  //
+  // Note that it's required to check `tensor->data.raw != nullptr`. Otherwise
+  // the subgraph won't allocate memory for a dynamic tensor when its size
+  // is equal to the original tensor size.
+  if (tensor->data.raw != nullptr &&
+      EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
     return kTfLiteOk;
   }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 8345eb7497dd45b84c0f28530e9fe50bdeff1888..86fe7c5140ec7261ca2bcb136000231e2f2437f3 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -51,8 +51,47 @@ namespace {
   } while (0)
 
 namespace {
+
+bool IsFloat(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsQuantized(TfLiteType type) {
+  switch (type) {
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt16:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
+                      const TfLiteNode* node) {
+  switch (builtin_code) {
+    case kTfLiteBuiltinConv2d:
+    case kTfLiteBuiltinFullyConnected: {
+      const int input_id = node->inputs->data[0];
+      const int filter_id = node->inputs->data[1];
+      const TfLiteType input_type = context->tensors[input_id].type;
+      const TfLiteType filter_type = context->tensors[filter_id].type;
+      return IsFloat(input_type) && IsQuantized(filter_type);
+    }
+    default:
+      return false;
+  }
+}
+
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
+constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+
 }  // namespace
 
 // RAII NN API Model Destructor for use with std::unique_ptr
@@ -147,16 +186,42 @@ class OperandMapping {
   std::vector<int> lite_tensor_to_ann_tensor_;
 };
 
+class DequantizeMapping {
+ public:
+  int DequantizedAnnIndex(int ann_index, TfLiteType type) const {
+    for (const auto& element : mapping_) {
+      if (ann_index == std::get<0>(element) && type == std::get<1>(element)) {
+        return std::get<2>(element);
+      }
+    }
+    return -1;
+  }
+
+  void Add(int ann_index, TfLiteType type, int dequantized_ann_index) {
+    // This assumes it is not already mapped.
+    mapping_.emplace_back(ann_index, type, dequantized_ann_index);
+  }
+
+ private:
+  // Each tuple specifies the ANN (quantized) tensor index, the desired
+  // floating-point type and the matching ANN (dequantized) tensor index. This
+  // could use a map but instead std::vector is used to keep code size lower.
+  std::vector<std::tuple<int, TfLiteType, int>> mapping_;
+};
+
 // Abstract builder for building an op in the NN API graph. This handles
 // the disparity between TFLite and NN API operand types. NN API has singular
 // operands for both tensors and parameters, and TFLite separates the two.
 class NNAPIOpBuilder {
  public:
   NNAPIOpBuilder(const NnApi* nnapi, TfLiteContext* context,
-                 OperandMapping* tensor_mapping, ANeuralNetworksModel* nn_model)
+                 OperandMapping* tensor_mapping,
+                 DequantizeMapping* dequantize_mapping,
+                 ANeuralNetworksModel* nn_model)
       : nnapi_(nnapi),
         context_(context),
         operand_mapping_(tensor_mapping),
+        dequantize_mapping_(dequantize_mapping),
         nn_model_(nn_model) {}
 
   TfLiteStatus AddScalarInt32Operand(int32_t value) {
@@ -190,50 +255,129 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddTensorInput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_inputs_.push_back(ann_index);
-    return kTfLiteOk;
+  TfLiteStatus AddTensorInput(int tensor_index, bool hybrid_op) {
+    return AddTensor(tensor_index, hybrid_op, &augmented_inputs_);
   }
 
   TfLiteStatus AddTensorOutput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_outputs_.push_back(ann_index);
-    return kTfLiteOk;
+    return AddTensor(tensor_index, /*hybrid_op=*/false, &augmented_outputs_);
   }
 
   TfLiteStatus AddAdditionalFloat32OutputTensor(uint32_t dimension_count) {
     std::vector<uint32_t> dims(dimension_count, 0);
-    ANeuralNetworksOperandType operand_type{
-        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
-        .dimensionCount = dimension_count,
-        .dimensions = dims.data()};
+    return AddFloat32OutputTensor(dimension_count, dims.data(), nullptr);
+  }
+
+  TfLiteStatus AddStateFloat32Tensor(int tensor_index,
+                                     int* ann_tensor_index_out) {
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    return AddFloat32OutputTensor(
+        tensor->dims->size, reinterpret_cast<uint32_t*>(tensor->dims->data),
+        ann_tensor_index_out);
+  }
+
+  // Adds a Dequantize operator and replaces the input tensor index with the
+  // dequantized version. If the dequantized version of the operator already
+  // exists then it is not added again.
+  TfLiteStatus AddDequantize(int nn_input_index, int lite_index,
+                             TfLiteType dequantized_type) {
+    const int ann_index = operand_mapping_->lite_index_to_ann(lite_index);
+    int dequantized_ann_index =
+        dequantize_mapping_->DequantizedAnnIndex(ann_index, dequantized_type);
+
+    if (dequantized_ann_index == -1) {
+      // The dequantized version does not exist yet, it has to be added: a new
+      // Dequantize operation is added, yielding a new tensor.
+      const TfLiteTensor& tensor = context_->tensors[lite_index];
+      ANeuralNetworksOperandType operand_type{
+          dequantized_type, static_cast<uint32_t>(tensor.dims->size),
+          reinterpret_cast<uint32_t*>(tensor.dims->data), 0.f, 0};
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+      dequantized_ann_index = operand_mapping_->add_new_non_tensor_operand();
+
+      // Add Dequantize operation.
+      const uint32_t dequantize_input[1] = {static_cast<uint32_t>(ann_index)};
+      const uint32_t dequantize_output[1] = {
+          static_cast<uint32_t>(dequantized_ann_index)};
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_, nnapi_->ANeuralNetworksModel_addOperation(
+                        nn_model_, ANEURALNETWORKS_DEQUANTIZE, 1,
+                        dequantize_input, 1, dequantize_output));
+      dequantize_mapping_->Add(ann_index, dequantized_type,
+                               dequantized_ann_index);
+    }
+
+    // The input for the original operation is modified so that the operation
+    // now uses the dequantized tensor as input.
+    augmented_inputs_[nn_input_index] = dequantized_ann_index;
+
+    return kTfLiteOk;
+  }
+
+  // Finish emitting the op (of type `type`) into the NN API.
+  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
+    // Actually add a NN API operation
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperation(
+            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
+            augmented_inputs_.data(),
+            static_cast<uint32_t>(augmented_outputs_.size()),
+            augmented_outputs_.data()));
+    augmented_inputs_.clear();
+    augmented_outputs_.clear();
+    return kTfLiteOk;
+  }
+
+ private:
+  template <typename T>
+  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
+    ANeuralNetworksOperandType operand_type{.type = nn_type};
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    augmented_outputs_.push_back(ann_operand);
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_index, &value, sizeof(T)));
+    augmented_inputs_.push_back(ann_index);
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddStateFloat32Tensor(int tensor_index,
-                                     int* ann_tensor_index_out) {
-    TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    int ann_index = operand_mapping_->add_new_non_tensor_operand();
+  template <typename T>
+  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
+                                int32_t nn_type) {
+    ANeuralNetworksOperandType operand_type{
+        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
 
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_index, values, sizeof(T) * num_values));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddFloat32OutputTensor(uint32_t dimension_count,
+                                      const uint32_t* dimension_data,
+                                      int* ann_index_out) {
     ANeuralNetworksOperandType operand_type{
-        ANEURALNETWORKS_TENSOR_FLOAT32,
-        static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
-        tensor->params.zero_point};
+        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
+        .dimensionCount = dimension_count,
+        .dimensions = dimension_data,
+    };
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
     augmented_outputs_.push_back(ann_index);
-
-    *ann_tensor_index_out = ann_index;
+    if (ann_index_out) *ann_index_out = ann_index;
     return kTfLiteOk;
   }
 
@@ -241,10 +385,11 @@ class NNAPIOpBuilder {
   // This returns the NN API tensor index corresponding to the created tensor.
   // If another caller previously created a NN API tensor for `tensor_index`
   // then the existing one is returned.
-  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
+  TfLiteStatus AddTensor(int tensor_index, bool hybrid_op,
+                         std::vector<uint32_t>* indices) {
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
-      *ann_tensor_index_out = ann_tensor_index;
+      indices->push_back(ann_tensor_index);
       return kTfLiteOk;
     }
     // Allocate a new tensor index
@@ -255,11 +400,17 @@ class NNAPIOpBuilder {
     float scale = 0.0f;
     int32_t zeroPoint = 0;
     TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    switch (tensor->type) {
+    TfLiteType tensor_type = tensor->type;
+    if (hybrid_op && (tensor_type == kTfLiteUInt8)) {
+      // For legacy reason, UINT8 weights in hybrid operators are actually INT8
+      // values and should be interpreted as such.
+      tensor_type = kTfLiteInt8;
+    }
+    switch (tensor_type) {
       case kTfLiteNoType:
         // Tensors added during initialization of Ops don't have a type yet and
         // should not be registered with the NNAPI.
-        *ann_tensor_index_out = -1;
+        indices->push_back(-1);
         return kTfLiteOk;
       case kTfLiteFloat32:
         nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
@@ -273,6 +424,10 @@ class NNAPIOpBuilder {
           scale = 1;
         }
         break;
+      case kTfLiteInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        scale = tensor->params.scale;
+        break;
       case kTfLiteInt32:
         nn_type = ANEURALNETWORKS_TENSOR_INT32;
         scale = tensor->params.scale;
@@ -298,53 +453,7 @@ class NNAPIOpBuilder {
               nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes));
     }
 
-    *ann_tensor_index_out = ann_tensor_index;
-    return kTfLiteOk;
-  }
-
-  // Finish emitting the op (of type `type`) into the NN API.
-  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
-    // Actually add a NN API operation
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_,
-        nnapi_->ANeuralNetworksModel_addOperation(
-            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
-            augmented_inputs_.data(),
-            static_cast<uint32_t>(augmented_outputs_.size()),
-            augmented_outputs_.data()));
-    augmented_inputs_.clear();
-    augmented_outputs_.clear();
-    return kTfLiteOk;
-  }
-
- private:
-  template <typename T>
-  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{.type = nn_type};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_,
-        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
-                      nn_model_, ann_operand, &value, sizeof(T)));
-    augmented_inputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
-
-  template <typename T>
-  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
-                                int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{
-        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_,
-        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
-                      nn_model_, ann_operand, values, sizeof(T) * num_values));
-    augmented_inputs_.push_back(ann_operand);
+    indices->push_back(ann_tensor_index);
     return kTfLiteOk;
   }
 
@@ -355,7 +464,13 @@ class NNAPIOpBuilder {
   TfLiteContext* const context_;
 
   // Tracks relationship between indices.
-  OperandMapping* operand_mapping_;
+  OperandMapping* const operand_mapping_;
+
+  // Keeps mapping of ANN quantized tensor and float data type to equivalent
+  // dequantized ANN tensor. For example, tensor #4 (UINT8) + FLOAT32 could map
+  // to tensor #10 (FLOAT32) because a DEQUANTIZE operator was added to convert
+  // tensor #4 to a FLOAT32 tensor.
+  DequantizeMapping* const dequantize_mapping_;
 
   // The NNAPI model.
   ANeuralNetworksModel* const nn_model_;
@@ -394,8 +509,9 @@ class NNAPIDelegateKernel {
   // Return a function that knows how to translate a node into its operands
   // when called. You can use this function to see if a node is supported
   // (i.e. that MappingFn is not nullptr).
-  static MappingFn Map(TfLiteContext* context, int builtin_code, int version,
-                       int android_sdk_version, TfLiteNode* node) {
+  static MappingFn Map(const TfLiteContext* context, int builtin_code,
+                       int version, int android_sdk_version,
+                       const TfLiteNode* node) {
     switch (builtin_code) {
       case kTfLiteBuiltinAdd:
         if (version == 1) {
@@ -451,6 +567,11 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinConv2d:
         if (version == 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              IsHybridOperator(context, builtin_code, node)) {
+            // Hybrid operators not supported before NNAPI 1.2.
+            return nullptr;
+          }
           auto builtin =
               reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
           if (builtin->dilation_width_factor != 1 ||
@@ -488,6 +609,11 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinFullyConnected:
         if (version == 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              IsHybridOperator(context, builtin_code, node)) {
+            // Hybrid operators not supported before NNAPI 1.2.
+            return nullptr;
+          }
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
@@ -957,18 +1083,69 @@ class NNAPIDelegateKernel {
   std::unique_ptr<NNMemory> nn_input_memory_;
   std::unique_ptr<NNMemory> nn_output_memory_;
 
+  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
+                                         int builtin_code,
+                                         const TfLiteNode* node,
+                                         NNAPIOpBuilder* builder) {
+    // Depending on the operator and the input data format, Dequantize
+    // operators may need to be added. For example when the input is
+    // floating-point but weights are quantized then the weights will first be
+    // dequantized to the same format as the input before being passed to the
+    // operator.
+
+    // The tensor determining whether the inputs should be floating-point.
+    int input_tensor_index = -1;
+    std::vector<int> inputs_to_potentially_dequantize;
+
+    switch (builtin_code) {
+      case kTfLiteBuiltinConv2d:
+      case kTfLiteBuiltinFullyConnected: {
+        input_tensor_index = 0;
+        // Weights and bias are inputs #1 and #2 respectively and may require
+        // dequantization.
+        inputs_to_potentially_dequantize = {1, 2};
+        break;
+      }
+      default:
+        return;
+    }
+
+    int tensor_id = node->inputs->data[input_tensor_index];
+    if (tensor_id < 0) return;
+
+    // Nothing to do if the input is not floating-point.
+    if (!IsFloat(context->tensors[tensor_id].type)) return;
+
+    for (int i : inputs_to_potentially_dequantize) {
+      tensor_id = node->inputs->data[i];
+      if (tensor_id < 0) continue;  // Ignore optional input.
+
+      const TfLiteType type = context->tensors[tensor_id].type;
+      // Nothing to do for this tensor if it's not quantized.
+      if (type != kTfLiteUInt8) continue;
+
+      // Insert Dequantize operator if it hasn't been done already and change
+      // the node's input accordingly.
+      builder->AddDequantize(i, node->inputs->data[i], type);
+    }
+  }
+
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
-    // The operand builder allows creating a single op. We create it at this
-    // reduced power position rather than in the for loop to avoid reallocating
-    // the vectors.
-    NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_, nn_model_.get());
-    // Add Tensors
-    // allocate outside to avoid realloc
+    DequantizeMapping dequantize_mapping;
+    // The operand builder allows creating a single op. It is created outside
+    // the for loop to avoid reallocating the vectors.
+    NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
+                           &dequantize_mapping, nn_model_.get());
+    // Add Tensors.
     for (auto node_index : nodes_) {
       // Obtain the op and registration.
       TfLiteNode* node;
       TfLiteRegistration* reg;
-      context->GetNodeAndRegistration(context, node_index, &node, &reg);
+      TF_LITE_ENSURE_STATUS(
+          context->GetNodeAndRegistration(context, node_index, &node, &reg));
+
+      const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
+
       // Map inputs to NN API tensor indices.
       for (auto input_index : TfLiteIntArrayView(node->inputs)) {
         if (input_index == kOptionalTensor &&
@@ -980,7 +1157,7 @@ class NNAPIDelegateKernel {
           // tensor when supported by NNAPI.
           TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
         } else {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
         }
       }
       // Get op type and operands
@@ -993,6 +1170,11 @@ class NNAPIDelegateKernel {
         TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
       }
 
+      // Dequantize operators may have to be added in case inputs are to be
+      // floating-point.
+      AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
+                                        &builder);
+
       builder.FinalizeAddOperation(nn_op_type);
     }
     return kTfLiteOk;
@@ -1026,7 +1208,7 @@ class NNAPIDelegateKernel {
       total_output_byte_size += context->tensors[i].bytes;
     }
 
-    // Add state output tensors as model inputs
+    // Add state output tensors as model outputs.
     for (int i : model_state_outputs_) {
       outputs.push_back(i);
     }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 5da052eb42275d684bfbf83e7b52227ccbb97a06..37512386389aef811bd1c9c3d03621e0a188a7e7 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -49,6 +49,24 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
                                  const std::vector<int>& dims) {
     return interpreter_->ResizeInputTensor(tensor_index, dims);
   }
+
+ protected:
+  void SetData(int index, TensorType type, std::initializer_list<float> data) {
+    switch (type) {
+      case TensorType_FLOAT32:
+        PopulateTensor(index, data);
+        break;
+      case TensorType_INT32:
+        QuantizeAndPopulate<int32_t>(index, data);
+        break;
+      case TensorType_UINT8:
+        QuantizeAndPopulate<uint8_t>(index, data);
+        break;
+      default:
+        FAIL() << "Type not supported: " << type;
+        break;
+    }
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -225,14 +243,15 @@ TEST(NNAPIDelegate, L2PoolWithNoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
 }
 
-class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
+class ConvolutionOpModel : public SingleOpModelWithNNAPI {
  public:
-  BaseConvolutionOpModel(
+  ConvolutionOpModel(
       const TensorData& input, const TensorData& filter,
       const TensorData& output, int stride_width = 2, int stride_height = 2,
       enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
-      int dilation_width_factor = 1, int dilation_height_factor = 1) {
+      int dilation_width_factor = 1, int dilation_height_factor = 1)
+      : input_type_(input.type), filter_type_(filter.type) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -249,7 +268,8 @@ class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
     }
 
     output_ = AddOutput(output);
-    if (input.type != TensorType_FLOAT32) {
+
+    if (input_type_ != TensorType_FLOAT32) {
       // The following is required by quantized inference. It is the unittest's
       // responsibility to make sure the output scale falls into the correct
       // range.
@@ -265,56 +285,53 @@ class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
- protected:
-  int input_;
-  int filter_;
-  int bias_;
-  int output_;
-};
-
-class ConvolutionOpModel : public BaseConvolutionOpModel {
- public:
-  using BaseConvolutionOpModel::BaseConvolutionOpModel;
-
-  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
- public:
-  using BaseConvolutionOpModel::BaseConvolutionOpModel;
-
   void SetInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    SetData(input_, input_type_, data);
   }
 
   void SetFilter(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(filter_, data);
+    SetData(filter_, filter_type_, data);
   }
 
   void SetBias(std::initializer_list<float> data) {
-    QuantizeAndPopulate<int32_t>(bias_, data);
+    const auto bias_type =
+        (input_type_ == TensorType_FLOAT32) ? input_type_ : TensorType_INT32;
+    SetData(bias_, bias_type, data);
   }
 
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+  std::vector<float> GetOutput() {
+    if (input_type_ == TensorType_FLOAT32) {
+      return ExtractVector<float>(output_);
+    } else {
+      return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                                 GetScale(output_), GetZeroPoint(output_));
+    }
   }
+
+  std::vector<uint8_t> GetQuantizedOutput() {
+    if (input_type_ == TensorType_FLOAT32) {
+      return {};  // Not supported.
+    } else {
+      return ExtractVector<uint8_t>(output_);
+    }
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+
+  const TensorType input_type_;
+  const TensorType filter_type_;
 };
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(NNAPIDelegate, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
-                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
-                                {TensorType_UINT8, {}, -127, 128});
+TEST(ConvolutionOpTest, SimpleTestQuantized) {
+  ConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                       {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                       {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
       // First batch
       1, 1, 1, 1,  // row = 1
@@ -332,25 +349,55 @@ TEST(NNAPIDelegate, SimpleTestQuantized) {
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      18, 2, 5,  // first batch, left
-                      18, 2, 5,  // first batch, right
-                      17, 4, 3,  // second batch, left
-                      37, 4, 3,  // second batch, right
-                  },
-                  1e-5)));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 2, 5,  // first batch, left
+                                     18, 2, 5,  // first batch, right
+                                     17, 4, 3,  // second batch, left
+                                     37, 4, 3,  // second batch, right
+                                 },
+                                 1e-5)));
   // For good  measure, let's also verify the quantized values:
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 145, 129, 132,  //
-                                 145, 129, 132,  //
-                                 144, 131, 130,  //
-                                 164, 131, 130,  //
-                             }));
+  EXPECT_THAT(m.GetQuantizedOutput(), ElementsAreArray({
+                                          145, 129, 132,  //
+                                          145, 129, 132,  //
+                                          144, 131, 130,  //
+                                          164, 131, 130,  //
+                                      }));
 }
 
-TEST(NNAPIDelegate, Conv2DWithNoActivation) {
+TEST(ConvolutionOpTest, FloatInputQuantizedWeights) {
+  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+                       {TensorType_UINT8, {3, 2, 2, 1}, 0, 64},
+                       {TensorType_FLOAT32, {}});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 2,  // row = 1
+      2, 2, 2, 1,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,  // first 2x2 filter
+      0, 1, 0, 1,  // second 2x2 filter
+      0, 0, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 5, 7,    // first batch, left
+                                     16, 5, 6,    // first batch, right
+                                     17, 6, 6,    // second batch, left
+                                     37, 10, 10,  // second batch, right
+                                 },
+                                 0.2)));
+}
+
+TEST(ConvolutionOpTest, NoActivation) {
   ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
@@ -458,56 +505,48 @@ TEST(NNAPIDelegate, DepthwiseConv2DWithNoActivation) {
                              }));
 }
 
-class FloatFullyConnectedOpModel : public SingleOpModelWithNNAPI {
+class FullyConnectedOpModel : public SingleOpModelWithNNAPI {
  public:
-  FloatFullyConnectedOpModel(int units, int batches, const TensorData& input,
-                             const TensorData& output = {TensorType_FLOAT32})
-      : batches_(batches), units_(units) {
-    int total_input_size = 1;
-    for (int i = 0; i < input.shape.size(); ++i) {
-      total_input_size *= input.shape[i];
-    }
-    input_size_ = total_input_size / batches_;
-
+  FullyConnectedOpModel(
+      const TensorData& input, const TensorData& weights,
+      const TensorData& output,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE)
+      : input_type_(input.type), weights_type_(weights.type) {
     input_ = AddInput(input);
-    weights_ =
-        AddInput({input.type, {units_, input_size_}, input.min, input.max});
+    weights_ = AddInput(weights);
 
+    const int units = weights.shape[0];
     if (input.type == TensorType_FLOAT32) {
-      bias_ = AddInput({TensorType_FLOAT32, {units_}});
+      bias_ = AddInput({TensorType_FLOAT32, {units}});
     } else {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
       auto bias_scale = GetScale(input_) * GetScale(weights_);
-      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      TensorData bias{TensorType_INT32, {units}, 0, 0, bias_scale};
       bias_ = AddInput(bias);
     }
 
     output_ = AddOutput(output);
 
-    SetBuiltinOp(
-        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
-        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
-            .Union());
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions,
+                 CreateFullyConnectedOptions(builder_, activation).Union());
     BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
   }
 
-  int input_size() { return input_size_; }
-  int num_units() { return units_; }
-  int num_batches() { return batches_; }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetWeights(std::initializer_list<float> f) {
-    PopulateTensor(weights_, f);
+  void SetInput(std::initializer_list<float> data) {
+    SetData(input_, input_type_, data);
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
+  void SetWeights(std::initializer_list<float> data) {
+    SetData(weights_, weights_type_, data);
   }
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
+
+  void SetBias(std::initializer_list<float> data) {
+    const auto bias_type =
+        (input_type_ == TensorType_FLOAT32) ? input_type_ : TensorType_INT32;
+    SetData(bias_, bias_type, data);
   }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
@@ -518,14 +557,14 @@ class FloatFullyConnectedOpModel : public SingleOpModelWithNNAPI {
   int bias_;
   int output_;
 
-  int batches_;
-  int units_;
-  int input_size_;
+  const TensorType input_type_;
+  const TensorType weights_type_;
 };
 
-TEST(NNAPIDelegate, FullyConnectedSimpleTest) {
-  FloatFullyConnectedOpModel m(/*units=*/3, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {2, 10}});
+TEST(FullyConnectedOpTest, SimpleTest) {
+  FullyConnectedOpModel m(/*input=*/{TensorType_FLOAT32, {2, 10}},
+                          /*weights=*/{TensorType_FLOAT32, {3, 10}},
+                          /*output=*/{TensorType_FLOAT32});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -543,6 +582,28 @@ TEST(NNAPIDelegate, FullyConnectedSimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
+TEST(FullyConnectedOpTest, FloatInputQuantizedWeights) {
+  FullyConnectedOpModel m(/*input=*/{TensorType_FLOAT32, {2, 10}},
+                          /*weights=*/{TensorType_UINT8, {3, 10}, 0, 64},
+                          /*output=*/{TensorType_FLOAT32});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60}, 1.3)));
+}
+
 class SoftmaxOpModel : public SingleOpModelWithNNAPI {
  public:
   SoftmaxOpModel(int batches, int size, float beta)
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
index 2f0f4327b7028feaa19719352c0d9c67f51b9f39..ac71c9bd34e524c41e70ec2e724ed30680b63932 100644
--- a/tensorflow/lite/experimental/c/BUILD
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -20,15 +20,13 @@ tflite_cc_shared_object(
     name = "libtensorflowlite_c.so",
     linkopts = select({
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/lite/experimental/c:version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow/lite/experimental/c:version_script.lds)",
         ],
     }),
     deps = [
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index 827b10449b4dc06bfb84226fd44221255f29f987..61587a0ba42ed742cfd4203a481dbb596fa0502e 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -5,27 +5,29 @@ package(default_visibility = ["//tensorflow:internal"])
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
-    name = "tflite_lstm",
-    srcs = ["tflite_lstm.py"],
+    name = "rnn",
+    srcs = ["rnn.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/lite/python:lite",
+        "//tensorflow/lite/python:op_hint",
         "//tensorflow/python:framework",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
         "@six_archive//:six",
     ],
 )
 
 py_library(
-    name = "tflite_rnn",
-    srcs = ["tflite_rnn.py"],
+    name = "rnn_cell",
+    srcs = ["rnn_cell.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/lite/python:lite",
+        "//tensorflow/lite/python:op_hint",
         "//tensorflow/python:framework",
+        "//tensorflow/python:rnn_cell",
         "@six_archive//:six",
     ],
 )
@@ -40,7 +42,8 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_lstm",
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
@@ -62,7 +65,8 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_rnn",
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
@@ -84,7 +88,8 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_lstm",
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
@@ -106,7 +111,8 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_rnn",
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
@@ -117,3 +123,13 @@ py_test(
         "@six_archive//:six",
     ],
 )
+
+py_library(
+    name = "tflite_lstm_ops",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+    ],
+)
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
index 9dc81096bcd5bd64f9989662970831105edbbe84..867932e9153dc4b5eb751c49e8a5e350a2263183 100644
--- a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
@@ -20,7 +20,7 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
+from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
 from tensorflow.python.framework import test_util
@@ -56,19 +56,21 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def buildLstmLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
-        TFLiteLSTMCell(
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
-        TFLiteLSTMCell(
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
             num_proj=8,
             forget_bias=0,
             name="rnn3"),
-        TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, forget_bias=0, name="rnn4")
     ])
 
-  def buildModel(self, fw_lstm_layer, bw_lstm_layer):
+  def buildModel(self, fw_lstm_layer, bw_lstm_layer, is_dynamic_rnn):
     # Weights and biases for output softmax layer.
     out_weights = tf.Variable(
         tf.random_normal([self.num_units * 2, self.n_classes]))
@@ -78,14 +80,28 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     x = tf.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
-    lstm_input = tf.unstack(x, self.time_steps, 1)
-    outputs, _, _ = tf.nn.static_bidirectional_rnn(
-        fw_lstm_layer, bw_lstm_layer, lstm_input, dtype="float32")
-
-    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
-    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    if is_dynamic_rnn:
+      lstm_inputs = tf.transpose(x, [1, 0, 2])
+      outputs, _ = bidirectional_dynamic_rnn(
+          fw_lstm_layer,
+          bw_lstm_layer,
+          lstm_inputs,
+          dtype="float32",
+          time_major=True)
+      fw_outputs, bw_outputs = outputs
+      output = tf.concat([fw_outputs, bw_outputs], 2)
+      output = tf.unstack(output, axis=0)
+      output = output[-1]
+    else:
+      lstm_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+          fw_lstm_layer, bw_lstm_layer, lstm_input, dtype="float32")
+      output = outputs[-1]
+
+    # Compute logits by multiplying output of shape [batch_size,num_units*2]
+    # by the softmax layer's out_weight of shape [num_units*2,n_classes]
     # plus out_bias
-    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    prediction = tf.matmul(output, out_weights) + out_bias
     output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
 
     return x, prediction, output_class
@@ -111,13 +127,15 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
                                  self.n_input))
       sess.run(opt, feed_dict={x: batch_x, y: batch_y})
 
-  def saveAndRestoreModel(self, fw_lstm_layer, bw_lstm_layer, sess, saver):
+  def saveAndRestoreModel(self, fw_lstm_layer, bw_lstm_layer, sess, saver,
+                          is_dynamic_rnn):
     model_dir = tempfile.mkdtemp()
     saver.save(sess, model_dir)
 
     # Reset the graph.
     tf.reset_default_graph()
-    x, prediction, output_class = self.buildModel(fw_lstm_layer, bw_lstm_layer)
+    x, prediction, output_class = self.buildModel(fw_lstm_layer, bw_lstm_layer,
+                                                  is_dynamic_rnn)
 
     new_sess = tf.Session(config=CONFIG)
     saver = tf.train.Saver()
@@ -173,12 +191,34 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     sess = tf.Session(config=CONFIG)
 
     x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
-                                                  self.buildLstmLayer())
+                                                  self.buildLstmLayer(), False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(), self.buildLstmLayer(), sess, saver, False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
+                                                  self.buildLstmLayer(), True)
     self.trainModel(x, prediction, output_class, sess)
 
     saver = tf.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
-        self.buildLstmLayer(), self.buildLstmLayer(), sess, saver)
+        self.buildLstmLayer(),
+        self.buildLstmLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True)
 
     test_inputs, expected_output, frozen_graph = self.getInferenceResult(
         x, output_class, new_sess)
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
index 7a937ce47f7fb049f50307c049f33f8e0060b986..0a05af41be2e77904992b93ecb7395ad21918714 100644
--- a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
@@ -22,7 +22,7 @@ import tensorflow as tf
 from tensorflow import flags
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_rnn import TfLiteRNNCell
+from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
 from tensorflow.python.framework import test_util
@@ -63,11 +63,11 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   def buildRnnLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
-        TfLiteRNNCell(self.num_units, name="rnn1"),
-        TfLiteRNNCell(self.num_units, name="rnn2")
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
     ])
 
-  def buildModel(self, fw_rnn_layer, bw_rnn_layer):
+  def buildModel(self, fw_rnn_layer, bw_rnn_layer, is_dynamic_rnn):
     # Weights and biases for output softmax layer.
     out_weights = tf.Variable(
         tf.random_normal([self.num_units * 2, self.n_classes]))
@@ -77,14 +77,28 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     x = tf.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
-    rnn_input = tf.unstack(x, self.time_steps, 1)
-    outputs, _, _ = tf.nn.static_bidirectional_rnn(
-        fw_rnn_layer, bw_rnn_layer, rnn_input, dtype="float32")
-
-    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
-    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    if is_dynamic_rnn:
+      rnn_inputs = tf.transpose(x, [1, 0, 2])
+      outputs, _ = bidirectional_dynamic_rnn(
+          fw_rnn_layer,
+          bw_rnn_layer,
+          rnn_inputs,
+          dtype="float32",
+          time_major=True)
+      fw_outputs, bw_outputs = outputs
+      output = tf.concat([fw_outputs, bw_outputs], 2)
+      output = tf.unstack(output, axis=0)
+      output = output[-1]
+    else:
+      rnn_inputs = tf.unstack(x, self.time_steps, 1)
+      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+          fw_rnn_layer, bw_rnn_layer, rnn_inputs, dtype="float32")
+      output = outputs[-1]
+
+    # Compute logits by multiplying output of shape [batch_size,num_units*2]
+    # by the softmax layer's out_weight of shape [num_units*2,n_classes]
     # plus out_bias
-    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    prediction = tf.matmul(output, out_weights) + out_bias
     output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
 
     return x, prediction, output_class
@@ -110,13 +124,15 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
                                  self.n_input))
       sess.run(opt, feed_dict={x: batch_x, y: batch_y})
 
-  def saveAndRestoreModel(self, fw_rnn_layer, bw_rnn_layer, sess, saver):
+  def saveAndRestoreModel(self, fw_rnn_layer, bw_rnn_layer, sess, saver,
+                          is_dynamic_rnn):
     model_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
     saver.save(sess, model_dir)
 
     # Reset the graph.
     tf.reset_default_graph()
-    x, prediction, output_class = self.buildModel(fw_rnn_layer, bw_rnn_layer)
+    x, prediction, output_class = self.buildModel(fw_rnn_layer, bw_rnn_layer,
+                                                  is_dynamic_rnn)
 
     new_sess = tf.Session(config=CONFIG)
     saver = tf.train.Saver()
@@ -169,12 +185,34 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     sess = tf.Session(config=CONFIG)
 
     x, prediction, output_class = self.buildModel(self.buildRnnLayer(),
-                                                  self.buildRnnLayer())
+                                                  self.buildRnnLayer(), False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), self.buildRnnLayer(), sess, saver, False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildRnnLayer(),
+                                                  self.buildRnnLayer(), True)
     self.trainModel(x, prediction, output_class, sess)
 
     saver = tf.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
-        self.buildRnnLayer(), self.buildRnnLayer(), sess, saver)
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True)
 
     test_inputs, expected_output, frozen_graph = self.getInferenceResult(
         x, output_class, new_sess)
diff --git a/tensorflow/lite/experimental/examples/lstm/rnn.py b/tensorflow/lite/experimental/examples/lstm/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b92a350fa3679cd70e90206ca495ec4d246ef834
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/rnn.py
@@ -0,0 +1,428 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TfLite LSTMCell wrapper.
+
+TODO(renjieliu): Find a better home for this one.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.lite.python.op_hint as op_hint
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.rnn import _best_effort_input_batch_size
+from tensorflow.python.ops.rnn import _dynamic_rnn_loop
+from tensorflow.python.ops.rnn import _should_cache
+from tensorflow.python.ops.rnn import _transpose_batch_time
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("lite.experimental.nn.dynamic_rnn")
+def dynamic_rnn(cell,
+                inputs,
+                sequence_length=None,
+                initial_state=None,
+                dtype=None,
+                parallel_iterations=None,
+                swap_memory=False,
+                time_major=True,
+                scope=None):
+  """Creates a recurrent neural network specified by RNNCell `cell`.
+
+  Performs fully dynamic unrolling of `inputs`.
+
+  Example:
+
+  ```python
+  # create a BasicRNNCell
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+  # defining initial state
+  initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+  # 'state' is a tensor of shape [batch_size, cell_state_size]
+  outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                     initial_state=initial_state,
+                                     dtype=tf.float32)
+  ```
+
+  ```python
+  # create 2 LSTMCells
+  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+  # create a RNN cell composed sequentially of a number of RNNCells
+  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+  # 'state' is a N-tuple where N is the number of LSTMCells containing a
+  # tf.contrib.rnn.LSTMStateTuple for each cell
+  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                     inputs=data,
+                                     dtype=tf.float32)
+  ```
+
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: The RNN inputs.
+      If `time_major == False` (default), this must be a `Tensor` of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If `time_major == True`, this must be a `Tensor` of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements. This may also be
+        a (possibly nested) tuple of Tensors satisfying this property.  The
+        first two dimensions must match across all the inputs, but otherwise the
+        ranks and other shape components may differ. In this case, input to
+        `cell` at each time-step will replicate the structure of these tuples,
+        except for the time dimension (from which the time is taken). The input
+        to `cell` at each time step will be a `Tensor` or (possibly nested)
+        tuple of Tensors each with dimensions `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used
+      to copy-through state and zero-out outputs when past a batch element's
+      sequence length.  So it's more for performance than correctness.
+    initial_state: (optional) An initial state for the RNN. If `cell.state_size`
+      is an integer, this must be a `Tensor` of appropriate type and shape
+      `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this
+      should be a tuple of tensors having shapes `[batch_size, s] for s in
+      cell.state_size`.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_state is not provided or RNN state has a heterogeneous
+      dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+
+    outputs: The RNN output `Tensor`.
+
+      If time_major == False (default), this will be a `Tensor` shaped:
+        `[batch_size, max_time, cell.output_size]`.
+
+      If time_major == True, this will be a `Tensor` shaped:
+        `[max_time, batch_size, cell.output_size]`.
+
+      Note, if `cell.output_size` is a (possibly nested) tuple of integers
+      or `TensorShape` objects, then `outputs` will be a tuple having the
+      same structure as `cell.output_size`, containing Tensors having shapes
+      corresponding to the shape data in `cell.output_size`.
+
+    state: The final state.  If `cell.state_size` is an int, this
+      will be shaped `[batch_size, cell.state_size]`.  If it is a
+      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+      be a tuple having the corresponding shapes. If cells are `LSTMCells`
+      `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If inputs is None or an empty list.
+    RuntimeError: If not using control flow v2.
+  """
+
+  # Currently only support time_major == True case.
+  assert time_major
+
+  # TODO(b/123051275): We need to check if the cells are TfLiteLSTMCells or
+  # TfLiteRNNCells.
+  rnn_cell_impl.assert_like_rnncell("cell", cell)
+
+  if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+    raise RuntimeError("OpHint dynamic rnn only supports control flow v2.")
+
+  parent_first_child_input = [{
+      "parent_ophint_input_index": 0,
+      "first_child_ophint_input_index": 0
+  }]
+  parent_last_child_output = [{
+      "parent_output_index": 0,
+      # For LstmCell, the index is 2.
+      # For RnnCell, the index is 1.
+      # So we use -1 meaning it's the last one.
+      "child_output_index": -1
+  }]
+  internal_children_input_output = [{
+      "child_input_index": 0,
+      # For LstmCell, the index is 2.
+      # For RnnCell, the index is 1.
+      # So we use -1 meaning it's the last one.
+      "child_output_index": -1
+  }]
+  inputs_outputs_mappings = {
+      "parent_first_child_input": parent_first_child_input,
+      "parent_last_child_output": parent_last_child_output,
+      "internal_children_input_output": internal_children_input_output
+  }
+  tflite_wrapper = op_hint.OpHint(
+      "TfLiteDynamicRnn",
+      level=2,
+      children_inputs_mappings=inputs_outputs_mappings)
+  with vs.variable_scope(scope or "rnn") as varscope:
+    # Create a new scope in which the caching device is either
+    # determined by the parent scope, or is set to place the cached
+    # Variable using the same placement as for the rest of the RNN.
+    if _should_cache():
+      if varscope.caching_device is None:
+        varscope.set_caching_device(lambda op: op.device)
+
+    inputs = tflite_wrapper.add_input(inputs, name="input", index_override=0)
+
+    # By default, time_major==False and inputs are batch-major: shaped
+    #   [batch, time, depth]
+    # For internal calculations, we transpose to [time, batch, depth]
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+      # (batch, time, depth) => (time, batch, depth)
+      flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+      flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+    if sequence_length is not None:
+      sequence_length = math_ops.to_int32(sequence_length)
+      if sequence_length.get_shape().rank not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size, "
+            "but saw shape: %s" % sequence_length.get_shape())
+      sequence_length = array_ops.identity(  # Just to find it in the graph.
+          sequence_length,
+          name="sequence_length")
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If there is no initial_state, you must give a dtype.")
+      if getattr(cell, "get_initial_state", None) is not None:
+        state = cell.get_initial_state(
+            inputs=None, batch_size=batch_size, dtype=dtype)
+      else:
+        state = cell.zero_state(batch_size, dtype)
+
+    def _assert_has_shape(x, shape):
+      x_shape = array_ops.shape(x)
+      packed_shape = array_ops.stack(shape)
+      return control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [
+              "Expected shape for Tensor %s is " % x.name, packed_shape,
+              " but saw shape: ", x_shape
+          ])
+
+    if not context.executing_eagerly() and sequence_length is not None:
+      # Perform some shape validation
+      with ops.control_dependencies(
+          [_assert_has_shape(sequence_length, [batch_size])]):
+        sequence_length = array_ops.identity(
+            sequence_length, name="CheckSeqLen")
+
+    inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+    outputs, final_state = _dynamic_rnn_loop(
+        cell,
+        inputs,
+        state,
+        parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory,
+        sequence_length=sequence_length,
+        dtype=dtype)
+
+    # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+    # If we are performing batch-major calculations, transpose output back
+    # to shape [batch, time, depth]
+    if not time_major:
+      # (time, batch, depth) => (batch, time, depth)
+      outputs = nest.map_structure(_transpose_batch_time, outputs)
+    outputs = tflite_wrapper.add_output(outputs, name="outputs")
+
+    return outputs, final_state
+
+
+def bidirectional_dynamic_rnn(cell_fw,
+                              cell_bw,
+                              inputs,
+                              sequence_length=None,
+                              initial_state_fw=None,
+                              initial_state_bw=None,
+                              dtype=None,
+                              parallel_iterations=None,
+                              swap_memory=False,
+                              time_major=False,
+                              scope=None):
+  """Creates a dynamic version of bidirectional recurrent neural network.
+
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
+  length(s) of the sequence(s) or completely unrolled if length(s) is not
+  given.
+
+  Args:
+    cell_fw: An instance of RNNCell, to be used for forward direction.
+    cell_bw: An instance of RNNCell, to be used for backward direction.
+    inputs: The RNN inputs.
+      If time_major == False (default), this must be a tensor of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If time_major == True, this must be a tensor of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences in the batch. If
+      not provided, all batch entries are assumed to be full sequences; and time
+      reversal is applied from time `0` to `max_time` for each sequence.
+    initial_state_fw: (optional) An initial state for the forward RNN. This must
+      be a tensor of appropriate type and shape `[batch_size,
+      cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a
+      tuple of tensors having shapes `[batch_size, s] for s in
+      cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using the
+      corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial states and expected output.
+      Required if initial_states are not provided or RNN states have a
+      heterogeneous dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to
+      "bidirectional_rnn"
+
+  Returns:
+    A tuple (outputs, output_states) where:
+      outputs: A tuple (output_fw, output_bw) containing the forward and
+        the backward rnn output `Tensor`.
+        If time_major == False (default),
+          output_fw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_bw.output_size]`.
+        If time_major == True,
+          output_fw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_bw.output_size]`.
+        It returns a tuple instead of a single concatenated `Tensor`, unlike
+        in the `bidirectional_rnn`. If the concatenated one is preferred,
+        the forward and backward outputs can be concatenated as
+        `tf.concat(outputs, 2)`.
+      output_states: A tuple (output_state_fw, output_state_bw) containing
+        the forward and the backward final states of bidirectional rnn.
+
+  Raises:
+    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+  """
+  rnn_cell_impl.assert_like_rnncell("cell_fw", cell_fw)
+  rnn_cell_impl.assert_like_rnncell("cell_bw", cell_bw)
+
+  with vs.variable_scope(scope or "bidirectional_rnn"):
+    # Forward direction
+    with vs.variable_scope("fw") as fw_scope:
+      output_fw, output_state_fw = dynamic_rnn(
+          cell=cell_fw,
+          inputs=inputs,
+          sequence_length=sequence_length,
+          initial_state=initial_state_fw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=fw_scope)
+
+    # Backward direction
+    if not time_major:
+      time_axis = 1
+      batch_axis = 0
+    else:
+      time_axis = 0
+      batch_axis = 1
+
+    def _reverse(input_, seq_lengths, seq_axis, batch_axis):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_,
+            seq_lengths=seq_lengths,
+            seq_axis=seq_axis,
+            batch_axis=batch_axis)
+      else:
+        return array_ops.reverse(input_, axis=[seq_axis])
+
+    with vs.variable_scope("bw") as bw_scope:
+
+      def _map_reverse(inp):
+        return _reverse(
+            inp,
+            seq_lengths=sequence_length,
+            seq_axis=time_axis,
+            batch_axis=batch_axis)
+
+      inputs_reverse = nest.map_structure(_map_reverse, inputs)
+      tmp, output_state_bw = dynamic_rnn(
+          cell=cell_bw,
+          inputs=inputs_reverse,
+          sequence_length=sequence_length,
+          initial_state=initial_state_bw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=bw_scope)
+
+  output_bw = _reverse(
+      tmp,
+      seq_lengths=sequence_length,
+      seq_axis=time_axis,
+      batch_axis=batch_axis)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
similarity index 57%
rename from tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
rename to tensorflow/lite/experimental/examples/lstm/rnn_cell.py
index e6d329ffd1d6261a75d08038887c48f80fff699c..eeec76444db248883d7d69fcc951d55d0c4aed33 100644
--- a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
+++ b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
@@ -12,39 +12,151 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TfLite LSTMCell wrapper.
+"""TfLite BasicRnnCell wrapper.
 
 TODO(renjieliu): Find a better home for this one.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import tensorflow as tf
+import itertools
 
-from tensorflow.lite.python import lite
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
+import tensorflow.lite.python.op_hint as op_hint
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops.rnn import _best_effort_input_batch_size
-from tensorflow.python.ops.rnn import _dynamic_rnn_loop
-from tensorflow.python.ops.rnn import _should_cache
-from tensorflow.python.ops.rnn import _transpose_batch_time
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("lite.experimental.nn.TfLiteRNNCell")
+class TfLiteRNNCell(rnn_cell_impl.LayerRNNCell):
+  """The most basic RNN cell.
+
+  This is used only for TfLite, it provides hints and it also makes the
+  variables in the desired for the tflite ops.
+  """
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initializes the parameters for an RNN cell.
+
+    Args:
+      num_units: int, The number of units in the RNN cell.
+      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
+        that is within Keras activation function names.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope. Raises an error if not `True` and the existing scope
+        already has the given variables.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
+
+    Raises:
+      ValueError: If the existing scope already has the given variables.
+    """
+    super(TfLiteRNNCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be Rank-2.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self._tflite_wrapper = op_hint.OpHint("UnidirectionalSequenceRnn")
+    self._num_units = num_units
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def build(self, inputs_shape):
+    """Builds the RNN cell.
+
+    Args:
+      inputs_shape: Rnn input tensor shape.
+
+    Raises:
+      ValueError: If last dimension of the input shape is not known.
+    """
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       (inputs_shape,))
+
+    input_depth = inputs_shape[-1]
+
+    def add_variable_wrapped(name, shape, initializer, index):
+      var = self.add_variable(name, shape=shape, initializer=initializer)
+      return self._tflite_wrapper.add_input(
+          var, name=name, index_override=index)
+
+    self._input_weights = add_variable_wrapped(
+        "input_weights", [self._num_units, input_depth], None, 1)
+    self._recurrent_weights = add_variable_wrapped(
+        "recurrent_weights", [self._num_units, self._num_units], None, 2)
+    self._bias = add_variable_wrapped(
+        "bias",
+        shape=[self._num_units],
+        initializer=init_ops.zeros_initializer(dtype=self.dtype),
+        index=3)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+    inputs = self._tflite_wrapper.add_input(
+        inputs, tag="input", name="input", aggregate="stack", index_override=0)
+    state = self._tflite_wrapper.add_input(
+        state,
+        tag="hidden_state",
+        name="hidden_state",
+        aggregate="first",
+        index_override=4)
+    weights = array_ops.transpose(
+        array_ops.concat([self._input_weights, self._recurrent_weights], 1))
+    gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
+    output = self._activation(gate_inputs)
+    output = self._tflite_wrapper.add_output(
+        output,
+        tag="output",
+        name="output",
+        index_override=1,
+        aggregate="stack")
+    return output, output
+
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(TfLiteRNNCell, self).get_config()
+    return dict(itertools.chain(base_config.items(), config.items()))
+
+
+@tf_export("lite.experimental.nn.TFLiteLSTMCell")
 class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -142,7 +254,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # TODO(raziel): layers stuff -- chop if un-layerizing Op.
     self.input_spec = base_layer.InputSpec(ndim=2)
 
-    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceLstm")
+    self._tflite_wrapper = op_hint.OpHint("UnidirectionalSequenceLstm")
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -158,7 +270,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
 
     self._output_size = num_proj if num_proj else num_units
     self._state_size = (
-        tf.nn.rnn_cell.LSTMStateTuple(num_units, self._output_size)
+        rnn_cell_impl.LSTMStateTuple(num_units, self._output_size)
         if state_is_tuple else num_units + self._output_size)
 
   @property
@@ -329,24 +441,28 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # c is the final state.
     # m is the output.
     i = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_input_w, self.cell_to_input_w], axis=1),
+            array_ops.concat([self.input_to_input_w, self.cell_to_input_w],
+                             axis=1),
             transpose_b=True), self.input_bias)
     f = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_forget_w, self.cell_to_forget_w], axis=1),
+            array_ops.concat([self.input_to_forget_w, self.cell_to_forget_w],
+                             axis=1),
             transpose_b=True), self.forget_bias)
     o = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_output_w, self.cell_to_output_w], axis=1),
+            array_ops.concat([self.input_to_output_w, self.cell_to_output_w],
+                             axis=1),
             transpose_b=True), self.output_bias)
     j = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_cell_w, self.cell_to_cell_w], axis=1),
+            array_ops.concat([self.input_to_cell_w, self.cell_to_cell_w],
+                             axis=1),
             transpose_b=True), self.cell_bias)
 
     # Diagonal connections
@@ -369,7 +485,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
       m = sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
-      transposed_proj_kernel = tf.transpose(self._proj_kernel)
+      transposed_proj_kernel = array_ops.transpose(self._proj_kernel)
       m = math_ops.matmul(m, transposed_proj_kernel)
 
       if self._proj_clip is not None:
@@ -383,7 +499,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
         m, tag="m", name="m", index_override=2, aggregate="stack")
 
     new_state = (
-        tf.nn.rnn_cell.LSTMStateTuple(c, m)
+        rnn_cell_impl.LSTMStateTuple(c, m)
         if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
@@ -404,240 +520,3 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     }
     base_config = super(TFLiteLSTMCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
-
-
-def dynamic_rnn(cell,
-                inputs,
-                sequence_length=None,
-                initial_state=None,
-                dtype=None,
-                parallel_iterations=None,
-                swap_memory=False,
-                time_major=True,
-                scope=None):
-  """Creates a recurrent neural network specified by RNNCell `cell`.
-
-  Performs fully dynamic unrolling of `inputs`.
-
-  Example:
-
-  ```python
-  # create a BasicRNNCell
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-
-  # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
-
-  # defining initial state
-  initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
-
-  # 'state' is a tensor of shape [batch_size, cell_state_size]
-  outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
-                                     initial_state=initial_state,
-                                     dtype=tf.float32)
-  ```
-
-  ```python
-  # create 2 LSTMCells
-  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
-
-  # create a RNN cell composed sequentially of a number of RNNCells
-  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
-
-  # 'outputs' is a tensor of shape [batch_size, max_time, 256]
-  # 'state' is a N-tuple where N is the number of LSTMCells containing a
-  # tf.contrib.rnn.LSTMStateTuple for each cell
-  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
-                                     inputs=data,
-                                     dtype=tf.float32)
-  ```
-
-
-  Args:
-    cell: An instance of RNNCell.
-    inputs: The RNN inputs.
-      If `time_major == False` (default), this must be a `Tensor` of shape:
-        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
-      If `time_major == True`, this must be a `Tensor` of shape: `[max_time,
-        batch_size, ...]`, or a nested tuple of such elements. This may also be
-        a (possibly nested) tuple of Tensors satisfying this property.  The
-        first two dimensions must match across all the inputs, but otherwise the
-        ranks and other shape components may differ. In this case, input to
-        `cell` at each time-step will replicate the structure of these tuples,
-        except for the time dimension (from which the time is taken). The input
-        to `cell` at each time step will be a `Tensor` or (possibly nested)
-        tuple of Tensors each with dimensions `[batch_size, ...]`.
-    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used
-      to copy-through state and zero-out outputs when past a batch element's
-      sequence length.  So it's more for performance than correctness.
-    initial_state: (optional) An initial state for the RNN. If `cell.state_size`
-      is an integer, this must be a `Tensor` of appropriate type and shape
-      `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this
-      should be a tuple of tensors having shapes `[batch_size, s] for s in
-      cell.state_size`.
-    dtype: (optional) The data type for the initial state and expected output.
-      Required if initial_state is not provided or RNN state has a heterogeneous
-      dtype.
-    parallel_iterations: (Default: 32).  The number of iterations to run in
-      parallel.  Those operations which do not have any temporal dependency and
-      can be run in parallel, will be.  This parameter trades off time for
-      space.  Values >> 1 use more memory but take less time, while smaller
-      values use less memory but computations take longer.
-    swap_memory: Transparently swap the tensors produced in forward inference
-      but needed for back prop from GPU to CPU.  This allows training RNNs which
-      would typically not fit on a single GPU, with very minimal (or no)
-      performance penalty.
-    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
-      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
-      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
-      `time_major = True` is a bit more efficient because it avoids transposes
-      at the beginning and end of the RNN calculation.  However, most TensorFlow
-      data is batch-major, so by default this function accepts input and emits
-      output in batch-major form.
-    scope: VariableScope for the created subgraph; defaults to "rnn".
-
-  Returns:
-    A pair (outputs, state) where:
-
-    outputs: The RNN output `Tensor`.
-
-      If time_major == False (default), this will be a `Tensor` shaped:
-        `[batch_size, max_time, cell.output_size]`.
-
-      If time_major == True, this will be a `Tensor` shaped:
-        `[max_time, batch_size, cell.output_size]`.
-
-      Note, if `cell.output_size` is a (possibly nested) tuple of integers
-      or `TensorShape` objects, then `outputs` will be a tuple having the
-      same structure as `cell.output_size`, containing Tensors having shapes
-      corresponding to the shape data in `cell.output_size`.
-
-    state: The final state.  If `cell.state_size` is an int, this
-      will be shaped `[batch_size, cell.state_size]`.  If it is a
-      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
-      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
-      be a tuple having the corresponding shapes. If cells are `LSTMCells`
-      `state` will be a tuple containing a `LSTMStateTuple` for each cell.
-
-  Raises:
-    TypeError: If `cell` is not an instance of RNNCell.
-    ValueError: If inputs is None or an empty list.
-    RuntimeError: If not using control flow v2.
-  """
-
-  # Currently only support time_major == True case.
-  assert time_major
-
-  # TODO(b/123051275): We need to check if the cells are TfLiteLSTMCells or
-  # TfLiteRNNCells.
-  rnn_cell_impl.assert_like_rnncell("cell", cell)
-
-  if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
-    raise RuntimeError("OpHint dynamic rnn only supports control flow v2.")
-
-  parent_first_child_input = [{
-      "parent_ophint_input_index": 0,
-      "first_child_ophint_input_index": 0
-  }]
-  parent_last_child_output = [{
-      "parent_output_index": 0,
-      # For LstmCell, the index is 2.
-      # For RnnCell, the index is 1.
-      # So we use -1 meaning it's the last one.
-      "child_output_index": -1
-  }]
-  internal_children_input_output = [{
-      "child_input_index": 0,
-      # For LstmCell, the index is 2.
-      # For RnnCell, the index is 1.
-      # So we use -1 meaning it's the last one.
-      "child_output_index": -1
-  }]
-  inputs_outputs_mappings = {
-      "parent_first_child_input": parent_first_child_input,
-      "parent_last_child_output": parent_last_child_output,
-      "internal_children_input_output": internal_children_input_output
-  }
-  tflite_wrapper = lite.OpHint(
-      "TfLiteDynamicRnn",
-      level=2,
-      children_inputs_mappings=inputs_outputs_mappings)
-  with vs.variable_scope(scope or "rnn") as varscope:
-    # Create a new scope in which the caching device is either
-    # determined by the parent scope, or is set to place the cached
-    # Variable using the same placement as for the rest of the RNN.
-    if _should_cache():
-      if varscope.caching_device is None:
-        varscope.set_caching_device(lambda op: op.device)
-
-    inputs = tflite_wrapper.add_input(inputs, name="input", index_override=0)
-
-    # By default, time_major==False and inputs are batch-major: shaped
-    #   [batch, time, depth]
-    # For internal calculations, we transpose to [time, batch, depth]
-    flat_input = nest.flatten(inputs)
-
-    if not time_major:
-      # (batch, time, depth) => (time, batch, depth)
-      flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
-      flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
-
-    parallel_iterations = parallel_iterations or 32
-    if sequence_length is not None:
-      sequence_length = math_ops.to_int32(sequence_length)
-      if sequence_length.get_shape().rank not in (None, 1):
-        raise ValueError(
-            "sequence_length must be a vector of length batch_size, "
-            "but saw shape: %s" % sequence_length.get_shape())
-      sequence_length = array_ops.identity(  # Just to find it in the graph.
-          sequence_length,
-          name="sequence_length")
-
-    batch_size = _best_effort_input_batch_size(flat_input)
-
-    if initial_state is not None:
-      state = initial_state
-    else:
-      if not dtype:
-        raise ValueError("If there is no initial_state, you must give a dtype.")
-      if getattr(cell, "get_initial_state", None) is not None:
-        state = cell.get_initial_state(
-            inputs=None, batch_size=batch_size, dtype=dtype)
-      else:
-        state = cell.zero_state(batch_size, dtype)
-
-    def _assert_has_shape(x, shape):
-      x_shape = array_ops.shape(x)
-      packed_shape = array_ops.stack(shape)
-      return control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [
-              "Expected shape for Tensor %s is " % x.name, packed_shape,
-              " but saw shape: ", x_shape
-          ])
-
-    if not context.executing_eagerly() and sequence_length is not None:
-      # Perform some shape validation
-      with ops.control_dependencies(
-          [_assert_has_shape(sequence_length, [batch_size])]):
-        sequence_length = array_ops.identity(
-            sequence_length, name="CheckSeqLen")
-
-    inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
-
-    outputs, final_state = _dynamic_rnn_loop(
-        cell,
-        inputs,
-        state,
-        parallel_iterations=parallel_iterations,
-        swap_memory=swap_memory,
-        sequence_length=sequence_length,
-        dtype=dtype)
-
-    # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
-    # If we are performing batch-major calculations, transpose output back
-    # to shape [batch, time, depth]
-    if not time_major:
-      # (time, batch, depth) => (batch, time, depth)
-      outputs = nest.map_structure(_transpose_batch_time, outputs)
-    outputs = tflite_wrapper.add_output(outputs, name="outputs")
-
-    return outputs, final_state
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py b/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py
deleted file mode 100644
index e4aad18367e6c8bf9669e928dff8d7c9376043b7..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TfLite BasicRnnCell wrapper.
-
-TODO(renjieliu): Find a better home for this one.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import itertools
-
-from tensorflow.lite.python import lite
-from tensorflow.python.keras import activations
-from tensorflow.python.layers import base as base_layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import rnn_cell_impl
-
-
-class TfLiteRNNCell(rnn_cell_impl.LayerRNNCell):
-  """The most basic RNN cell.
-
-  This is used only for TfLite, it provides hints and it also makes the
-  variables in the desired for the tflite ops.
-  """
-
-  def __init__(self,
-               num_units,
-               activation=None,
-               reuse=None,
-               name=None,
-               dtype=None,
-               **kwargs):
-    """Initializes the parameters for an RNN cell.
-
-    Args:
-      num_units: int, The number of units in the RNN cell.
-      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
-        that is within Keras activation function names.
-      reuse: (optional) Python boolean describing whether to reuse variables in
-        an existing scope. Raises an error if not `True` and the existing scope
-        already has the given variables.
-      name: String, the name of the layer. Layers with the same name will share
-        weights, but to avoid mistakes we require reuse=True in such cases.
-      dtype: Default dtype of the layer (default of `None` means use the type of
-        the first input). Required when `build` is called before `call`.
-      **kwargs: Dict, keyword named properties for common layer attributes, like
-        `trainable` etc when constructing the cell from configs of get_config().
-
-    Raises:
-      ValueError: If the existing scope already has the given variables.
-    """
-    super(TfLiteRNNCell, self).__init__(
-        _reuse=reuse, name=name, dtype=dtype, **kwargs)
-
-    # Inputs must be Rank-2.
-    self.input_spec = base_layer.InputSpec(ndim=2)
-
-    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceRnn")
-    self._num_units = num_units
-    if activation:
-      self._activation = activations.get(activation)
-    else:
-      self._activation = math_ops.tanh
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  def build(self, inputs_shape):
-    """Builds the RNN cell.
-
-    Args:
-      inputs_shape: Rnn input tensor shape.
-
-    Raises:
-      ValueError: If last dimension of the input shape is not known.
-    """
-    if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
-                       (inputs_shape,))
-
-    input_depth = inputs_shape[-1]
-
-    def add_variable_wrapped(name, shape, initializer, index):
-      var = self.add_variable(name, shape=shape, initializer=initializer)
-      return self._tflite_wrapper.add_input(
-          var, name=name, index_override=index)
-
-    self._input_weights = add_variable_wrapped(
-        "input_weights", [self._num_units, input_depth], None, 1)
-    self._recurrent_weights = add_variable_wrapped(
-        "recurrent_weights", [self._num_units, self._num_units], None, 2)
-    self._bias = add_variable_wrapped(
-        "bias",
-        shape=[self._num_units],
-        initializer=init_ops.zeros_initializer(dtype=self.dtype),
-        index=3)
-
-    self.built = True
-
-  def call(self, inputs, state):
-    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
-    inputs = self._tflite_wrapper.add_input(
-        inputs, tag="input", name="input", aggregate="stack", index_override=0)
-    state = self._tflite_wrapper.add_input(
-        state,
-        tag="hidden_state",
-        name="hidden_state",
-        aggregate="first",
-        index_override=4)
-    weights = array_ops.transpose(
-        array_ops.concat([self._input_weights, self._recurrent_weights], 1))
-    gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights)
-    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
-    output = self._activation(gate_inputs)
-    output = self._tflite_wrapper.add_output(
-        output,
-        tag="output",
-        name="output",
-        index_override=1,
-        aggregate="stack")
-    return output, output
-
-  def get_config(self):
-    config = {
-        "num_units": self._num_units,
-        "activation": activations.serialize(self._activation),
-        "reuse": self._reuse,
-    }
-    base_config = super(TfLiteRNNCell, self).get_config()
-    return dict(itertools.chain(base_config.items(), config.items()))
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index ed02d6c906c47af3a0a98fcc8494d8d47b6a69f0..5915978db8539bc2eed69ffd29a700ca9d90669d 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -20,8 +20,6 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import dynamic_rnn
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -57,16 +55,18 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def buildLstmLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
-        TFLiteLSTMCell(
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
-        TFLiteLSTMCell(
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
             num_proj=8,
             forget_bias=0,
             name="rnn3"),
-        TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, forget_bias=0, name="rnn4")
     ])
 
   def buildModel(self, lstm_layer, is_dynamic_rnn):
@@ -82,7 +82,8 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     # x is shaped [batch_size,time_steps,num_inputs]
     if is_dynamic_rnn:
       lstm_input = tf.transpose(x, perm=[1, 0, 2])
-      outputs, _ = dynamic_rnn(lstm_layer, lstm_input, dtype="float32")
+      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          lstm_layer, lstm_input, dtype="float32")
       outputs = tf.unstack(outputs, axis=0)
     else:
       lstm_input = tf.unstack(x, self.time_steps, 1)
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
index af7ac4c4d8d700716eb25ab77d25a0927dd16d66..3b44c13877e5c2d9edc112c9e6d3f94e5755d5a0 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -22,7 +22,6 @@ import tensorflow as tf
 from tensorflow import flags
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_rnn import TfLiteRNNCell
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -62,11 +61,11 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   def buildRnnLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
-        TfLiteRNNCell(self.num_units, name="rnn1"),
-        TfLiteRNNCell(self.num_units, name="rnn2")
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
     ])
 
-  def buildModel(self, rnn_layer):
+  def buildModel(self, rnn_layer, is_dynamic_rnn):
     # Weights and biases for output softmax layer.
     out_weights = tf.Variable(
         tf.random_normal([self.num_units, self.n_classes]))
@@ -77,8 +76,14 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
     # x is shaped [batch_size,time_steps,num_inputs]
-    rnn_input = tf.unstack(x, self.time_steps, 1)
-    outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
+    if is_dynamic_rnn:
+      rnn_input = tf.transpose(x, perm=[1, 0, 2])
+      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          rnn_layer, rnn_input, dtype="float32")
+      outputs = tf.unstack(outputs, axis=0)
+    else:
+      rnn_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
 
     # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
     # by the softmax layer's out_weight of shape [num_units,n_classes]
@@ -108,13 +113,14 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
                                  self.n_input))
       sess.run(opt, feed_dict={x: batch_x, y: batch_y})
 
-  def saveAndRestoreModel(self, rnn_layer, sess, saver):
+  def saveAndRestoreModel(self, rnn_layer, sess, saver, is_dynamic_rnn):
     """Saves and restores the model to mimic the most common use case.
 
     Args:
       rnn_layer: The rnn layer either a single rnn cell or a multi rnn cell.
       sess: Old session.
       saver: saver created by tf.train.Saver()
+      is_dynamic_rnn: use dynamic_rnn or not.
 
     Returns:
       A tuple containing:
@@ -130,7 +136,7 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
     # Reset the graph.
     tf.reset_default_graph()
-    x, prediction, output_class = self.buildModel(rnn_layer)
+    x, prediction, output_class = self.buildModel(rnn_layer, is_dynamic_rnn)
 
     new_sess = tf.Session(config=CONFIG)
     saver = tf.train.Saver()
@@ -177,12 +183,32 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
   def testStaticRnnMultiRnnCell(self):
     sess = tf.Session(config=CONFIG)
 
-    x, prediction, output_class = self.buildModel(self.buildRnnLayer())
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), is_dynamic_rnn=False)
     self.trainModel(x, prediction, output_class, sess)
 
     saver = tf.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
-        self.buildRnnLayer(), sess, saver)
+        self.buildRnnLayer(), sess, saver, is_dynamic_rnn=False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), is_dynamic_rnn=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), sess, saver, is_dynamic_rnn=True)
 
     test_inputs, expected_output, frozen_graph = self.getInferenceResult(
         x, output_class, new_sess)
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index 30070bf0088e287ffed46f844e18ed9eda8caaa3..4f7cfae5c5b196c64152b8f37dc3d6a4306d4d7c 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -10,15 +10,21 @@ detection model, takes up a total of 22KB.
 ## Table of Contents
 
 -   [Getting Started](#getting-started)
+
     *   [Getting Started with Portable Reference Code](#getting-started-with-portable-reference-code)
     *   [Building Portable Reference Code using Make](#building-portable-reference-code-using-make)
     *   [Building for the "Blue Pill" STM32F103 using Make](#building-for-the-blue-pill-stm32f103-using-make)
     *   [Building for "Hifive1" SiFive FE310 development board using Make](#building-for-hifive1-sifive-fe310-development-board-using-make)
     *   [Building for Ambiq Micro Apollo3Blue EVB using Make](#building-for-ambiq-micro-apollo3blue-evb-using-make)
         *   [Additional Apollo3 Instructions](#additional-apollo3-instructions)
+    *   [Building for the Eta Compute ECM3531 EVB using Make](#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
+
 -   [Goals](#goals)
+
 -   [Generating Project Files](#generating-project-#files)
+
 -   [How to Port TensorFlow Lite Micro to a New Platform](#how-to-port-tensorflow-lite-micro-to-a-new-platform)
+
     *   [Requirements](#requirements)
     *   [Getting Started](getting-started)
     *   [Troubleshooting](#troubleshooting)
@@ -39,12 +45,13 @@ easy as possible to add support for others.
 If you're a product developer, we have build instructions or pre-generated
 project files that you can download for the following platforms:
 
-| Device | Mbed | Keil | Make/GCC
--------- | ---- | ---- | --------
-[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html) | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | - | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl) | -
-["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | - | - | [Instructions](#building-for-the-blue-pill-stm32f103-using-make)
-[Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/) | - | - | [Instructions](#building-for-ambiq-micro-apollo3blue-evb-using-make)
-[Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/) | - | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
+Device                                                                                         | Mbed                                                                           | Keil                                                                           | Make/GCC
+---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ | --------
+[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)     | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | -                                                                              | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl)
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | -                                                                              | -                                                                              | [Instructions](#building-for-the-blue-pill-stm32f103-using-make)
+[Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/)  | -                                                                              | -                                                                              | [Instructions](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+[Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/)                            | -                                                                              | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
+[Eta Compute ECM3531 EVB](https://etacompute.com/)                                             | -                                                                              | -                                                                              | [Instructions](#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
 
 If your device is not yet supported, it may not be too hard to add support. You
 can learn about that process
@@ -231,7 +238,7 @@ Successfully tagged riscv_build:latest
 
 Building micro_speech_test binary
 
--   Lauch the Docker that we just created using: `docker run -it-v
+-   Launch the Docker that we just created using: `docker run -it-v
     /tmp/copybara_out:/workspace riscv_build:latest bash`
 -   Enter the source root directory by running `cd /workspace`
 -   Download the dependencies by running
@@ -242,7 +249,7 @@ Building micro_speech_test binary
 -   Build the binary: `make -f
     tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=riscv32_mcu`
 
-Lauching Renode to test the binary, currently this set up is not automated.
+Launching Renode to test the binary, currently this set up is not automated.
 
 -   Execute the binary on Renode: `renode -P 5000 --disable-xwt -e 's
     @/workspace/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc'`
@@ -265,12 +272,11 @@ You should see the following log with the magic string `~~~ALL TEST PASSED~~~`:
 
 Follow these steps to get the pushbutton yes/no example working on Apollo 3:
 
-1.  Make sure to run the "Getting Started" section before performing the
-    following steps
-2.  Download Apollo3-SDK-2018.08.13 and place in
-    `tensorflow/lite/experimental/micro/tools/make/downloads`. This is not yet
-    publicly released, but you can contact ashah@ambiqmicro.com to request a
-    copy.
+1.  Make sure to run the "Building Portable Reference Code using Make" section
+    before performing the following steps
+2.  The Ambiq Micro SDK is downloaded into
+    `tensorflow/lite/experimental/micro/tools/make/downloads` by
+    'download_dependencies.sh'.
 3.  Compile the project with the following command: make -f
     tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
     pushbutton_cmsis_speech_test_bin
@@ -297,7 +303,10 @@ Follow these steps to get the pushbutton yes/no example working on Apollo 3:
     4.  Press BTN2. An LED will flash for 1 second. Speak your utterance during
         this one second
     5.  The debugger will print out four numbers. They are the probabilites for
-        1) no speech, 2) unknown speech, 3) yes, 4) no
+        1.  no speech
+        2.  unkown speech
+        3.  yes
+        4.  no
     6.  The EVB LEDs will indicate detection.
         1.  LED0 (rightmost LED) - ON when capturing 1sec of audio
         2.  LED1 - ON when detecting silence
@@ -315,6 +324,53 @@ To flash a part with JFlash Lite, do the following:
 4. Data file = `tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin`
 5. Prog Addr = 0x0000C000
 
+## Building for the Eta Compute ECM3531 EVB using Make
+
+1.  Follow the instructions at
+    [Tensorflow Micro Speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech#getting-started)
+    to down load the Tensorflow source code and the support libraries \(but do
+    not run the make command shown there.\)
+2.  Download the Eta Compute SDK, version 0.0.17. Contact info@etacompute.com
+3.  You will need the the Arm compiler arm-none-eabi-gcc, version 7.3.1
+    20180622, release ARM/embedded-7-branch revision 261907, 7-2018-q2-update.
+    This compiler is downloaded when you run the
+    tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+    script.
+4.  Edit the file
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
+    so that the variables ETA_SDK and GCC_ARM point to the correct directories.
+5.  Compile the code with the command \
+    &nbsp;&nbsp;&nbsp;&nbsp;make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531
+    TAGS="CMSIS" test \
+    This will produce a set of executables in the
+    tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin
+    directory.
+6.  To load an executable into SRAM \
+    &nbsp;&nbsp;&nbsp;&nbsp;Start ocd \
+    &nbsp;&nbsp;&nbsp;&nbsp;cd
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
+    &nbsp;&nbsp;&nbsp;&nbsp;./load_program name_of_executable, for e.g.,
+    ./load_program audio_provider_test \
+    &nbsp;&nbsp;&nbsp;&nbsp;Start PuTTY \(Connection type = Serial, Speed =
+    11520, Data bits = 8, Stop bits = 1, Parity = None\) \
+    The following output should appear: \
+    Testing TestAudioProvider \
+    Testing TestTimer \
+    2/2 tests passed \
+    \~\~\~ALL TESTS PASSED\~\~\~ \
+    Execution time \(msec\) = 7
+7.  To load into flash \
+    &nbsp;&nbsp;&nbsp;&nbsp;Edit the variable ETA_LDS_FILE in
+    tensorflow/lite/experimental/micro/tools/&nbsp;&nbsp;make/targets/ecm3531_makefile.inc
+    to point to the ecm3531_flash.lds file \
+    &nbsp;&nbsp;&nbsp;&nbsp;Recompile \( make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531
+    TAGS="CMSIS" test\) \
+    &nbsp;&nbsp;&nbsp;&nbsp;cd
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
+    &nbsp;&nbsp;&nbsp;&nbsp;./flash_program executable_name to load into flash.
+
 ## Goals
 
 The design goals are for the framework to be:
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 51ba2976a2eeca01a280fc8e9abb14289bf22adc..29d40e702d96b2eb6d4a85c2841f4d2829b8d764 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -11,34 +11,34 @@ load(
 )
 
 cc_library(
-    name = "model_settings",
+    name = "simple_model_settings",
     srcs = [
-        "model_settings.cc",
+        "simple_features/simple_model_settings.cc",
     ],
     hdrs = [
-        "model_settings.h",
+        "simple_features/simple_model_settings.h",
     ],
 )
 
 cc_library(
-    name = "tiny_conv_model_data",
+    name = "tiny_conv_simple_features_model_data",
     srcs = [
-        "tiny_conv_model_data.cc",
+        "simple_features/tiny_conv_simple_features_model_data.cc",
     ],
     hdrs = [
-        "tiny_conv_model_data.h",
+        "simple_features/tiny_conv_simple_features_model_data.h",
     ],
 )
 
 cc_library(
-    name = "features_test_data",
+    name = "simple_features_test_data",
     srcs = [
-        "no_features_data.cc",
-        "yes_features_data.cc",
+        "simple_features/no_simple_features_data.cc",
+        "simple_features/yes_simple_features_data.cc",
     ],
     hdrs = [
-        "no_features_data.h",
-        "yes_features_data.h",
+        "simple_features/no_simple_features_data.h",
+        "simple_features/yes_simple_features_data.h",
     ],
 )
 
@@ -48,10 +48,10 @@ tflite_micro_cc_test(
         "micro_speech_test.cc",
     ],
     deps = [
-        ":features_test_data",
-        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_test_data",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/experimental/micro/kernels:micro_ops",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -60,45 +60,66 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "preprocessor_test_data",
+    name = "audio_sample_test_data",
     srcs = [
         "no_30ms_sample_data.cc",
-        "no_power_spectrum_data.cc",
         "yes_30ms_sample_data.cc",
-        "yes_power_spectrum_data.cc",
     ],
     hdrs = [
         "no_30ms_sample_data.h",
-        "no_power_spectrum_data.h",
         "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.h",
     ],
 )
 
 cc_library(
-    name = "preprocessor_reference",
+    name = "audio_large_sample_test_data",
     srcs = [
-        "preprocessor.cc",
+        "no_1000ms_sample_data.cc",
+        "yes_1000ms_sample_data.cc",
     ],
     hdrs = [
-        "preprocessor.h",
+        "no_1000ms_sample_data.h",
+        "yes_1000ms_sample_data.h",
+    ],
+)
+
+cc_library(
+    name = "simple_features_generator_test_data",
+    srcs = [
+        "simple_features/no_power_spectrum_data.cc",
+        "simple_features/yes_power_spectrum_data.cc",
+    ],
+    hdrs = [
+        "simple_features/no_power_spectrum_data.h",
+        "simple_features/yes_power_spectrum_data.h",
+    ],
+)
+
+cc_library(
+    name = "simple_features_generator_reference",
+    srcs = [
+        "simple_features/simple_features_generator.cc",
+    ],
+    hdrs = [
+        "simple_features/simple_features_generator.h",
     ],
     deps = [
-        ":model_settings",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_reference_test",
+    name = "simple_features_generator_reference_test",
     srcs = [
-        "preprocessor_test.cc",
+        "simple_features/simple_features_generator_test.cc",
     ],
     deps = [
-        ":model_settings",
-        ":preprocessor_reference",
-        ":preprocessor_test_data",
+        ":audio_sample_test_data",
+        ":simple_features_generator_reference",
+        ":simple_features_generator_test_data",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -106,29 +127,30 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "preprocessor_fixed",
+    name = "simple_features_generator_fixed",
     srcs = [
-        "fixed_point/preprocessor.cc",
+        "simple_features/fixed_point/simple_features_generator.cc",
     ],
     hdrs = [
-        "preprocessor.h",
+        "simple_features/simple_features_generator.h",
     ],
     deps = [
-        ":model_settings",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_fixed_test",
+    name = "simple_features_generator_fixed_test",
     srcs = [
-        "preprocessor_test.cc",
+        "simple_features/simple_features_generator_test.cc",
     ],
     deps = [
-        ":model_settings",
-        ":preprocessor_fixed",
-        ":preprocessor_test_data",
+        ":audio_sample_test_data",
+        ":simple_features_generator_fixed",
+        ":simple_features_generator_test_data",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -144,9 +166,25 @@ cc_library(
         "audio_provider.h",
     ],
     deps = [
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+    ],
+)
+
+cc_library(
+    name = "audio_provider_mock",
+    srcs = [
+        "audio_provider_mock.cc",
+    ],
+    hdrs = [
+        "audio_provider.h",
+    ],
+    deps = [
+        ":audio_large_sample_test_data",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -157,9 +195,24 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":audio_provider",
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "audio_provider_mock_test",
+    srcs = [
+        "audio_provider_mock_test.cc",
+    ],
+    deps = [
+        ":audio_large_sample_test_data",
+        ":audio_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
@@ -174,10 +227,10 @@ cc_library(
     ],
     deps = [
         ":audio_provider",
-        ":model_settings",
-        ":preprocessor_reference",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_generator",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -189,9 +242,41 @@ tflite_micro_cc_test(
     deps = [
         ":audio_provider",
         ":feature_provider",
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "feature_provider_mock",
+    srcs = [
+        "feature_provider.cc",
+    ],
+    hdrs = [
+        "feature_provider.h",
+    ],
+    deps = [
+        ":audio_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_generator",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "feature_provider_mock_test",
+    srcs = [
+        "feature_provider_mock_test.cc",
+    ],
+    deps = [
+        ":feature_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_test_data",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
@@ -205,9 +290,9 @@ cc_library(
         "recognize_commands.h",
     ],
     deps = [
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -235,11 +320,29 @@ cc_binary(
     deps = [
         ":audio_provider",
         ":feature_provider",
-        ":model_settings",
         ":recognize_commands",
-        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_binary(
+    name = "micro_speech_mock",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":audio_provider_mock",
+        ":feature_provider",
+        ":recognize_commands",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/schema:schema_fbs",
     ],
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
index 3d560510ad140ff0bba84ebcf790a0fda90e72fa..73b884f29de30bbfbdc8487f620f6990a53ef92c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
@@ -5,6 +5,11 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
     -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/
 
+  GENERATED_PROJECT_INCLUDES += \
+    -isystemthird_party/cmsis/CMSIS/Core/Include/ \
+    -isystemthird_party/cmsis/CMSIS/DSP/Include/ \
+    -Ithird_party/CMSIS_ext/
+
   CMSIS_PREPROCESSOR_SRCS := \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc \
@@ -12,6 +17,7 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
   CMSIS_PREPROCESSOR_HDRS := \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h \
+    third_party/CMSIS_ext/README.md \
     third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h
 
   PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
@@ -24,20 +30,27 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
   MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
 
   THIRD_PARTY_CC_SRCS += \
-    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
-    third_party/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
-    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
-    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
-    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
+    $(MAKEFILE_DIR)/downloads/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
 
   THIRD_PARTY_CC_HDRS += \
+    third_party/cmsis/LICENSE.txt \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_compiler.h \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_gcc.h \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_version.h \
+    third_party/cmsis/CMSIS/Core/Include/core_cm3.h \
     third_party/cmsis/CMSIS/DSP/Include/arm_common_tables.h \
-    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h
+    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_math.h
 
 endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
index 49aace3d7d05ba1d7010d3d834c66dc13e488c96..c4e0f0e6ca12feca0f6e9638c3f36b81b2dcbd77 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
@@ -1,106 +1,349 @@
 
+INCLUDES += \
+ -I$(MAKEFILE_DIR)/downloads/kissfft
+
+PROJECT_INCLUDES += \
+third_party/kissfft
+
+KISSFFT_LIB_SRCS := \
+$(MAKEFILE_DIR)/downloads/kissfft/kiss_fft.c \
+$(MAKEFILE_DIR)/downloads/kissfft/tools/kiss_fftr.c
+
+KISSFFT_LIB_HDRS := \
+$(MAKEFILE_DIR)/downloads/kissfft/COPYING \
+$(MAKEFILE_DIR)/downloads/kissfft/kiss_fft.h \
+$(MAKEFILE_DIR)/downloads/kissfft/_kiss_fft_guts.h \
+$(MAKEFILE_DIR)/downloads/kissfft/tools/kiss_fftr.h
+
 MICRO_SPEECH_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
 
 MICRO_SPEECH_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
 
-PREPROCESSOR_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
+SIMPLE_FEATURES_GENERATOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
+
+SIMPLE_FEATURES_GENERATOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
+
+MICRO_FEATURES_LIB_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc \
+$(KISSFFT_LIB_SRCS)
+
+MICRO_FEATURES_LIB_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h \
+$(KISSFFT_LIB_HDRS)
+
+MICRO_FEATURES_FFT_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc \
+$(KISSFFT_LIB_SRCS)
+
+MICRO_FEATURES_FFT_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h \
+$(KISSFFT_LIB_HDRS)
+
+MICRO_FEATURES_FILTERBANK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
+
+MICRO_FEATURES_FILTERBANK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_FRONTEND_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc \
+$(MICRO_FEATURES_LIB_SRCS)
+
+MICRO_FEATURES_FRONTEND_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h \
+$(MICRO_FEATURES_LIB_HDRS)
+
+MICRO_FEATURES_LOG_SCALE_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
+
+MICRO_FEATURES_LOG_SCALE_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
 
-PREPROCESSOR_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+MICRO_FEATURES_NOISE_REDUCTION_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
+
+MICRO_FEATURES_NOISE_REDUCTION_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
+
+MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_WINDOW_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
+
+MICRO_FEATURES_WINDOW_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_GENERATOR_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+$(MICRO_FEATURES_LIB_SRCS)
+
+MICRO_FEATURES_GENERATOR_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+$(MICRO_FEATURES_LIB_HDRS)
+
+MICRO_FEATURES_GENERATOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_FEATURES_GENERATOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
+$(MICRO_FEATURES_GENERATOR_HDRS)
 
 AUDIO_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
 
 AUDIO_PROVIDER_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+
+AUDIO_PROVIDER_MOCK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
+
+AUDIO_PROVIDER_MOCK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
 
 FEATURE_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
 
 FEATURE_PROVIDER_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+FEATURE_PROVIDER_MOCK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+FEATURE_PROVIDER_MOCK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
 
 RECOGNIZE_COMMANDS_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
 
 RECOGNIZE_COMMANDS_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
 
 MICRO_SPEECH_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
 
 MICRO_SPEECH_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+MICRO_SPEECH_MOCK_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_SPEECH_MOCK_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
 
 # Find any platform-specific rules for this example.
 include $(wildcard tensorflow/lite/experimental/micro/examples/micro_speech/*/Makefile.inc)
 
+$(eval $(call microlite_test,micro_features_fft_test,\
+$(MICRO_FEATURES_FFT_TEST_SRCS),$(MICRO_FEATURES_FFT_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_filterbank_test,\
+$(MICRO_FEATURES_FILTERBANK_TEST_SRCS),$(MICRO_FEATURES_FILTERBANK_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_frontend_test,\
+$(MICRO_FEATURES_FRONTEND_TEST_SRCS),$(MICRO_FEATURES_FRONTEND_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_log_scale_test,\
+$(MICRO_FEATURES_LOG_SCALE_TEST_SRCS),$(MICRO_FEATURES_LOG_SCALE_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_noise_reduction_test,\
+$(MICRO_FEATURES_NOISE_REDUCTION_TEST_SRCS),$(MICRO_FEATURES_NOISE_REDUCTION_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_pcan_gain_control_test,\
+$(MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_SRCS),$(MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_window_test,\
+$(MICRO_FEATURES_WINDOW_TEST_SRCS),$(MICRO_FEATURES_WINDOW_TEST_HDRS)))
+
+# Test the code for feature generation.
+$(eval $(call microlite_test,micro_features_generator_test,\
+$(MICRO_FEATURES_GENERATOR_TEST_SRCS), $(MICRO_FEATURES_GENERATOR_TEST_HDRS)))
+
 # Tests loading and running a speech model.
 $(eval $(call microlite_test,micro_speech_test,\
 $(MICRO_SPEECH_TEST_SRCS),$(MICRO_SPEECH_TEST_HDRS)))
 
 # Test the code for feature generation.
-$(eval $(call microlite_test,preprocessor_test,\
-$(PREPROCESSOR_TEST_SRCS), $(PREPROCESSOR_TEST_HDRS)))
+$(eval $(call microlite_test,simple_features_generator_test,\
+$(SIMPLE_FEATURES_GENERATOR_TEST_SRCS), $(SIMPLE_FEATURES_GENERATOR_TEST_HDRS)))
 
 # Tests the audio provider module.
 $(eval $(call microlite_test,audio_provider_test,\
 $(AUDIO_PROVIDER_TEST_SRCS),$(AUDIO_PROVIDER_TEST_HDRS)))
 
+# Tests the audio provider mock module.
+$(eval $(call microlite_test,audio_provider_mock_test,\
+$(AUDIO_PROVIDER_MOCK_TEST_SRCS),$(AUDIO_PROVIDER_MOCK_TEST_HDRS)))
+
 # Tests the feature provider module.
 $(eval $(call microlite_test,feature_provider_test,\
 $(FEATURE_PROVIDER_TEST_SRCS),$(FEATURE_PROVIDER_TEST_HDRS)))
 
-# Tests the feature provider module.
+# Tests the feature provider module using the mock audio provider.
+$(eval $(call microlite_test,feature_provider_mock_test,\
+$(FEATURE_PROVIDER_MOCK_TEST_SRCS),$(FEATURE_PROVIDER_MOCK_TEST_HDRS)))
+
+# Tests the command recognizer module.
 $(eval $(call microlite_test,recognize_commands_test,\
 $(RECOGNIZE_COMMANDS_TEST_SRCS),$(RECOGNIZE_COMMANDS_TEST_HDRS)))
 
 # Builds a standalone speech command recognizer binary.
 $(eval $(call microlite_test,micro_speech,\
 $(MICRO_SPEECH_SRCS),$(MICRO_SPEECH_HDRS)))
+
+# Builds a standalone speech command recognizer binary using fake audio input.
+$(eval $(call microlite_test,micro_speech_mock,\
+$(MICRO_SPEECH_MOCK_SRCS),$(MICRO_SPEECH_MOCK_HDRS)))
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
index 500eed33bab0187f9b2cf9647c046f4a541b9e2c..3cc81c4b5b493f8b624a92960e80d36087f146da 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
@@ -83,7 +83,8 @@ If you see a compiling error on older machines, try leaving out the `--copt` arg
 ```
 bazel run tensorflow/examples/speech_commands:freeze -- \
 --model_architecture=tiny_conv --window_stride=20 --preprocess=average \
---wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb
+--wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb \
+--start_checkpoint=/tmp/speech_commands_train/tiny_conv.ckpt-18000
 ```
 
 The next step is to create a TensorFlow Lite file from the frozen graph:
@@ -99,5 +100,5 @@ bazel run tensorflow/lite/toco:toco -- \
 Finally, convert the file into a C source file that can be compiled into an embedded system:
 
 ```
-xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_model_data.cc
+xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_simple_features_model_data.cc
 ```
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
index 0aa362be0038f8757387a6311021e183dc19dabd..c83090344ba0d82e9f774897577b1eb924e92329 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -5,7 +5,7 @@ ifeq ($(TARGET), apollo3evb)
     $(AP3_MICRO_DIR)/../preprocessor.cc \
     $(AP3_MICRO_DIR)/pushbutton_main.c \
     $(AP3_MICRO_DIR)/pushbutton_test.cc \
-    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(AP3_MICRO_DIR)/../simple_features/tiny_conv_simple_features_model_data.cc \
     $(APOLLO3_SDK)/devices/am_devices_led.c
   ALL_SRCS += $(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS)
   PUSHBUTTON_MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
@@ -24,8 +24,8 @@ ifeq ($(TARGET), apollo3evb)
   PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
     $(AP3_MICRO_DIR)/pushbutton_main.c \
     $(AP3_MICRO_DIR)/pushbutton_test.cc \
-    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
-    $(CMSIS_DIR)/preprocessor.cc \
+    $(AP3_MICRO_DIR)/../simple_features/tiny_conv_simple_features_model_data.cc \
+    $(CMSIS_DIR)/simple_features_generator.cc \
     $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
     $(CMSIS_DIR)/hanning.c \
     $(APOLLO3_SDK)/devices/am_devices_led.c \
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
index afee38343b3fac81de945dcd01b53ad35e8be270..4f70d47c3ea9b6f7df884ceabeca245a2a5e55ce 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
@@ -157,7 +157,7 @@ void pdm_data_get(void) {
 // PDM interrupt handler.
 //
 //*****************************************************************************
-void am_pdm_isr(void) {
+void am_pdm0_isr(void) {
   uint32_t ui32Status;
   //
   // Read the interrupt status.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
index 95043f857b34b953c91a762bc1a54e9489431bff..d4583dbf4a6dcb083e4d9cd2818e63a116debd7f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
  * micro_speech_test.cc */
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -32,20 +32,36 @@ uint8_t g_unknown_score = 0;
 uint8_t g_yes_score = 0;
 uint8_t g_no_score = 0;
 
+namespace {
+
+TfLiteStatus GenerateSimpleFeatures_1sec(tflite::ErrorReporter* error_reporter,
+                                         const int16_t* input,
+                                         uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    GenerateSimpleFeatures(error_reporter, input + i * 320, 480, 43,
+                           output + i * 43);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(TestPreprocessor) {
+TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t preprocessed_data[43 * 49];
-  TfLiteStatus preprocess_1sec_status =
-      Preprocess_1sec(error_reporter, captured_data, preprocessed_data);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, preprocess_1sec_status);
+  TfLiteStatus generate_1sec_status = GenerateSimpleFeatures_1sec(
+      error_reporter, captured_data, preprocessed_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, generate_1sec_status);
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_simple_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
index 52db18e6868371afc0b7cd39f6f41d0d60b91689..08811c83b437e66bf1e77a1a1f32d1cb5be02c43 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 namespace {
 int16_t g_dummy_audio_data[kMaxAudioSampleSize];
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c9792510b055e243ab4f6e804717647afa0b418
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+namespace {
+int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  const int yes_start = (0 * kAudioSampleFrequency) / 1000;
+  const int yes_end = (1000 * kAudioSampleFrequency) / 1000;
+  const int no_start = (4000 * kAudioSampleFrequency) / 1000;
+  const int no_end = (5000 * kAudioSampleFrequency) / 1000;
+  const int wraparound = (8000 * kAudioSampleFrequency) / 1000;
+  const int start_sample = (start_ms * kAudioSampleFrequency) / 1000;
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    const int sample_index = (start_sample + i) % wraparound;
+    int16_t sample;
+    if ((sample_index >= yes_start) && (sample_index < yes_end)) {
+      sample = g_yes_1000ms_sample_data[sample_index - yes_start];
+    } else if ((sample_index >= no_start) && (sample_index < no_end)) {
+      sample = g_no_1000ms_sample_data[sample_index - no_start];
+    } else {
+      sample = 0;
+    }
+    g_dummy_audio_data[i] = sample;
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_dummy_audio_data;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() {
+  g_latest_audio_timestamp += 100;
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b73d436ad638277d3f052715c506668e7f163f17
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAudioProviderMock) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  int audio_samples_size = 0;
+  int16_t* audio_samples = nullptr;
+  TfLiteStatus get_status =
+      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i], audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 500, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i + 8000],
+                            audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 1500, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(0, audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 12250, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_1000ms_sample_data[i + 4000],
+                            audio_samples[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
index 85fbbb80a6c5b330230c1d1d0186de795edc4754..f9212aa3491e99104c2a3f1f5e315e9e96481345 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
index 06647d0c536564c26d72cb73396ca36efb3aeb25..49fea826759956d479e9171e2ba7a41331e31023 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 #include "AUDIO_DISCO_F746NG.h"
 #include "SDRAM_DISCO_F746NG.h"
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
index 7f9ece41dd3f013ae328ffd1bdc98f197855a131..b5dfa3d944076a21cde2dfafc6ce1ed39f15164d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
     : feature_size_(feature_size),
@@ -48,6 +48,10 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
   int slices_needed = current_step - last_step;
   // If this is the first call, make sure we don't use any cached information.
   if (is_first_run_) {
+    TfLiteStatus init_status = InitializeMicroFeatures(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
     is_first_run_ = false;
     slices_needed = kFeatureSliceCount;
   }
@@ -94,16 +98,17 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
       GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
                       &audio_samples_size, &audio_samples);
       if (audio_samples_size < kMaxAudioSampleSize) {
-        error_reporter->Report("Audio data size %d  too small, want %d",
+        error_reporter->Report("Audio data size %d too small, want %d",
                                audio_samples_size, kMaxAudioSampleSize);
         return kTfLiteError;
       }
       uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
-      TfLiteStatus preprocess_status =
-          Preprocess(error_reporter, audio_samples, audio_samples_size,
-                     kFeatureSliceSize, new_slice_data);
-      if (preprocess_status != kTfLiteOk) {
-        return preprocess_status;
+      size_t num_samples_read;
+      TfLiteStatus generate_status = GenerateMicroFeatures(
+          error_reporter, audio_samples, audio_samples_size, kFeatureSliceSize,
+          new_slice_data, &num_samples_read);
+      if (generate_status != kTfLiteOk) {
+        return generate_status;
       }
     }
   }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05912e26f8a7f5e89b9f45766adf4270c033ed5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 970,
+      &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+
+  for (int i = 0; i < kFeatureElementCount; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_micro_f2e59fea_nohash_1_data[i],
+                            feature_data[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(TestFeatureProviderMockNo) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, /* last_time_in_ms= */ 4000, /* time_in_ms= */ 4970,
+      &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+
+  for (int i = 0; i < kFeatureElementCount; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_micro_f9643d42_nohash_4_data[i],
+                            feature_data[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
index 556cbfe799bd9adf2df8f584a4f10b4a1c834bd4..e7655a3be53ae6a032195dd4ca991f740bb19537 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
index 3a9a5a4df1bf8239950dd2c79a1048706004e1f5..e71e62170e442e4139acbadb97268b6f74db6459 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -31,7 +31,8 @@ int main(int argc, char* argv[]) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
@@ -123,7 +124,8 @@ int main(int argc, char* argv[]) {
       return 1;
     }
     if (is_new_command) {
-      error_reporter->Report("Heard %s (%d)", found_command, score);
+      error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
+                             current_time);
     }
   }
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1e684e1efd0bfbc676635e8c3233ef6284e6954d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
@@ -0,0 +1,300 @@
+# Library for generating feature vectors from audio data
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_model_settings",
+    srcs = [
+        "micro_model_settings.cc",
+    ],
+    hdrs = [
+        "micro_model_settings.h",
+    ],
+)
+
+cc_library(
+    name = "tiny_conv_micro_features_model_data",
+    srcs = [
+        "tiny_conv_micro_features_model_data.cc",
+    ],
+    hdrs = [
+        "tiny_conv_micro_features_model_data.h",
+    ],
+)
+
+cc_library(
+    name = "micro_features_test_data",
+    srcs = [
+        "no_micro_features_data.cc",
+        "yes_micro_features_data.cc",
+    ],
+    hdrs = [
+        "no_micro_features_data.h",
+        "yes_micro_features_data.h",
+    ],
+)
+
+cc_library(
+    name = "bits",
+    hdrs = ["bits.h"],
+)
+
+cc_library(
+    name = "static_alloc",
+    hdrs = ["static_alloc.h"],
+)
+
+cc_library(
+    name = "fft",
+    srcs = [
+        "fft.cc",
+        "fft_util.cc",
+    ],
+    hdrs = [
+        "fft.h",
+        "fft_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "@kissfft//:kiss_fftr_16",
+    ],
+)
+
+cc_library(
+    name = "filterbank",
+    srcs = [
+        "filterbank.cc",
+        "filterbank_util.cc",
+    ],
+    hdrs = [
+        "filterbank.h",
+        "filterbank_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "frontend",
+    srcs = [
+        "frontend.cc",
+        "frontend_util.cc",
+    ],
+    hdrs = [
+        "frontend.h",
+        "frontend_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":filterbank",
+        ":log_scale",
+        ":micro_model_settings",
+        ":noise_reduction",
+        ":pcan_gain_control",
+        ":window",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "log_scale",
+    srcs = [
+        "log_lut.cc",
+        "log_scale.cc",
+        "log_scale_util.cc",
+    ],
+    hdrs = [
+        "log_lut.h",
+        "log_scale.h",
+        "log_scale_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "noise_reduction",
+    srcs = [
+        "noise_reduction.cc",
+        "noise_reduction_util.cc",
+    ],
+    hdrs = [
+        "noise_reduction.h",
+        "noise_reduction_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "pcan_gain_control",
+    srcs = [
+        "pcan_gain_control.cc",
+        "pcan_gain_control_util.cc",
+    ],
+    hdrs = [
+        "pcan_gain_control.h",
+        "pcan_gain_control_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "window",
+    srcs = [
+        "window.cc",
+        "window_util.cc",
+    ],
+    hdrs = [
+        "window.h",
+        "window_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "micro_features_generator",
+    srcs = [
+        "micro_features_generator.cc",
+    ],
+    hdrs = [
+        "micro_features_generator.h",
+    ],
+    deps = [
+        ":frontend",
+        ":micro_model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "micro_features_generator_test_data",
+    srcs = [
+        "no_feature_data_slice.cc",
+        "yes_feature_data_slice.cc",
+    ],
+    hdrs = [
+        "no_feature_data_slice.h",
+        "yes_feature_data_slice.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "fft_test",
+    srcs = ["fft_test.cc"],
+    deps = [
+        ":fft",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "filterbank_test",
+    srcs = ["filterbank_test.cc"],
+    deps = [
+        ":filterbank",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "frontend_test",
+    srcs = ["frontend_test.cc"],
+    deps = [
+        ":frontend",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "log_scale_test",
+    srcs = ["log_scale_test.cc"],
+    deps = [
+        ":log_scale",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "noise_reduction_test",
+    srcs = ["noise_reduction_test.cc"],
+    deps = [
+        ":noise_reduction",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "pcan_gain_control_test",
+    srcs = ["pcan_gain_control_test.cc"],
+    deps = [
+        ":pcan_gain_control",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "window_test",
+    srcs = ["window_test.cc"],
+    deps = [
+        ":window",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_features_generator_test",
+    srcs = [
+        "micro_features_generator_test.cc",
+    ],
+    deps = [
+        ":micro_features_generator",
+        ":micro_features_generator_test_data",
+        ":micro_model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech:audio_sample_test_data",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b19ee6f030ae9fa8a931c6693cfe490747e336a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
+
+#include <cstdint>
+
+static inline int CountLeadingZeros32Slow(uint64_t n) {
+  int zeroes = 28;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros32(uint32_t n) {
+#if defined(_MSC_VER)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse(&result, n)) {
+    return 31 - result;
+  }
+  return 32;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clz(0) is undefined.
+  if (n == 0) {
+    return 32;
+  }
+  return __builtin_clz(n);
+#else
+  return CountLeadingZeros32Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit32(uint32_t n) {
+  return 32 - CountLeadingZeros32(n);
+}
+
+static inline int CountLeadingZeros64Slow(uint64_t n) {
+  int zeroes = 60;
+  if (n >> 32) zeroes -= 32, n >>= 32;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros64(uint64_t n) {
+#if defined(_MSC_VER) && defined(_M_X64)
+  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse64(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(_MSC_VER)
+  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
+    return 31 - result;
+  }
+  if (_BitScanReverse(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
+  if (n == 0) {
+    return 64;
+  }
+  return __builtin_clzll(n);
+#else
+  return CountLeadingZeros64Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit64(uint64_t n) {
+  return 64 - CountLeadingZeros64(n);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde4e38740e65cf56cd179d577528263177a649e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+
+#include <string.h>
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+// Internal test dependency placeholder1
+// Internal test dependency placeholder2
+#include "tools/kiss_fftr.h"
+// Internal test dependency placeholder3
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift) {
+  const size_t input_size = state->input_size;
+  const size_t fft_size = state->fft_size;
+
+  int16_t* fft_input = state->input;
+  // First, scale the input by the given shift.
+  int i;
+  for (i = 0; i < input_size; ++i) {
+    *fft_input++ = (*input++) << input_scale_shift;
+  }
+  // Zero out whatever else remains in the top part of the input.
+  for (; i < fft_size; ++i) {
+    *fft_input++ = 0;
+  }
+
+  // Apply the FFT.
+  kiss_fftr(reinterpret_cast<const kiss_fftr_cfg>(state->scratch), state->input,
+            reinterpret_cast<kiss_fft_cpx*>(state->output));
+}
+
+void FftInit(struct FftState* state) {
+  // All the initialization is done in FftPopulateState()
+}
+
+void FftReset(struct FftState* state) {
+  memset(state->input, 0, state->fft_size * sizeof(*state->input));
+  memset(state->output, 0, (state->fft_size / 2 + 1) * sizeof(*state->output));
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5d29f68a2cc5688f9644a2b556abb4787e3bb93
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+struct complex_int16_t {
+  int16_t real;
+  int16_t imag;
+};
+
+struct FftState {
+  int16_t input[kMaxAudioSampleSize];
+  struct complex_int16_t output[kMaxAudioSampleSize + 2];
+  size_t fft_size;
+  size_t input_size;
+  // This magic number was derived from KissFFT's estimate of how much space it
+  // will need to process the particular lengths and datatypes we need to for
+  // these model settings. This size will need to be recalculated for different
+  // models, but you will see a runtime error if it's not large enough.
+  char scratch[2848];
+  size_t scratch_size;
+};
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift);
+
+void FftInit(struct FftState* state);
+
+void FftReset(struct FftState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b89b01445a641c8152aaff8165495688ab6861b2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int16_t kFakeWindow[] = {
+    0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+    0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+const int kScaleShift = 0;
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FftTest_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  struct FftState state;
+  TF_LITE_MICRO_EXPECT(FftPopulateState(
+      error_reporter, &state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
+
+  FftInit(&state);
+  FftCompute(&state, kFakeWindow, kScaleShift);
+
+  const struct complex_int16_t expected[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  TF_LITE_MICRO_EXPECT_EQ(state.fft_size / 2 + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.fft_size / 2; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(state.output[i].real, expected[i].real, 2);
+    TF_LITE_MICRO_EXPECT_NEAR(state.output[i].imag, expected[i].imag, 2);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab742893197e6fda9ec2266e85997b555e0a4fc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+#include "tools/kiss_fftr.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+int FftPopulateState(tflite::ErrorReporter* error_reporter,
+                     struct FftState* state, size_t input_size) {
+  state->input_size = input_size;
+  state->fft_size = 1;
+  while (state->fft_size < state->input_size) {
+    state->fft_size <<= 1;
+  }
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->input,
+                                 (state->fft_size * sizeof(*state->input)));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->output, ((state->fft_size / 2 + 1) * sizeof(*state->output) * 2));
+
+  // Ask kissfft how much memory it wants.
+  size_t scratch_size = 0;
+  kiss_fftr_cfg kfft_cfg =
+      kiss_fftr_alloc(state->fft_size, 0, nullptr, &scratch_size);
+  if (kfft_cfg != nullptr) {
+    error_reporter->Report("Kiss memory sizing failed.");
+    return 0;
+  }
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->scratch, scratch_size);
+  state->scratch_size = scratch_size;
+  // Let kissfft configure the scratch space we just allocated
+  kfft_cfg = kiss_fftr_alloc(state->fft_size, 0, state->scratch, &scratch_size);
+  if (reinterpret_cast<char*>(kfft_cfg) != state->scratch) {
+    error_reporter->Report("Kiss memory preallocation strategy failed.");
+    return 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dea097bc996e194cef7987431c67be3c976ed2b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Prepares and FFT for the given input size.
+int FftPopulateState(tflite::ErrorReporter* error_reporter,
+                     struct FftState* state, size_t input_size);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67f69dd67581ff6c15063b2467810f6c212ed1e5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy) {
+  const int end_index = state->end_index;
+  int i;
+  energy += state->start_index;
+  fft_output += state->start_index;
+  for (i = state->start_index; i < end_index; ++i) {
+    const int32_t real = fft_output->real;
+    const int32_t imag = fft_output->imag;
+    fft_output++;
+    const uint32_t mag_squared = (real * real) + (imag * imag);
+    *energy++ = mag_squared;
+  }
+}
+
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy) {
+  uint64_t* work = state->work;
+  uint64_t weight_accumulator = 0;
+  uint64_t unweight_accumulator = 0;
+
+  const int16_t* channel_frequency_starts = state->channel_frequency_starts;
+  const int16_t* channel_weight_starts = state->channel_weight_starts;
+  const int16_t* channel_widths = state->channel_widths;
+
+  int num_channels_plus_1 = state->num_channels + 1;
+  int i;
+  for (i = 0; i < num_channels_plus_1; ++i) {
+    const int32_t* magnitudes = energy + *channel_frequency_starts++;
+    const int16_t* weights = state->weights + *channel_weight_starts;
+    const int16_t* unweights = state->unweights + *channel_weight_starts++;
+    const int width = *channel_widths++;
+    int j;
+    for (j = 0; j < width; ++j) {
+      weight_accumulator += *weights++ * (static_cast<uint64_t>(*magnitudes));
+      unweight_accumulator +=
+          *unweights++ * (static_cast<uint64_t>(*magnitudes));
+      ++magnitudes;
+    }
+    *work++ = weight_accumulator;
+    weight_accumulator = unweight_accumulator;
+    unweight_accumulator = 0;
+  }
+}
+
+static uint16_t Sqrt32(uint32_t num) {
+  if (num == 0) {
+    return 0;
+  }
+  uint32_t res = 0;
+  int max_bit_number = 32 - MostSignificantBit32(num);
+  max_bit_number |= 1;
+  uint32_t bit = 1U << (31 - max_bit_number);
+  int iterations = (31 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFF) {
+    ++res;
+  }
+  return res;
+}
+
+static uint32_t Sqrt64(uint64_t num) {
+  // Take a shortcut and just use 32 bit operations if the upper word is all
+  // clear. This will cause a slight off by one issue for numbers close to 2^32,
+  // but it probably isn't going to matter (and gives us a big performance win).
+  if ((num >> 32) == 0) {
+    return Sqrt32(static_cast<uint32_t>(num));
+  }
+  uint64_t res = 0;
+  int max_bit_number = 64 - MostSignificantBit64(num);
+  max_bit_number |= 1;
+  uint64_t bit = 1ULL << (63 - max_bit_number);
+  int iterations = (63 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFFFFFFLL) {
+    ++res;
+  }
+  return res;
+}
+
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
+  const int num_channels = state->num_channels;
+  const int64_t* work = reinterpret_cast<int64_t*>(state->work + 1);
+  // Reuse the work buffer since we're fine clobbering it at this point to hold
+  // the output.
+  uint32_t* output = reinterpret_cast<uint32_t*>(state->work);
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    *output++ = Sqrt64(*work++) >> scale_down_shift;
+  }
+  return reinterpret_cast<uint32_t*>(state->work);
+}
+
+void FilterbankReset(struct FilterbankState* state) {
+  memset(state->work, 0, (state->num_channels + 1) * sizeof(*state->work));
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7b479d4899a985482710dfcb12b908f50ec1690
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#define kFilterbankBits 12
+
+struct FilterbankState {
+  int num_channels;
+  int start_index;
+  int end_index;
+  int16_t channel_frequency_starts[kFeatureSliceSize + 1];
+  int16_t channel_weight_starts[kFeatureSliceSize + 1];
+  int16_t channel_widths[kFeatureSliceSize + 1];
+  int16_t weights[316];
+  int16_t unweights[316];
+  uint64_t work[kFeatureSliceSize + 1];
+};
+
+// Converts the relevant complex values of an FFT output into energy (the
+// square magnitude).
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy);
+
+// Computes the mel-scale filterbank on the given energy array. Output is cached
+// internally - to fetch it, you need to call FilterbankSqrt.
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy);
+
+// Applies an integer square root to the 64 bit intermediate values of the
+// filterbank, and returns a pointer to them. Memory will be invalidated the
+// next time FilterbankAccumulateChannels is called.
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift);
+
+void FilterbankReset(struct FilterbankState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..682b216ace37fa0a809db3c06386b5b4b3ca94c8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc
@@ -0,0 +1,228 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+
+#include <cstring>
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kSpectrumSize = 17;
+const int kStartIndex = 1;
+const int kEndIndex = 15;
+const int32_t kEnergy[] = {-1,     181,      400,      181,      625,    28322,
+                           786769, 18000000, 40972801, 18000000, 784996, 28085,
+                           625,    181,      361,      -1,       -1};
+const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
+const int kScaleShift = 0;
+
+// Test filterbank generation using scaled-down defaults.
+class FilterbankTestConfig {
+ public:
+  FilterbankTestConfig() {
+    config_.num_channels = 2;
+    config_.lower_band_limit = 8.0;
+    config_.upper_band_limit = 450.0;
+  }
+
+  struct FilterbankConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckStartIndex) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.start_index, kStartIndex);
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckEndIndex) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.end_index, kEndIndex);
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelFrequencyStarts) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 4, 8};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWeightStarts) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 8, 16};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWidths) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {8, 8, 8};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_widths[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckWeights) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
+                              0, 3376, 2468, 1591, 744,  0,   0,   0,
+                              0, 4020, 3226, 2456, 1708, 983, 277, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.weights[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckUnweights) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
+                              0, 720, 1628, 2505, 3352, 0,    0,    0,
+                              0, 76,  870,  1640, 2388, 3113, 3819, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.unweights[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckConvertFftComplexToEnergy) {
+  struct FilterbankState state;
+  state.start_index = kStartIndex;
+  state.end_index = kEndIndex;
+
+  struct complex_int16_t fake_fft[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  int32_t* energy = reinterpret_cast<int32_t*>(fake_fft);
+  FilterbankConvertFftComplexToEnergy(&state, fake_fft, energy);
+
+  int i;
+  for (i = state.start_index; i < state.end_index; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(energy[i], kEnergy[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckAccumulateChannels) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  FilterbankAccumulateChannels(&state, kEnergy);
+
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(kWork) / sizeof(kWork[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.work[i], kWork[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckSqrt) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+  std::memcpy(state.work, kWork, sizeof(kWork));
+
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
+
+  const uint32_t expected[] = {247311, 508620};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(scaled_filterbank[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea4aa5179bc6e1d875127152b596f220a076191e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+
+#include <assert.h>
+#include <math.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+#define kFilterbankIndexAlignment 4
+#define kFilterbankChannelBlockSize 4
+
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) {
+  config->num_channels = 32;
+  config->lower_band_limit = 125.0f;
+  config->upper_band_limit = 7500.0f;
+  config->output_scale_shift = 7;
+}
+
+static float FreqToMel(float freq) {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+static void CalculateCenterFrequencies(const int num_channels,
+                                       const float lower_frequency_limit,
+                                       const float upper_frequency_limit,
+                                       float* center_frequencies) {
+  assert(lower_frequency_limit >= 0.0f);
+  assert(upper_frequency_limit > lower_frequency_limit);
+
+  const float mel_low = FreqToMel(lower_frequency_limit);
+  const float mel_hi = FreqToMel(upper_frequency_limit);
+  const float mel_span = mel_hi - mel_low;
+  const float mel_spacing = mel_span / (static_cast<float>(num_channels));
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    center_frequencies[i] = mel_low + (mel_spacing * (i + 1));
+  }
+}
+
+static void QuantizeFilterbankWeights(const float float_weight, int16_t* weight,
+                                      int16_t* unweight) {
+  *weight = floor(float_weight * (1 << kFilterbankBits) + 0.5);
+  *unweight = floor((1.0 - float_weight) * (1 << kFilterbankBits) + 0.5);
+}
+
+int FilterbankPopulateState(tflite::ErrorReporter* error_reporter,
+                            const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size) {
+  state->num_channels = config->num_channels;
+  const int num_channels_plus_1 = config->num_channels + 1;
+
+  // How should we align things to index counts given the byte alignment?
+  const int index_alignment =
+      (kFilterbankIndexAlignment < sizeof(int16_t)
+           ? 1
+           : kFilterbankIndexAlignment / sizeof(int16_t));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_frequency_starts,
+      (num_channels_plus_1 * sizeof(*state->channel_frequency_starts)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_weight_starts,
+      (num_channels_plus_1 * sizeof(*state->channel_weight_starts)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_widths,
+      (num_channels_plus_1 * sizeof(*state->channel_widths)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->work,
+                                 (num_channels_plus_1 * sizeof(*state->work)));
+
+  float center_mel_freqs[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      center_mel_freqs, (num_channels_plus_1 * sizeof(*center_mel_freqs)));
+
+  int16_t actual_channel_starts[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      actual_channel_starts,
+      (num_channels_plus_1 * sizeof(*actual_channel_starts)));
+
+  int16_t actual_channel_widths[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      actual_channel_widths,
+      (num_channels_plus_1 * sizeof(*actual_channel_widths)));
+
+  CalculateCenterFrequencies(num_channels_plus_1, config->lower_band_limit,
+                             config->upper_band_limit, center_mel_freqs);
+
+  // Always exclude DC.
+  const float hz_per_sbin =
+      0.5 * sample_rate / (static_cast<float>(spectrum_size) - 1);
+  state->start_index = 1.5 + config->lower_band_limit / hz_per_sbin;
+  state->end_index = 0;  // Initialized to zero here, but actually set below.
+
+  // For each channel, we need to figure out what frequencies belong to it, and
+  // how much padding we need to add so that we can efficiently multiply the
+  // weights and unweights for accumulation. To simplify the multiplication
+  // logic, all channels will have some multiplication to do (even if there are
+  // no frequencies that accumulate to that channel) - they will be directed to
+  // a set of zero weights.
+  int chan_freq_index_start = state->start_index;
+  int weight_index_start = 0;
+  int needs_zeros = 0;
+
+  int chan;
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    // Keep jumping frequencies until we overshoot the bound on this channel.
+    int freq_index = chan_freq_index_start;
+    while (FreqToMel((freq_index)*hz_per_sbin) <= center_mel_freqs[chan]) {
+      ++freq_index;
+    }
+
+    const int width = freq_index - chan_freq_index_start;
+    actual_channel_starts[chan] = chan_freq_index_start;
+    actual_channel_widths[chan] = width;
+
+    if (width == 0) {
+      // This channel doesn't actually get anything from the frequencies, it's
+      // always zero. We need then to insert some 'zero' weights into the
+      // output, and just redirect this channel to do a single multiplication at
+      // this point. For simplicity, the zeros are placed at the beginning of
+      // the weights arrays, so we have to go and update all the other
+      // weight_starts to reflect this shift (but only once).
+      state->channel_frequency_starts[chan] = 0;
+      state->channel_weight_starts[chan] = 0;
+      state->channel_widths[chan] = kFilterbankChannelBlockSize;
+      if (!needs_zeros) {
+        needs_zeros = 1;
+        int j;
+        for (j = 0; j < chan; ++j) {
+          state->channel_weight_starts[j] += kFilterbankChannelBlockSize;
+        }
+        weight_index_start += kFilterbankChannelBlockSize;
+      }
+    } else {
+      // How far back do we need to go to ensure that we have the proper
+      // alignment?
+      const int aligned_start =
+          (chan_freq_index_start / index_alignment) * index_alignment;
+      const int aligned_width = (chan_freq_index_start - aligned_start + width);
+      const int padded_width =
+          (((aligned_width - 1) / kFilterbankChannelBlockSize) + 1) *
+          kFilterbankChannelBlockSize;
+
+      state->channel_frequency_starts[chan] = aligned_start;
+      state->channel_weight_starts[chan] = weight_index_start;
+      state->channel_widths[chan] = padded_width;
+      weight_index_start += padded_width;
+    }
+    chan_freq_index_start = freq_index;
+  }
+
+  // Allocate the two arrays to store the weights - weight_index_start contains
+  // the index of what would be the next set of weights that we would need to
+  // add, so that's how many weights we need to allocate.
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->weights, (weight_index_start * sizeof(*state->weights)));
+  for (int i = 0; i < weight_index_start; ++i) {
+    state->weights[i] = 0;
+  }
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->unweights, (weight_index_start * sizeof(*state->unweights)));
+  for (int i = 0; i < weight_index_start; ++i) {
+    state->unweights[i] = 0;
+  }
+
+  // Next pass, compute all the weights. Since everything has been memset to
+  // zero, we only need to fill in the weights that correspond to some frequency
+  // for a channel.
+  const float mel_low = FreqToMel(config->lower_band_limit);
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    int frequency = actual_channel_starts[chan];
+    const int num_frequencies = actual_channel_widths[chan];
+    const int frequency_offset =
+        frequency - state->channel_frequency_starts[chan];
+    const int weight_start = state->channel_weight_starts[chan];
+    const float denom_val = (chan == 0) ? mel_low : center_mel_freqs[chan - 1];
+
+    int j;
+    for (j = 0; j < num_frequencies; ++j, ++frequency) {
+      const float weight =
+          (center_mel_freqs[chan] - FreqToMel(frequency * hz_per_sbin)) /
+          (center_mel_freqs[chan] - denom_val);
+
+      // Make the float into an integer for the weights (and unweights).
+      const int weight_index = weight_start + frequency_offset + j;
+      QuantizeFilterbankWeights(weight, state->weights + weight_index,
+                                state->unweights + weight_index);
+    }
+    if (frequency > state->end_index) {
+      state->end_index = frequency;
+    }
+  }
+
+  if (state->end_index >= spectrum_size) {
+    error_reporter->Report("Filterbank end_index is above spectrum size.");
+    return 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bf0c8494ce11753c2f2c2185e6c1141d0adbc74
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct FilterbankConfig {
+  // number of frequency channel buckets for filterbank
+  int num_channels;
+  // maximum frequency to include
+  float upper_band_limit;
+  // minimum frequency to include
+  float lower_band_limit;
+  // unused
+  int output_scale_shift;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config);
+
+// Allocates any buffers.
+int FilterbankPopulateState(tflite::ErrorReporter* error_reporter,
+                            const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c609190e4db90adb3ec79e2a794b61923cea0978
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read) {
+  struct FrontendOutput output;
+  output.values = nullptr;
+  output.size = 0;
+
+  // Try to apply the window - if it fails, return and wait for more data.
+  if (!WindowProcessSamples(&state->window, samples, num_samples,
+                            num_samples_read)) {
+    return output;
+  }
+
+  // Apply the FFT to the window's output (and scale it so that the fixed point
+  // FFT can have as much resolution as possible).
+  int input_shift =
+      15 - MostSignificantBit32(state->window.max_abs_output_value);
+  FftCompute(&state->fft, state->window.output, input_shift);
+
+  // We can re-ruse the fft's output buffer to hold the energy.
+  int32_t* energy = reinterpret_cast<int32_t*>(state->fft.output);
+  FilterbankConvertFftComplexToEnergy(&state->filterbank, state->fft.output,
+                                      energy);
+  FilterbankAccumulateChannels(&state->filterbank, energy);
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state->filterbank, input_shift);
+
+  // Apply noise reduction.
+  NoiseReductionApply(&state->noise_reduction, scaled_filterbank);
+
+  if (state->pcan_gain_control.enable_pcan) {
+    PcanGainControlApply(&state->pcan_gain_control, scaled_filterbank);
+  }
+
+  // Apply the log and scale.
+  int correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  uint16_t* logged_filterbank =
+      LogScaleApply(&state->log_scale, scaled_filterbank,
+                    state->filterbank.num_channels, correction_bits);
+
+  output.size = state->filterbank.num_channels;
+  output.values = logged_filterbank;
+  return output;
+}
+
+void FrontendReset(struct FrontendState* state) {
+  WindowReset(&state->window);
+  FftReset(&state->fft);
+  FilterbankReset(&state->filterbank);
+  NoiseReductionReset(&state->noise_reduction);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h
new file mode 100644
index 0000000000000000000000000000000000000000..3221d283e8740cd95fa39ece4ad9533d059018d7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+
+struct FrontendState {
+  struct WindowState window;
+  struct FftState fft;
+  struct FilterbankState filterbank;
+  struct NoiseReductionState noise_reduction;
+  struct PcanGainControlState pcan_gain_control;
+  struct LogScaleState log_scale;
+};
+
+struct FrontendOutput {
+  const uint16_t* values;
+  size_t size;
+};
+
+// Main entry point to processing frontend samples. Updates num_samples_read to
+// contain the number of samples that have been consumed from the input array.
+// Returns a struct containing the generated output. If not enough samples were
+// added to generate a feature vector, the returned size will be 0 and the
+// values pointer will be NULL. Note that the output pointer will be invalidated
+// as soon as FrontendProcessSamples is called again, so copy the contents
+// elsewhere if you need to use them later.
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read);
+
+void FrontendReset(struct FrontendState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d9f86d48d86b1795fa45b9bf24db4dd75fb0a20
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test end-to-end frontend behaviors.
+class FrontendTestConfig {
+ public:
+  FrontendTestConfig() {
+    config_.window.size_ms = 25;
+    config_.window.step_size_ms = 10;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.filterbank.num_channels = 2;
+    config_.filterbank.lower_band_limit = 8.0;
+    config_.filterbank.upper_band_limit = 450.0;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.noise_reduction.even_smoothing = 0.025;
+    config_.noise_reduction.odd_smoothing = 0.06;
+    config_.noise_reduction.min_signal_remaining = 0.05;
+    config_.pcan_gain_control.enable_pcan = true;
+    config_.pcan_gain_control.strength = 0.95;
+    config_.pcan_gain_control.offset = 80.0;
+    config_.pcan_gain_control.gain_bits = 21;
+    config_.log_scale.enable_log = true;
+    config_.log_scale.scale_shift = 6;
+  }
+
+  struct FrontendConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
+
+  const uint16_t expected[] = {479, 425};
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckConsecutiveWindow) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+
+  const int16_t expected[] = {436, 378};
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples + kStepSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
+          kStepSamples,
+      &num_samples_read);
+
+  TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
+  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..220bc130fb9332e4afbe02a4432b61c8a4bcd544
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config) {
+  WindowFillConfigWithDefaults(&config->window);
+  FilterbankFillConfigWithDefaults(&config->filterbank);
+  NoiseReductionFillConfigWithDefaults(&config->noise_reduction);
+  PcanGainControlFillConfigWithDefaults(&config->pcan_gain_control);
+  LogScaleFillConfigWithDefaults(&config->log_scale);
+}
+
+int FrontendPopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate) {
+  memset(state, 0, sizeof(*state));
+
+  if (!WindowPopulateState(error_reporter, &config->window, &state->window,
+                           sample_rate)) {
+    error_reporter->Report("Failed to populate window state");
+    return 0;
+  }
+
+  if (!FftPopulateState(error_reporter, &state->fft, state->window.size)) {
+    error_reporter->Report("Failed to populate fft state");
+    return 0;
+  }
+  FftInit(&state->fft);
+
+  if (!FilterbankPopulateState(error_reporter, &config->filterbank,
+                               &state->filterbank, sample_rate,
+                               state->fft.fft_size / 2 + 1)) {
+    error_reporter->Report("Failed to populate filterbank state");
+    return 0;
+  }
+
+  if (!NoiseReductionPopulateState(error_reporter, &config->noise_reduction,
+                                   &state->noise_reduction,
+                                   state->filterbank.num_channels)) {
+    error_reporter->Report("Failed to populate noise reduction state");
+    return 0;
+  }
+
+  int input_correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  if (!PcanGainControlPopulateState(
+          error_reporter, &config->pcan_gain_control, &state->pcan_gain_control,
+          state->noise_reduction.estimate, state->filterbank.num_channels,
+          state->noise_reduction.smoothing_bits, input_correction_bits)) {
+    error_reporter->Report("Failed to populate pcan gain control state");
+    return 0;
+  }
+
+  if (!LogScalePopulateState(error_reporter, &config->log_scale,
+                             &state->log_scale)) {
+    error_reporter->Report("Failed to populate log scale state");
+    return 0;
+  }
+
+  FrontendReset(state);
+
+  // All good, return a true value.
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7267644ae053e1f816cb22bb5e0ecd04e4de0ef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct FrontendConfig {
+  struct WindowConfig window;
+  struct FilterbankConfig filterbank;
+  struct NoiseReductionConfig noise_reduction;
+  struct PcanGainControlConfig pcan_gain_control;
+  struct LogScaleConfig log_scale;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config);
+
+// Prepares any buffers.
+int FrontendPopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c651caad8c67773f20a485eea5519f286b1b0253
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h"
+const uint16_t kLogLut[]
+#ifndef _MSC_VER
+    __attribute__((aligned(4)))
+#endif  // _MSV_VER
+    = {0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
+       2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
+       3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
+       5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507,
+       5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633,
+       5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370,
+       5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762,
+       4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
+       3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
+       2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
+       1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5ed9339bd02e23cc134992badce5cdb72a74771
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
+
+#include <stdint.h>
+
+// Number of segments in the log lookup table. The table will be kLogSegments+1
+// in length (with some padding).
+#define kLogSegments 128
+#define kLogSegmentsLog2 7
+
+// Scale used by lookup table.
+#define kLogScale 65536
+#define kLogScaleLog2 16
+#define kLogCoeff 45426
+
+extern const uint16_t kLogLut[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f85e9c1a2f5a0056deaffad7b99b774c772ce562
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h"
+
+#define kuint16max 0x0000FFFF
+
+// The following functions implement integer logarithms of various sizes. The
+// approximation is calculated according to method described in
+//       www.inti.gob.ar/electronicaeinformatica/instrumentacion/utic/
+//       publicaciones/SPL2007/Log10-spl07.pdf
+// It first calculates log2 of the input and then converts it to natural
+// logarithm.
+
+static uint32_t Log2FractionPart(const uint32_t x, const uint32_t log2x) {
+  // Part 1
+  int32_t frac = x - (1LL << log2x);
+  if (log2x < kLogScaleLog2) {
+    frac <<= kLogScaleLog2 - log2x;
+  } else {
+    frac >>= log2x - kLogScaleLog2;
+  }
+  // Part 2
+  const uint32_t base_seg = frac >> (kLogScaleLog2 - kLogSegmentsLog2);
+  const uint32_t seg_unit =
+      ((static_cast<uint32_t>(1)) << kLogScaleLog2) >> kLogSegmentsLog2;
+
+  const int32_t c0 = kLogLut[base_seg];
+  const int32_t c1 = kLogLut[base_seg + 1];
+  const int32_t seg_base = seg_unit * base_seg;
+  const int32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> kLogScaleLog2;
+  return frac + c0 + rel_pos;
+}
+
+static uint32_t Log(const uint32_t x, const uint32_t scale_shift) {
+  const uint32_t integer = MostSignificantBit32(x) - 1;
+  const uint32_t fraction = Log2FractionPart(x, integer);
+  const uint32_t log2 = (integer << kLogScaleLog2) + fraction;
+  const uint32_t round = kLogScale / 2;
+  const uint32_t loge =
+      ((static_cast<uint64_t>(kLogCoeff)) * log2 + round) >> kLogScaleLog2;
+  // Finally scale to our output scale
+  const uint32_t loge_scaled = ((loge << scale_shift) + round) >> kLogScaleLog2;
+  return loge_scaled;
+}
+
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits) {
+  const int scale_shift = state->scale_shift;
+  uint16_t* output = reinterpret_cast<uint16_t*>(signal);
+  uint16_t* ret = output;
+  int i;
+  for (i = 0; i < signal_size; ++i) {
+    uint32_t value = *signal++;
+    if (state->enable_log) {
+      if (correction_bits < 0) {
+        value >>= -correction_bits;
+      } else {
+        value <<= correction_bits;
+      }
+      if (value > 1) {
+        value = Log(value, scale_shift);
+      } else {
+        value = 0;
+      }
+    }
+    *output++ = (value < kuint16max) ? value : kuint16max;
+  }
+  return ret;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..d90b87fb6d6fe181158b209a87a42f6d075ba457
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+struct LogScaleState {
+  int enable_log;
+  int scale_shift;
+};
+
+// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note
+// that the signal array will be modified.
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5b23323a273314a347f25e691d538781558980a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kScaleShift = 6;
+const int kCorrectionBits = -1;
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValues) {
+  struct LogScaleState state;
+  state.enable_log = true;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {3578, 1533};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {479, 425};
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValuesNoLog) {
+  struct LogScaleState state;
+  state.enable_log = false;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {85964, 45998};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {65535, 45998};
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09adc09c3511fdedeb7246b6717fa4bfb4c83ba1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config) {
+  config->enable_log = 1;
+  config->scale_shift = 6;
+}
+
+int LogScalePopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct LogScaleConfig* config,
+                          struct LogScaleState* state) {
+  state->enable_log = config->enable_log;
+  state->scale_shift = config->scale_shift;
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3caf207f2693756783b6c1dc64246d2522388d3b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct LogScaleConfig {
+  // set to false (0) to disable this module
+  int enable_log;
+  // scale results by 2^(scale_shift)
+  int scale_shift;
+};
+
+// Populates the LogScaleConfig with "sane" default values.
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config);
+
+// Allocates any buffers.
+int LogScalePopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct LogScaleConfig* config,
+                          struct LogScaleState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6657c6f3205903c178b2aa4314551f5b4fee1101
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+
+#include <cmath>
+#include <cstring>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+
+FrontendState g_micro_features_state;
+bool g_is_first_time = true;
+
+}  // namespace
+
+TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter) {
+  FrontendConfig config;
+  config.window.size_ms = kFeatureSliceDurationMs;
+  config.window.step_size_ms = kFeatureSliceStrideMs;
+  config.noise_reduction.smoothing_bits = 10;
+  config.filterbank.num_channels = kFeatureSliceSize;
+  config.filterbank.lower_band_limit = 125.0;
+  config.filterbank.upper_band_limit = 7500.0;
+  config.noise_reduction.smoothing_bits = 10;
+  config.noise_reduction.even_smoothing = 0.025;
+  config.noise_reduction.odd_smoothing = 0.06;
+  config.noise_reduction.min_signal_remaining = 0.05;
+  config.pcan_gain_control.enable_pcan = 1;
+  config.pcan_gain_control.strength = 0.95;
+  config.pcan_gain_control.offset = 80.0;
+  config.pcan_gain_control.gain_bits = 21;
+  config.log_scale.enable_log = 1;
+  config.log_scale.scale_shift = 6;
+  if (!FrontendPopulateState(error_reporter, &config, &g_micro_features_state,
+                             kAudioSampleFrequency)) {
+    error_reporter->Report("FrontendPopulateState() failed");
+    return kTfLiteError;
+  }
+  g_is_first_time = true;
+  return kTfLiteOk;
+}
+
+// This is not exposed in any header, and is only used for testing, to ensure
+// that the state is correctly set up before generating results.
+void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
+  for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
+    g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
+  }
+}
+
+TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
+                                   const int16_t* input, int input_size,
+                                   int output_size, uint8_t* output,
+                                   size_t* num_samples_read) {
+  const int16_t* frontend_input;
+  if (g_is_first_time) {
+    frontend_input = input;
+    g_is_first_time = false;
+  } else {
+    frontend_input = input + 160;
+  }
+  FrontendOutput frontend_output = FrontendProcessSamples(
+      &g_micro_features_state, frontend_input, input_size, num_samples_read);
+
+  for (int i = 0; i < frontend_output.size; ++i) {
+    // These scaling values are derived from those used in input_data.py in the
+    // training pipeline.
+    constexpr int32_t value_scale = (10 * 255);
+    constexpr int32_t value_div = (256 * 26);
+    int32_t value =
+        ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
+        value_div;
+    if (value < 0) {
+      value = 0;
+    }
+    if (value > 255) {
+      value = 255;
+    }
+    output[i] = value;
+  }
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..46fa55d62ff7a8032cb94e512d4e856fb5960276
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Sets up any resources needed for the feature generation pipeline.
+TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter);
+
+// Converts audio sample data into a more compact form that's appropriate for
+// feeding into a neural network.
+TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
+                                   const int16_t* input, int input_size,
+                                   int output_size, uint8_t* output,
+                                   size_t* num_samples_read);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2da7a799ce398ba7faf31d577d79bb96b2072a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+// This is a test-only API, not exposed in any public headers, so declare it.
+void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets);
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+
+  // The micro features pipeline retains state from previous calls to help
+  // estimate the background noise. Unfortunately this makes it harder to
+  // exactly reproduce results in a test environment, so use a known snapshot
+  // of the parameters at the point that the golden feature values were
+  // created.
+  const uint32_t yes_estimate_presets[] = {
+      1062898, 2644477, 1257642, 1864718, 412722, 725703, 395721, 474082,
+      173046,  255856,  158966,  153736,  69181,  199100, 144493, 227740,
+      110573,  164330,  79666,   144650,  122947, 476799, 398553, 497493,
+      322152,  1140005, 566716,  690605,  308902, 347481, 109891, 170457,
+      73901,   100975,  42963,   72325,   34183,  20207,  6640,   9468,
+  };
+  SetMicroFeaturesNoiseEstimates(yes_estimate_presets);
+
+  uint8_t yes_calculated_data[g_yes_feature_data_slice_size];
+  size_t num_samples_read;
+  TfLiteStatus yes_status = GenerateMicroFeatures(
+      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
+      g_yes_feature_data_slice_size, yes_calculated_data, &num_samples_read);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+
+  for (int i = 0; i < g_yes_feature_data_slice_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_feature_data_slice[i],
+                            yes_calculated_data[i]);
+    if (g_yes_feature_data_slice[i] != yes_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_yes_feature_data_slice[i],
+                             yes_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+  // As we did for the previous features, set known good noise state
+  // parameters.
+  const uint32_t no_estimate_presets[] = {
+      2563964, 1909393, 559801, 538670, 203643, 175959, 75088, 139491,
+      59691,   95307,   43865,  129263, 52517,  80058,  51330, 100731,
+      76674,   76262,   15497,  22598,  13778,  21460,  8946,  17806,
+      10023,   18810,   8002,   10842,  7578,   9983,   6267,  10759,
+      8946,    18488,   9691,   39785,  9939,   17835,  9671,  18512,
+  };
+  SetMicroFeaturesNoiseEstimates(no_estimate_presets);
+
+  uint8_t no_calculated_data[g_no_feature_data_slice_size];
+  size_t num_samples_read;
+  TfLiteStatus no_status = GenerateMicroFeatures(
+      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
+      g_no_feature_data_slice_size, no_calculated_data, &num_samples_read);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
+
+  for (int i = 0; i < g_no_feature_data_slice_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_feature_data_slice[i], no_calculated_data[i]);
+    if (g_no_feature_data_slice[i] != no_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_no_feature_data_slice[i], no_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
index b9b8fb37b19d384fe92edf8ce2292aee19b99b7f..09f65ca24b3cd03485a5a79599dc0143ca83329c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 const char* kCategoryLabels[kCategoryCount] = {
     "silence",
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h
new file mode 100644
index 0000000000000000000000000000000000000000..b74a4d01ca49d37d62daf3710c878cfc6d9940f0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
+
+// Keeping these as constant expressions allow us to allocate fixed-sized arrays
+// on the stack for our working memory.
+
+// The size of the input time series data we pass to the FFT to produce the
+// frequency information. This has to be a power of two, and since we're dealing
+// with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
+constexpr int kMaxAudioSampleSize = 512;
+constexpr int kAudioSampleFrequency = 16000;
+
+// All of these values are derived from the values used during model training,
+// if you change your model you'll need to update these constants.
+constexpr int kFeatureSliceSize = 40;
+constexpr int kFeatureSliceCount = 49;
+constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
+constexpr int kFeatureSliceStrideMs = 20;
+constexpr int kFeatureSliceDurationMs = 30;
+
+constexpr int kCategoryCount = 4;
+constexpr int kSilenceIndex = 0;
+constexpr int kUnknownIndex = 1;
+extern const char* kCategoryLabels[kCategoryCount];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
diff --git a/tensorflow/core/platform/default/protobuf_compiler.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
similarity index 53%
rename from tensorflow/core/platform/default/protobuf_compiler.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
index a93d7a184b21a1111764e0a7fc0765ebe877ce32..1dbb606e184e70e0fa97d417bcbab6010b8a88a5 100644
--- a/tensorflow/core/platform/default/protobuf_compiler.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
+// See the header for documentation on the meaning of this data.
 
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf_compiler.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf_compiler.h
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
 
-#include "google/protobuf/compiler/importer.h"
-#include "tensorflow/core/platform/default/protobuf.h"
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
+const uint8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
+    216, 195, 223, 211, 238, 223, 243, 215, 226, 204, 232, 211, 232, 213,
+    240, 218, 235, 214, 238, 205, 207, 173, 149, 201, 215, 200, 230, 213,
+    208, 195, 175, 151, 195, 175, 182, 163, 235, 217, 218, 190,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..72ea2bf6a23e83bff5dea771931e585d74c757ec
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_features_data.cc and consists of the 29th spectrogram slice of 43 values.
+// This is the expected result of running the sample data in
+// no_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
+
+#include <cstdint>
+
+constexpr int g_no_feature_data_slice_size = 40;
+extern const uint8_t g_no_feature_data_slice[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..865209b01df7e8f77139bcd5b6a37537a6f674f4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize=1 \
+ * --preprocess="micro" \
+ * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
+ * --output_c_file="/tmp/no_micro_features_data.cc" \
+ */
+
+const int g_no_micro_f9643d42_nohash_4_width = 40;
+const int g_no_micro_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_micro_f9643d42_nohash_4_data[] = {
+    230, 205, 191, 203, 202, 181, 180, 194, 205, 187, 183, 197, 203, 198, 196,
+    186, 202, 159, 151, 126, 110, 138, 141, 142, 137, 148, 133, 120, 110, 126,
+    117, 110, 117, 116, 137, 134, 95,  116, 123, 110, 184, 144, 183, 189, 197,
+    172, 188, 164, 194, 179, 175, 174, 182, 173, 184, 174, 200, 145, 154, 148,
+    147, 135, 143, 122, 127, 138, 116, 99,  122, 105, 110, 125, 127, 133, 131,
+    123, 116, 119, 127, 114, 193, 176, 185, 170, 175, 146, 166, 167, 185, 185,
+    185, 183, 195, 185, 176, 178, 197, 155, 137, 144, 164, 132, 153, 132, 138,
+    137, 134, 95,  120, 116, 131, 122, 99,  120, 120, 110, 116, 110, 126, 127,
+    128, 159, 187, 119, 178, 187, 197, 167, 199, 184, 180, 165, 194, 176, 144,
+    134, 187, 136, 142, 134, 145, 132, 145, 105, 119, 123, 125, 116, 125, 102,
+    129, 138, 130, 99,  99,  90,  120, 123, 134, 95,  194, 172, 187, 123, 191,
+    179, 195, 182, 201, 137, 167, 142, 185, 161, 187, 146, 167, 152, 154, 107,
+    152, 112, 134, 144, 117, 116, 105, 85,  105, 105, 99,  90,  123, 112, 112,
+    68,  107, 105, 117, 99,  116, 143, 139, 90,  154, 142, 188, 172, 178, 135,
+    175, 149, 177, 110, 173, 160, 169, 162, 173, 119, 132, 110, 85,  85,  117,
+    129, 117, 112, 117, 51,  112, 95,  139, 102, 105, 90,  128, 119, 112, 99,
+    170, 168, 195, 152, 174, 173, 180, 0,   157, 130, 169, 149, 149, 123, 170,
+    130, 170, 133, 159, 102, 134, 90,  85,  105, 126, 119, 130, 90,  78,  68,
+    127, 120, 95,  51,  122, 110, 112, 78,  116, 95,  180, 135, 179, 146, 179,
+    162, 197, 153, 172, 135, 154, 0,   149, 95,  145, 114, 166, 0,   114, 110,
+    145, 107, 114, 90,  136, 68,  95,  95,  95,  85,  116, 99,  116, 0,   95,
+    68,  102, 51,  102, 78,  185, 157, 138, 158, 180, 117, 173, 142, 145, 117,
+    169, 130, 159, 99,  138, 123, 169, 90,  78,  0,   123, 85,  107, 51,  114,
+    102, 95,  0,   116, 85,  119, 95,  95,  68,  85,  51,  116, 68,  102, 78,
+    167, 105, 164, 163, 178, 126, 164, 154, 154, 51,  177, 120, 156, 85,  134,
+    139, 168, 90,  161, 102, 114, 116, 122, 95,  112, 102, 107, 51,  114, 85,
+    119, 78,  114, 90,  102, 51,  102, 51,  114, 99,  177, 68,  152, 102, 184,
+    166, 179, 129, 177, 129, 180, 110, 158, 105, 139, 0,   145, 85,  148, 102,
+    117, 102, 116, 0,   78,  68,  90,  51,  107, 85,  78,  0,   51,  0,   51,
+    0,   95,  51,  107, 68,  180, 117, 90,  0,   138, 0,   187, 146, 119, 140,
+    164, 90,  136, 0,   131, 51,  159, 99,  141, 138, 116, 51,  90,  51,  90,
+    68,  105, 0,   85,  78,  112, 51,  122, 95,  128, 68,  85,  0,   112, 68,
+    147, 126, 178, 146, 171, 130, 190, 147, 188, 123, 170, 78,  132, 0,   130,
+    125, 159, 95,  102, 0,   110, 0,   95,  85,  120, 68,  78,  51,  99,  51,
+    105, 0,   112, 102, 105, 68,  90,  51,  90,  0,   127, 95,  166, 175, 187,
+    133, 135, 0,   171, 139, 132, 128, 140, 51,  126, 107, 161, 0,   95,  51,
+    119, 0,   114, 0,   95,  110, 116, 51,  112, 0,   90,  0,   116, 51,  68,
+    0,   105, 68,  105, 0,   164, 78,  173, 0,   194, 166, 145, 114, 116, 51,
+    107, 122, 151, 0,   156, 102, 148, 51,  122, 95,  129, 0,   85,  0,   127,
+    78,  90,  0,   78,  0,   95,  0,   110, 0,   68,  119, 120, 68,  68,  0,
+    122, 99,  147, 127, 200, 167, 85,  114, 161, 85,  161, 125, 143, 99,  156,
+    85,  147, 68,  99,  0,   107, 102, 132, 51,  112, 68,  95,  78,  99,  0,
+    68,  0,   51,  0,   90,  78,  128, 51,  95,  0,   166, 136, 174, 138, 189,
+    144, 130, 129, 138, 134, 132, 120, 134, 0,   51,  78,  147, 51,  51,  0,
+    51,  0,   78,  0,   68,  68,  95,  78,  90,  0,   0,   0,   68,  0,   90,
+    68,  110, 0,   95,  51,  165, 151, 157, 0,   0,   0,   112, 0,   112, 95,
+    149, 107, 119, 68,  126, 68,  138, 0,   78,  0,   78,  0,   99,  51,  112,
+    0,   102, 0,   78,  51,  85,  0,   0,   0,   78,  0,   95,  0,   95,  78,
+    105, 0,   152, 0,   0,   51,  132, 105, 159, 0,   129, 102, 114, 0,   138,
+    51,  123, 0,   129, 78,  119, 51,  51,  51,  105, 0,   78,  85,  95,  0,
+    85,  0,   0,   0,   85,  0,   78,  0,   0,   0,   172, 142, 141, 0,   137,
+    0,   148, 128, 157, 120, 146, 120, 120, 0,   95,  78,  141, 68,  68,  0,
+    68,  0,   90,  0,   85,  0,   107, 0,   78,  0,   85,  51,  102, 0,   68,
+    78,  68,  0,   51,  0,   125, 0,   141, 51,  102, 138, 175, 51,  120, 51,
+    173, 85,  116, 141, 164, 68,  150, 123, 133, 51,  114, 0,   117, 68,  150,
+    51,  116, 68,  78,  0,   68,  0,   68,  0,   85,  0,   78,  0,   51,  78,
+    155, 90,  161, 0,   132, 99,  123, 78,  107, 0,   134, 90,  95,  0,   78,
+    0,   162, 143, 85,  0,   107, 78,  125, 90,  90,  51,  51,  0,   85,  0,
+    0,   0,   132, 102, 102, 154, 128, 0,   99,  68,  162, 102, 151, 0,   99,
+    51,  147, 141, 156, 0,   112, 120, 158, 127, 145, 139, 187, 171, 135, 138,
+    146, 0,   95,  68,  127, 0,   85,  0,   105, 0,   0,   0,   187, 170, 162,
+    188, 165, 51,  51,  78,  243, 215, 225, 196, 205, 181, 205, 168, 176, 134,
+    157, 110, 126, 114, 133, 139, 193, 163, 159, 116, 160, 126, 122, 127, 171,
+    99,  114, 68,  123, 85,  90,  0,   157, 146, 166, 179, 136, 0,   116, 90,
+    242, 219, 240, 204, 216, 164, 188, 171, 176, 164, 154, 158, 190, 157, 190,
+    141, 182, 177, 169, 128, 172, 145, 105, 129, 157, 90,  78,  51,  119, 68,
+    137, 68,  116, 78,  141, 132, 151, 122, 156, 140, 234, 206, 229, 201, 216,
+    174, 191, 144, 162, 85,  122, 157, 194, 167, 204, 149, 180, 166, 166, 139,
+    122, 133, 156, 126, 145, 85,  128, 0,   99,  51,  145, 0,   126, 51,  166,
+    162, 166, 162, 177, 157, 228, 198, 221, 197, 214, 177, 173, 166, 173, 139,
+    185, 191, 202, 163, 205, 172, 206, 189, 135, 68,  166, 134, 149, 134, 135,
+    90,  127, 107, 175, 90,  136, 117, 135, 140, 172, 167, 166, 149, 177, 152,
+    221, 191, 215, 194, 211, 0,   156, 147, 182, 178, 208, 163, 190, 157, 208,
+    200, 195, 164, 179, 154, 181, 150, 143, 99,  132, 137, 185, 143, 163, 85,
+    51,  107, 132, 134, 164, 127, 167, 159, 175, 141, 216, 195, 223, 211, 238,
+    223, 243, 215, 226, 204, 232, 211, 232, 213, 240, 218, 235, 214, 238, 205,
+    207, 173, 149, 201, 215, 200, 230, 213, 208, 195, 175, 151, 195, 175, 182,
+    163, 235, 217, 218, 190, 211, 191, 215, 191, 217, 220, 241, 215, 229, 206,
+    236, 210, 227, 216, 236, 188, 183, 149, 202, 189, 208, 172, 191, 201, 220,
+    193, 221, 207, 216, 208, 201, 131, 170, 187, 229, 197, 211, 194, 226, 201,
+    205, 184, 206, 177, 221, 210, 226, 184, 204, 197, 218, 198, 212, 209, 213,
+    141, 172, 110, 175, 167, 180, 156, 213, 188, 192, 179, 213, 205, 204, 174,
+    200, 147, 162, 181, 203, 167, 198, 187, 210, 164, 196, 169, 189, 168, 224,
+    198, 213, 204, 198, 195, 230, 211, 221, 197, 208, 0,   0,   0,   85,  90,
+    167, 130, 175, 173, 203, 164, 193, 144, 170, 145, 185, 148, 154, 139, 198,
+    159, 180, 171, 216, 174, 178, 161, 166, 136, 216, 184, 215, 197, 199, 190,
+    228, 195, 208, 51,  117, 0,   0,   0,   0,   0,   140, 51,  135, 154, 188,
+    155, 168, 0,   90,  0,   156, 85,  110, 0,   174, 90,  172, 154, 179, 99,
+    142, 166, 179, 157, 177, 95,  192, 142, 204, 198, 217, 147, 173, 0,   112,
+    0,   0,   0,   0,   0,   0,   0,   110, 0,   107, 0,   160, 0,   148, 95,
+    172, 0,   0,   0,   116, 0,   122, 114, 170, 0,   0,   0,   0,   0,   179,
+    110, 196, 85,  205, 183, 169, 0,   99,  0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   141, 0,   112, 0,   0,   0,   134, 0,   0,   0,   0,
+    0,   0,   0,   139, 0,   0,   0,   0,   112, 186, 78,  163, 0,   169, 128,
+    174, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   95,
+    0,   105, 0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   95,  0,
+    0,   0,   0,   0,   0,   0,   119, 0,   164, 78,  0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   68,
+    117, 0,   0,   0,   0,   0,   0,   0,   148, 0,   0,   0,   0,   0,   0,
+    0,   0,   0,   116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
+    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..178323eeba6669d247edfe9cb675b37fe5c7d526
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
+
+extern const int g_no_micro_f9643d42_nohash_4_width;
+extern const int g_no_micro_f9643d42_nohash_4_height;
+extern const unsigned char g_no_micro_f9643d42_nohash_4_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b3aa19cda5d1c8151de5e8bf5aad45df09259a0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+
+#include <string.h>
+
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t smoothing =
+        ((i & 1) == 0) ? state->even_smoothing : state->odd_smoothing;
+    const uint32_t one_minus_smoothing = (1 << kNoiseReductionBits) - smoothing;
+
+    // Update the estimate of the noise.
+    const uint32_t signal_scaled_up = signal[i] << state->smoothing_bits;
+    uint32_t estimate =
+        ((static_cast<uint64_t>(signal_scaled_up) * smoothing) +
+         (static_cast<uint64_t>(state->estimate[i]) * one_minus_smoothing)) >>
+        kNoiseReductionBits;
+    state->estimate[i] = estimate;
+
+    // Make sure that we can't get a negative value for the signal - estimate.
+    if (estimate > signal_scaled_up) {
+      estimate = signal_scaled_up;
+    }
+
+    const uint32_t floor =
+        (static_cast<uint64_t>(signal[i]) * state->min_signal_remaining) >>
+        kNoiseReductionBits;
+    const uint32_t subtracted =
+        (signal_scaled_up - estimate) >> state->smoothing_bits;
+    const uint32_t output = subtracted > floor ? subtracted : floor;
+    signal[i] = output;
+  }
+}
+
+void NoiseReductionReset(struct NoiseReductionState* state) {
+  memset(state->estimate, 0, sizeof(*state->estimate) * state->num_channels);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..699144345d5751f27f7adcafec551180f82725d1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
+
+#define kNoiseReductionBits 14
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+struct NoiseReductionState {
+  int smoothing_bits;
+  uint16_t even_smoothing;
+  uint16_t odd_smoothing;
+  uint16_t min_signal_remaining;
+  int num_channels;
+  uint32_t estimate[kFeatureSliceSize];
+};
+
+// Removes stationary noise from each channel of the signal using a low pass
+// filter.
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal);
+
+void NoiseReductionReset(struct NoiseReductionState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de7181d710bc00938e411869bf071b91e22f2044
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kNumChannels = 2;
+
+// Test noise reduction using default config values.
+class NoiseReductionTestConfig {
+ public:
+  NoiseReductionTestConfig() {
+    config_.smoothing_bits = 10;
+    config_.even_smoothing = 0.025;
+    config_.odd_smoothing = 0.06;
+    config_.min_signal_remaining = 0.05;
+  }
+
+  struct NoiseReductionConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReductionEstimate) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  NoiseReductionTestConfig config;
+  struct NoiseReductionState state;
+  TF_LITE_MICRO_EXPECT(NoiseReductionPopulateState(
+      error_reporter, &config.config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {6321887, 31248341};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.estimate[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReduction) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  NoiseReductionTestConfig config;
+  struct NoiseReductionState state;
+  TF_LITE_MICRO_EXPECT(NoiseReductionPopulateState(
+      error_reporter, &config.config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {241137, 478104};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42a5c2136f2a85b0ddd7e3a620bb879d13eeb258
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config) {
+  config->smoothing_bits = 10;
+  config->even_smoothing = 0.025;
+  config->odd_smoothing = 0.06;
+  config->min_signal_remaining = 0.05;
+}
+
+int NoiseReductionPopulateState(tflite::ErrorReporter* error_reporter,
+                                const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels) {
+  state->smoothing_bits = config->smoothing_bits;
+  state->odd_smoothing = config->odd_smoothing * (1 << kNoiseReductionBits);
+  state->even_smoothing = config->even_smoothing * (1 << kNoiseReductionBits);
+  state->min_signal_remaining =
+      config->min_signal_remaining * (1 << kNoiseReductionBits);
+  state->num_channels = num_channels;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->estimate, (state->num_channels * sizeof(*state->estimate)));
+  for (int i = 0; i < state->num_channels; ++i) {
+    state->estimate[i] = 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..60f9de5067c606158bc0e29771d1e83a495cd4c1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct NoiseReductionConfig {
+  // scale the signal up by 2^(smoothing_bits) before reduction
+  int smoothing_bits;
+  // smoothing coefficient for even-numbered channels
+  float even_smoothing;
+  // smoothing coefficient for odd-numbered channels
+  float odd_smoothing;
+  // fraction of signal to preserve (1.0 disables this module)
+  float min_signal_remaining;
+};
+
+// Populates the NoiseReductionConfig with "sane" default values.
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config);
+
+// Prepares any buffers.
+int NoiseReductionPopulateState(tflite::ErrorReporter* error_reporter,
+                                const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50656758d722844b8aeb6a32c04d3df36f0e5242
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut) {
+  if (x <= 2) {
+    return lut[x];
+  }
+
+  const int16_t interval = MostSignificantBit32(x);
+  lut += 4 * interval - 6;
+
+  const int16_t frac =
+      ((interval < 11) ? (x << (11 - interval)) : (x >> (interval - 11))) &
+      0x3FF;
+
+  int32_t result = (static_cast<int32_t>(lut[2]) * frac) >> 5;
+  result += (static_cast<int32_t>(lut[1])) << 5;
+  result *= frac;
+  result = (result + (1 << 14)) >> 15;
+  result += lut[0];
+  return static_cast<int16_t>(result);
+}
+
+uint32_t PcanShrink(const uint32_t x) {
+  if (x < (2 << kPcanSnrBits)) {
+    return (x * x) >> (2 + 2 * kPcanSnrBits - kPcanOutputBits);
+  } else {
+    return (x >> (kPcanSnrBits - kPcanOutputBits)) - (1 << kPcanOutputBits);
+  }
+}
+
+void PcanGainControlApply(struct PcanGainControlState* state,
+                          uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t gain =
+        WideDynamicFunction(state->noise_estimate[i], state->gain_lut);
+    const uint32_t snr =
+        (static_cast<uint64_t>(signal[i]) * gain) >> state->snr_shift;
+    signal[i] = PcanShrink(snr);
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d6fc990385cd74ccba8510765eb7ad8da4eeca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kPcanSnrBits 12
+#define kPcanOutputBits 6
+
+#define kWideDynamicFunctionBits 32
+#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3)
+
+struct PcanGainControlState {
+  int enable_pcan;
+  uint32_t* noise_estimate;
+  int num_channels;
+  int16_t gain_lut[kWideDynamicFunctionLUTSize];
+  int32_t snr_shift;
+};
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut);
+
+uint32_t PcanShrink(const uint32_t x);
+
+void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7dee66746f381ea50127e416fe90f063353eca89
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kNumChannels = 2;
+const int kSmoothingBits = 10;
+const int kCorrectionBits = -1;
+
+// Test pcan auto gain control using default config values.
+class PcanGainControlTestConfig {
+ public:
+  PcanGainControlTestConfig() {
+    config_.enable_pcan = 1;
+    config_.strength = 0.95;
+    config_.offset = 80.0;
+    config_.gain_bits = 21;
+  }
+
+  struct PcanGainControlConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PcanGainControlTest_TestPcanGainControl) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint32_t estimate[] = {6321887, 31248341};
+  PcanGainControlTestConfig config;
+  struct PcanGainControlState state;
+  TF_LITE_MICRO_EXPECT(PcanGainControlPopulateState(
+      error_reporter, &config.config_, &state, estimate, kNumChannels,
+      kSmoothingBits, kCorrectionBits));
+
+  uint32_t signal[] = {241137, 478104};
+  PcanGainControlApply(&state, signal);
+
+  const uint32_t expected[] = {3578, 1533};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7867ac6284d519ba6dd35f601bc3cb40e2f95fe
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+
+#include <math.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+#define kint16max 0x00007FFF
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config) {
+  config->enable_pcan = 0;
+  config->strength = 0.95;
+  config->offset = 80.0;
+  config->gain_bits = 21;
+}
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x) {
+  const float x_as_float =
+      (static_cast<float>(x)) / (static_cast<uint32_t>(1) << input_bits);
+  const float gain_as_float =
+      (static_cast<uint32_t>(1) << config->gain_bits) *
+      powf(x_as_float + config->offset, -config->strength);
+
+  if (gain_as_float > kint16max) {
+    return kint16max;
+  }
+  return static_cast<int16_t>(gain_as_float + 0.5f);
+}
+
+int PcanGainControlPopulateState(tflite::ErrorReporter* error_reporter,
+                                 const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits) {
+  state->enable_pcan = config->enable_pcan;
+  if (!state->enable_pcan) {
+    return 1;
+  }
+  state->noise_estimate = noise_estimate;
+  state->num_channels = num_channels;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->gain_lut, (kWideDynamicFunctionLUTSize * sizeof(int16_t)));
+  state->snr_shift = config->gain_bits - input_correction_bits - kPcanSnrBits;
+
+  const int32_t input_bits = smoothing_bits - input_correction_bits;
+  state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
+  state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
+  int16_t* temp_gain_lut = state->gain_lut - 6;
+  int interval;
+  for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
+    const uint32_t x0 = static_cast<uint32_t>(1) << (interval - 1);
+    const uint32_t x1 = x0 + (x0 >> 1);
+    const uint32_t x2 =
+        (interval == kWideDynamicFunctionBits) ? x0 + (x0 - 1) : 2 * x0;
+
+    const int16_t y0 = PcanGainLookupFunction(config, input_bits, x0);
+    const int16_t y1 = PcanGainLookupFunction(config, input_bits, x1);
+    const int16_t y2 = PcanGainLookupFunction(config, input_bits, x2);
+
+    const int32_t diff1 = static_cast<int32_t>(y1) - y0;
+    const int32_t diff2 = static_cast<int32_t>(y2) - y0;
+    const int32_t a1 = 4 * diff1 - diff2;
+    const int32_t a2 = diff2 - a1;
+
+    temp_gain_lut[4 * interval] = y0;
+    temp_gain_lut[4 * interval + 1] = static_cast<int16_t>(a1);
+    temp_gain_lut[4 * interval + 2] = static_cast<int16_t>(a2);
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc1de7bb25db509f8271d12f053e61554d07680
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct PcanGainControlConfig {
+  // set to false (0) to disable this module
+  int enable_pcan;
+  // gain normalization exponent (0.0 disables, 1.0 full strength)
+  float strength;
+  // positive value added in the normalization denominator
+  float offset;
+  // number of fractional bits in the gain
+  int gain_bits;
+};
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config);
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x);
+
+int PcanGainControlPopulateState(tflite::ErrorReporter* error_reporter,
+                                 const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2af862de7590323819c99de3a6702d1bd046681
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
+
+// Checks to ensure that the C-style array passed in has a compile-time size of
+// at least the number of bytes requested. This doesn't work with raw pointers
+// since sizeof() doesn't know their actual length, so only use this to check
+// statically-allocated arrays with known sizes.
+#define STATIC_ALLOC_ENSURE_ARRAY_SIZE(A, N)                                 \
+  do {                                                                       \
+    if (sizeof(A) < (N)) {                                                   \
+      error_reporter->Report(#A " too small (%d bytes, wanted %d) at %s:%d", \
+                             sizeof(A), (N), __FILE__, __LINE__);            \
+      return 0;                                                              \
+    }                                                                        \
+  } while (0)
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..57a32c3595da7ae17c2328bb4c98fb005fd253ef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
@@ -0,0 +1,1541 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+// See the README for a full description of the creation process.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+
+const unsigned char g_tiny_conv_micro_features_model_data[] = {
+    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xd0, 0x46, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+    0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
+    0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0x94, 0x41, 0x00, 0x00,
+    0x74, 0x41, 0x00, 0x00, 0x44, 0x41, 0x00, 0x00, 0xb4, 0x3e, 0x00, 0x00,
+    0xac, 0x3e, 0x00, 0x00, 0xa4, 0x3e, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf0, 0xb9, 0xff, 0xff,
+    0xf4, 0xb9, 0xff, 0xff, 0x52, 0xba, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x3e, 0x00, 0x00, 0x68, 0x95, 0x91, 0x7d, 0x9b, 0x85, 0x85, 0x81,
+    0x77, 0x85, 0x99, 0x89, 0x7e, 0x8a, 0x85, 0x92, 0xa5, 0x7e, 0x93, 0x97,
+    0x97, 0x91, 0xa3, 0x97, 0x88, 0x8b, 0xa6, 0x71, 0x77, 0x85, 0x95, 0x86,
+    0x6b, 0x93, 0xcb, 0x96, 0x7a, 0x9a, 0x7f, 0x85, 0x7a, 0x8e, 0xac, 0x98,
+    0x6d, 0x9d, 0x9b, 0x70, 0x9a, 0x90, 0xba, 0x99, 0x7b, 0x93, 0x6e, 0x68,
+    0x75, 0x86, 0xc4, 0x8b, 0x66, 0x5d, 0x96, 0x7f, 0x92, 0x91, 0xb6, 0x7b,
+    0x96, 0x95, 0x9a, 0x77, 0x9a, 0x96, 0xce, 0x80, 0x88, 0x65, 0x8e, 0x80,
+    0x88, 0x85, 0xb7, 0x9c, 0x7b, 0x93, 0x9d, 0x95, 0x83, 0x92, 0xd0, 0x7e,
+    0x68, 0x88, 0x6c, 0x78, 0x98, 0x81, 0xac, 0x95, 0x9e, 0x98, 0xa2, 0x99,
+    0x8d, 0x7d, 0xb8, 0x81, 0x6e, 0x68, 0xa1, 0x81, 0x9d, 0x99, 0xb4, 0x7d,
+    0x92, 0x86, 0x9d, 0x93, 0xa3, 0xb0, 0xd6, 0x79, 0x93, 0x76, 0x8d, 0x84,
+    0x91, 0x9d, 0xbe, 0x94, 0xb0, 0x70, 0x84, 0x80, 0x85, 0x99, 0x9e, 0xa2,
+    0x86, 0x8a, 0x7a, 0x76, 0x91, 0x8d, 0xa6, 0x76, 0x8d, 0x82, 0x98, 0x8c,
+    0x92, 0x8f, 0x8c, 0xb3, 0x78, 0x75, 0xa5, 0x88, 0x73, 0x8c, 0x91, 0x7c,
+    0x82, 0x7d, 0x93, 0x9e, 0x8b, 0x97, 0x7c, 0x90, 0x84, 0x95, 0x7e, 0x9e,
+    0xa4, 0x52, 0x8a, 0xb4, 0x97, 0x65, 0x7d, 0xb6, 0x83, 0x7d, 0x99, 0x80,
+    0x97, 0x85, 0x96, 0x5f, 0x8e, 0x87, 0x95, 0x6d, 0x76, 0x84, 0x97, 0x8c,
+    0x66, 0x97, 0xae, 0x6b, 0x93, 0xb3, 0xa8, 0x8b, 0xa1, 0x79, 0xa3, 0x94,
+    0x7e, 0xa8, 0x8d, 0xad, 0x78, 0x82, 0xa2, 0x7b, 0x90, 0xa4, 0x7d, 0xb3,
+    0xa0, 0x7b, 0x94, 0x85, 0x9a, 0x8d, 0x76, 0x82, 0x65, 0x73, 0xab, 0xa4,
+    0xaa, 0x74, 0x93, 0x9c, 0x83, 0x66, 0xbf, 0x7a, 0xaa, 0x81, 0x92, 0x89,
+    0x7e, 0x88, 0xa6, 0x66, 0xaf, 0x92, 0x9f, 0x97, 0x6c, 0x89, 0x9c, 0x74,
+    0x7e, 0x82, 0x8e, 0x88, 0xb2, 0x85, 0xba, 0x96, 0x90, 0x78, 0x8d, 0xa7,
+    0x9e, 0x87, 0xbc, 0x7f, 0xb2, 0x8b, 0x77, 0x9b, 0xab, 0x8f, 0xa4, 0x7d,
+    0x6f, 0x77, 0x8c, 0x98, 0x6f, 0x89, 0xb1, 0x9f, 0xa7, 0x94, 0x7d, 0xae,
+    0x88, 0x8a, 0xa9, 0x75, 0x7d, 0x7c, 0x88, 0x99, 0x90, 0x9d, 0x97, 0xa7,
+    0x8d, 0x7f, 0x73, 0xa1, 0xa3, 0x87, 0xa9, 0x92, 0x98, 0x7e, 0x9c, 0x88,
+    0x73, 0x6b, 0x78, 0x8e, 0x7d, 0x86, 0x6c, 0x7c, 0x92, 0x40, 0x86, 0xa7,
+    0x65, 0x93, 0x67, 0x91, 0x67, 0x71, 0x6c, 0xa8, 0x81, 0x70, 0x8e, 0xa8,
+    0x7b, 0x63, 0x89, 0x76, 0x69, 0x90, 0x73, 0x5e, 0x92, 0x78, 0x7e, 0x9d,
+    0x87, 0x86, 0x89, 0x64, 0x66, 0xa9, 0x92, 0x8d, 0x72, 0x7c, 0x63, 0x7f,
+    0x94, 0x5c, 0x92, 0x89, 0x87, 0x9d, 0x8b, 0x75, 0x93, 0x8c, 0x94, 0x68,
+    0x97, 0x87, 0x78, 0x7d, 0x7f, 0x84, 0x84, 0x77, 0x6b, 0x8e, 0x83, 0xab,
+    0x7e, 0x62, 0x90, 0x83, 0x8e, 0x71, 0x7e, 0x9b, 0x96, 0x6d, 0x83, 0x6a,
+    0x76, 0x68, 0x71, 0x90, 0x98, 0x90, 0x9b, 0x68, 0x89, 0x89, 0x95, 0x85,
+    0x6e, 0x75, 0x8e, 0x95, 0x83, 0x7a, 0x91, 0x7f, 0x8b, 0x71, 0x90, 0x7d,
+    0xad, 0x91, 0x6f, 0x74, 0x98, 0x8a, 0xb0, 0xa8, 0x80, 0xa3, 0x8e, 0x7c,
+    0xa5, 0x67, 0xa4, 0x66, 0xa9, 0x7b, 0x85, 0x9d, 0x88, 0xab, 0x7d, 0x81,
+    0x6e, 0x7f, 0x8f, 0x97, 0x97, 0x84, 0x89, 0x74, 0x9d, 0x5f, 0x9c, 0x88,
+    0x6f, 0x74, 0x96, 0x9e, 0x7e, 0x7e, 0xa4, 0x85, 0x94, 0x91, 0xaf, 0x99,
+    0x7a, 0xaa, 0x8c, 0x92, 0x85, 0x9d, 0x6c, 0x79, 0x57, 0x7a, 0x80, 0x84,
+    0x79, 0x79, 0x74, 0xa5, 0x55, 0xab, 0x73, 0x8c, 0x72, 0x9d, 0x72, 0xa9,
+    0x90, 0x73, 0x8f, 0xa0, 0x89, 0x6d, 0x68, 0x66, 0x61, 0x6f, 0x57, 0x7d,
+    0x66, 0x8c, 0x65, 0x87, 0x62, 0x76, 0x83, 0x77, 0x89, 0xa4, 0x73, 0x89,
+    0x7f, 0x70, 0x79, 0x6b, 0x86, 0x6f, 0x8d, 0x96, 0x65, 0x89, 0x66, 0x53,
+    0x73, 0xae, 0x6a, 0x72, 0x88, 0x97, 0x7a, 0x7f, 0x5d, 0xa1, 0x86, 0x88,
+    0x5f, 0x9f, 0x9b, 0x8a, 0x74, 0x9a, 0x7a, 0x7e, 0x8b, 0x71, 0x58, 0x74,
+    0x8f, 0x9b, 0x9b, 0x8d, 0x6b, 0x83, 0x60, 0x7f, 0x75, 0x91, 0x79, 0x93,
+    0x7a, 0x92, 0x8c, 0x7e, 0x7a, 0x95, 0x84, 0x69, 0x8f, 0x8c, 0x7c, 0x6e,
+    0x8b, 0x87, 0x82, 0x62, 0xa6, 0x97, 0x91, 0x65, 0xa2, 0xa4, 0x9b, 0x8b,
+    0x85, 0xa4, 0x84, 0x7b, 0x67, 0x93, 0x96, 0x84, 0x85, 0x75, 0x6d, 0x9e,
+    0x80, 0x80, 0x73, 0x8c, 0x81, 0x70, 0x8a, 0x68, 0x9c, 0x8e, 0x63, 0x91,
+    0x89, 0x79, 0x8d, 0x79, 0xa4, 0x9a, 0x96, 0xa0, 0x83, 0x63, 0x88, 0x8f,
+    0x76, 0xb4, 0xa8, 0x8e, 0x68, 0x8d, 0x8e, 0x95, 0x78, 0xae, 0x5d, 0x89,
+    0x66, 0x7e, 0x7b, 0x8a, 0x75, 0x86, 0x71, 0x97, 0x6d, 0xb3, 0x67, 0x76,
+    0x82, 0x7d, 0x70, 0x79, 0x8a, 0x9c, 0x82, 0xa7, 0x82, 0xab, 0x58, 0x86,
+    0x5c, 0x70, 0x8c, 0x71, 0x61, 0xa6, 0x74, 0xa8, 0x65, 0x78, 0x72, 0x9d,
+    0x6c, 0x92, 0x70, 0x88, 0x88, 0x79, 0x96, 0x6f, 0x68, 0xa4, 0x7a, 0x7b,
+    0x96, 0xac, 0x6d, 0x76, 0x6a, 0xab, 0x82, 0x7d, 0x71, 0x8d, 0x6b, 0x81,
+    0x6c, 0x9d, 0x71, 0x59, 0x5c, 0x71, 0x77, 0x6d, 0x6a, 0x96, 0x76, 0x69,
+    0x80, 0x83, 0x88, 0x70, 0x97, 0xb4, 0x8a, 0x6c, 0xa5, 0x6e, 0x64, 0x75,
+    0x73, 0xa2, 0x7f, 0x97, 0x9e, 0x75, 0x8f, 0x86, 0x68, 0xbb, 0x6b, 0x86,
+    0x8d, 0x80, 0x8e, 0x58, 0x6d, 0xb2, 0x76, 0x99, 0x8f, 0x70, 0x6c, 0x86,
+    0x78, 0x9e, 0x91, 0x90, 0xa2, 0x7c, 0x8c, 0x81, 0x80, 0xb4, 0x77, 0x7a,
+    0x8c, 0x5f, 0x85, 0x56, 0x7a, 0x93, 0x6b, 0x5c, 0x74, 0x59, 0x7e, 0x86,
+    0x8c, 0xae, 0x76, 0x7d, 0x76, 0x7e, 0x81, 0x5f, 0x81, 0x8e, 0x7b, 0x90,
+    0xaa, 0x99, 0x79, 0x89, 0x93, 0xbc, 0x86, 0x91, 0xa2, 0x88, 0x79, 0x82,
+    0x80, 0xb6, 0x4a, 0x93, 0x7b, 0x89, 0x75, 0x8d, 0x7a, 0x8d, 0x66, 0x7c,
+    0x81, 0x9f, 0x6e, 0x86, 0x4d, 0x82, 0x66, 0x88, 0x73, 0x89, 0x7d, 0xac,
+    0x89, 0x9f, 0x58, 0x7f, 0x6b, 0x8c, 0x6a, 0x82, 0x59, 0xb8, 0x83, 0x67,
+    0x8b, 0x8a, 0x84, 0x7b, 0x7f, 0xb5, 0x44, 0x57, 0x5a, 0x73, 0x8b, 0x6d,
+    0x7c, 0x9e, 0x71, 0x72, 0x8d, 0x93, 0x80, 0x60, 0x7f, 0xc5, 0x69, 0x5c,
+    0x67, 0x92, 0x6c, 0x75, 0x66, 0x8f, 0x91, 0x5a, 0x6c, 0x70, 0x90, 0x84,
+    0x88, 0xab, 0x90, 0x66, 0x9c, 0x64, 0x6e, 0x68, 0x92, 0x9e, 0x89, 0x8d,
+    0x82, 0x97, 0x77, 0x75, 0x7f, 0xa7, 0x91, 0x75, 0x8c, 0x89, 0xa4, 0x6b,
+    0x98, 0x99, 0x80, 0x7d, 0x6b, 0x7f, 0x7d, 0x88, 0x79, 0xa1, 0x87, 0x90,
+    0x81, 0x8e, 0x94, 0x96, 0x7d, 0xa8, 0x86, 0x84, 0x86, 0x79, 0x97, 0x6e,
+    0xaa, 0x95, 0x8a, 0x9f, 0x8c, 0x72, 0x99, 0x77, 0x81, 0x94, 0x91, 0x9f,
+    0x6e, 0x67, 0x87, 0x70, 0x7d, 0xad, 0x58, 0x7f, 0x6d, 0x96, 0x8e, 0x82,
+    0x7d, 0xa6, 0x77, 0x99, 0x87, 0x95, 0x89, 0x7e, 0xa6, 0x9e, 0x86, 0xac,
+    0x78, 0x9f, 0x9b, 0x85, 0x76, 0x99, 0x6a, 0x92, 0x66, 0x7b, 0x9a, 0x99,
+    0x83, 0x8b, 0x57, 0x65, 0x75, 0x9f, 0xa6, 0x8a, 0x8d, 0x96, 0x6f, 0x80,
+    0x65, 0x8f, 0x80, 0x9f, 0x82, 0x85, 0x55, 0x75, 0x5c, 0x84, 0x91, 0x86,
+    0x76, 0x96, 0x5a, 0x6c, 0x62, 0x7b, 0x92, 0x88, 0x61, 0xca, 0x75, 0x66,
+    0x70, 0x70, 0x8e, 0x7a, 0x75, 0xb2, 0x66, 0x81, 0x5b, 0x79, 0x92, 0x97,
+    0x94, 0xaf, 0x72, 0x8a, 0x9b, 0x5f, 0x65, 0x96, 0x81, 0xb6, 0x8a, 0x6f,
+    0x94, 0x7a, 0x96, 0x92, 0x79, 0x94, 0x8e, 0x53, 0x9a, 0x73, 0x6a, 0x9d,
+    0xa1, 0xa3, 0xa4, 0x8f, 0x6b, 0xa4, 0x8b, 0x82, 0x96, 0xb1, 0x8c, 0x92,
+    0x7f, 0x91, 0x5f, 0x98, 0x8a, 0xa4, 0x7e, 0x80, 0x97, 0x86, 0x86, 0x86,
+    0x8f, 0xa6, 0x77, 0x9a, 0x82, 0x80, 0x6e, 0x73, 0x83, 0xaf, 0x87, 0x6d,
+    0x77, 0x9a, 0x83, 0x9f, 0x7c, 0xa4, 0x71, 0x6f, 0x7d, 0x75, 0x9d, 0x82,
+    0x83, 0xaf, 0x85, 0x80, 0x8d, 0x7f, 0xa4, 0xa2, 0x88, 0xba, 0x76, 0x76,
+    0x94, 0x6b, 0x76, 0x83, 0x77, 0x96, 0x78, 0x8c, 0xb0, 0x8e, 0x83, 0x87,
+    0xa0, 0xcc, 0x7f, 0xa4, 0x8c, 0x77, 0x84, 0x8c, 0x80, 0xa0, 0x57, 0x76,
+    0x76, 0x71, 0x86, 0x9c, 0x7f, 0x88, 0x57, 0x95, 0x4d, 0x8c, 0x7f, 0x80,
+    0x66, 0x9e, 0x42, 0x8d, 0x6a, 0x8e, 0x8c, 0x80, 0x89, 0x9d, 0x4f, 0x83,
+    0x54, 0x8a, 0x5e, 0x64, 0x70, 0x94, 0x78, 0x90, 0x7d, 0x78, 0x8d, 0x71,
+    0x56, 0x9a, 0x8c, 0x65, 0x8b, 0x62, 0x88, 0x9a, 0x6c, 0x8e, 0x7b, 0x78,
+    0x68, 0x86, 0x64, 0x6b, 0x67, 0xaa, 0x8c, 0x7b, 0x67, 0x75, 0x58, 0x7e,
+    0x6b, 0x97, 0x92, 0x87, 0x9c, 0x79, 0x71, 0x76, 0x7d, 0xbb, 0x89, 0x75,
+    0x83, 0x57, 0x74, 0x98, 0xa1, 0x8f, 0xb0, 0x89, 0x76, 0x88, 0x69, 0x9c,
+    0x74, 0xb0, 0x86, 0x9c, 0x79, 0x6f, 0x84, 0x70, 0x94, 0xa1, 0x6e, 0x7a,
+    0xa3, 0x88, 0xa0, 0x7a, 0x94, 0xa1, 0x82, 0x93, 0x99, 0x95, 0x7f, 0xab,
+    0x97, 0x9d, 0x6e, 0x68, 0x79, 0x73, 0x76, 0x83, 0x76, 0xbd, 0x87, 0x87,
+    0x86, 0x74, 0x8f, 0x6e, 0x65, 0xba, 0x6a, 0x78, 0x91, 0x62, 0x72, 0x67,
+    0x75, 0xbd, 0x8c, 0x5e, 0x85, 0x6d, 0x72, 0x85, 0x7d, 0x96, 0x8f, 0xb9,
+    0x9f, 0x97, 0xa2, 0x8a, 0xa1, 0xc1, 0x8d, 0xbc, 0x85, 0x78, 0x93, 0x97,
+    0x99, 0x9f, 0x3a, 0x98, 0x65, 0x8d, 0x6a, 0x6c, 0x92, 0x85, 0x49, 0x7e,
+    0x6a, 0xaa, 0x8a, 0x94, 0x6b, 0x93, 0x40, 0x8a, 0x8c, 0x9c, 0x6f, 0xad,
+    0x72, 0xb0, 0x58, 0x88, 0x60, 0x8c, 0x86, 0x84, 0x74, 0x96, 0x8f, 0x97,
+    0x5e, 0x6c, 0x79, 0x92, 0x51, 0xa8, 0x92, 0x58, 0x62, 0x6f, 0x6c, 0x76,
+    0x5f, 0x9e, 0x86, 0x71, 0x9c, 0x69, 0x7e, 0x80, 0x8a, 0x97, 0x6f, 0x79,
+    0x8b, 0x6f, 0x6c, 0x88, 0x73, 0x9c, 0x6d, 0x91, 0x77, 0x73, 0x7f, 0x97,
+    0x86, 0xa9, 0xac, 0x71, 0x82, 0x90, 0x83, 0x8a, 0x80, 0x9d, 0xa8, 0x85,
+    0x78, 0x7f, 0x94, 0x99, 0x8e, 0xa3, 0x89, 0x70, 0x87, 0x62, 0x82, 0x87,
+    0x8c, 0x98, 0x7a, 0x88, 0x72, 0x7e, 0x78, 0xa0, 0x78, 0x95, 0x97, 0x8f,
+    0x7b, 0x7c, 0x83, 0x94, 0x93, 0xa7, 0x77, 0x97, 0x90, 0x5e, 0x76, 0x7c,
+    0x68, 0xaa, 0x69, 0x67, 0x76, 0x84, 0x7e, 0x64, 0xa3, 0xbe, 0x7e, 0x8b,
+    0x82, 0x50, 0x8a, 0x82, 0x89, 0xc0, 0x79, 0x78, 0x68, 0x7c, 0x6b, 0x77,
+    0x82, 0x99, 0x7b, 0x83, 0x80, 0x90, 0x96, 0x96, 0x87, 0xb7, 0xa5, 0x94,
+    0x82, 0x99, 0x95, 0x91, 0x7e, 0xa2, 0x49, 0x95, 0x6d, 0x8e, 0xa9, 0x89,
+    0x8e, 0x8f, 0x3d, 0x95, 0x6a, 0x8c, 0x8b, 0x8c, 0x7e, 0x88, 0x63, 0x94,
+    0x69, 0x94, 0x88, 0x92, 0x79, 0xa7, 0x68, 0x60, 0x76, 0x85, 0xa1, 0x6f,
+    0x54, 0x96, 0x63, 0x7a, 0x5c, 0x73, 0x74, 0x6e, 0x53, 0x99, 0x69, 0x76,
+    0x69, 0x57, 0x6a, 0x82, 0x55, 0x93, 0x82, 0x80, 0x65, 0x7f, 0x7b, 0x76,
+    0x72, 0x87, 0x8d, 0x97, 0x98, 0x78, 0x7e, 0x6d, 0x7a, 0x95, 0x78, 0x70,
+    0x90, 0x83, 0x89, 0x80, 0x7f, 0x9d, 0x73, 0x73, 0x84, 0x77, 0x8e, 0x77,
+    0x8e, 0x75, 0x9e, 0xa5, 0x86, 0x68, 0x89, 0x7d, 0x8d, 0x99, 0x79, 0x8f,
+    0x8e, 0x87, 0x87, 0x97, 0x8c, 0x91, 0xa1, 0x96, 0x83, 0x73, 0x87, 0xa9,
+    0x8c, 0xa6, 0x85, 0x8c, 0x96, 0x7d, 0x7f, 0x8e, 0x7e, 0xb0, 0x85, 0x8f,
+    0x7f, 0x7d, 0x95, 0x7d, 0x9c, 0xb3, 0x71, 0x86, 0x81, 0x69, 0x7b, 0x69,
+    0x76, 0xb6, 0x5d, 0x67, 0x8a, 0x68, 0x9c, 0xa6, 0x70, 0xbf, 0x79, 0x60,
+    0x8b, 0x7f, 0x7a, 0x7b, 0x8b, 0xaf, 0x8c, 0xa1, 0x86, 0x92, 0x76, 0x8d,
+    0x89, 0xa2, 0xa8, 0xa3, 0xa0, 0xa2, 0x96, 0x9d, 0x7c, 0x92, 0x3f, 0x9b,
+    0x6d, 0x8a, 0x80, 0x81, 0xa0, 0x92, 0x50, 0x7c, 0x82, 0x99, 0x80, 0xa6,
+    0x8e, 0x8d, 0x4f, 0x8d, 0x65, 0x71, 0x77, 0x81, 0x51, 0xa6, 0x3f, 0x5c,
+    0x63, 0x6f, 0x61, 0x93, 0x5c, 0xaa, 0x77, 0x8f, 0x5d, 0x53, 0x79, 0x74,
+    0x6b, 0x94, 0x86, 0x81, 0x85, 0x48, 0x81, 0x80, 0x6b, 0x85, 0x6c, 0x91,
+    0x92, 0x6a, 0x74, 0x78, 0x72, 0x87, 0x6c, 0x82, 0x88, 0x7b, 0x93, 0x71,
+    0x91, 0x8d, 0x67, 0x83, 0x86, 0x5b, 0x86, 0x79, 0x81, 0x9f, 0x95, 0x8a,
+    0x70, 0x66, 0x9e, 0x6b, 0x72, 0x98, 0x97, 0x95, 0x72, 0x93, 0x84, 0x92,
+    0x8c, 0x96, 0xa2, 0x65, 0x80, 0x75, 0xa2, 0xa7, 0x7d, 0x97, 0x71, 0x8f,
+    0x69, 0x65, 0x8f, 0xae, 0x9c, 0x97, 0x5d, 0xb3, 0x98, 0x83, 0x98, 0xa0,
+    0x5f, 0x7e, 0x7a, 0x7a, 0x87, 0x7c, 0x92, 0xa0, 0x81, 0xa6, 0x71, 0x8e,
+    0x88, 0x52, 0xa3, 0x88, 0x6a, 0x9d, 0x84, 0x82, 0x7c, 0x78, 0x9f, 0x92,
+    0x66, 0xa4, 0x53, 0x6a, 0x7e, 0x84, 0x60, 0x84, 0x92, 0xb0, 0x93, 0x9d,
+    0xa0, 0x5f, 0x95, 0x8c, 0x77, 0xa1, 0x8c, 0x90, 0xa0, 0x9c, 0x9a, 0x95,
+    0x85, 0xa1, 0x22, 0x8f, 0x57, 0x80, 0x96, 0x7d, 0x92, 0x8b, 0x41, 0xa6,
+    0x61, 0xa2, 0x6f, 0x80, 0x5d, 0x91, 0x66, 0xab, 0x6d, 0x7e, 0x88, 0x93,
+    0x5c, 0xa5, 0x75, 0x6e, 0x6c, 0x86, 0x69, 0x73, 0x4e, 0x8e, 0x77, 0x6b,
+    0x6c, 0x60, 0x67, 0x91, 0x75, 0x91, 0x6c, 0x7c, 0x53, 0x6e, 0x75, 0x8e,
+    0x79, 0x8c, 0x8b, 0x74, 0x6b, 0x57, 0x71, 0xa1, 0x7f, 0x83, 0x6c, 0x6b,
+    0x93, 0x99, 0x7a, 0x78, 0x71, 0x8c, 0x78, 0x88, 0x9f, 0x85, 0x77, 0x7b,
+    0x86, 0x85, 0xa1, 0x61, 0x78, 0x65, 0x61, 0x75, 0x82, 0x7d, 0xa9, 0xa2,
+    0x84, 0x82, 0x94, 0x95, 0x90, 0x9f, 0x83, 0x97, 0x76, 0x95, 0x8a, 0x83,
+    0x9b, 0x87, 0x8b, 0x7a, 0x6c, 0x6e, 0x75, 0x95, 0x85, 0x95, 0x84, 0x9e,
+    0x96, 0x74, 0x7d, 0xa5, 0x85, 0x8e, 0x7e, 0x73, 0x85, 0x8d, 0x87, 0x80,
+    0x8a, 0x96, 0x65, 0x87, 0x7c, 0x73, 0x80, 0x96, 0x73, 0x8d, 0x5e, 0x79,
+    0x7e, 0x8d, 0x79, 0x85, 0x63, 0xa0, 0x62, 0x89, 0x9d, 0x8c, 0x74, 0x7b,
+    0x9c, 0xa5, 0x71, 0x8c, 0x83, 0x91, 0x8e, 0x8d, 0x89, 0x8b, 0x8b, 0xa4,
+    0x78, 0x88, 0x9e, 0x85, 0x8b, 0x94, 0x38, 0x84, 0x7b, 0x86, 0x7d, 0xa2,
+    0x73, 0x8f, 0x47, 0x7b, 0x69, 0xb4, 0x85, 0x71, 0x61, 0x9d, 0x59, 0x95,
+    0x74, 0x93, 0x6a, 0x88, 0x62, 0xa2, 0x56, 0x93, 0x8d, 0x68, 0x7e, 0x80,
+    0x6b, 0xb7, 0x63, 0x90, 0x5d, 0x54, 0x6c, 0x90, 0x5a, 0x8e, 0x7e, 0x7d,
+    0x82, 0x73, 0x7f, 0x89, 0x94, 0x8e, 0x7a, 0x70, 0x6c, 0x79, 0x88, 0x88,
+    0x9b, 0x8b, 0x70, 0x81, 0x83, 0x83, 0x8b, 0x86, 0x64, 0x93, 0x82, 0x66,
+    0x66, 0x79, 0x74, 0x91, 0x92, 0x94, 0x7c, 0x87, 0x72, 0x79, 0x8d, 0xaa,
+    0xa2, 0x9e, 0xaf, 0x95, 0xb1, 0x8a, 0x95, 0x8b, 0x94, 0x7e, 0x79, 0x8e,
+    0x99, 0x98, 0x97, 0x9e, 0x94, 0x87, 0x74, 0x72, 0x63, 0x92, 0x92, 0x95,
+    0xb0, 0x94, 0x86, 0x91, 0x77, 0x8f, 0x91, 0x7e, 0x83, 0x88, 0x90, 0xa5,
+    0x79, 0x70, 0x85, 0x8f, 0x67, 0x90, 0x98, 0x8d, 0x8a, 0x5d, 0x8c, 0x9c,
+    0x94, 0x91, 0x80, 0x95, 0x6e, 0x95, 0x73, 0x8d, 0x63, 0x8e, 0x53, 0x8a,
+    0x77, 0x88, 0x8f, 0x6f, 0x87, 0x9e, 0x8b, 0xb7, 0x99, 0xb2, 0x85, 0x82,
+    0xa1, 0x89, 0x9b, 0xa7, 0x80, 0x81, 0xa0, 0x8e, 0x84, 0xa9, 0x27, 0x73,
+    0x5e, 0x85, 0x5f, 0x92, 0x8c, 0xa2, 0x34, 0x8e, 0x6e, 0xb2, 0x7b, 0x8c,
+    0x69, 0x93, 0x47, 0x9e, 0x58, 0x7e, 0x94, 0x86, 0x47, 0xa3, 0x53, 0x6b,
+    0x6e, 0x6a, 0x7f, 0x73, 0x5b, 0x8c, 0x7a, 0x99, 0x6c, 0x5d, 0x82, 0x82,
+    0x62, 0x8a, 0x7a, 0x8e, 0x88, 0x62, 0xa0, 0x8e, 0x5c, 0x9a, 0x72, 0x79,
+    0x66, 0x6b, 0x75, 0x78, 0x82, 0x8a, 0x59, 0x91, 0x93, 0x68, 0x78, 0xb4,
+    0x86, 0x7e, 0x8c, 0x6e, 0x88, 0x7f, 0x96, 0x8e, 0x6e, 0x8b, 0x8c, 0x73,
+    0xab, 0x79, 0x88, 0xa6, 0x86, 0x81, 0x9a, 0x80, 0x9a, 0x9e, 0x8b, 0x6d,
+    0x9a, 0x70, 0x8e, 0x8a, 0x84, 0x7a, 0xaf, 0xb8, 0x9e, 0x90, 0x89, 0xb3,
+    0x9b, 0x85, 0x94, 0xb6, 0x87, 0x8c, 0x6e, 0xa3, 0xac, 0x9e, 0x8c, 0x7c,
+    0x81, 0x83, 0x70, 0x8d, 0x7c, 0x81, 0x77, 0x82, 0x69, 0x8e, 0x5e, 0x80,
+    0x8a, 0x8e, 0x7c, 0x8a, 0x89, 0x90, 0x58, 0x59, 0x85, 0x88, 0x7a, 0x86,
+    0x73, 0x9c, 0x4a, 0x81, 0x8d, 0x89, 0x91, 0x95, 0x72, 0x83, 0x9d, 0x99,
+    0x8d, 0x6b, 0x95, 0x7e, 0x70, 0x94, 0x8c, 0x9f, 0x8a, 0x8f, 0xa7, 0x84,
+    0x87, 0xb6, 0x42, 0x81, 0x63, 0x8a, 0x79, 0x77, 0x74, 0x90, 0x23, 0x85,
+    0x74, 0x8f, 0x87, 0x80, 0x50, 0xa1, 0x4d, 0x9b, 0x55, 0x82, 0x74, 0x8e,
+    0x4a, 0xa7, 0x52, 0x4d, 0x77, 0x67, 0x77, 0x9e, 0x62, 0xa5, 0x7d, 0x96,
+    0x6f, 0x45, 0x80, 0x8c, 0x6c, 0x92, 0x99, 0x6f, 0x5d, 0x56, 0x93, 0xac,
+    0x94, 0x9c, 0x95, 0x92, 0x6e, 0x71, 0x87, 0x8c, 0x7b, 0xa9, 0x7f, 0x7a,
+    0x69, 0x6b, 0x7d, 0x90, 0x6f, 0x81, 0x9f, 0x80, 0x83, 0x67, 0x78, 0x85,
+    0x85, 0x91, 0x8a, 0x80, 0xaa, 0x86, 0x8c, 0x88, 0x8c, 0x8f, 0x9b, 0x85,
+    0x8b, 0x7e, 0x83, 0x82, 0x95, 0x75, 0x6b, 0x8f, 0x85, 0x8b, 0xb0, 0x9f,
+    0xa7, 0x8e, 0x61, 0x9d, 0x72, 0xac, 0x92, 0x87, 0x94, 0x96, 0x68, 0x8f,
+    0x63, 0x85, 0x9c, 0xa8, 0x82, 0x9b, 0x85, 0x9b, 0x6b, 0x72, 0x83, 0x85,
+    0x90, 0x87, 0x74, 0xa4, 0x88, 0x57, 0x63, 0x90, 0x8e, 0x7b, 0x80, 0x81,
+    0x94, 0x74, 0x68, 0x8a, 0x7f, 0x86, 0x78, 0x72, 0x75, 0x67, 0x7a, 0x8a,
+    0x7a, 0x74, 0x8c, 0xad, 0x75, 0xa2, 0x7d, 0x9a, 0x9e, 0x83, 0x92, 0xa2,
+    0xa3, 0x98, 0xa5, 0x91, 0x84, 0xb0, 0x21, 0x9a, 0x5f, 0x8c, 0x7e, 0x86,
+    0x80, 0xa0, 0x16, 0x9b, 0x5b, 0x9c, 0x76, 0x8d, 0x77, 0x9f, 0x62, 0x86,
+    0x6a, 0x6c, 0x6e, 0x8f, 0x4e, 0xc1, 0x61, 0x6f, 0x74, 0x79, 0x80, 0x5f,
+    0x59, 0x9e, 0x7c, 0x87, 0x7f, 0x4b, 0x6c, 0x8b, 0x5a, 0x8f, 0x65, 0x8a,
+    0x62, 0x58, 0x66, 0x8d, 0x83, 0x97, 0x8a, 0x7a, 0x77, 0x79, 0x6c, 0x83,
+    0x8c, 0x93, 0x82, 0x5e, 0x61, 0x8c, 0x82, 0x80, 0x88, 0x88, 0x85, 0x87,
+    0x77, 0x70, 0x8d, 0x7f, 0x7a, 0x89, 0x72, 0x7e, 0xa3, 0x99, 0x6b, 0xaa,
+    0x81, 0x87, 0x90, 0x6f, 0x7f, 0x77, 0x96, 0x83, 0x89, 0x89, 0x6a, 0x77,
+    0xa4, 0x6c, 0x97, 0x7e, 0x95, 0xa4, 0x63, 0x8d, 0x71, 0x96, 0x8a, 0xa4,
+    0x9f, 0x7c, 0x54, 0x94, 0x7a, 0x89, 0x8a, 0x90, 0x7e, 0x9d, 0x53, 0x7c,
+    0x9d, 0x83, 0x90, 0x84, 0xa1, 0x8e, 0x80, 0x74, 0x69, 0x7a, 0x69, 0x93,
+    0x8a, 0x90, 0x83, 0x76, 0x8b, 0x6f, 0x8e, 0x93, 0x82, 0x84, 0x7d, 0x94,
+    0xa1, 0x78, 0x7d, 0x68, 0x79, 0x83, 0x85, 0x9d, 0x89, 0xa0, 0x8a, 0x93,
+    0x90, 0x8c, 0x82, 0x86, 0x80, 0x71, 0xb3, 0xa1, 0x90, 0xb2, 0x27, 0xa3,
+    0x5e, 0xa3, 0xa6, 0x64, 0x75, 0xa0, 0x23, 0x8c, 0x7c, 0xc4, 0x7a, 0x8c,
+    0x4d, 0xa3, 0x4c, 0x93, 0x71, 0x7b, 0x71, 0x8b, 0x34, 0xa5, 0x47, 0x7f,
+    0x4e, 0x73, 0x51, 0x8a, 0x67, 0xa0, 0x9d, 0x7f, 0x65, 0x38, 0x61, 0x70,
+    0x71, 0x8d, 0x6a, 0x7e, 0x7e, 0x4c, 0x7d, 0x8d, 0x81, 0x80, 0xa5, 0x84,
+    0x6f, 0x57, 0x70, 0x91, 0x8b, 0x99, 0x9d, 0x84, 0x77, 0x7f, 0x6b, 0x7f,
+    0x76, 0x8f, 0x90, 0x72, 0x6c, 0x58, 0x6b, 0x85, 0xa6, 0x8a, 0xa2, 0x6d,
+    0x8a, 0x71, 0x71, 0x95, 0x92, 0x7c, 0x88, 0x67, 0x86, 0x6d, 0x8d, 0x95,
+    0x79, 0x8e, 0x65, 0x71, 0x71, 0x91, 0x85, 0x99, 0xa9, 0x87, 0x80, 0x88,
+    0x74, 0x86, 0x75, 0x83, 0x8b, 0x7f, 0x78, 0xb1, 0x90, 0xa8, 0x7b, 0x98,
+    0x8a, 0x7b, 0x5b, 0x99, 0x6f, 0x7f, 0xa0, 0x79, 0xa5, 0x93, 0x8b, 0x7b,
+    0x7e, 0x7a, 0x61, 0x9d, 0x98, 0x8b, 0x82, 0x7c, 0x76, 0x73, 0x81, 0x8a,
+    0x7e, 0x8d, 0x6e, 0x71, 0xa0, 0x65, 0x80, 0x62, 0x7d, 0x8d, 0x5e, 0x9b,
+    0x8f, 0x85, 0x89, 0xad, 0x71, 0x73, 0x7f, 0x89, 0x8d, 0x89, 0xb3, 0xa1,
+    0x7c, 0xaf, 0x43, 0x82, 0x49, 0x92, 0x62, 0x7f, 0x79, 0xa6, 0x23, 0x99,
+    0x6c, 0x9a, 0x8a, 0x90, 0x6c, 0xb9, 0x6f, 0x8a, 0x61, 0x7f, 0x8f, 0x8a,
+    0x57, 0xb9, 0x55, 0x65, 0x4b, 0x51, 0x66, 0x6e, 0x4a, 0xa1, 0x83, 0x8a,
+    0x73, 0x23, 0x8a, 0x6d, 0x46, 0xa7, 0x87, 0x64, 0x84, 0x5f, 0x6f, 0x6f,
+    0x9b, 0x9d, 0x76, 0x83, 0x60, 0x6e, 0x76, 0x8a, 0x9a, 0xa6, 0x75, 0x73,
+    0x86, 0x5b, 0x97, 0x88, 0x7b, 0x8e, 0x82, 0x5c, 0x97, 0x71, 0x74, 0x85,
+    0x83, 0x91, 0x89, 0x6f, 0x93, 0x94, 0x8b, 0xa9, 0x7d, 0x84, 0x80, 0x89,
+    0x97, 0x80, 0x65, 0x92, 0x9a, 0x85, 0x5a, 0x6a, 0x6b, 0x58, 0x6f, 0x8c,
+    0x9a, 0x8b, 0x6e, 0x81, 0x9d, 0xae, 0x8c, 0x86, 0x8d, 0x90, 0x6c, 0xb8,
+    0x91, 0x89, 0x98, 0xbd, 0x8b, 0x78, 0x7d, 0x87, 0x9c, 0x72, 0x73, 0x80,
+    0x9e, 0x92, 0x5d, 0x77, 0x78, 0x4f, 0x87, 0x7b, 0x7a, 0x9e, 0x74, 0x67,
+    0x6a, 0x58, 0x95, 0x80, 0x75, 0x97, 0x81, 0x75, 0x94, 0x75, 0x73, 0x92,
+    0x83, 0x7b, 0x6b, 0x8e, 0x82, 0x6e, 0x7d, 0x9b, 0x91, 0x7f, 0x9e, 0xaa,
+    0x8c, 0xa3, 0xa8, 0x8c, 0x9a, 0xc1, 0x28, 0xac, 0x49, 0x9b, 0x59, 0x8a,
+    0x60, 0xa7, 0x39, 0xa7, 0x75, 0x9b, 0x95, 0x94, 0x76, 0xb3, 0x4a, 0x6b,
+    0x60, 0x6c, 0xa5, 0x71, 0x40, 0xc4, 0x4c, 0x7c, 0x76, 0x7b, 0x67, 0x76,
+    0x76, 0xa4, 0x7b, 0x83, 0x67, 0x4d, 0x87, 0x87, 0x6e, 0x93, 0x84, 0x70,
+    0x78, 0x41, 0x87, 0x9f, 0x7a, 0x8c, 0x87, 0x69, 0x73, 0x6c, 0x93, 0x73,
+    0x77, 0xa2, 0x52, 0x72, 0x5c, 0x75, 0x6c, 0x8f, 0x65, 0x92, 0x87, 0x52,
+    0x67, 0x54, 0x54, 0x75, 0x90, 0x9c, 0x91, 0x6f, 0xa3, 0x86, 0x87, 0x9c,
+    0x99, 0x86, 0x9f, 0x71, 0x8a, 0x7a, 0x7a, 0x97, 0x7a, 0x86, 0x6c, 0x99,
+    0x89, 0x7e, 0x9c, 0x83, 0x98, 0x78, 0x73, 0x7f, 0x91, 0x96, 0x9a, 0x8d,
+    0xb0, 0x9e, 0x6a, 0x80, 0x92, 0x86, 0x95, 0x83, 0x94, 0x92, 0x6f, 0x86,
+    0x8a, 0x52, 0x6e, 0x82, 0x84, 0x8b, 0x77, 0x88, 0x70, 0x54, 0x8f, 0x7f,
+    0x7d, 0x7e, 0x57, 0x89, 0x6d, 0x6f, 0x9c, 0x93, 0x90, 0x93, 0x52, 0x70,
+    0x75, 0x92, 0x73, 0x88, 0x93, 0x77, 0x77, 0x91, 0x89, 0xa2, 0x9d, 0xa6,
+    0xae, 0x84, 0x7d, 0xab, 0x92, 0x7e, 0x9c, 0x98, 0x7b, 0xc3, 0x38, 0x98,
+    0x4f, 0x97, 0x8f, 0x93, 0x62, 0xb8, 0x23, 0xa4, 0x6d, 0x9c, 0x81, 0x8e,
+    0x6f, 0x9d, 0x56, 0x89, 0x50, 0x94, 0x70, 0x77, 0x5d, 0xb7, 0x60, 0x5b,
+    0x72, 0x45, 0x81, 0x8c, 0x66, 0xbc, 0x8f, 0x7f, 0x57, 0x43, 0x85, 0x96,
+    0x5a, 0xb2, 0x91, 0x7d, 0x6c, 0x3a, 0x73, 0x92, 0x63, 0x93, 0x89, 0x90,
+    0x7f, 0x52, 0x7f, 0x7b, 0xa1, 0xa6, 0x8f, 0x60, 0x78, 0x51, 0x5f, 0xac,
+    0x7b, 0x89, 0x88, 0x97, 0x7e, 0x64, 0x57, 0x72, 0x6c, 0x96, 0x74, 0x78,
+    0xab, 0x66, 0x62, 0x8d, 0x6f, 0x86, 0x91, 0x93, 0x7d, 0x74, 0x82, 0x80,
+    0x73, 0x84, 0x9c, 0x8e, 0x68, 0x69, 0x9e, 0xa1, 0x8a, 0x83, 0x7a, 0x87,
+    0x94, 0x8c, 0x83, 0x7e, 0x91, 0x92, 0x82, 0x7b, 0xa0, 0x8e, 0x73, 0x86,
+    0xa9, 0x95, 0x7c, 0xa5, 0x6c, 0x6f, 0x8c, 0x87, 0xa6, 0x8a, 0x77, 0x86,
+    0x7d, 0x79, 0x89, 0x75, 0x8f, 0x82, 0x54, 0x61, 0x82, 0x8e, 0x80, 0x84,
+    0x7b, 0x8e, 0x61, 0x82, 0x86, 0x77, 0x7d, 0x7c, 0x7e, 0x6c, 0x7b, 0xad,
+    0x7b, 0x90, 0x88, 0x80, 0x64, 0x83, 0x7e, 0xa7, 0x83, 0x7e, 0xb5, 0xbb,
+    0x88, 0xd9, 0x21, 0x9a, 0x4d, 0x9f, 0x91, 0x97, 0x64, 0xb5, 0x1c, 0x8a,
+    0x5f, 0xaf, 0x7e, 0x7b, 0x67, 0xad, 0x48, 0x7f, 0x4e, 0x87, 0x8f, 0x7c,
+    0x46, 0xab, 0x70, 0x7f, 0x4b, 0x4e, 0x48, 0x8c, 0x63, 0xc5, 0xa2, 0x7f,
+    0x68, 0x3b, 0x59, 0x7f, 0x53, 0xa1, 0x8e, 0x6e, 0x7a, 0x4a, 0x5f, 0x62,
+    0x5b, 0xa1, 0x62, 0x78, 0x74, 0x57, 0x78, 0x91, 0x7b, 0x9b, 0x75, 0x73,
+    0x73, 0x72, 0x94, 0x92, 0x79, 0xaa, 0x94, 0x75, 0x86, 0x58, 0x8c, 0x71,
+    0x77, 0x91, 0xa5, 0x74, 0x8f, 0x73, 0x89, 0x77, 0x68, 0x8e, 0x90, 0x96,
+    0x9f, 0x79, 0x77, 0x7d, 0x89, 0x9b, 0x8c, 0x94, 0x81, 0x88, 0x91, 0x8f,
+    0x9b, 0x91, 0x78, 0x87, 0x82, 0x72, 0xa7, 0xa2, 0x85, 0x98, 0xa3, 0x91,
+    0x83, 0x75, 0x72, 0x93, 0x80, 0x8f, 0x85, 0x70, 0x97, 0x58, 0x9f, 0x72,
+    0x91, 0x8e, 0x93, 0x74, 0x97, 0x73, 0x74, 0x91, 0x80, 0x84, 0x96, 0x94,
+    0x76, 0x69, 0x66, 0x9e, 0x81, 0x8a, 0x8b, 0x63, 0x65, 0x7c, 0xa1, 0x9a,
+    0x72, 0x84, 0x9e, 0x89, 0x9a, 0x86, 0x98, 0x7f, 0x77, 0x85, 0x82, 0xaa,
+    0xa3, 0x88, 0xac, 0x9e, 0x76, 0xca, 0x2b, 0xa0, 0x40, 0xad, 0x6f, 0x6c,
+    0x66, 0xc8, 0x07, 0x9e, 0x3e, 0x9f, 0x85, 0x9f, 0x5e, 0xb7, 0x53, 0x91,
+    0x56, 0x6d, 0x62, 0x95, 0x4c, 0xc7, 0x46, 0x56, 0x4b, 0x5d, 0x6f, 0x52,
+    0x4d, 0xa3, 0x8c, 0x90, 0x78, 0x4d, 0x58, 0x8d, 0x53, 0x93, 0x8e, 0x68,
+    0x6f, 0x3b, 0x49, 0x86, 0x6e, 0x9d, 0x76, 0x74, 0x5b, 0x44, 0x7b, 0x8c,
+    0x89, 0xb0, 0x64, 0x62, 0x6a, 0x6d, 0x7a, 0xae, 0x84, 0x95, 0x8c, 0x71,
+    0x8b, 0x60, 0x82, 0x9e, 0x8c, 0xa8, 0x90, 0x66, 0xa1, 0x7b, 0x65, 0x82,
+    0x8f, 0x7d, 0x8d, 0x78, 0x8e, 0x5f, 0x75, 0x88, 0x5d, 0x93, 0xa1, 0x93,
+    0x6b, 0x67, 0x7a, 0xa7, 0x92, 0x8c, 0x65, 0x88, 0x95, 0x93, 0x87, 0x81,
+    0x9c, 0x97, 0x62, 0x9d, 0x90, 0x62, 0xa1, 0x9f, 0x87, 0x94, 0x94, 0x99,
+    0x92, 0x8f, 0x71, 0x80, 0x77, 0x82, 0x92, 0x78, 0x67, 0x69, 0x7e, 0x81,
+    0x93, 0x89, 0x80, 0x9b, 0x71, 0x57, 0x63, 0x83, 0x7b, 0x9f, 0x5d, 0x92,
+    0x85, 0x96, 0x7e, 0x92, 0x84, 0x7f, 0x81, 0xa3, 0xa8, 0x96, 0x91, 0x8e,
+    0x8c, 0x8e, 0x7d, 0xb0, 0x86, 0x72, 0x9d, 0x8e, 0x8e, 0xd0, 0x05, 0x77,
+    0x45, 0xad, 0x91, 0x95, 0x71, 0xb8, 0x01, 0x9a, 0x41, 0xb8, 0x94, 0x6e,
+    0x63, 0xd3, 0x58, 0x8c, 0x5a, 0x89, 0x85, 0x83, 0x52, 0xc1, 0x7b, 0x6a,
+    0x65, 0x6e, 0x73, 0x63, 0x68, 0xba, 0x67, 0x78, 0x79, 0x4a, 0x73, 0x8f,
+    0x51, 0xc9, 0x85, 0x8a, 0x6b, 0x45, 0x6a, 0x8f, 0x6c, 0xad, 0x8a, 0x8d,
+    0x6a, 0x6e, 0x6b, 0x7f, 0x86, 0xb4, 0x88, 0x7d, 0xaa, 0x71, 0x5c, 0x69,
+    0x5d, 0xa8, 0x62, 0x7d, 0x6c, 0x6e, 0x6f, 0x6a, 0x7c, 0x9d, 0x7a, 0x83,
+    0x7d, 0x79, 0x7b, 0x9c, 0x73, 0x93, 0x7f, 0x9d, 0x8c, 0x75, 0x78, 0x83,
+    0x85, 0x88, 0x81, 0x81, 0x98, 0x79, 0xa3, 0xae, 0x5b, 0x90, 0x89, 0x9d,
+    0x6d, 0x90, 0xa3, 0x8e, 0x87, 0x96, 0x60, 0xa7, 0x76, 0x82, 0x81, 0x84,
+    0x84, 0x9c, 0x73, 0x8a, 0x6c, 0x58, 0x64, 0x96, 0x89, 0x8b, 0x76, 0x60,
+    0x91, 0x72, 0x7f, 0x86, 0x9a, 0x89, 0x67, 0x7d, 0x77, 0x84, 0x73, 0x5c,
+    0x67, 0x8a, 0x82, 0x8c, 0x8c, 0x94, 0x8a, 0xa2, 0xaa, 0x7e, 0x5f, 0x7f,
+    0x86, 0x90, 0x96, 0xab, 0x8d, 0x91, 0x7c, 0xb6, 0x82, 0x8d, 0xb8, 0xa9,
+    0x92, 0xea, 0x1b, 0x74, 0x25, 0xab, 0x8d, 0x61, 0x81, 0xd8, 0x2c, 0x86,
+    0x2f, 0xcf, 0xa2, 0x84, 0x7f, 0xa4, 0x36, 0x86, 0x47, 0x8d, 0x60, 0x8a,
+    0x62, 0xb1, 0x4a, 0x54, 0x48, 0x73, 0x64, 0x9d, 0x72, 0xb2, 0x76, 0x4c,
+    0x8e, 0x4e, 0x76, 0x94, 0x7c, 0xad, 0x74, 0x6c, 0x6c, 0x54, 0x7f, 0x63,
+    0x97, 0xb3, 0x74, 0x6c, 0x99, 0x5f, 0x86, 0x6a, 0xa3, 0x94, 0x7c, 0x83,
+    0x8d, 0x81, 0x79, 0xac, 0x61, 0x9b, 0x65, 0x7b, 0x66, 0x89, 0x60, 0x76,
+    0x8d, 0x93, 0x8d, 0x84, 0x71, 0x65, 0x82, 0x8c, 0x94, 0xa7, 0x59, 0xa1,
+    0x8b, 0x72, 0x84, 0x65, 0x75, 0x95, 0x62, 0x71, 0x71, 0x7e, 0x7b, 0x97,
+    0x9b, 0x9a, 0x80, 0xb1, 0x77, 0x7a, 0x73, 0x8e, 0x9c, 0x8c, 0x7d, 0x96,
+    0x89, 0x7d, 0x7e, 0x80, 0x8e, 0x93, 0x63, 0x72, 0x6b, 0x57, 0x78, 0x8f,
+    0x90, 0x86, 0x62, 0x75, 0x7e, 0x54, 0x7d, 0x95, 0x85, 0x84, 0x73, 0x7b,
+    0x8f, 0x9e, 0x72, 0x8c, 0x90, 0x96, 0x8e, 0x6c, 0x80, 0x8b, 0x9e, 0x8c,
+    0x87, 0x8e, 0x9b, 0x97, 0x8f, 0x94, 0xa3, 0x6b, 0xad, 0x93, 0x8a, 0x96,
+    0x8d, 0x91, 0xa6, 0x8a, 0x9e, 0xce, 0x6b, 0x98, 0x6d, 0xa9, 0x92, 0x92,
+    0x7c, 0xe2, 0x63, 0x97, 0x42, 0xc8, 0xa3, 0xa0, 0x88, 0xdc, 0x75, 0x9b,
+    0x51, 0x7d, 0x5c, 0x80, 0x89, 0xc0, 0x83, 0x5e, 0x5e, 0xa4, 0x3e, 0x74,
+    0x9b, 0xb6, 0x7f, 0x63, 0x78, 0x7d, 0x74, 0x57, 0x93, 0xa2, 0x83, 0x70,
+    0x5e, 0x7d, 0x60, 0x69, 0x93, 0x9e, 0x79, 0x86, 0x91, 0x67, 0x86, 0x95,
+    0xa2, 0xad, 0x62, 0x74, 0x68, 0x7e, 0x7e, 0x82, 0x8c, 0xb0, 0xa0, 0x63,
+    0x8b, 0x82, 0x8f, 0x8c, 0xa4, 0xa3, 0x76, 0x6c, 0x8e, 0x87, 0x72, 0x85,
+    0xaa, 0xa4, 0x7f, 0x7b, 0x8e, 0x9a, 0x69, 0x91, 0x9d, 0xa0, 0x81, 0x92,
+    0x90, 0x85, 0x66, 0x82, 0xa3, 0xa9, 0x7f, 0x8f, 0x83, 0x9d, 0x8b, 0x8d,
+    0x96, 0xa3, 0x8f, 0x7a, 0x6d, 0x89, 0x74, 0x8a, 0xa9, 0xa9, 0x7b, 0x77,
+    0x93, 0x8b, 0x63, 0x92, 0x99, 0x8b, 0x88, 0x4f, 0x87, 0x7c, 0x67, 0x78,
+    0x83, 0xa5, 0xa5, 0x58, 0x8d, 0x70, 0x86, 0x82, 0x9e, 0xa7, 0xa5, 0x96,
+    0x8d, 0x7b, 0x96, 0x8c, 0x95, 0xa3, 0x8d, 0x9c, 0x92, 0x95, 0x98, 0x94,
+    0x87, 0x90, 0x92, 0x92, 0x95, 0x96, 0xad, 0x6e, 0x97, 0x8c, 0x92, 0x7f,
+    0x95, 0x8b, 0x8a, 0x90, 0x9b, 0x87, 0x9e, 0x86, 0x91, 0xa0, 0x68, 0x82,
+    0x85, 0x8e, 0x82, 0xa8, 0x9f, 0x68, 0x87, 0x75, 0x9b, 0x70, 0x95, 0x91,
+    0x6c, 0x77, 0x8b, 0x7b, 0x95, 0x80, 0x99, 0x65, 0x95, 0x82, 0x92, 0x9a,
+    0x8a, 0x65, 0x70, 0x8c, 0x98, 0x9e, 0x80, 0x7b, 0xa5, 0x9b, 0x93, 0x94,
+    0x84, 0x6a, 0x69, 0x82, 0x80, 0x7a, 0x75, 0x72, 0x94, 0x79, 0xad, 0xb2,
+    0x81, 0x8b, 0x85, 0x6c, 0x86, 0x88, 0x9e, 0x79, 0x86, 0x9e, 0x7e, 0x91,
+    0x7b, 0x6d, 0x93, 0x91, 0x82, 0x97, 0x6b, 0xa6, 0xaa, 0x9f, 0xa8, 0x74,
+    0x94, 0x7f, 0x63, 0x98, 0x90, 0xa1, 0x8c, 0x7f, 0x71, 0x86, 0x89, 0x95,
+    0x88, 0x80, 0x77, 0x67, 0x85, 0x7d, 0x89, 0x6d, 0x9c, 0x76, 0x72, 0x8d,
+    0x96, 0x94, 0x88, 0x98, 0x9f, 0x94, 0x8e, 0x84, 0x7a, 0x88, 0x79, 0x9f,
+    0x81, 0xa1, 0x7c, 0x8b, 0x71, 0x79, 0x7d, 0x9d, 0x7b, 0x6a, 0x8c, 0x66,
+    0x9e, 0x7b, 0x77, 0x7a, 0xb0, 0x74, 0x7f, 0x8d, 0x8d, 0x71, 0x72, 0x84,
+    0x90, 0x98, 0x7b, 0x89, 0x9b, 0x8e, 0x85, 0x7a, 0x67, 0x8a, 0x72, 0x84,
+    0x82, 0x91, 0x91, 0x7a, 0x85, 0x8a, 0xae, 0x8a, 0x9a, 0x9a, 0x7f, 0x85,
+    0x8a, 0x90, 0x69, 0x7b, 0x76, 0x78, 0x98, 0x54, 0x94, 0x7e, 0x6c, 0x72,
+    0x89, 0x88, 0x82, 0x96, 0x59, 0x95, 0x76, 0x91, 0x94, 0x96, 0x83, 0x84,
+    0x72, 0x8d, 0x97, 0x71, 0x68, 0x8e, 0x88, 0x8b, 0x7c, 0xa9, 0x73, 0x8a,
+    0x95, 0x86, 0x87, 0x96, 0x91, 0x77, 0xb1, 0x88, 0x6e, 0x7d, 0x7c, 0x9f,
+    0x8f, 0x82, 0x79, 0x83, 0xa6, 0x81, 0x89, 0x83, 0x85, 0x9b, 0x7c, 0x68,
+    0x6f, 0x84, 0x7c, 0xa1, 0x8e, 0x80, 0x78, 0x8f, 0x96, 0x77, 0x7e, 0x7b,
+    0x8f, 0x81, 0xa5, 0x84, 0x86, 0x91, 0x7b, 0x73, 0x92, 0x85, 0xa3, 0x7e,
+    0x80, 0x95, 0x7d, 0x5f, 0x8c, 0x94, 0x95, 0x73, 0x95, 0x78, 0x87, 0xa1,
+    0x94, 0x6c, 0xac, 0x6c, 0x77, 0x89, 0x86, 0x9c, 0x82, 0x76, 0x99, 0x93,
+    0x92, 0x88, 0x80, 0x80, 0x85, 0x8a, 0xa8, 0x8f, 0x7a, 0x89, 0x9a, 0x7a,
+    0x8f, 0x91, 0x86, 0x82, 0x7f, 0x82, 0x91, 0x95, 0x85, 0x71, 0x7d, 0x8f,
+    0x83, 0x8c, 0x79, 0x97, 0x7a, 0x9b, 0x91, 0x88, 0xa2, 0x86, 0x8a, 0x80,
+    0xa0, 0x96, 0x8b, 0x7d, 0x76, 0x96, 0x9f, 0x8d, 0x95, 0x8a, 0x94, 0xa0,
+    0x80, 0x95, 0x9b, 0x96, 0x81, 0xa8, 0x59, 0x89, 0x92, 0xb2, 0x83, 0x89,
+    0x85, 0x81, 0x7e, 0x64, 0x77, 0x82, 0x90, 0x96, 0x7e, 0x9f, 0xab, 0x8a,
+    0x6e, 0x9b, 0x90, 0x89, 0x6e, 0x7d, 0x81, 0x65, 0x81, 0x86, 0xa1, 0x93,
+    0x8b, 0x83, 0x81, 0x89, 0x8b, 0x90, 0x7e, 0x97, 0x8e, 0x75, 0x7e, 0x7e,
+    0x7b, 0x81, 0x9a, 0x64, 0x90, 0xab, 0x90, 0x82, 0x8a, 0x82, 0x8d, 0xad,
+    0x90, 0x74, 0x7f, 0x9a, 0x88, 0x92, 0x83, 0x97, 0xa6, 0x6e, 0x9d, 0x81,
+    0xa2, 0x98, 0x74, 0x84, 0x93, 0x85, 0x84, 0x7d, 0xa2, 0x92, 0x92, 0x87,
+    0x73, 0x8b, 0x92, 0x74, 0x96, 0x70, 0x83, 0x86, 0x8a, 0x89, 0x86, 0x88,
+    0x87, 0x7c, 0x7d, 0x81, 0x8d, 0x71, 0x8c, 0x89, 0x70, 0x94, 0x8f, 0x9a,
+    0x83, 0x9d, 0x99, 0x78, 0x74, 0x88, 0x84, 0x9a, 0x95, 0x8b, 0x8e, 0x7f,
+    0xa2, 0xa0, 0x76, 0x93, 0x9b, 0x7c, 0x97, 0x81, 0x83, 0x8c, 0xa1, 0x99,
+    0x9d, 0x7f, 0x87, 0x75, 0xa7, 0x75, 0x89, 0x7e, 0x88, 0x80, 0x8f, 0x84,
+    0x9a, 0x77, 0x8d, 0x90, 0x9d, 0x6c, 0x88, 0x8d, 0x8e, 0x81, 0x97, 0x6d,
+    0x81, 0x88, 0x64, 0x8c, 0x77, 0x8e, 0x91, 0x8a, 0x7f, 0x8a, 0x94, 0x7a,
+    0x89, 0x93, 0x8c, 0x69, 0x85, 0x8c, 0x93, 0x61, 0x7e, 0x89, 0x7e, 0x8a,
+    0x65, 0x8a, 0xa9, 0x7f, 0x80, 0x86, 0x82, 0x90, 0x66, 0x7a, 0x99, 0x71,
+    0x7f, 0x73, 0x8d, 0x94, 0x7d, 0x73, 0x7a, 0x7d, 0x87, 0x7a, 0x97, 0x70,
+    0x81, 0x60, 0x61, 0x7a, 0x91, 0x88, 0x93, 0x7a, 0x9e, 0xa6, 0x92, 0x9d,
+    0x92, 0x67, 0x99, 0x9a, 0xae, 0x71, 0x89, 0xa5, 0x9f, 0xa6, 0x98, 0x89,
+    0x97, 0x90, 0x9b, 0x9a, 0xc0, 0x95, 0x8f, 0x9c, 0x95, 0x93, 0x88, 0x95,
+    0x95, 0xa0, 0x8e, 0x8c, 0xa8, 0x94, 0x6e, 0x9e, 0x6f, 0x7b, 0xa5, 0x96,
+    0x98, 0x90, 0x91, 0x89, 0x93, 0x8f, 0x84, 0xb2, 0x7f, 0x5e, 0xc2, 0x75,
+    0x8f, 0x90, 0x9c, 0xbf, 0x8a, 0x84, 0xa6, 0x85, 0x7d, 0x84, 0x8a, 0xad,
+    0x6f, 0x88, 0xac, 0x77, 0x91, 0x8d, 0x94, 0xac, 0x8f, 0x7f, 0xa1, 0xa5,
+    0x8e, 0x6d, 0x8a, 0x82, 0x85, 0x80, 0x9b, 0x7a, 0x9f, 0x60, 0x95, 0x97,
+    0x90, 0x67, 0x8f, 0x91, 0x86, 0x89, 0x88, 0x89, 0x96, 0x6c, 0x8b, 0x94,
+    0x8a, 0x75, 0x84, 0x96, 0x8a, 0x86, 0x7c, 0x91, 0x74, 0x8f, 0x97, 0x89,
+    0x8f, 0x8e, 0x6b, 0x97, 0x93, 0x89, 0x6b, 0x7e, 0x65, 0xa4, 0xa5, 0x63,
+    0x85, 0x88, 0x81, 0xa3, 0x70, 0x9b, 0x9e, 0x8c, 0x62, 0x73, 0x85, 0xb4,
+    0x88, 0x6e, 0x92, 0x6f, 0x91, 0x88, 0x79, 0x91, 0x7f, 0x7d, 0x9a, 0x6b,
+    0x78, 0x93, 0x7e, 0x79, 0x93, 0x7a, 0x74, 0x91, 0x8d, 0x92, 0xb3, 0x61,
+    0xa3, 0x76, 0x81, 0x99, 0x96, 0x8b, 0x93, 0x8f, 0xa7, 0x6f, 0x8f, 0xa6,
+    0xb2, 0x76, 0xa1, 0x83, 0xa8, 0x8b, 0xae, 0x99, 0x90, 0x6a, 0x97, 0x97,
+    0xaa, 0x95, 0x85, 0x7d, 0x97, 0x94, 0x86, 0x94, 0x89, 0xa4, 0xa9, 0x81,
+    0x89, 0x7c, 0x96, 0xb3, 0x92, 0x7d, 0xa4, 0x6f, 0x6d, 0x92, 0x83, 0xb4,
+    0x7b, 0x94, 0x8c, 0x79, 0x61, 0x6f, 0x8f, 0xb7, 0x88, 0x66, 0xaa, 0x7d,
+    0x89, 0x7f, 0x90, 0xbd, 0x99, 0xac, 0xb1, 0x96, 0x9c, 0x7c, 0x92, 0xb7,
+    0x73, 0x94, 0xad, 0x9d, 0x7c, 0x80, 0x87, 0x96, 0x73, 0x8d, 0xa8, 0x88,
+    0xa9, 0x83, 0x7b, 0x84, 0x9d, 0x99, 0x83, 0x89, 0x9d, 0x7f, 0x7e, 0x86,
+    0x75, 0x83, 0x77, 0x7d, 0x8b, 0x7d, 0x80, 0x9d, 0xa2, 0x94, 0x72, 0x92,
+    0x75, 0x95, 0x99, 0xa0, 0x7b, 0x83, 0x99, 0x89, 0x82, 0x92, 0x5b, 0x9e,
+    0x7c, 0x91, 0x95, 0x79, 0x61, 0x86, 0x60, 0xc7, 0x72, 0x91, 0xb5, 0x88,
+    0x71, 0x8d, 0x85, 0x91, 0x83, 0x74, 0xa8, 0x67, 0x79, 0x77, 0x7f, 0x79,
+    0x68, 0x84, 0x95, 0x69, 0x98, 0x88, 0x74, 0x72, 0x9c, 0x86, 0x87, 0x95,
+    0x90, 0x95, 0x9b, 0x8b, 0xc5, 0x7d, 0x81, 0x8f, 0x88, 0x8c, 0xb0, 0x95,
+    0xa8, 0x8c, 0x84, 0xa0, 0xb0, 0x89, 0x9a, 0x90, 0xaa, 0x88, 0x96, 0x9b,
+    0x88, 0xa9, 0x89, 0x99, 0xb7, 0x82, 0x99, 0xa0, 0x85, 0x70, 0x9c, 0x9a,
+    0x94, 0x74, 0x91, 0x81, 0x76, 0x70, 0x8f, 0xc2, 0x8c, 0x91, 0x8f, 0x69,
+    0x74, 0x7e, 0x6d, 0x9a, 0x80, 0x77, 0xa5, 0x94, 0x8b, 0x6d, 0x82, 0xcf,
+    0x8e, 0x74, 0xc4, 0x86, 0x7f, 0x78, 0x72, 0xb3, 0x78, 0x7a, 0xac, 0x9c,
+    0x7d, 0x77, 0x8d, 0xca, 0x67, 0x8c, 0xd5, 0x8f, 0x7f, 0x71, 0x70, 0x82,
+    0x7e, 0x9f, 0xb0, 0x7f, 0x75, 0x90, 0x79, 0x7b, 0x8d, 0x7b, 0xa6, 0x87,
+    0x98, 0x76, 0x84, 0x96, 0x81, 0x6a, 0x96, 0x86, 0x8e, 0x77, 0xa3, 0x83,
+    0x91, 0x83, 0x8a, 0x6c, 0x74, 0x83, 0x99, 0x7d, 0x7c, 0x8a, 0x88, 0x9a,
+    0x6b, 0x86, 0x59, 0xa3, 0x8a, 0x8e, 0xbb, 0x8a, 0x75, 0x78, 0x68, 0xb5,
+    0x9b, 0x7b, 0xa7, 0x93, 0x5b, 0x6c, 0x6b, 0xa0, 0x74, 0x99, 0xc0, 0x73,
+    0x8b, 0x7e, 0x8e, 0x83, 0x64, 0x7c, 0x7d, 0x7a, 0x98, 0x7d, 0x82, 0x7c,
+    0x8f, 0x7e, 0x74, 0x86, 0xa9, 0x84, 0xba, 0x8f, 0xc7, 0x6f, 0x87, 0xae,
+    0x97, 0x91, 0xad, 0x82, 0xb2, 0x70, 0x8a, 0xa0, 0xb0, 0x7d, 0x95, 0x8d,
+    0xc2, 0x85, 0x80, 0xad, 0x9f, 0x85, 0x8b, 0x76, 0xaa, 0xab, 0x8f, 0xa0,
+    0x89, 0x9b, 0x8a, 0xb3, 0xa0, 0x72, 0xbe, 0x8c, 0x93, 0x7a, 0xa0, 0xad,
+    0x99, 0x6f, 0xa2, 0x79, 0x78, 0x8b, 0x6d, 0xae, 0x75, 0x6f, 0xa1, 0x8d,
+    0x68, 0x81, 0x74, 0xb3, 0x8f, 0x81, 0xc6, 0x96, 0x77, 0x68, 0x85, 0xaf,
+    0x86, 0x9f, 0xbb, 0x8a, 0x7e, 0x8a, 0x86, 0xab, 0x8b, 0x87, 0x94, 0x96,
+    0x99, 0x82, 0x6a, 0xaa, 0x7b, 0x81, 0xa6, 0x9b, 0xb6, 0x73, 0x78, 0x9a,
+    0x8f, 0xaa, 0x93, 0x81, 0x97, 0x7a, 0x72, 0x82, 0x79, 0x81, 0x7c, 0x88,
+    0x8e, 0x79, 0x9d, 0x81, 0x9a, 0x75, 0x9b, 0x89, 0x73, 0x6a, 0xa6, 0x84,
+    0x5c, 0x6f, 0xa0, 0x9d, 0x81, 0x84, 0x3e, 0xaf, 0x94, 0xa1, 0xb8, 0x93,
+    0x81, 0x89, 0x68, 0xd4, 0x87, 0x99, 0x99, 0x95, 0x79, 0x72, 0x81, 0xa1,
+    0x78, 0x7d, 0x8f, 0x7e, 0x87, 0x78, 0x8e, 0x97, 0x7e, 0x96, 0x86, 0x86,
+    0x97, 0x74, 0x6f, 0x7d, 0xa5, 0x81, 0x6f, 0x8e, 0x9e, 0x8b, 0xad, 0xac,
+    0xbd, 0x75, 0x84, 0xa2, 0x93, 0x76, 0xc7, 0x9e, 0xb0, 0x75, 0x89, 0xa4,
+    0x95, 0x92, 0xb5, 0xaa, 0xb9, 0x7d, 0x79, 0xa5, 0x88, 0x70, 0x84, 0x70,
+    0xa3, 0x81, 0xa1, 0xa6, 0x8f, 0x96, 0x96, 0x8d, 0xa5, 0x83, 0xb2, 0x8f,
+    0x88, 0x74, 0x96, 0xbc, 0x8b, 0x81, 0xa4, 0x85, 0x7c, 0x87, 0x64, 0xb4,
+    0x80, 0x88, 0x92, 0x90, 0x78, 0x79, 0x77, 0xa5, 0x79, 0x8b, 0xbd, 0x7d,
+    0x84, 0x8c, 0x96, 0xd4, 0x78, 0x81, 0xa4, 0x8c, 0x97, 0x89, 0x78, 0xc4,
+    0x9f, 0x94, 0xb9, 0x83, 0x76, 0x78, 0x89, 0x86, 0x81, 0x8f, 0xbd, 0xa7,
+    0x88, 0x79, 0x8e, 0x92, 0x86, 0x88, 0xad, 0x8a, 0x7b, 0x7f, 0x80, 0xad,
+    0x7a, 0xaf, 0x8a, 0x93, 0xa6, 0x84, 0x92, 0x8e, 0x84, 0x99, 0x80, 0xae,
+    0x74, 0x7c, 0x95, 0x9c, 0x7b, 0x84, 0x84, 0x84, 0xa4, 0x82, 0x57, 0xb5,
+    0x95, 0xc1, 0xb7, 0xa0, 0x85, 0x7b, 0x69, 0xc3, 0xb1, 0x8e, 0xa0, 0x8e,
+    0x81, 0x88, 0x78, 0x9e, 0x81, 0x97, 0xb2, 0x74, 0x81, 0x84, 0x91, 0x87,
+    0x6f, 0x6f, 0x75, 0x78, 0x92, 0x7a, 0x6d, 0x80, 0x9a, 0x7e, 0x81, 0xa1,
+    0xa8, 0x6d, 0xb5, 0x98, 0xb4, 0x7f, 0x9a, 0xa4, 0x9d, 0x7b, 0xba, 0xaa,
+    0xce, 0x93, 0x79, 0xa5, 0x81, 0x95, 0xa6, 0x7f, 0x8c, 0x8b, 0x96, 0xa4,
+    0xa1, 0x8d, 0x91, 0x97, 0xce, 0x8e, 0x8e, 0x9d, 0x86, 0x7f, 0x97, 0xa3,
+    0x99, 0x75, 0xa3, 0xa0, 0x69, 0x6a, 0x87, 0xa0, 0x9a, 0x80, 0xa2, 0x72,
+    0x6d, 0x85, 0x6b, 0x94, 0x8d, 0x77, 0x9f, 0x84, 0x7f, 0x92, 0x64, 0xaa,
+    0x78, 0x82, 0xa7, 0x8f, 0x84, 0x79, 0x84, 0xb9, 0x92, 0x7c, 0xb6, 0x96,
+    0x9c, 0x99, 0x8f, 0xab, 0xab, 0x8a, 0xa2, 0xab, 0x6d, 0x97, 0x7b, 0xb1,
+    0x9e, 0x6c, 0x9a, 0x99, 0xaa, 0xa3, 0x70, 0x80, 0x81, 0x6f, 0xb6, 0x95,
+    0x93, 0x93, 0x8e, 0x80, 0x86, 0xb0, 0x87, 0x91, 0x8f, 0x8c, 0xa4, 0x86,
+    0x89, 0x8f, 0x93, 0x83, 0x75, 0x7d, 0x9b, 0x86, 0x7d, 0x5a, 0x9d, 0x67,
+    0x9f, 0x78, 0x5c, 0xa5, 0x8e, 0xa2, 0xc1, 0x95, 0x89, 0x84, 0x53, 0xd1,
+    0x7d, 0x9b, 0xc0, 0x8f, 0x73, 0x7f, 0x85, 0x9e, 0x8a, 0x7b, 0xa6, 0x84,
+    0x6c, 0x74, 0x95, 0x93, 0x7a, 0x7a, 0x81, 0x7d, 0x89, 0x86, 0x76, 0x8a,
+    0xad, 0x66, 0x90, 0x90, 0x9d, 0x77, 0xb4, 0xad, 0xac, 0x8e, 0xb3, 0xa5,
+    0x9d, 0x91, 0xd7, 0x94, 0xba, 0x8b, 0x72, 0xa4, 0x93, 0x7e, 0xa7, 0x86,
+    0xae, 0x83, 0x63, 0xa6, 0xa0, 0x78, 0x81, 0x8b, 0xc4, 0x82, 0x8f, 0x98,
+    0xa1, 0x8f, 0x79, 0x9a, 0x92, 0x85, 0x9d, 0x91, 0x92, 0x84, 0x8f, 0x84,
+    0x91, 0x6d, 0x7b, 0x69, 0x75, 0x87, 0x5d, 0x99, 0x92, 0x83, 0xab, 0x8f,
+    0x53, 0x90, 0x7b, 0xa0, 0x71, 0x89, 0xc2, 0x7f, 0x6a, 0x7c, 0x86, 0xb2,
+    0x8d, 0x89, 0xaf, 0x9c, 0x81, 0x8c, 0x84, 0xbe, 0x93, 0x9c, 0xa8, 0x97,
+    0x68, 0x9b, 0x84, 0xa3, 0x8a, 0x77, 0xa5, 0x79, 0x7b, 0x87, 0x86, 0xa5,
+    0x80, 0x83, 0x9e, 0x8d, 0xb1, 0x94, 0x7a, 0x8b, 0xa6, 0xa8, 0x80, 0x98,
+    0x8c, 0x73, 0xa9, 0x7b, 0x91, 0x8f, 0x71, 0x82, 0x68, 0x84, 0xa5, 0x96,
+    0x67, 0x63, 0xa6, 0x71, 0xa7, 0x85, 0x57, 0x9f, 0x91, 0xb2, 0xa6, 0x87,
+    0x80, 0x8f, 0x6a, 0xba, 0x9d, 0xb7, 0xb9, 0x8b, 0x75, 0x7c, 0x6f, 0x9f,
+    0x74, 0x8d, 0xaf, 0x6e, 0x7c, 0x65, 0x6c, 0x8a, 0x7c, 0x81, 0x89, 0x77,
+    0x8b, 0x74, 0x65, 0x9b, 0xa5, 0x6b, 0x92, 0x71, 0xbb, 0x70, 0x99, 0xbf,
+    0xb0, 0x7b, 0x92, 0xb4, 0xa4, 0x84, 0xc4, 0x92, 0xa8, 0x94, 0x7e, 0xcd,
+    0x83, 0x87, 0xaf, 0xa0, 0xa5, 0x94, 0x72, 0xb9, 0x90, 0xa6, 0x9e, 0x9e,
+    0x9b, 0x7a, 0x68, 0xc0, 0x8f, 0x89, 0x72, 0x94, 0x9b, 0x81, 0x81, 0x91,
+    0x88, 0x90, 0xa8, 0x8d, 0x90, 0x78, 0x7c, 0x67, 0x64, 0x8e, 0x55, 0xa1,
+    0x6d, 0x86, 0xa3, 0x6f, 0x5c, 0x7d, 0x79, 0xa3, 0x64, 0x71, 0xd4, 0x87,
+    0x73, 0x85, 0x76, 0xc7, 0x72, 0x86, 0xb2, 0x8c, 0x7b, 0x8d, 0x96, 0xc3,
+    0xad, 0x87, 0xac, 0xa8, 0x84, 0x94, 0x7b, 0xbf, 0x83, 0x74, 0x8e, 0x8c,
+    0x9c, 0x99, 0x88, 0x8e, 0x86, 0x88, 0xae, 0x7f, 0x70, 0x96, 0x6f, 0x74,
+    0x8f, 0x85, 0x7c, 0x86, 0x97, 0x83, 0xa0, 0x6a, 0x8b, 0x82, 0x88, 0x90,
+    0x72, 0x84, 0x9b, 0xa1, 0x6f, 0x72, 0xa4, 0x95, 0xa6, 0x7d, 0x65, 0xbd,
+    0x90, 0xb6, 0x9e, 0x98, 0xa1, 0x94, 0x66, 0xb3, 0x9c, 0xb3, 0xa7, 0x7f,
+    0x91, 0x69, 0x6e, 0xb1, 0x68, 0x7a, 0xaa, 0x91, 0x7c, 0x71, 0x9f, 0x95,
+    0x83, 0x86, 0x76, 0x69, 0x9b, 0x7f, 0x8c, 0x94, 0x9c, 0x89, 0x86, 0x93,
+    0xc1, 0x79, 0x98, 0x9e, 0xb1, 0x90, 0x9b, 0xb7, 0xab, 0x86, 0xc6, 0xa1,
+    0xa9, 0xaa, 0x86, 0xb0, 0x8b, 0x79, 0xb9, 0x85, 0xbe, 0x92, 0x60, 0xc0,
+    0x9f, 0x9a, 0x90, 0x8d, 0xb5, 0x77, 0x95, 0xad, 0x8b, 0x93, 0x8a, 0x93,
+    0x93, 0x7e, 0x86, 0xa6, 0x7d, 0x89, 0x6b, 0x81, 0x93, 0x75, 0x7f, 0x86,
+    0x66, 0x8f, 0x56, 0x8f, 0x84, 0x75, 0x9e, 0x77, 0x78, 0x89, 0x62, 0xb3,
+    0x78, 0x76, 0xb5, 0x92, 0x7f, 0x80, 0x7a, 0xb9, 0x7d, 0x80, 0xc2, 0xb9,
+    0x7d, 0x8f, 0x8f, 0x8c, 0xa0, 0x78, 0xa2, 0xaf, 0x68, 0x98, 0x77, 0xac,
+    0x96, 0x77, 0x96, 0x99, 0x84, 0xb1, 0x72, 0x8e, 0x96, 0xa4, 0xa9, 0x8e,
+    0x84, 0x7b, 0x85, 0x8d, 0x8f, 0x83, 0x83, 0x7f, 0x85, 0x6e, 0xa4, 0x98,
+    0xab, 0x83, 0x90, 0x8e, 0x77, 0x8e, 0xab, 0x9c, 0x73, 0x79, 0x8d, 0x6e,
+    0xa0, 0x97, 0x68, 0xa7, 0x8a, 0xbd, 0x95, 0x96, 0x96, 0x8b, 0x72, 0xc7,
+    0x8d, 0x8c, 0xa5, 0x83, 0x9b, 0x8b, 0x6c, 0xac, 0x62, 0x78, 0xae, 0x78,
+    0x71, 0x7a, 0x8d, 0xae, 0x91, 0x87, 0x90, 0x82, 0x9b, 0x83, 0x90, 0x97,
+    0xb0, 0x96, 0x82, 0xa5, 0xa9, 0x76, 0xa5, 0xa0, 0xac, 0xa1, 0x93, 0x94,
+    0xb7, 0x91, 0xbb, 0x9b, 0xa4, 0xa5, 0x8c, 0xb5, 0x95, 0x7b, 0x92, 0x91,
+    0xb0, 0x97, 0x73, 0xb9, 0x86, 0xa7, 0x92, 0x98, 0x9e, 0x70, 0x77, 0xba,
+    0x96, 0x7b, 0xa6, 0x86, 0x97, 0x85, 0x8e, 0xaa, 0x93, 0x97, 0x8f, 0x8b,
+    0x8d, 0x79, 0x84, 0x7e, 0x70, 0x95, 0x52, 0x8f, 0x62, 0x75, 0x8b, 0x8b,
+    0x7b, 0x8b, 0x79, 0xaf, 0x90, 0x6d, 0xc8, 0x8d, 0x84, 0x8c, 0x72, 0xaf,
+    0x70, 0x8d, 0xa5, 0x8a, 0x76, 0x97, 0x87, 0x8e, 0xa9, 0x83, 0xb2, 0x8d,
+    0x7e, 0x9b, 0x76, 0xc2, 0xa2, 0x72, 0xc5, 0x87, 0x75, 0xb7, 0x92, 0x95,
+    0x9e, 0xa0, 0xc3, 0x82, 0x8d, 0x8f, 0x7d, 0x85, 0x90, 0x99, 0x7b, 0x82,
+    0x87, 0x87, 0xa0, 0x87, 0x9a, 0x8b, 0xa2, 0xa4, 0x67, 0x93, 0xa5, 0xbb,
+    0x73, 0x5f, 0x8c, 0x60, 0xa5, 0x7d, 0x6c, 0xb3, 0xb2, 0xb3, 0xa9, 0xa9,
+    0x8d, 0x8d, 0x67, 0xd7, 0x63, 0x99, 0xaa, 0x83, 0x88, 0x6a, 0x6f, 0x9e,
+    0x5e, 0x9e, 0x9d, 0x81, 0x84, 0x6e, 0x98, 0x90, 0x89, 0x7c, 0x95, 0x7d,
+    0x81, 0x8a, 0xa2, 0x8c, 0x92, 0x85, 0x80, 0x92, 0xac, 0x80, 0x9b, 0x9b,
+    0xc3, 0x8c, 0x95, 0xbc, 0xaa, 0x7c, 0xb5, 0x8d, 0xa1, 0xb8, 0x70, 0xb6,
+    0x8c, 0x92, 0xa8, 0x8e, 0xa3, 0x76, 0x6c, 0xbe, 0xa0, 0x8c, 0x92, 0x8e,
+    0xa1, 0x83, 0x76, 0xb2, 0x91, 0x7b, 0x8e, 0x87, 0x7f, 0x89, 0x8a, 0xa1,
+    0x91, 0xa0, 0x7a, 0x95, 0x7b, 0x86, 0x99, 0x92, 0x78, 0x8a, 0x62, 0x9e,
+    0x7b, 0x7b, 0x89, 0x79, 0x78, 0x87, 0x82, 0x94, 0x7d, 0x91, 0x96, 0x79,
+    0x7b, 0x8d, 0x80, 0xa7, 0x88, 0x95, 0xa6, 0x8f, 0x7d, 0x95, 0x79, 0xa2,
+    0x91, 0x9b, 0x9d, 0x90, 0x79, 0xa4, 0x88, 0x98, 0x9b, 0x7a, 0xa5, 0x7f,
+    0x71, 0x9c, 0x87, 0x96, 0x8c, 0x8f, 0xbc, 0x74, 0x95, 0x99, 0x7f, 0x78,
+    0x8c, 0x63, 0x7c, 0x7a, 0x92, 0x8c, 0xa8, 0x78, 0xa8, 0x89, 0x9a, 0x86,
+    0x69, 0x7e, 0xa1, 0xc3, 0x57, 0x68, 0x84, 0x89, 0xa9, 0x8d, 0x6f, 0xa9,
+    0x8a, 0xab, 0xa5, 0xad, 0x94, 0x83, 0x6b, 0xa7, 0x7e, 0x95, 0x9b, 0x7f,
+    0x8b, 0x78, 0x73, 0x90, 0x65, 0x8d, 0xb1, 0x91, 0x84, 0x65, 0x90, 0xb4,
+    0x8c, 0x89, 0x94, 0x7c, 0x99, 0x8b, 0x98, 0xb7, 0xb0, 0x91, 0x9e, 0x88,
+    0xbd, 0xa0, 0xa4, 0xb9, 0xad, 0x96, 0x97, 0xa3, 0xb6, 0x81, 0xba, 0x9b,
+    0xbc, 0xa9, 0x94, 0xb9, 0xa0, 0x85, 0x8e, 0xa1, 0xac, 0x87, 0x65, 0xa6,
+    0x98, 0x8e, 0xaa, 0xa3, 0xa3, 0x7f, 0x79, 0xb4, 0x93, 0x76, 0x90, 0x99,
+    0x8b, 0x90, 0x84, 0xa6, 0x90, 0x8f, 0x88, 0xa6, 0x89, 0x83, 0x86, 0x7a,
+    0x5d, 0x96, 0x71, 0xa5, 0x64, 0x94, 0x9a, 0x85, 0x7c, 0xa1, 0x96, 0x9d,
+    0x76, 0x8f, 0x95, 0xa0, 0x7f, 0x8c, 0x80, 0xc7, 0x6c, 0x7d, 0xb7, 0xb2,
+    0x82, 0x8e, 0x82, 0xbd, 0xb3, 0x82, 0x99, 0x9b, 0x80, 0x94, 0x8c, 0x94,
+    0x94, 0x6b, 0xc6, 0xa9, 0x81, 0x9f, 0x8c, 0x7e, 0x87, 0x88, 0xb3, 0x7d,
+    0x88, 0x8c, 0x81, 0x81, 0x7e, 0x7e, 0x86, 0x87, 0x96, 0x85, 0xb4, 0x87,
+    0xab, 0x91, 0x8f, 0xa1, 0x72, 0x83, 0xa4, 0x89, 0x6b, 0x75, 0x85, 0x7c,
+    0x94, 0x85, 0x6f, 0xad, 0x91, 0xae, 0xa4, 0xa5, 0xa7, 0x8e, 0x6c, 0xb2,
+    0x73, 0x99, 0x96, 0x92, 0x89, 0x81, 0x7d, 0x88, 0x60, 0x8d, 0x94, 0x83,
+    0x99, 0x68, 0x86, 0xa2, 0x94, 0x8e, 0x82, 0x76, 0x89, 0x8d, 0x98, 0x86,
+    0x94, 0x90, 0x83, 0x7d, 0xad, 0x94, 0xa6, 0x90, 0xcb, 0x96, 0xa2, 0xb2,
+    0xb6, 0x89, 0xc4, 0x9d, 0xc7, 0xa5, 0x75, 0xc3, 0x92, 0x8c, 0x8e, 0xad,
+    0x96, 0x94, 0x8e, 0xab, 0x94, 0x90, 0xa8, 0x84, 0xb5, 0x84, 0x66, 0xce,
+    0x74, 0x8c, 0x93, 0x8d, 0x8f, 0x95, 0x8b, 0xa1, 0x7b, 0xa1, 0x79, 0x9e,
+    0x81, 0xa4, 0xa0, 0x98, 0x5f, 0x78, 0x8e, 0x97, 0x6f, 0x81, 0x96, 0x8d,
+    0x70, 0x93, 0x72, 0x9c, 0x7b, 0x98, 0x8b, 0x8a, 0x8f, 0x8b, 0x6c, 0xa9,
+    0x81, 0x99, 0xb3, 0xa3, 0x71, 0x9c, 0x8b, 0x94, 0xa6, 0x8a, 0xb8, 0xa0,
+    0x7b, 0x98, 0x74, 0x9f, 0x92, 0x92, 0xb2, 0x89, 0x81, 0xa8, 0x87, 0x97,
+    0x96, 0x86, 0xa4, 0x7b, 0x63, 0x8e, 0x86, 0x7d, 0x76, 0x81, 0x93, 0x94,
+    0x98, 0x8b, 0xaf, 0x6d, 0xab, 0x9b, 0x85, 0x9b, 0x91, 0x86, 0x95, 0x95,
+    0x65, 0x89, 0x9e, 0x6b, 0xa4, 0x82, 0x68, 0xb5, 0x8b, 0xd1, 0x9d, 0x93,
+    0x7d, 0x67, 0x5e, 0xba, 0x9b, 0x94, 0x93, 0x8d, 0x88, 0x73, 0x7c, 0x8e,
+    0x7d, 0x83, 0x9a, 0x82, 0xa4, 0x62, 0x9a, 0x8d, 0x86, 0xa0, 0x7b, 0x72,
+    0xa9, 0x84, 0xa7, 0x94, 0xb2, 0x98, 0x8f, 0x81, 0xbe, 0x84, 0x9d, 0x94,
+    0x9c, 0x9a, 0x94, 0x8f, 0xb1, 0x82, 0xb1, 0x82, 0xb1, 0xb2, 0x78, 0xa7,
+    0x95, 0x99, 0x8b, 0x8c, 0xb1, 0x81, 0x5b, 0xbb, 0x88, 0x7a, 0x90, 0xa3,
+    0x8d, 0x78, 0x6f, 0xbf, 0x8c, 0x93, 0xa1, 0x8e, 0x9f, 0x98, 0x88, 0xb3,
+    0x7e, 0x82, 0x8a, 0x8e, 0x7d, 0x8a, 0x96, 0x6a, 0x6c, 0x7b, 0x91, 0x94,
+    0x6f, 0x89, 0x9a, 0x84, 0x73, 0x8b, 0x8c, 0x91, 0x7d, 0x8e, 0x9e, 0x80,
+    0x88, 0x81, 0x78, 0xaf, 0x86, 0xa5, 0xa2, 0x8d, 0x6a, 0x8a, 0x75, 0xa1,
+    0x83, 0x87, 0xaf, 0x7d, 0x6c, 0xa3, 0x65, 0x77, 0x89, 0x91, 0x9a, 0xa1,
+    0xa1, 0xaf, 0x78, 0x94, 0x93, 0xb2, 0xaf, 0x92, 0x74, 0x7a, 0xa7, 0x7b,
+    0x8f, 0x9c, 0x86, 0x8d, 0x8f, 0x79, 0xb0, 0xb3, 0x97, 0x82, 0x8e, 0x92,
+    0x92, 0x81, 0xa7, 0xbc, 0x6e, 0x6e, 0x89, 0xa5, 0x9a, 0x8d, 0x84, 0xb6,
+    0x83, 0xae, 0xa5, 0xa7, 0xae, 0x86, 0x6b, 0xb9, 0x89, 0xb0, 0x8f, 0x82,
+    0x8f, 0x6f, 0x83, 0x98, 0x6a, 0x98, 0x9a, 0x85, 0x9f, 0x78, 0x93, 0x8d,
+    0x83, 0x88, 0x88, 0x7e, 0x97, 0x99, 0x8a, 0x9b, 0xb0, 0x90, 0x86, 0x88,
+    0xb5, 0x90, 0xb3, 0xaa, 0xad, 0x96, 0x93, 0xa3, 0x9d, 0x81, 0xa3, 0x9a,
+    0x9f, 0x99, 0x90, 0x9c, 0x9e, 0x8e, 0x88, 0x93, 0xa8, 0x94, 0x62, 0xa6,
+    0x94, 0x92, 0xa1, 0x86, 0xb7, 0x8a, 0x6a, 0xa6, 0x81, 0x7e, 0x7b, 0x80,
+    0x89, 0x8f, 0x74, 0xa6, 0x72, 0x91, 0xa6, 0x9b, 0x73, 0x97, 0x7e, 0x6f,
+    0x70, 0x8d, 0x73, 0x98, 0x80, 0x90, 0x8f, 0x7e, 0x83, 0x77, 0x84, 0x92,
+    0x7f, 0x8c, 0x91, 0xa6, 0x99, 0x90, 0x9d, 0xb1, 0x88, 0x85, 0x89, 0x85,
+    0x7c, 0x9f, 0x7e, 0xb0, 0xaa, 0x84, 0xa0, 0x8e, 0x74, 0x93, 0x78, 0x90,
+    0x9a, 0x8b, 0x8e, 0x97, 0x8f, 0x9f, 0x7c, 0x83, 0x8a, 0x88, 0xa5, 0x8f,
+    0x8b, 0x74, 0x84, 0x9a, 0x7f, 0x91, 0x88, 0x77, 0x9c, 0x91, 0xbc, 0x93,
+    0x9c, 0x82, 0x89, 0x9b, 0x8a, 0x7d, 0xb7, 0xb8, 0x6f, 0x68, 0xb5, 0x8e,
+    0xb4, 0x86, 0x8c, 0xb3, 0x94, 0xb6, 0xa4, 0x93, 0x98, 0x8b, 0x70, 0xb3,
+    0x96, 0xaa, 0x87, 0x89, 0x99, 0x68, 0x74, 0xa4, 0x69, 0x9e, 0x8e, 0x6b,
+    0x9f, 0x6b, 0x95, 0x9c, 0x88, 0x89, 0x8a, 0x86, 0x8d, 0x75, 0x94, 0x88,
+    0xa0, 0x94, 0x77, 0x8c, 0x9c, 0x8d, 0x8e, 0xa4, 0xac, 0xa7, 0x8a, 0x9b,
+    0xa9, 0x81, 0xab, 0xac, 0xaf, 0xaf, 0x87, 0xbb, 0x9b, 0x95, 0x8e, 0x9e,
+    0x9f, 0xa1, 0x6c, 0xb4, 0x98, 0x8f, 0x81, 0x8d, 0x98, 0x8f, 0x78, 0x96,
+    0x89, 0x86, 0x6c, 0x91, 0x8d, 0x9f, 0x95, 0x9f, 0x6b, 0x7f, 0x93, 0x7c,
+    0x96, 0x8e, 0x8a, 0x58, 0x80, 0x8e, 0x7a, 0x93, 0x8b, 0x78, 0x99, 0x92,
+    0x62, 0x8e, 0x83, 0x8e, 0x87, 0x83, 0x86, 0x99, 0x93, 0x92, 0x80, 0x95,
+    0xa2, 0x72, 0xa2, 0x97, 0x78, 0x87, 0x7b, 0xa3, 0x99, 0x78, 0x98, 0x9c,
+    0x80, 0x9b, 0x5e, 0x8a, 0x9c, 0x99, 0xa6, 0x7a, 0x8e, 0x99, 0x7a, 0x8e,
+    0x8b, 0x76, 0x9b, 0x89, 0x80, 0x8e, 0x83, 0x8a, 0x80, 0x7c, 0x80, 0x74,
+    0x95, 0x8c, 0xbf, 0x7e, 0xa8, 0x7a, 0x99, 0x7d, 0x7d, 0x73, 0xb4, 0xae,
+    0x88, 0x76, 0xae, 0x78, 0xaa, 0x65, 0x94, 0xbe, 0x97, 0xaf, 0xa4, 0x91,
+    0x9c, 0x95, 0x6c, 0xbe, 0x82, 0xb1, 0x9b, 0x91, 0x85, 0x7d, 0x66, 0x9c,
+    0x99, 0xbd, 0xa3, 0x88, 0xa8, 0x73, 0x81, 0x94, 0x92, 0x8e, 0x90, 0x8d,
+    0xaf, 0x75, 0x86, 0x9b, 0x8b, 0x8b, 0x8d, 0x74, 0xbd, 0x85, 0x97, 0x8b,
+    0x9d, 0xba, 0x90, 0xa8, 0x9d, 0x72, 0xa5, 0xa8, 0xbf, 0xbb, 0x7b, 0xb6,
+    0xad, 0x94, 0x6f, 0x9a, 0xa7, 0x97, 0x78, 0x9c, 0x98, 0x8d, 0x8c, 0x93,
+    0xb8, 0xa8, 0x7f, 0x9d, 0x98, 0x7f, 0x8f, 0x8a, 0x8d, 0xa8, 0x86, 0x7b,
+    0x5d, 0x89, 0x8a, 0x83, 0x8c, 0x8b, 0x81, 0x56, 0x7c, 0x87, 0x89, 0xa6,
+    0x75, 0x7c, 0x92, 0x74, 0x96, 0x92, 0x78, 0x8d, 0x8d, 0x98, 0xae, 0x7a,
+    0x95, 0x8f, 0x8b, 0x9c, 0x95, 0x9f, 0xae, 0x93, 0x7b, 0x93, 0x8c, 0x9a,
+    0x79, 0x74, 0x94, 0x6e, 0x7e, 0x8f, 0x64, 0x9f, 0x9c, 0x88, 0x8f, 0x8e,
+    0x84, 0x8d, 0x89, 0x95, 0x96, 0x8f, 0x9d, 0x60, 0x85, 0x86, 0x7c, 0x93,
+    0x8d, 0x68, 0x83, 0x7c, 0x94, 0x87, 0xb8, 0xa2, 0x9d, 0x82, 0x8e, 0x84,
+    0x6c, 0x73, 0xa8, 0xbc, 0x84, 0x85, 0xa2, 0x79, 0x92, 0x64, 0x69, 0xa9,
+    0x82, 0xa7, 0x9d, 0x95, 0x8e, 0x6f, 0x9f, 0xa7, 0x97, 0xb1, 0x9d, 0x8e,
+    0xa1, 0x70, 0x80, 0x9e, 0x8e, 0x91, 0xa0, 0xaa, 0x81, 0x5b, 0x98, 0x8f,
+    0xa0, 0xaa, 0x83, 0x7a, 0x91, 0x7a, 0x73, 0x80, 0xa6, 0x9a, 0x80, 0x7d,
+    0x9e, 0x75, 0x7b, 0xa3, 0xad, 0x92, 0x98, 0xc0, 0xa1, 0x80, 0x88, 0xa2,
+    0xa5, 0xa4, 0x7e, 0x9b, 0xa0, 0x80, 0x6e, 0xa0, 0x9f, 0xa3, 0x8a, 0x8f,
+    0xa2, 0x93, 0x86, 0x8d, 0x8f, 0x93, 0x7e, 0x90, 0x98, 0x83, 0x7d, 0x9b,
+    0x9f, 0x9a, 0x97, 0x83, 0x6e, 0x8d, 0x94, 0x6c, 0x7b, 0x7f, 0x73, 0x65,
+    0x6a, 0x93, 0x8a, 0x94, 0x83, 0x89, 0x7d, 0x7b, 0x77, 0x8a, 0x7a, 0x9b,
+    0x8e, 0x8d, 0x94, 0x89, 0x86, 0x83, 0x7c, 0x8e, 0x8b, 0x90, 0xab, 0x99,
+    0x81, 0x8e, 0x77, 0x9c, 0x8c, 0x82, 0x97, 0x8f, 0x78, 0x91, 0x5f, 0xa1,
+    0x8b, 0x83, 0xa9, 0x8d, 0x7b, 0x97, 0x77, 0x80, 0x84, 0x7e, 0x9e, 0x75,
+    0xa3, 0x86, 0x67, 0x7c, 0x80, 0x6d, 0x77, 0x75, 0x88, 0x75, 0xad, 0x7a,
+    0x93, 0x89, 0x8c, 0x87, 0x7a, 0x79, 0xb2, 0xa1, 0x69, 0x80, 0xb5, 0x7a,
+    0xa6, 0x7b, 0x95, 0xac, 0x95, 0xa9, 0x98, 0xa4, 0xad, 0x83, 0x8d, 0xbe,
+    0xa4, 0x98, 0xad, 0x7d, 0x8b, 0x65, 0x65, 0xad, 0x6a, 0xae, 0xa3, 0xa8,
+    0x9c, 0x63, 0x90, 0x91, 0x6d, 0x9a, 0x81, 0x98, 0x86, 0x6a, 0x83, 0x84,
+    0x94, 0x9c, 0x77, 0x86, 0xc2, 0x7f, 0x9b, 0xa9, 0xad, 0xae, 0xa7, 0xa6,
+    0xd4, 0x70, 0x9d, 0xb5, 0xaa, 0xdb, 0x8f, 0xa3, 0xa5, 0x87, 0x88, 0x9e,
+    0xa9, 0x9f, 0x62, 0xa7, 0xa2, 0x8e, 0x7d, 0x8a, 0x9d, 0xa2, 0x6b, 0xa7,
+    0x96, 0x6d, 0x76, 0x8c, 0x9b, 0x8c, 0x86, 0x86, 0x93, 0x7c, 0x9d, 0x7c,
+    0x7e, 0x93, 0x5c, 0x79, 0x76, 0x8c, 0x8a, 0x87, 0x79, 0x97, 0x9a, 0x7a,
+    0x85, 0x8c, 0x7f, 0x85, 0x7a, 0xa1, 0xa7, 0x72, 0x87, 0x7f, 0x96, 0x9e,
+    0x92, 0x92, 0x9e, 0xa0, 0x72, 0x99, 0x7a, 0xb0, 0x8c, 0x8d, 0xa3, 0x9b,
+    0x91, 0xa6, 0x63, 0x94, 0x8b, 0x81, 0xbb, 0x94, 0x79, 0x95, 0x99, 0x9a,
+    0xa0, 0x7a, 0x96, 0x72, 0x82, 0x9a, 0x83, 0x7f, 0x72, 0x7f, 0x6d, 0x75,
+    0x91, 0x7f, 0xbc, 0x84, 0x9a, 0x81, 0x95, 0x69, 0x7d, 0x6d, 0xa2, 0xa8,
+    0x7e, 0x64, 0xac, 0x86, 0x85, 0x6d, 0x99, 0xaa, 0x7e, 0x79, 0x9c, 0xa0,
+    0xa4, 0x77, 0x99, 0xac, 0xa8, 0x8d, 0xb7, 0xa2, 0xa3, 0x61, 0x82, 0x98,
+    0x84, 0x8e, 0xa1, 0x8c, 0x88, 0x82, 0x6f, 0x7d, 0x88, 0x80, 0x7a, 0x8a,
+    0x8c, 0x6d, 0x87, 0x6f, 0xab, 0x8f, 0x8b, 0x76, 0xa0, 0x7d, 0x9f, 0xab,
+    0xb0, 0xb8, 0x9c, 0x8d, 0xb8, 0x81, 0x89, 0x94, 0xa8, 0xc8, 0x92, 0x9b,
+    0x8d, 0x83, 0x7b, 0xaf, 0x97, 0x94, 0x6e, 0xa5, 0x9b, 0x97, 0x89, 0x8d,
+    0xaa, 0x8a, 0x66, 0x88, 0x93, 0x84, 0xa1, 0x88, 0xa0, 0x99, 0x85, 0x89,
+    0x7d, 0x84, 0x8b, 0x6a, 0x92, 0xa1, 0x74, 0x76, 0x73, 0x87, 0x7a, 0x9a,
+    0x77, 0x86, 0x89, 0x5f, 0x7f, 0x8b, 0x7f, 0x8d, 0x7e, 0x81, 0x95, 0x8a,
+    0x7d, 0x85, 0x74, 0x9a, 0x87, 0x8c, 0x9e, 0xae, 0x80, 0x88, 0x7d, 0x8b,
+    0xaa, 0x79, 0x7c, 0x97, 0x79, 0x90, 0x7b, 0x97, 0x97, 0x9f, 0xa1, 0xa2,
+    0xab, 0x97, 0x69, 0x7a, 0x8d, 0x9f, 0x9f, 0x89, 0x90, 0x8c, 0x66, 0x98,
+    0x6e, 0x86, 0x7b, 0x6e, 0x86, 0x8a, 0xb2, 0xa6, 0x93, 0x7d, 0x8c, 0x81,
+    0x7e, 0x84, 0xa6, 0xb6, 0x83, 0x92, 0xa0, 0x88, 0x90, 0x5f, 0x7c, 0x92,
+    0x98, 0x94, 0x92, 0x98, 0xa7, 0x65, 0x90, 0xa2, 0xa2, 0x9b, 0xa6, 0x7d,
+    0x8b, 0x5a, 0x94, 0x95, 0x9b, 0xa5, 0x99, 0xa5, 0x7e, 0x61, 0x9a, 0x7a,
+    0x8b, 0x77, 0x87, 0x76, 0x9d, 0x72, 0x9a, 0x84, 0x98, 0x94, 0x92, 0x73,
+    0xae, 0x78, 0x8e, 0xaa, 0xa0, 0xc3, 0x7a, 0xa4, 0xa0, 0x75, 0xa9, 0xae,
+    0x8c, 0xd6, 0x87, 0x8f, 0x9f, 0x8c, 0x9b, 0x90, 0x99, 0x97, 0x73, 0x8f,
+    0x9b, 0x9c, 0x8c, 0x89, 0xa5, 0x84, 0x8f, 0x7b, 0x8b, 0x7f, 0x97, 0x98,
+    0x8d, 0x7b, 0x94, 0x9d, 0x9c, 0x8e, 0x92, 0x89, 0x88, 0x8d, 0x6c, 0x63,
+    0x73, 0x81, 0x72, 0x8a, 0x88, 0x8a, 0x9f, 0x79, 0x81, 0x82, 0x9a, 0xa9,
+    0x7a, 0x92, 0x7d, 0x76, 0x7b, 0x7a, 0x6a, 0xbe, 0x91, 0x7d, 0x86, 0xad,
+    0x84, 0x86, 0x6c, 0x91, 0x91, 0x9f, 0x92, 0x6b, 0x95, 0x98, 0x84, 0xa0,
+    0x8f, 0x8b, 0x9e, 0x7f, 0x9f, 0x97, 0x7e, 0x87, 0x80, 0x9e, 0x79, 0x8d,
+    0x68, 0x87, 0x88, 0x7d, 0x89, 0x81, 0x6d, 0x85, 0x80, 0x82, 0xa0, 0x97,
+    0xa3, 0x72, 0x94, 0x74, 0x8e, 0x56, 0x96, 0x98, 0x91, 0x6f, 0xa0, 0xae,
+    0x7c, 0x6e, 0x8e, 0xa9, 0x7c, 0x80, 0x87, 0xa3, 0x9e, 0x57, 0x8e, 0xb5,
+    0x87, 0xa6, 0x87, 0x79, 0x8f, 0x55, 0x8a, 0x81, 0x97, 0x6c, 0x9b, 0x99,
+    0x78, 0x5c, 0x82, 0x80, 0x91, 0x76, 0x80, 0x91, 0x8b, 0x65, 0x89, 0x7d,
+    0xa9, 0x95, 0x89, 0x97, 0x96, 0x6a, 0x89, 0xad, 0x92, 0x9f, 0xb6, 0x82,
+    0x88, 0x79, 0x9d, 0xa5, 0x9c, 0xae, 0x9a, 0x93, 0x77, 0x8e, 0x8a, 0xb5,
+    0x84, 0xb0, 0x76, 0xa2, 0x89, 0xa0, 0x96, 0x7a, 0xa5, 0x8e, 0x7e, 0x74,
+    0x8d, 0x89, 0x89, 0x9e, 0x93, 0x95, 0x90, 0x78, 0x93, 0x8f, 0xa5, 0x7c,
+    0x9d, 0x7c, 0x77, 0x85, 0x81, 0x92, 0x7c, 0x87, 0x92, 0x82, 0x98, 0xa3,
+    0x63, 0x76, 0x9b, 0x91, 0x7b, 0x8e, 0x97, 0x7e, 0x66, 0x90, 0x63, 0xb4,
+    0x71, 0x88, 0x86, 0x8e, 0x6f, 0x89, 0x7a, 0x88, 0x93, 0x7f, 0x96, 0xa8,
+    0x7d, 0x88, 0x88, 0x86, 0x7b, 0x91, 0x88, 0x6b, 0xa6, 0x8b, 0x69, 0x78,
+    0x82, 0x80, 0x83, 0x6b, 0xaf, 0x81, 0x7b, 0x64, 0x8f, 0x78, 0x6e, 0x7f,
+    0x86, 0x91, 0x92, 0xa3, 0xa0, 0x97, 0x82, 0x88, 0x92, 0x90, 0x9e, 0x89,
+    0x9d, 0x7b, 0x96, 0x82, 0xa3, 0x8c, 0x7f, 0x84, 0x7a, 0x6c, 0x60, 0x85,
+    0xa9, 0x74, 0x83, 0xa2, 0x89, 0x87, 0x9b, 0x77, 0x9b, 0x9a, 0x99, 0x84,
+    0x7c, 0x9c, 0x8d, 0x90, 0x8d, 0x7b, 0x74, 0x77, 0x93, 0x8c, 0x6c, 0x8b,
+    0x85, 0x78, 0x7f, 0x7d, 0x75, 0x7f, 0x7e, 0x85, 0x8f, 0x7d, 0x62, 0x8c,
+    0x7c, 0xad, 0x7f, 0x83, 0xa1, 0xa1, 0x97, 0x7b, 0x72, 0x82, 0x9d, 0x81,
+    0x94, 0x81, 0x8d, 0x9f, 0x6f, 0x8f, 0x9d, 0x89, 0x6a, 0x7e, 0x7f, 0x7f,
+    0x8d, 0x7e, 0x91, 0x86, 0x7d, 0x8a, 0x7e, 0x70, 0x7b, 0x9b, 0x6e, 0x5f,
+    0xa8, 0x7a, 0x73, 0x8a, 0x7a, 0x71, 0x90, 0x95, 0x8d, 0x78, 0x7b, 0x72,
+    0x5e, 0x89, 0x62, 0xa1, 0x87, 0x7f, 0x83, 0x75, 0x98, 0x7f, 0x76, 0x72,
+    0x8f, 0x9b, 0x7a, 0x8b, 0xa1, 0x7f, 0x60, 0x99, 0x96, 0x6e, 0x67, 0x76,
+    0x88, 0x98, 0x6c, 0x7b, 0x9b, 0x8d, 0x5f, 0x89, 0x7c, 0x81, 0x79, 0x86,
+    0x69, 0x9e, 0x83, 0x65, 0x8e, 0x82, 0x83, 0x89, 0x85, 0x7f, 0x90, 0x80,
+    0xa2, 0x81, 0x85, 0x83, 0x8e, 0x94, 0x94, 0x75, 0x86, 0x87, 0x9a, 0xb2,
+    0x82, 0x99, 0x85, 0x7f, 0x8c, 0x7e, 0x81, 0x9a, 0x81, 0x7d, 0x87, 0x81,
+    0xa3, 0x8c, 0x8d, 0x85, 0x8d, 0x96, 0x86, 0x7c, 0xa7, 0x87, 0x7e, 0x9d,
+    0x63, 0xa8, 0x7c, 0x97, 0xa2, 0xa4, 0x7e, 0x87, 0x93, 0x9e, 0x89, 0x8d,
+    0x6b, 0x6d, 0x9d, 0x9b, 0x78, 0x8a, 0x8e, 0x7f, 0x7b, 0xa5, 0x6e, 0x8c,
+    0x89, 0x88, 0x73, 0x7e, 0x77, 0x9d, 0xa6, 0xa7, 0x77, 0x87, 0x7e, 0x7e,
+    0x97, 0x84, 0x6b, 0x59, 0x60, 0x90, 0x85, 0x76, 0x8f, 0x61, 0x7f, 0x94,
+    0x8f, 0x84, 0x8b, 0x7f, 0x73, 0x77, 0x73, 0x71, 0x8a, 0x9b, 0x7b, 0x89,
+    0x97, 0x8f, 0x76, 0x63, 0xa3, 0xa1, 0x6b, 0x7c, 0x62, 0x95, 0x8e, 0xa3,
+    0x9f, 0x89, 0x8f, 0x7f, 0x92, 0x7c, 0xa2, 0xa4, 0xa6, 0x92, 0x89, 0x93,
+    0x74, 0x73, 0x73, 0x96, 0xad, 0x9b, 0x87, 0xac, 0x91, 0x8a, 0xa0, 0x70,
+    0x70, 0x7e, 0x8f, 0x74, 0x75, 0xaf, 0x8d, 0x82, 0x8e, 0x82, 0x96, 0x7d,
+    0x69, 0x9c, 0x64, 0xa2, 0x82, 0x89, 0x83, 0x9d, 0x83, 0x88, 0x62, 0x92,
+    0x72, 0x89, 0x6d, 0x7f, 0x92, 0x70, 0x8e, 0x80, 0x7e, 0x8d, 0x91, 0x85,
+    0x8d, 0x89, 0x83, 0x96, 0x90, 0x96, 0x9c, 0xa6, 0x8a, 0x73, 0x89, 0x79,
+    0xa9, 0x70, 0x80, 0x78, 0x96, 0x80, 0x7b, 0x85, 0xa5, 0x80, 0x93, 0x95,
+    0xc5, 0x74, 0x81, 0x88, 0xa2, 0x93, 0x86, 0x9c, 0xa3, 0x6d, 0x92, 0x8a,
+    0x92, 0x99, 0x98, 0x65, 0xad, 0x63, 0x9d, 0x95, 0x99, 0x89, 0x7f, 0x7a,
+    0x99, 0x91, 0x7f, 0x78, 0x90, 0x8f, 0x80, 0x85, 0xa1, 0x68, 0x9d, 0x6c,
+    0x83, 0x8f, 0x7c, 0x5e, 0x99, 0x7b, 0x80, 0x91, 0x66, 0x8a, 0x92, 0xb3,
+    0x7a, 0x99, 0x91, 0x7e, 0x7d, 0x96, 0x69, 0x9e, 0x7c, 0x89, 0xad, 0x8f,
+    0x9d, 0x90, 0x85, 0x8e, 0x72, 0xa9, 0x89, 0x83, 0x7c, 0x82, 0x70, 0x82,
+    0x6b, 0x79, 0x75, 0x8d, 0x77, 0x9b, 0x7c, 0x8f, 0x8a, 0x95, 0x87, 0x9f,
+    0x7c, 0x90, 0x87, 0x70, 0x83, 0x83, 0x98, 0x9f, 0x85, 0x86, 0x8d, 0x81,
+    0x87, 0x87, 0x87, 0x9d, 0x8f, 0x9d, 0x7c, 0x98, 0xa2, 0xac, 0x88, 0x93,
+    0x88, 0x7d, 0x9b, 0x76, 0x82, 0x67, 0x69, 0x7f, 0x8c, 0x8d, 0x94, 0x7d,
+    0x7b, 0xae, 0x8c, 0x85, 0x8b, 0xa7, 0x8c, 0x87, 0x96, 0x7d, 0x8b, 0x90,
+    0x90, 0x7c, 0x92, 0xa8, 0x81, 0x87, 0xa4, 0xa4, 0x82, 0x8b, 0x8d, 0x89,
+    0x8f, 0x70, 0x9d, 0x7f, 0xa0, 0x84, 0x99, 0x65, 0x99, 0x78, 0x94, 0x8b,
+    0xc5, 0x8d, 0x8d, 0x55, 0xb3, 0x8d, 0x78, 0x93, 0xb4, 0x6d, 0x84, 0x90,
+    0xd5, 0x76, 0x7a, 0x9e, 0xc8, 0x8f, 0x86, 0x8a, 0xaa, 0x8b, 0x7f, 0x90,
+    0xaa, 0x95, 0x9c, 0x81, 0xb4, 0x6b, 0x64, 0x8a, 0x99, 0x84, 0x74, 0x6e,
+    0x95, 0x75, 0x98, 0x92, 0x9a, 0x91, 0x8c, 0x7d, 0x88, 0x6e, 0x89, 0x7d,
+    0x87, 0x80, 0x8e, 0x86, 0x78, 0x9f, 0x96, 0x75, 0x76, 0x82, 0x84, 0xaf,
+    0x8a, 0xb3, 0x93, 0x97, 0x86, 0x7c, 0x7e, 0x96, 0x7c, 0x6d, 0x90, 0x8e,
+    0x85, 0x88, 0x8a, 0x9f, 0x70, 0x89, 0x9f, 0x99, 0x95, 0x87, 0x91, 0x9d,
+    0x80, 0x74, 0x88, 0x7c, 0x7f, 0xa8, 0x93, 0x77, 0x66, 0xa6, 0x80, 0xa2,
+    0x88, 0xa0, 0xaf, 0x6f, 0x76, 0x70, 0x82, 0x9a, 0x73, 0x89, 0x9a, 0x75,
+    0x75, 0x8e, 0x5f, 0x85, 0x6a, 0x76, 0x98, 0x66, 0x87, 0xa3, 0x7a, 0x73,
+    0x9d, 0xa1, 0x98, 0x8e, 0x78, 0x91, 0x83, 0x8c, 0x82, 0x9e, 0x90, 0x87,
+    0x8f, 0x9b, 0x8b, 0x8f, 0x89, 0x62, 0x74, 0x82, 0x7b, 0x7f, 0x8a, 0x9d,
+    0x89, 0x93, 0x8c, 0x7a, 0x99, 0x77, 0xac, 0x75, 0x9b, 0x7f, 0x7f, 0x56,
+    0x8c, 0x96, 0x70, 0x79, 0xc2, 0x7d, 0x90, 0x64, 0xe9, 0x79, 0x68, 0xb2,
+    0xc2, 0xa6, 0xa7, 0x7e, 0xd9, 0x98, 0x79, 0x87, 0xc0, 0x97, 0x87, 0x66,
+    0xd0, 0x9f, 0x92, 0x82, 0xa4, 0xa8, 0x8d, 0x78, 0xa6, 0xa1, 0x76, 0x7d,
+    0xa4, 0x87, 0x89, 0x51, 0xae, 0x88, 0x5b, 0x76, 0x7d, 0x70, 0x74, 0x93,
+    0x89, 0x74, 0x9e, 0x7a, 0x79, 0x64, 0x9a, 0x94, 0x65, 0x93, 0xb0, 0x8d,
+    0x88, 0x7e, 0x8e, 0xa5, 0x63, 0x94, 0x94, 0x7d, 0x91, 0x87, 0x84, 0x95,
+    0x75, 0x9e, 0x81, 0x99, 0x65, 0x76, 0x82, 0x9c, 0x6a, 0xab, 0x84, 0x85,
+    0x88, 0x72, 0x92, 0x83, 0x82, 0xaf, 0x6d, 0x9d, 0x9e, 0x73, 0x98, 0x7f,
+    0x91, 0xb4, 0x62, 0x8d, 0x74, 0x6e, 0xb4, 0x94, 0x97, 0x9e, 0x6f, 0x9a,
+    0x83, 0x7b, 0xa9, 0x7d, 0x87, 0x97, 0x60, 0xa9, 0x7a, 0x75, 0xad, 0x6c,
+    0x77, 0xa4, 0x88, 0x82, 0x6f, 0x8a, 0x83, 0x74, 0x9a, 0xa7, 0x83, 0x91,
+    0x7c, 0x7c, 0x78, 0x77, 0x83, 0x92, 0x7a, 0x83, 0x90, 0x6f, 0x79, 0x6b,
+    0x9b, 0x8d, 0x99, 0x95, 0x7b, 0x89, 0x8e, 0x6c, 0x8e, 0x6c, 0x9b, 0x91,
+    0x97, 0x80, 0x83, 0x6f, 0xaa, 0x91, 0x66, 0x76, 0xc9, 0x77, 0x82, 0x4d,
+    0xd7, 0x5f, 0x58, 0x9a, 0xb1, 0x7a, 0xb1, 0x6b, 0xe5, 0x9d, 0x76, 0x89,
+    0xb6, 0x94, 0x90, 0x5b, 0xb8, 0x92, 0x7d, 0x90, 0xbd, 0x9a, 0x85, 0x4e,
+    0xb4, 0x84, 0x61, 0x82, 0x94, 0x8e, 0x70, 0x57, 0x90, 0x89, 0x6f, 0x60,
+    0x78, 0x90, 0x78, 0x85, 0x8e, 0x7c, 0x76, 0x74, 0x71, 0x5d, 0x94, 0x93,
+    0x71, 0x8f, 0xc2, 0x80, 0x75, 0x7d, 0x77, 0xa8, 0x70, 0x8f, 0xa6, 0x83,
+    0x74, 0x6b, 0x79, 0x97, 0x76, 0xa2, 0xad, 0x93, 0x5b, 0x8c, 0x7c, 0x7e,
+    0x82, 0x9b, 0xa0, 0x76, 0x71, 0x7a, 0xa3, 0x80, 0x87, 0x90, 0x92, 0xa6,
+    0x85, 0x71, 0x99, 0x91, 0x91, 0x8c, 0x99, 0x9b, 0x92, 0x74, 0xb2, 0x79,
+    0x9c, 0x7c, 0x7b, 0xa8, 0x8c, 0x6f, 0xb5, 0x69, 0x7a, 0x8a, 0x68, 0x9f,
+    0x82, 0x7d, 0xbd, 0x5f, 0xa1, 0x92, 0x83, 0x9f, 0x6f, 0xa1, 0x88, 0x61,
+    0x7b, 0x94, 0x89, 0x83, 0x6f, 0x6e, 0x92, 0x9d, 0x65, 0x7f, 0x97, 0x83,
+    0x87, 0x75, 0x92, 0x8a, 0x82, 0x82, 0x79, 0x92, 0x78, 0x89, 0x92, 0x7a,
+    0x91, 0x64, 0x8a, 0x93, 0x9d, 0x74, 0x78, 0x64, 0xab, 0x57, 0x7a, 0x84,
+    0xcf, 0x7d, 0x95, 0x4f, 0xde, 0x63, 0x78, 0x9a, 0xb7, 0x7a, 0x8b, 0x5b,
+    0xda, 0xa3, 0x94, 0x99, 0xbd, 0x88, 0xa4, 0x53, 0xad, 0x8b, 0x81, 0x96,
+    0xca, 0x8f, 0x76, 0x5e, 0xbd, 0x9d, 0x70, 0x81, 0x9b, 0x7d, 0x8a, 0x44,
+    0xa0, 0x77, 0x52, 0x6e, 0x82, 0x62, 0x6a, 0x6b, 0x9d, 0xaa, 0x81, 0x85,
+    0x7d, 0x5f, 0x7f, 0x9c, 0x65, 0x99, 0x97, 0x81, 0x7f, 0x65, 0x65, 0xa4,
+    0x84, 0x8c, 0xa1, 0x6d, 0x7a, 0x70, 0x79, 0x90, 0x98, 0xaa, 0x76, 0x95,
+    0x7f, 0x91, 0x95, 0x96, 0x6e, 0xa5, 0x95, 0xa2, 0x7d, 0x7e, 0x93, 0x87,
+    0x7d, 0x9b, 0x85, 0x9b, 0x85, 0x79, 0x96, 0x6b, 0x9d, 0x9d, 0x61, 0x99,
+    0x9c, 0x74, 0xcc, 0x7e, 0x9a, 0x83, 0x83, 0x98, 0x6f, 0x6d, 0xc5, 0x69,
+    0xb0, 0xa5, 0x5c, 0x91, 0x6c, 0x7b, 0xcc, 0x72, 0x9a, 0x9d, 0x7e, 0xa3,
+    0x8a, 0x96, 0x8e, 0x74, 0x7b, 0x80, 0x6b, 0x85, 0x84, 0x56, 0x92, 0x83,
+    0x64, 0x90, 0x86, 0x86, 0x88, 0x79, 0x8b, 0xa0, 0x86, 0x72, 0xab, 0x95,
+    0x80, 0x81, 0x96, 0x8f, 0x75, 0x7f, 0x71, 0x92, 0x9e, 0x75, 0x62, 0x5e,
+    0xc3, 0x7a, 0x6c, 0x84, 0xba, 0x81, 0x8f, 0x49, 0xc9, 0x76, 0x54, 0x89,
+    0xc2, 0x8c, 0xa2, 0x54, 0xd8, 0xa4, 0x72, 0x90, 0xb1, 0x91, 0xa0, 0x7a,
+    0xbf, 0x9a, 0x6f, 0x82, 0xbb, 0x81, 0x6a, 0x52, 0xc2, 0x82, 0x52, 0x65,
+    0x8d, 0x8a, 0x84, 0x46, 0xa2, 0x90, 0x45, 0x52, 0x82, 0x61, 0x8c, 0x77,
+    0x92, 0x6d, 0x87, 0x5b, 0x5e, 0x72, 0x76, 0x97, 0x73, 0x8d, 0x8d, 0x70,
+    0x7a, 0x66, 0x76, 0x89, 0x72, 0xbf, 0xb0, 0x84, 0x7d, 0x80, 0x71, 0x8f,
+    0x85, 0xa9, 0xa3, 0x7d, 0x7b, 0x84, 0x83, 0xa1, 0x97, 0xa7, 0xaf, 0x84,
+    0x86, 0x7d, 0x94, 0x78, 0x80, 0x98, 0x71, 0x84, 0x94, 0x73, 0xb0, 0x74,
+    0x99, 0xa2, 0x68, 0xa7, 0x8b, 0x86, 0xe0, 0x75, 0x9e, 0x93, 0x5c, 0xb2,
+    0xa2, 0x68, 0xb8, 0x61, 0x92, 0xa3, 0x68, 0xa4, 0x89, 0x59, 0xd0, 0x77,
+    0x97, 0xa9, 0x6a, 0x9b, 0x7d, 0x69, 0x9b, 0x79, 0x8c, 0x7c, 0x68, 0x8b,
+    0x7a, 0x53, 0x99, 0x9c, 0x7e, 0x8d, 0x89, 0x96, 0x9e, 0x83, 0x89, 0x74,
+    0x7f, 0x94, 0x92, 0x8f, 0x85, 0x8a, 0x8a, 0x80, 0x99, 0x87, 0x7a, 0x7d,
+    0xac, 0x93, 0x74, 0x68, 0xba, 0x87, 0x6a, 0x98, 0xc7, 0x79, 0x91, 0x54,
+    0xeb, 0x80, 0x45, 0x80, 0xc4, 0xb4, 0x94, 0x61, 0xd2, 0xa6, 0x7b, 0x95,
+    0xa4, 0xaa, 0x93, 0x7b, 0xb1, 0x74, 0x53, 0x7c, 0xaa, 0x91, 0x64, 0x51,
+    0xa9, 0x6e, 0x5e, 0x7c, 0x79, 0x82, 0x8b, 0x2e, 0x9d, 0x66, 0x61, 0x5e,
+    0x72, 0x7f, 0x6e, 0x6d, 0x8c, 0x79, 0x7d, 0x60, 0x76, 0x79, 0x68, 0x84,
+    0x4d, 0x8e, 0xa8, 0x8f, 0x78, 0x74, 0x69, 0xa4, 0x6e, 0xa9, 0xb9, 0x59,
+    0x83, 0x7f, 0x7a, 0x93, 0x90, 0x9b, 0x8d, 0x93, 0x78, 0x80, 0x77, 0x8b,
+    0x72, 0xa3, 0x97, 0x73, 0x91, 0x6c, 0x9a, 0x97, 0xa3, 0xad, 0x89, 0x96,
+    0x9e, 0x6d, 0xb5, 0x7c, 0xa4, 0x98, 0x61, 0x8a, 0x93, 0x5f, 0xdc, 0x63,
+    0xba, 0x92, 0x84, 0x94, 0xab, 0x6f, 0xbf, 0x66, 0x98, 0x93, 0x74, 0x85,
+    0x96, 0x63, 0xb8, 0x60, 0x94, 0xbb, 0x79, 0x94, 0x7b, 0x67, 0x8a, 0x64,
+    0x99, 0xac, 0x60, 0x98, 0xb0, 0x65, 0xa2, 0x73, 0x8f, 0x94, 0x8c, 0x92,
+    0x84, 0x84, 0x9b, 0x8f, 0x84, 0x8d, 0x9f, 0x90, 0x91, 0x85, 0x93, 0x74,
+    0x97, 0x66, 0x7f, 0x78, 0xa2, 0x95, 0x73, 0x6b, 0xc5, 0x6f, 0x62, 0x79,
+    0xbd, 0x81, 0x89, 0x4a, 0xbd, 0x93, 0x57, 0x81, 0xba, 0xb0, 0x9b, 0x4c,
+    0xe8, 0xa2, 0x85, 0xa2, 0x96, 0x92, 0x93, 0x62, 0xbe, 0x7a, 0x71, 0x8b,
+    0x8d, 0x97, 0x53, 0x56, 0xb1, 0x5f, 0x67, 0x60, 0x7a, 0x8e, 0x8a, 0x3a,
+    0x86, 0x67, 0x6d, 0x53, 0x6e, 0x91, 0x7b, 0x60, 0x99, 0x6d, 0x71, 0x5d,
+    0x67, 0x65, 0x63, 0x87, 0x71, 0x8a, 0x92, 0x6d, 0x8f, 0x6f, 0x6f, 0xae,
+    0x6c, 0xa2, 0x87, 0x6f, 0x99, 0x88, 0x78, 0x94, 0x8a, 0xb2, 0x93, 0x89,
+    0x90, 0x8d, 0x8c, 0x98, 0x81, 0x86, 0x90, 0x6d, 0xa2, 0x82, 0xa2, 0xa3,
+    0x9d, 0x8f, 0x7a, 0x9f, 0x87, 0x70, 0xbd, 0x8e, 0xa5, 0x99, 0x5d, 0x70,
+    0x8c, 0x60, 0xc7, 0x78, 0x97, 0xb0, 0x6f, 0x94, 0x92, 0x5a, 0xc3, 0x6e,
+    0x8b, 0x9f, 0x79, 0xa3, 0x8c, 0x5e, 0xbf, 0x79, 0x8e, 0x98, 0x76, 0x8e,
+    0x67, 0x31, 0x9b, 0x85, 0x8e, 0x85, 0x71, 0x99, 0x72, 0x77, 0x84, 0x81,
+    0x91, 0x95, 0x80, 0x98, 0x82, 0x6f, 0x90, 0xa0, 0x91, 0x91, 0x8e, 0x75,
+    0x8a, 0x89, 0x93, 0x69, 0x95, 0x7f, 0x9a, 0xa0, 0x9e, 0x9b, 0x88, 0x4e,
+    0xc3, 0x8d, 0x65, 0x74, 0xba, 0x8d, 0x97, 0x4d, 0xd6, 0x94, 0x73, 0xa0,
+    0xb1, 0xb3, 0x8c, 0x67, 0xdd, 0x9f, 0x7f, 0xaa, 0xaf, 0x9a, 0x88, 0x67,
+    0xc2, 0x8f, 0x71, 0x7b, 0x8f, 0x9f, 0x47, 0x52, 0x93, 0x72, 0x5a, 0x52,
+    0x97, 0x9d, 0x67, 0x3c, 0xa9, 0x59, 0x59, 0x5b, 0x88, 0x92, 0x82, 0x57,
+    0x83, 0x67, 0x94, 0x77, 0x52, 0x74, 0x60, 0x9e, 0x52, 0x84, 0xa2, 0x69,
+    0x71, 0x96, 0x73, 0xb0, 0x5e, 0xb0, 0x89, 0x71, 0x94, 0x8a, 0x66, 0xa0,
+    0x75, 0xc1, 0x99, 0x8e, 0x83, 0x8a, 0x91, 0x89, 0x6b, 0xa5, 0x79, 0x82,
+    0x8b, 0x73, 0x95, 0xb0, 0x77, 0x9b, 0x82, 0x7d, 0x8f, 0x60, 0xb9, 0x78,
+    0x8b, 0x8f, 0x7b, 0x74, 0x84, 0x6d, 0xbf, 0x76, 0x8f, 0xa3, 0x91, 0xa1,
+    0x81, 0x59, 0xcb, 0x69, 0xac, 0x90, 0x98, 0x92, 0xa7, 0x5d, 0xb4, 0x8b,
+    0xaa, 0xb1, 0x98, 0x8c, 0xa2, 0x4d, 0xa1, 0x69, 0x7f, 0xa0, 0x7d, 0x8a,
+    0x9b, 0x77, 0x8e, 0x71, 0x82, 0x8a, 0x78, 0x8d, 0x98, 0x78, 0x90, 0x91,
+    0x7e, 0x7f, 0x78, 0x85, 0x97, 0x8a, 0x97, 0x6d, 0xb3, 0x94, 0x89, 0xa3,
+    0xa5, 0x9a, 0x76, 0x6b, 0xbd, 0x79, 0x71, 0x95, 0xce, 0xab, 0x93, 0x1f,
+    0xe9, 0x97, 0x4c, 0x84, 0xd5, 0x9f, 0x98, 0x6e, 0xdd, 0x8d, 0x80, 0x9c,
+    0xa8, 0x9e, 0x8d, 0x75, 0xbc, 0x8c, 0x80, 0x89, 0xa1, 0x89, 0x74, 0x58,
+    0x92, 0x86, 0x55, 0x87, 0x91, 0x8d, 0x70, 0x33, 0xb8, 0x50, 0x63, 0x6b,
+    0x79, 0x99, 0x76, 0x71, 0x75, 0x59, 0x73, 0x6b, 0x62, 0x62, 0x74, 0x85,
+    0x73, 0xa3, 0xac, 0x78, 0x77, 0x88, 0x64, 0xa0, 0x73, 0xa1, 0xa8, 0x73,
+    0x91, 0x8e, 0x5f, 0x9a, 0x68, 0xc9, 0xa1, 0x92, 0x7a, 0x7c, 0x69, 0x77,
+    0x7d, 0x9e, 0x8f, 0x76, 0x88, 0x80, 0x92, 0x93, 0x91, 0x99, 0x8c, 0x85,
+    0x9f, 0x69, 0xa8, 0x9b, 0x9f, 0x9a, 0x64, 0x7a, 0x99, 0x70, 0xc4, 0x6d,
+    0x9a, 0x99, 0x82, 0xa0, 0x8b, 0x59, 0xc8, 0x61, 0x8f, 0x95, 0x72, 0x8c,
+    0x90, 0x63, 0xa9, 0x7e, 0x88, 0x8c, 0x85, 0x78, 0x76, 0x58, 0x8e, 0x72,
+    0xa3, 0x9a, 0x7c, 0xa0, 0x7f, 0x6d, 0xa6, 0x83, 0x7e, 0x8d, 0x83, 0x88,
+    0x86, 0x68, 0x8d, 0x96, 0xaa, 0x78, 0x90, 0xa5, 0x9c, 0x9d, 0x99, 0x88,
+    0xb0, 0x82, 0x6f, 0x7e, 0xad, 0xa9, 0x7b, 0x6a, 0xba, 0x6c, 0x6d, 0x89,
+    0xc1, 0x9e, 0x8e, 0x2f, 0xf2, 0x77, 0x50, 0x73, 0xdb, 0xc4, 0x9c, 0x6c,
+    0xd0, 0x90, 0x88, 0xbe, 0x97, 0xb9, 0x9e, 0x6e, 0xbe, 0x8e, 0x83, 0x8e,
+    0x96, 0x98, 0x4c, 0x4e, 0xa7, 0x8d, 0x43, 0x92, 0x8f, 0x92, 0x6d, 0x27,
+    0x94, 0x73, 0x5f, 0x42, 0x7c, 0xa7, 0x8a, 0x5a, 0x81, 0x60, 0x85, 0x66,
+    0x73, 0x72, 0x74, 0x9d, 0x5a, 0x9e, 0xa3, 0x71, 0x75, 0x91, 0x4f, 0xa2,
+    0x67, 0xa6, 0x91, 0x64, 0x92, 0x7e, 0x95, 0x8d, 0x6e, 0xbe, 0x9b, 0x57,
+    0x9b, 0x82, 0x89, 0x70, 0x6f, 0x9e, 0x7e, 0x86, 0x97, 0x81, 0x85, 0x8e,
+    0x70, 0x96, 0x6c, 0x72, 0xab, 0x6d, 0x9c, 0x91, 0xa0, 0x8a, 0x8d, 0x88,
+    0x9e, 0x75, 0xc6, 0x76, 0x7c, 0xa7, 0x6b, 0xa8, 0x94, 0x72, 0xb6, 0x78,
+    0x8d, 0x90, 0x7b, 0x8c, 0xa6, 0x65, 0xad, 0x9b, 0xaa, 0x94, 0x89, 0x7d,
+    0x90, 0x69, 0xaa, 0x7e, 0x9e, 0xad, 0x7f, 0x94, 0x81, 0x7d, 0xa1, 0x7b,
+    0x6c, 0x65, 0x83, 0x95, 0x89, 0x75, 0x93, 0x87, 0x94, 0x87, 0xa8, 0x92,
+    0x8d, 0xa6, 0x9f, 0x78, 0xaa, 0x72, 0x95, 0x94, 0xac, 0xa6, 0x91, 0x5a,
+    0xdb, 0x82, 0x55, 0xb6, 0xc1, 0xa3, 0x84, 0x4f, 0xc9, 0x88, 0x53, 0x8f,
+    0xbb, 0xae, 0x9b, 0x8a, 0xd8, 0xa9, 0x68, 0xc2, 0xa0, 0xa9, 0x87, 0x6b,
+    0xbd, 0x99, 0x7e, 0x86, 0x88, 0xa7, 0x5e, 0x53, 0xa4, 0x84, 0x6b, 0x6e,
+    0x89, 0x95, 0x84, 0x2d, 0xb5, 0x43, 0x3e, 0x50, 0x71, 0x96, 0x9a, 0x5b,
+    0xa1, 0x60, 0x80, 0x70, 0x6a, 0x73, 0x8f, 0x95, 0x52, 0x9b, 0xae, 0x71,
+    0x76, 0x7d, 0x61, 0x99, 0x5b, 0xc3, 0xa8, 0x76, 0x98, 0x72, 0x7f, 0x8a,
+    0x66, 0xc7, 0xa3, 0x7b, 0x8e, 0x8f, 0x70, 0x74, 0x6a, 0xae, 0x85, 0x83,
+    0x96, 0x7d, 0x98, 0xa7, 0x8f, 0x94, 0x7e, 0x84, 0x96, 0x7a, 0xab, 0x7d,
+    0x83, 0xb1, 0x6f, 0x7d, 0x9f, 0x80, 0xca, 0x8f, 0x9b, 0xa9, 0x69, 0x7a,
+    0x92, 0x73, 0xaa, 0x74, 0x88, 0x98, 0x87, 0x8f, 0xa7, 0x68, 0xa0, 0x74,
+    0x97, 0x95, 0x6e, 0x6f, 0x83, 0x53, 0x9b, 0x79, 0x71, 0x87, 0x7d, 0x8b,
+    0x79, 0x87, 0xa3, 0x75, 0x68, 0x73, 0x7e, 0x89, 0x8f, 0x81, 0x98, 0x7a,
+    0x9a, 0x83, 0x9d, 0x95, 0x90, 0x98, 0x97, 0x57, 0x93, 0x7e, 0xa2, 0x9a,
+    0xa8, 0x8a, 0x85, 0x53, 0xbd, 0x7a, 0x61, 0x8b, 0xca, 0xac, 0x9b, 0x2e,
+    0xe8, 0xa5, 0x66, 0x86, 0xca, 0xa7, 0xa0, 0x85, 0xcf, 0xa4, 0x6a, 0xc2,
+    0xb0, 0xaa, 0x76, 0x76, 0xb6, 0xa2, 0x72, 0xa9, 0xa1, 0xa1, 0x67, 0x67,
+    0xac, 0x90, 0x70, 0x6d, 0x8f, 0xb5, 0x6d, 0x3b, 0x85, 0x64, 0x4a, 0x6e,
+    0x72, 0x9f, 0x98, 0x5b, 0x97, 0x3e, 0x8a, 0x6a, 0x6c, 0x7d, 0x77, 0x98,
+    0x5a, 0x92, 0xa3, 0x81, 0x6f, 0x91, 0x7b, 0xa6, 0x6e, 0x9c, 0x9b, 0x5f,
+    0x9e, 0x7e, 0x77, 0x9d, 0x88, 0xc6, 0x81, 0x5a, 0x93, 0x8b, 0x6c, 0x71,
+    0x63, 0x9e, 0x78, 0x79, 0x70, 0x90, 0x95, 0x9f, 0x71, 0xa9, 0x90, 0x73,
+    0x98, 0x8a, 0xa5, 0x8e, 0x87, 0xb0, 0x79, 0x79, 0x92, 0x7d, 0xcc, 0xa8,
+    0x7a, 0x92, 0x82, 0x91, 0x90, 0x69, 0xa4, 0x9b, 0x97, 0x8f, 0x75, 0x7c,
+    0xa3, 0x69, 0xb5, 0x87, 0x8d, 0x88, 0x7b, 0x94, 0x8b, 0x55, 0xa2, 0x6d,
+    0x89, 0x8e, 0x81, 0x8a, 0x9e, 0x87, 0x86, 0x83, 0x8b, 0x84, 0x87, 0xa7,
+    0x8e, 0x79, 0xa4, 0x9c, 0x99, 0x82, 0xa3, 0x8f, 0x91, 0x9a, 0x95, 0x5b,
+    0x9f, 0x6e, 0x85, 0x93, 0xa6, 0x9a, 0x91, 0x4c, 0xd8, 0x6b, 0x6d, 0x85,
+    0xde, 0xaa, 0x97, 0x51, 0xcf, 0x8c, 0x5f, 0x9a, 0xc2, 0x9d, 0x9a, 0x7c,
+    0xc6, 0xb1, 0x84, 0xac, 0xba, 0xa5, 0x7c, 0x76, 0xbd, 0x93, 0x7f, 0xa0,
+    0x86, 0xae, 0x47, 0x41, 0x88, 0x82, 0x62, 0x62, 0x73, 0xad, 0x6b, 0x23,
+    0xa0, 0x48, 0x5a, 0x5a, 0x8f, 0x98, 0xbd, 0x5c, 0x9c, 0x72, 0x7c, 0x68,
+    0x50, 0x78, 0x91, 0xab, 0x5c, 0xc1, 0xc6, 0x66, 0x87, 0x86, 0x60, 0x99,
+    0x65, 0xac, 0x94, 0x91, 0x7e, 0x8c, 0x7d, 0x9b, 0x70, 0xb2, 0x9a, 0x7d,
+    0x82, 0x91, 0x6b, 0x86, 0x6f, 0xbb, 0x7f, 0x66, 0x7a, 0x79, 0x94, 0x96,
+    0x71, 0xa5, 0x75, 0x73, 0x95, 0x81, 0xa4, 0x8b, 0x87, 0xaa, 0x8e, 0x92,
+    0xa9, 0x82, 0xb0, 0x92, 0x89, 0xa7, 0x83, 0x81, 0x8c, 0x6d, 0xc4, 0x7a,
+    0x89, 0xa5, 0xa1, 0xa2, 0xa4, 0x6b, 0xa4, 0x82, 0x90, 0xb2, 0x8d, 0x72,
+    0x83, 0x60, 0xa7, 0x7a, 0x80, 0x97, 0x65, 0x90, 0x87, 0x85, 0xae, 0x71,
+    0x7d, 0x71, 0x98, 0xa8, 0x90, 0x75, 0xa9, 0x96, 0xa2, 0x91, 0x7b, 0x6b,
+    0xa0, 0x9d, 0x8d, 0x5d, 0xa4, 0x79, 0x8c, 0xa4, 0xad, 0x94, 0x7e, 0x77,
+    0xb6, 0x92, 0x74, 0xaf, 0xb5, 0x9b, 0x99, 0x67, 0xe7, 0x8e, 0x6a, 0x87,
+    0xc1, 0x98, 0x9b, 0x7e, 0xd7, 0x9b, 0x5b, 0xae, 0xc9, 0x94, 0x7a, 0x6d,
+    0x9e, 0xb4, 0x86, 0x8e, 0xa3, 0xa1, 0x5e, 0x5d, 0x8e, 0x8f, 0x6b, 0x59,
+    0xa5, 0xa9, 0x69, 0x20, 0xa4, 0x64, 0x35, 0x61, 0x83, 0x9d, 0x8a, 0x4e,
+    0x8b, 0x6c, 0x5e, 0x5b, 0x68, 0x76, 0x89, 0x94, 0x5f, 0x87, 0x98, 0x7a,
+    0x5d, 0x81, 0x89, 0xa6, 0x54, 0xa3, 0xb4, 0x7b, 0x83, 0x8a, 0x90, 0x8b,
+    0x86, 0xbc, 0x86, 0x59, 0x91, 0x79, 0x71, 0x6b, 0x7c, 0x94, 0x98, 0x7f,
+    0x81, 0x76, 0x85, 0xad, 0x69, 0xa8, 0x83, 0x8c, 0x8f, 0x70, 0x9a, 0x91,
+    0x78, 0xb3, 0x8f, 0x6d, 0x90, 0x86, 0xbd, 0x97, 0x7f, 0xaf, 0x7e, 0x90,
+    0x8f, 0x63, 0xa2, 0x93, 0x6e, 0xab, 0x75, 0x72, 0x8d, 0x74, 0xa1, 0x72,
+    0x82, 0xaa, 0x70, 0x82, 0x8d, 0x67, 0x94, 0x91, 0x92, 0xa5, 0x7f, 0xa5,
+    0x6f, 0x6d, 0xaf, 0x80, 0x89, 0x7d, 0x92, 0x99, 0x92, 0x72, 0x9d, 0x7d,
+    0x92, 0x78, 0xa9, 0x89, 0xa9, 0x9b, 0xa3, 0x73, 0x98, 0x71, 0x98, 0x86,
+    0x9e, 0x97, 0x9e, 0x6a, 0xb9, 0x6a, 0x6e, 0x90, 0xde, 0x94, 0x9a, 0x52,
+    0xdd, 0xa9, 0x6a, 0x79, 0xb9, 0xa3, 0xaa, 0x95, 0xba, 0xa2, 0x75, 0xc2,
+    0xbf, 0xb5, 0x6d, 0x8d, 0xae, 0x9b, 0x8d, 0x9a, 0x92, 0xb4, 0x5e, 0x4b,
+    0x8b, 0x99, 0x4f, 0x65, 0x94, 0xb6, 0x5d, 0x3a, 0xa3, 0x77, 0x51, 0x4e,
+    0x6d, 0xa3, 0x94, 0x59, 0x80, 0x56, 0x8c, 0x67, 0x67, 0x74, 0x99, 0x85,
+    0x57, 0x7b, 0x9e, 0x7e, 0x84, 0x85, 0x94, 0x96, 0x71, 0xbf, 0x97, 0x5f,
+    0x7d, 0x80, 0x93, 0x87, 0x6b, 0xb9, 0x7d, 0x8b, 0x84, 0x84, 0x6b, 0x8c,
+    0x6c, 0xc4, 0x85, 0x82, 0x87, 0x8d, 0x64, 0x90, 0x80, 0xb6, 0x9a, 0x70,
+    0x9c, 0x68, 0xa0, 0x88, 0x81, 0x9d, 0x83, 0x75, 0x9d, 0x84, 0xbf, 0x8f,
+    0x83, 0x9b, 0x75, 0x82, 0x9c, 0x76, 0xa4, 0x9d, 0x8a, 0xa7, 0x8e, 0x96,
+    0x9c, 0x64, 0xc0, 0x95, 0x88, 0xa5, 0x6f, 0x74, 0x7e, 0x5d, 0x9f, 0x7d,
+    0x89, 0x81, 0x71, 0xa8, 0x82, 0x6e, 0x9b, 0x9a, 0x6f, 0xa5, 0x88, 0x89,
+    0xa4, 0x7e, 0xa4, 0x90, 0xa1, 0x83, 0x8b, 0x9c, 0x9a, 0x89, 0xa2, 0x89,
+    0x9d, 0x5d, 0x86, 0xa5, 0xc4, 0x96, 0x9c, 0x85, 0xd6, 0x7c, 0x69, 0x88,
+    0xc9, 0xa5, 0x9b, 0x60, 0xea, 0xab, 0x62, 0x9f, 0xd1, 0xa5, 0x86, 0x7e,
+    0xb3, 0xbd, 0x7a, 0xa1, 0xbd, 0xa0, 0x7c, 0x92, 0xa6, 0xa3, 0x7d, 0xa9,
+    0x98, 0xa6, 0x71, 0x5c, 0x9b, 0x9b, 0x58, 0x6f, 0x8f, 0xaa, 0x5e, 0x3b,
+    0xa6, 0x5f, 0x3a, 0x79, 0x94, 0xa5, 0x84, 0x6f, 0x83, 0x5d, 0x75, 0x65,
+    0x6c, 0x77, 0x86, 0xad, 0x4a, 0x92, 0x8e, 0x8a, 0x8f, 0x7b, 0x72, 0x96,
+    0x79, 0xa6, 0xa8, 0x6d, 0x7b, 0x7b, 0x98, 0xa9, 0x79, 0xb9, 0x9e, 0x8f,
+    0x90, 0x6d, 0x76, 0x82, 0x81, 0xc1, 0x95, 0x7c, 0x97, 0x8d, 0x95, 0xa2,
+    0x7c, 0xa4, 0x7b, 0x9b, 0x7f, 0x6f, 0xac, 0x83, 0x7e, 0xa1, 0x7c, 0x7c,
+    0xa1, 0x7a, 0xa1, 0x6d, 0x95, 0x86, 0x77, 0x98, 0x8e, 0x58, 0xa2, 0x76,
+    0x8e, 0xa8, 0x94, 0x90, 0xa7, 0x62, 0xb8, 0x8a, 0x9f, 0xac, 0x87, 0x91,
+    0x88, 0x50, 0xa7, 0x83, 0x88, 0x65, 0x7a, 0x92, 0x9d, 0x70, 0xa9, 0x99,
+    0x7c, 0x87, 0x8c, 0x96, 0x8e, 0x73, 0xa4, 0xa7, 0x9b, 0x70, 0x99, 0x96,
+    0x8f, 0x88, 0xb4, 0x85, 0xa8, 0x6a, 0x9e, 0x78, 0xb0, 0x82, 0x9f, 0x89,
+    0xc9, 0x8d, 0x71, 0x7f, 0xc0, 0x98, 0xa0, 0x6d, 0xd2, 0x8e, 0x64, 0x9e,
+    0xb2, 0xa9, 0x93, 0x6e, 0xcc, 0xbb, 0x89, 0xb1, 0xc1, 0x9b, 0x86, 0x94,
+    0xb5, 0xb5, 0x95, 0xa0, 0x9c, 0x9b, 0x62, 0x5f, 0x7b, 0x91, 0x69, 0x74,
+    0x9e, 0xa3, 0x81, 0x30, 0x85, 0x59, 0x49, 0x5e, 0x83, 0x85, 0x7d, 0x6a,
+    0x90, 0x51, 0x80, 0x5e, 0x64, 0x6f, 0x99, 0x93, 0x75, 0x9a, 0xa7, 0x72,
+    0x6c, 0x5d, 0xa3, 0x93, 0x87, 0xa7, 0xbd, 0x6f, 0x92, 0x6d, 0x85, 0x98,
+    0x6f, 0xc7, 0xb6, 0x7c, 0x80, 0x71, 0x8a, 0x9f, 0x71, 0xb5, 0x8c, 0x6d,
+    0xac, 0x7b, 0x72, 0xb7, 0x69, 0xa6, 0x9d, 0x66, 0xab, 0x7a, 0x8b, 0x70,
+    0x8c, 0x9e, 0x86, 0x75, 0x96, 0x7b, 0xa3, 0x93, 0x8f, 0xb7, 0x84, 0x8c,
+    0x87, 0x56, 0xae, 0x82, 0x71, 0xa3, 0x8d, 0x93, 0xaf, 0x59, 0xb3, 0x8a,
+    0x97, 0x99, 0x75, 0x73, 0x8e, 0x51, 0xae, 0x84, 0x8b, 0x7a, 0x76, 0x77,
+    0x6e, 0x75, 0xa4, 0x8a, 0x75, 0x8e, 0x8f, 0xa2, 0x96, 0x76, 0x9a, 0x80,
+    0x96, 0x7d, 0x94, 0x71, 0x8a, 0x90, 0xac, 0x82, 0xa5, 0x61, 0xa3, 0x84,
+    0xac, 0x8f, 0x74, 0x5c, 0xb6, 0x77, 0x8b, 0x9b, 0xb5, 0x8b, 0xb6, 0x52,
+    0xd7, 0xaa, 0x4b, 0x8c, 0xbf, 0xb8, 0x9f, 0x6d, 0xcb, 0xa3, 0x6e, 0x97,
+    0xaa, 0x8d, 0x7c, 0x99, 0xc0, 0xd0, 0x9e, 0xb7, 0x93, 0xaa, 0x5a, 0x6a,
+    0x7d, 0x9a, 0x63, 0x71, 0x78, 0x8c, 0x67, 0x43, 0x87, 0x52, 0x64, 0x68,
+    0x68, 0x9c, 0x65, 0x60, 0x7a, 0x35, 0x68, 0x66, 0x63, 0x69, 0x8d, 0x8f,
+    0x72, 0x9b, 0x99, 0x5b, 0x80, 0x67, 0x93, 0xa2, 0x97, 0x9d, 0x8c, 0x68,
+    0x80, 0x86, 0x96, 0x91, 0x64, 0xbf, 0x98, 0x63, 0x83, 0x85, 0x61, 0x97,
+    0x6a, 0xac, 0xb4, 0x99, 0x8d, 0x7b, 0x7b, 0xad, 0x8b, 0xb2, 0x9e, 0x7f,
+    0x9a, 0x73, 0x91, 0x84, 0x89, 0x9f, 0x8a, 0x87, 0x8b, 0x72, 0x8e, 0x79,
+    0x86, 0xa7, 0x77, 0x84, 0x90, 0x58, 0xb2, 0x90, 0x93, 0xa0, 0x7f, 0x8a,
+    0x91, 0x5a, 0xb1, 0x80, 0x99, 0xc1, 0x80, 0x7d, 0x97, 0x5c, 0x9a, 0x8c,
+    0x71, 0x96, 0x7e, 0x7f, 0xad, 0x7b, 0xb9, 0x8a, 0x84, 0x84, 0x81, 0x97,
+    0x94, 0x64, 0x9f, 0x7e, 0x9b, 0x8d, 0x7d, 0x8d, 0x9a, 0x9e, 0xac, 0x72,
+    0xb2, 0x73, 0x81, 0x84, 0xc8, 0x81, 0x88, 0x72, 0xbe, 0x85, 0x86, 0x97,
+    0xd3, 0x8a, 0xc7, 0x75, 0xce, 0x9c, 0x69, 0xa6, 0xb0, 0xa1, 0x8e, 0x64,
+    0xb1, 0xa6, 0x67, 0xaa, 0xcd, 0x95, 0x97, 0xa2, 0xb2, 0xb2, 0x85, 0x9a,
+    0x9d, 0xa3, 0x5e, 0x73, 0x6e, 0xae, 0x50, 0x83, 0x8c, 0xab, 0x92, 0x43,
+    0x6b, 0x66, 0x43, 0x5c, 0x8f, 0x8a, 0x9a, 0x6c, 0x84, 0x48, 0x80, 0x6b,
+    0x8d, 0x82, 0xaf, 0x89, 0x71, 0x9f, 0xa4, 0x9a, 0x7b, 0x68, 0x91, 0xaa,
+    0x6b, 0xa3, 0x9c, 0x62, 0x8d, 0x6d, 0x87, 0x87, 0x81, 0x9a, 0x97, 0x6c,
+    0x9c, 0x76, 0x63, 0xbc, 0x62, 0xbc, 0xb0, 0x97, 0xa7, 0x81, 0x70, 0x8f,
+    0x7d, 0xb2, 0xa6, 0x98, 0xa1, 0x7b, 0x8e, 0x83, 0x8c, 0xa2, 0x7e, 0x73,
+    0x99, 0x65, 0xc1, 0x77, 0x8e, 0xbc, 0x72, 0xa6, 0x8c, 0x55, 0xab, 0x8e,
+    0x7d, 0xa3, 0x79, 0x80, 0x9e, 0x6b, 0xa9, 0x6c, 0x80, 0xb6, 0x81, 0xa6,
+    0x92, 0x5b, 0xb7, 0x99, 0x81, 0x7e, 0x8e, 0x89, 0x97, 0x86, 0x93, 0x86,
+    0x7b, 0x9a, 0x7f, 0x9a, 0x8e, 0x69, 0xa3, 0xa4, 0x9f, 0x8b, 0x96, 0x6f,
+    0x8b, 0x97, 0xb4, 0x74, 0x96, 0x53, 0x99, 0x91, 0xa7, 0xa8, 0x69, 0x72,
+    0xc9, 0x85, 0x99, 0x93, 0xc0, 0x90, 0xaa, 0x7f, 0xc7, 0x71, 0x74, 0x8d,
+    0xb7, 0xab, 0x91, 0x69, 0xb4, 0x9b, 0x7d, 0x95, 0xc3, 0xb0, 0x9b, 0xa9,
+    0xb3, 0x9f, 0x79, 0xa5, 0x9f, 0xad, 0x6b, 0x85, 0x90, 0xad, 0x69, 0x62,
+    0x7e, 0xa6, 0x69, 0x4e, 0x80, 0x7e, 0x52, 0x57, 0x5f, 0x95, 0x72, 0x4c,
+    0x87, 0x4e, 0x5a, 0x62, 0x7d, 0x70, 0x92, 0x98, 0x76, 0x8e, 0x99, 0x7d,
+    0x73, 0x6d, 0x86, 0x8e, 0x6b, 0x80, 0xa7, 0x9d, 0x91, 0x73, 0x95, 0x70,
+    0x80, 0xc3, 0x9f, 0x8b, 0x72, 0x86, 0x6b, 0xad, 0x76, 0xbe, 0xad, 0x8e,
+    0x9c, 0x78, 0x6a, 0xbf, 0x7d, 0xa8, 0x88, 0x8a, 0x8b, 0x8c, 0x9c, 0x8c,
+    0x8a, 0x85, 0x73, 0x92, 0xa2, 0x7b, 0xa5, 0x96, 0x9b, 0xa3, 0x6c, 0x80,
+    0xa6, 0x63, 0xac, 0x98, 0xa3, 0x9a, 0x83, 0x8a, 0x8c, 0x63, 0xb9, 0x8c,
+    0x99, 0xa1, 0x7a, 0x6c, 0x9e, 0x59, 0x90, 0x84, 0x8a, 0x93, 0x8f, 0x87,
+    0x98, 0x84, 0x99, 0xa4, 0x72, 0x6d, 0x95, 0xa2, 0x95, 0x72, 0xc3, 0x88,
+    0x8f, 0x6a, 0x77, 0x7d, 0x8b, 0xae, 0xa3, 0x7c, 0xa8, 0x5d, 0x7c, 0xa8,
+    0xa1, 0x85, 0x7e, 0x8c, 0xac, 0x8d, 0x73, 0x88, 0xc1, 0x89, 0xaa, 0x89,
+    0xb2, 0x92, 0x75, 0x9a, 0x9c, 0x8e, 0xb9, 0xaa, 0xaa, 0xac, 0x78, 0x85,
+    0xbc, 0x9f, 0x6d, 0xb7, 0x89, 0xa6, 0xb3, 0x8e, 0xa5, 0xbb, 0x6b, 0x9d,
+    0x8f, 0x8b, 0x69, 0x7a, 0x82, 0x99, 0x8c, 0x49, 0x87, 0x74, 0x37, 0x63,
+    0x5d, 0x92, 0x77, 0x66, 0x63, 0x56, 0x77, 0x5d, 0x7f, 0x68, 0x97, 0x74,
+    0x84, 0x94, 0x7d, 0x7d, 0x91, 0x78, 0x87, 0x96, 0x7f, 0x97, 0x94, 0x6f,
+    0x89, 0x6c, 0x96, 0x71, 0x83, 0x8f, 0x8a, 0x89, 0x7d, 0x84, 0x8a, 0xa6,
+    0x7b, 0x95, 0x89, 0x77, 0x94, 0x80, 0x7f, 0x93, 0x5e, 0xbb, 0x9c, 0xa8,
+    0xa2, 0x7e, 0xa6, 0x86, 0x7d, 0x8b, 0x92, 0x73, 0xac, 0x78, 0xaa, 0x98,
+    0xb1, 0x94, 0x79, 0x8b, 0x8f, 0x70, 0xa7, 0xae, 0x92, 0xad, 0xb1, 0x8b,
+    0xb0, 0x78, 0xbc, 0xa9, 0xa4, 0xa3, 0x9e, 0x76, 0x89, 0x67, 0xab, 0x98,
+    0x75, 0x8c, 0x86, 0x95, 0x9e, 0x77, 0x96, 0x85, 0x8c, 0x8e, 0x8b, 0x8a,
+    0x8a, 0x4b, 0x71, 0x8a, 0x9b, 0x6d, 0x6e, 0x89, 0x81, 0x82, 0xa7, 0x98,
+    0xa5, 0x66, 0x72, 0x8b, 0x99, 0x9a, 0x8b, 0x8b, 0x9f, 0x87, 0x79, 0x84,
+    0x99, 0x6d, 0x90, 0x7d, 0x9d, 0xa7, 0x81, 0xa3, 0x9d, 0x96, 0x82, 0x86,
+    0xa2, 0x8e, 0x8d, 0x7f, 0x84, 0x8c, 0x98, 0xbc, 0x83, 0xb4, 0xb5, 0x78,
+    0x7d, 0xab, 0x8d, 0x87, 0x71, 0x8d, 0x6e, 0x8f, 0x89, 0xaa, 0x7c, 0x6f,
+    0x71, 0x69, 0x65, 0x60, 0x81, 0x91, 0x94, 0x6d, 0x76, 0x66, 0x74, 0x5e,
+    0x77, 0x7c, 0xa2, 0xa6, 0x70, 0x90, 0xa3, 0x68, 0x83, 0x69, 0x71, 0x72,
+    0x6c, 0xa9, 0x85, 0x71, 0x88, 0x60, 0x90, 0x84, 0x8a, 0xba, 0x8b, 0x8c,
+    0x72, 0x8f, 0x98, 0x84, 0x8b, 0x8a, 0xb1, 0xa2, 0x93, 0x8d, 0x86, 0x99,
+    0xa2, 0x99, 0xb0, 0xa6, 0x92, 0x78, 0x86, 0x87, 0x9c, 0x9d, 0x6f, 0x92,
+    0x9a, 0x8a, 0xbf, 0xaa, 0xa3, 0xa2, 0x71, 0x8d, 0x93, 0x70, 0xb5, 0x9c,
+    0xa8, 0x97, 0xb4, 0x93, 0xa6, 0x75, 0xbb, 0xa3, 0x92, 0x95, 0x95, 0x94,
+    0x90, 0x5b, 0xbf, 0x92, 0x8a, 0x95, 0xa0, 0xa1, 0x68, 0x7e, 0x9a, 0x7f,
+    0x88, 0xa7, 0x93, 0xa1, 0x7a, 0x93, 0x95, 0x8b, 0x96, 0x94, 0x70, 0xa0,
+    0x70, 0x8f, 0x9d, 0x96, 0x8e, 0x9c, 0x90, 0x9f, 0x7e, 0x83, 0x84, 0x9e,
+    0x7f, 0x65, 0x72, 0x84, 0x64, 0x94, 0x75, 0xa7, 0x62, 0xa3, 0x8a, 0x9b,
+    0x82, 0x99, 0x87, 0x70, 0x81, 0x6d, 0xac, 0x7b, 0x74, 0x68, 0x5d, 0x95,
+    0xa0, 0x6e, 0x84, 0xab, 0x79, 0x8e, 0x8b, 0x79, 0x7b, 0x83, 0xa0, 0x7b,
+    0x96, 0x71, 0x5d, 0xad, 0xa4, 0x82, 0x79, 0x96, 0x73, 0x84, 0x7d, 0x98,
+    0x87, 0x93, 0x86, 0xa6, 0x7f, 0x7c, 0x71, 0x9d, 0xa4, 0x9b, 0x8a, 0x7c,
+    0x87, 0x6a, 0x7f, 0x8d, 0x97, 0x92, 0xa0, 0x88, 0x77, 0x7d, 0x70, 0x9c,
+    0x9f, 0xa0, 0x71, 0xa3, 0x73, 0x95, 0x76, 0x79, 0x94, 0x95, 0x83, 0x8b,
+    0x8d, 0x82, 0x7a, 0x77, 0xa6, 0x88, 0x72, 0x7a, 0x90, 0x76, 0x7f, 0x95,
+    0x83, 0x90, 0x9e, 0x7c, 0x8e, 0x9a, 0x6b, 0xa4, 0x98, 0x9f, 0x86, 0x8c,
+    0x76, 0x70, 0x74, 0x97, 0x7e, 0xa4, 0x5f, 0xa3, 0xa7, 0x7f, 0x67, 0x8d,
+    0x82, 0x95, 0x93, 0x99, 0x82, 0x70, 0x75, 0xa8, 0xa1, 0xaf, 0x8a, 0x8a,
+    0xb0, 0x89, 0x88, 0x6b, 0x98, 0xaf, 0x75, 0x7f, 0x86, 0x90, 0x8f, 0x8c,
+    0x84, 0x8d, 0x7f, 0x8b, 0x94, 0x9f, 0x80, 0x8b, 0x93, 0xa2, 0x98, 0xa5,
+    0x83, 0x81, 0x8a, 0xaa, 0x86, 0xa3, 0xb0, 0xac, 0x64, 0x9c, 0x7c, 0x93,
+    0xac, 0x85, 0x7f, 0x88, 0x7a, 0xa5, 0x75, 0x69, 0x94, 0xa8, 0x95, 0xa9,
+    0x6f, 0x9f, 0x85, 0x8a, 0xa5, 0x97, 0x98, 0xa9, 0x76, 0x80, 0x7e, 0x95,
+    0x89, 0xaf, 0x68, 0x7b, 0xb4, 0x8a, 0x6b, 0xa4, 0x7b, 0x90, 0x79, 0xba,
+    0x9f, 0x82, 0x7d, 0x89, 0x85, 0x82, 0x94, 0xa5, 0x78, 0x8f, 0x6f, 0x71,
+    0x62, 0x66, 0x73, 0x98, 0x8c, 0x7d, 0x81, 0xa2, 0x69, 0x7c, 0x76, 0xa4,
+    0x94, 0x8f, 0x6f, 0x8a, 0x94, 0x8e, 0x8a, 0x88, 0x8c, 0xa3, 0x6f, 0xa2,
+    0x7d, 0x90, 0x8f, 0x96, 0x6c, 0x76, 0x6e, 0x8e, 0x82, 0x85, 0x7f, 0x93,
+    0x81, 0x83, 0x7b, 0x9f, 0x91, 0x89, 0x75, 0x9c, 0x9f, 0x86, 0x7a, 0x8c,
+    0x7a, 0x7b, 0x82, 0xae, 0x6a, 0x7d, 0x82, 0x82, 0xa0, 0x85, 0x99, 0x9f,
+    0x88, 0x8b, 0x8c, 0x8f, 0x90, 0x96, 0x8e, 0x98, 0xa3, 0x87, 0x7f, 0x9b,
+    0x94, 0x73, 0x96, 0x86, 0x72, 0x7c, 0x75, 0x7c, 0x90, 0x79, 0x83, 0x80,
+    0x79, 0x9e, 0x9c, 0x8e, 0x99, 0x8c, 0x7a, 0x9c, 0x8d, 0x99, 0x9d, 0x84,
+    0xa5, 0x93, 0x85, 0x96, 0x88, 0x94, 0x80, 0x90, 0x73, 0xa3, 0x7c, 0xa1,
+    0x88, 0xa4, 0x98, 0x9f, 0x9e, 0x92, 0x6c, 0xa0, 0x84, 0x87, 0x8a, 0x83,
+    0x7b, 0x91, 0x8c, 0x9e, 0x73, 0xa6, 0x93, 0xa0, 0x8d, 0x98, 0x74, 0xa1,
+    0x83, 0x9a, 0x80, 0xbc, 0x62, 0x70, 0x9e, 0xad, 0x9e, 0x8f, 0x8f, 0x9e,
+    0x7e, 0xac, 0xb0, 0xa9, 0x79, 0x6f, 0x79, 0x8f, 0x7e, 0x71, 0x8d, 0xab,
+    0x97, 0x76, 0x86, 0xa2, 0x98, 0x95, 0x8b, 0x9b, 0x75, 0x7a, 0x71, 0x85,
+    0x7f, 0x61, 0x76, 0x8e, 0x99, 0x91, 0x88, 0x73, 0x71, 0x65, 0x82, 0xa0,
+    0x9b, 0x8f, 0x79, 0x70, 0x78, 0x66, 0x85, 0x94, 0x8b, 0x91, 0x75, 0x80,
+    0x9c, 0x94, 0x7f, 0xa5, 0x82, 0x91, 0x7d, 0x76, 0x80, 0x78, 0x83, 0x82,
+    0x79, 0x98, 0x83, 0x87, 0x94, 0x71, 0x73, 0x77, 0x71, 0x94, 0x6a, 0xa8,
+    0x9e, 0x8d, 0x90, 0x78, 0x7a, 0x81, 0x9c, 0x91, 0x96, 0x80, 0x79, 0x83,
+    0x92, 0x9f, 0x8a, 0x84, 0x8e, 0x97, 0x8c, 0x81, 0x87, 0x74, 0x8b, 0x8e,
+    0xa7, 0x86, 0x8b, 0x8a, 0x8e, 0x8f, 0x9b, 0x6b, 0x82, 0x8a, 0x9f, 0x7a,
+    0x96, 0x80, 0x91, 0x94, 0xa6, 0x8e, 0x7a, 0x97, 0x8a, 0x6c, 0xad, 0xa1,
+    0x78, 0x95, 0x9d, 0x9d, 0x88, 0x94, 0x99, 0x86, 0x80, 0x9b, 0x7c, 0x9c,
+    0x87, 0x7a, 0xa0, 0xa8, 0x83, 0x74, 0x8e, 0x9b, 0x65, 0x95, 0x83, 0xc2,
+    0x69, 0x88, 0x87, 0xa7, 0x86, 0x98, 0x9f, 0xc6, 0x5c, 0x7f, 0xb9, 0x9c,
+    0x8b, 0x6e, 0x95, 0xbd, 0x72, 0x83, 0xbf, 0xb1, 0x89, 0x6d, 0x89, 0x8e,
+    0x9d, 0x87, 0x95, 0x92, 0x76, 0x8d, 0x7f, 0x7f, 0x6d, 0x9d, 0x7b, 0x95,
+    0x86, 0x69, 0x90, 0xa0, 0x62, 0x7c, 0x56, 0xa0, 0x9c, 0x8b, 0x81, 0x79,
+    0xa6, 0x73, 0x69, 0xaa, 0x7b, 0x87, 0x8b, 0x7e, 0xa1, 0x9f, 0x6d, 0xa6,
+    0x7e, 0x7e, 0x87, 0x7c, 0xa5, 0x84, 0x7b, 0xa2, 0xae, 0x92, 0x8e, 0x67,
+    0x93, 0x88, 0x8b, 0xa2, 0x8d, 0x96, 0x92, 0x8e, 0x71, 0x7a, 0x82, 0x80,
+    0x9e, 0x8b, 0x7b, 0x87, 0x96, 0xa0, 0xa4, 0x92, 0x88, 0x7e, 0x77, 0x8e,
+    0x91, 0x7e, 0x81, 0x77, 0x79, 0x93, 0x8d, 0x9d, 0x8a, 0x71, 0x8d, 0x88,
+    0x9d, 0x89, 0x85, 0x94, 0x99, 0x80, 0x89, 0x8f, 0x87, 0x81, 0x83, 0x74,
+    0x8a, 0x89, 0x68, 0x7e, 0x99, 0x82, 0x8c, 0x76, 0xc6, 0x8f, 0x90, 0x7d,
+    0x6c, 0x68, 0xbd, 0x90, 0x78, 0x9d, 0x7b, 0xa3, 0x99, 0x76, 0xaf, 0x8d,
+    0x7d, 0x84, 0x7f, 0x9f, 0x8b, 0x7a, 0xaa, 0xa8, 0x79, 0x89, 0x8f, 0x8f,
+    0x71, 0x80, 0x7f, 0xaa, 0x85, 0x70, 0xa8, 0x96, 0x6c, 0x8c, 0xaf, 0xeb,
+    0x57, 0x7e, 0xcf, 0x8d, 0x93, 0x72, 0xa6, 0xd2, 0x52, 0xab, 0xbb, 0xa8,
+    0x8d, 0x82, 0x7a, 0xbc, 0x72, 0x95, 0xa3, 0xa7, 0x8b, 0x74, 0x84, 0x85,
+    0x6a, 0x85, 0x92, 0x9f, 0x91, 0x6b, 0x9b, 0x73, 0x77, 0xa2, 0x7f, 0x81,
+    0x8e, 0x8b, 0x71, 0x8c, 0x7f, 0x60, 0x86, 0x81, 0x9c, 0x86, 0x93, 0x65,
+    0x84, 0x84, 0x89, 0xa2, 0x98, 0x67, 0x88, 0x71, 0x92, 0x80, 0x65, 0xa2,
+    0xa5, 0x99, 0x85, 0x95, 0x8f, 0x85, 0x8f, 0x82, 0x7e, 0x9a, 0x8a, 0x74,
+    0x9d, 0x75, 0x88, 0x7e, 0xa2, 0x77, 0x82, 0x9e, 0x78, 0xa1, 0x74, 0x79,
+    0x7f, 0x87, 0x91, 0x8d, 0x7a, 0x73, 0x96, 0xa2, 0xa3, 0x81, 0x7d, 0x8a,
+    0x85, 0x75, 0x84, 0x81, 0x8b, 0x7f, 0x6c, 0x86, 0x8d, 0x7b, 0x79, 0x78,
+    0x89, 0x85, 0x8c, 0x9a, 0xa6, 0x96, 0x7a, 0x78, 0xa2, 0x85, 0x9b, 0x89,
+    0xc8, 0x97, 0xa3, 0x82, 0x8b, 0x7f, 0xe7, 0x8f, 0x8f, 0x74, 0x75, 0x83,
+    0x87, 0x79, 0xb3, 0xab, 0x70, 0x9a, 0x9a, 0xa6, 0x81, 0x7e, 0xb8, 0x91,
+    0x8b, 0x8d, 0x93, 0xa1, 0x79, 0x7d, 0x81, 0xb4, 0x79, 0x94, 0xa5, 0x89,
+    0x8e, 0x7c, 0x9b, 0xe2, 0x50, 0x94, 0xdf, 0xa0, 0x53, 0x5d, 0x90, 0xde,
+    0x67, 0x90, 0xaf, 0x8a, 0x8f, 0x73, 0x7b, 0xcb, 0x64, 0x9f, 0x91, 0x86,
+    0x95, 0x84, 0x83, 0x88, 0x76, 0x8b, 0x8a, 0x8f, 0x9c, 0x9a, 0x92, 0x96,
+    0x7f, 0x8e, 0x79, 0x80, 0x91, 0x6d, 0x86, 0x59, 0x74, 0x8a, 0x53, 0x88,
+    0xae, 0x7b, 0x80, 0x70, 0x87, 0x74, 0x75, 0x91, 0xa4, 0x74, 0x8d, 0x5a,
+    0x83, 0x95, 0x65, 0xa1, 0xb3, 0x74, 0x87, 0x7d, 0xaa, 0x82, 0x79, 0x78,
+    0x9b, 0x7c, 0x78, 0x74, 0x9e, 0x74, 0x92, 0x92, 0xa3, 0x6e, 0x75, 0x92,
+    0x6a, 0x6f, 0xa3, 0x7c, 0x9e, 0x7f, 0x92, 0x6b, 0x96, 0x79, 0x9a, 0x87,
+    0x83, 0x8c, 0x72, 0x79, 0x6a, 0xa3, 0x79, 0x7d, 0x6d, 0x6c, 0x81, 0x96,
+    0x98, 0x7f, 0x94, 0x81, 0x8a, 0x8a, 0xa7, 0x8c, 0x9a, 0x84, 0xa7, 0x89,
+    0x9d, 0x85, 0xa6, 0xa8, 0xd0, 0x92, 0x97, 0x9f, 0x76, 0x86, 0xe6, 0x6f,
+    0x7c, 0x84, 0x98, 0x8d, 0x80, 0x75, 0xc5, 0x86, 0x6b, 0x8d, 0x9e, 0x9e,
+    0x7f, 0x71, 0x97, 0xa1, 0x75, 0x92, 0xa9, 0x9e, 0x91, 0x5e, 0xa2, 0xa2,
+    0x68, 0xad, 0xa5, 0xa0, 0x7e, 0x68, 0xac, 0xdc, 0x50, 0xa2, 0xc1, 0x8a,
+    0x63, 0x74, 0x7e, 0xd9, 0x3f, 0xbb, 0xba, 0x9d, 0x7f, 0x76, 0x5f, 0xb0,
+    0x74, 0x8e, 0xb1, 0x95, 0x9a, 0x81, 0x63, 0x9f, 0x98, 0x74, 0x80, 0x89,
+    0x95, 0x8e, 0x9e, 0x78, 0x87, 0x82, 0x57, 0x87, 0x8d, 0x90, 0x79, 0x80,
+    0x76, 0x7c, 0x7d, 0x8a, 0xa6, 0x82, 0x98, 0x7a, 0x96, 0x97, 0x84, 0x87,
+    0xab, 0x7f, 0x87, 0x57, 0x83, 0x6a, 0x6a, 0x84, 0x9c, 0x8d, 0x74, 0x68,
+    0xa2, 0x92, 0x90, 0x98, 0x98, 0x8b, 0x6d, 0x72, 0x90, 0x8c, 0x7c, 0x7d,
+    0x9b, 0x6e, 0x71, 0x76, 0x6b, 0x7b, 0x63, 0x81, 0xad, 0x71, 0x78, 0x8e,
+    0x74, 0x87, 0x8e, 0x8a, 0xab, 0x8e, 0x83, 0x85, 0x7d, 0xa0, 0x67, 0x7f,
+    0x9c, 0x74, 0x6b, 0x88, 0x66, 0x92, 0x7f, 0x83, 0x94, 0x92, 0xa5, 0x82,
+    0xa1, 0x7b, 0x6f, 0x70, 0xab, 0x72, 0xb5, 0x91, 0xb7, 0x89, 0x91, 0x77,
+    0x77, 0x8a, 0xdb, 0x88, 0x8a, 0x8d, 0x89, 0x6c, 0x7b, 0x83, 0xc8, 0xb5,
+    0x4b, 0x96, 0x8b, 0x92, 0x91, 0x76, 0xa9, 0xae, 0x70, 0xa8, 0x74, 0x9d,
+    0x96, 0x6d, 0xa1, 0xba, 0x86, 0xbc, 0xbc, 0xa2, 0x8d, 0x6c, 0x96, 0xd8,
+    0x71, 0xb1, 0xae, 0xb0, 0x79, 0x7b, 0x71, 0xd8, 0x32, 0xaa, 0xae, 0xa7,
+    0x7c, 0x6b, 0x77, 0xc0, 0x7c, 0x9e, 0x9f, 0x89, 0x92, 0x8a, 0x76, 0xae,
+    0x97, 0x75, 0x87, 0x8c, 0x7f, 0x86, 0x8b, 0x73, 0x6b, 0x64, 0x87, 0x6d,
+    0x99, 0x8f, 0x8d, 0x66, 0x76, 0x87, 0x6d, 0x6e, 0x98, 0x7a, 0x91, 0x92,
+    0x8c, 0x7c, 0x89, 0x9b, 0x9e, 0x83, 0x86, 0x62, 0x90, 0x6e, 0x62, 0x82,
+    0xa3, 0x7e, 0x86, 0x6a, 0x93, 0x9b, 0x73, 0x6c, 0xa8, 0x99, 0x73, 0x99,
+    0x8c, 0x89, 0x85, 0x67, 0x98, 0x78, 0x63, 0x98, 0x77, 0xa6, 0x6e, 0x81,
+    0xa4, 0x64, 0x8f, 0x8a, 0x7f, 0x9b, 0x91, 0x91, 0x94, 0x82, 0x8b, 0x8b,
+    0x76, 0x66, 0x83, 0x81, 0x94, 0x71, 0x82, 0x9e, 0x93, 0x85, 0x80, 0x8c,
+    0xae, 0x94, 0x96, 0x74, 0x91, 0x9a, 0x6f, 0x9e, 0xa9, 0x76, 0xab, 0x8e,
+    0xd6, 0x9c, 0x7d, 0x98, 0x83, 0x6e, 0xfe, 0x83, 0x71, 0x82, 0x9f, 0x93,
+    0x7b, 0x67, 0xcb, 0xb9, 0x66, 0x89, 0x99, 0x8a, 0xac, 0x8c, 0xa0, 0x9c,
+    0x70, 0xaf, 0x81, 0x88, 0x9c, 0x7e, 0xa8, 0xa5, 0x65, 0x8c, 0xa1, 0x8c,
+    0x83, 0x85, 0x9d, 0xcb, 0x4b, 0xc1, 0xb5, 0xa2, 0x75, 0x63, 0x75, 0xbd,
+    0x34, 0xae, 0xca, 0xa2, 0x89, 0x7a, 0x69, 0xb0, 0x70, 0xae, 0x94, 0x76,
+    0x85, 0x93, 0x6a, 0x90, 0x6a, 0x8a, 0xac, 0x71, 0x7e, 0x81, 0xa2, 0x71,
+    0x98, 0x86, 0x99, 0x76, 0x8f, 0x6f, 0x90, 0x93, 0x7c, 0x72, 0x81, 0x8c,
+    0x78, 0x77, 0x97, 0x84, 0x98, 0x70, 0x96, 0x9a, 0x9b, 0x93, 0x92, 0x5f,
+    0xaa, 0x88, 0x5b, 0x74, 0xaa, 0x96, 0x6a, 0x73, 0x87, 0x83, 0x72, 0x89,
+    0xab, 0x8a, 0x5f, 0x71, 0xa4, 0x94, 0x92, 0x60, 0x96, 0x7b, 0x53, 0x88,
+    0x69, 0x8b, 0x5e, 0x7b, 0xa0, 0x83, 0x70, 0x95, 0x6d, 0x9b, 0x6d, 0x98,
+    0x99, 0x86, 0x6e, 0x7a, 0x87, 0x86, 0x68, 0x8a, 0x7e, 0x87, 0x90, 0x7d,
+    0x76, 0x93, 0x80, 0x8a, 0x8f, 0x97, 0xac, 0x71, 0xa2, 0x96, 0x7f, 0x8e,
+    0xc2, 0x71, 0xab, 0xa9, 0xd1, 0x85, 0x8c, 0x74, 0x70, 0x72, 0xff, 0x77,
+    0x6d, 0x77, 0x91, 0x5d, 0x71, 0x5d, 0xb2, 0xb1, 0x38, 0x76, 0xa6, 0x80,
+    0x91, 0x86, 0xa3, 0x9c, 0x85, 0x95, 0x99, 0xab, 0x8a, 0x6e, 0x9f, 0xa6,
+    0x75, 0xa9, 0xb3, 0x97, 0x69, 0x85, 0xa4, 0xc9, 0x59, 0xb4, 0xca, 0x8d,
+    0x5c, 0x67, 0x7d, 0xcd, 0x29, 0xca, 0xdb, 0x8c, 0x86, 0x8c, 0x70, 0xaa,
+    0x5c, 0x9e, 0x98, 0x86, 0x92, 0x7e, 0x6b, 0x8e, 0x8f, 0x6a, 0x84, 0x71,
+    0x9a, 0x76, 0x87, 0x84, 0x8b, 0x7f, 0x7f, 0x6e, 0xa3, 0x83, 0x85, 0x78,
+    0x6f, 0x7c, 0x6f, 0x96, 0x95, 0x8c, 0xa3, 0x72, 0x92, 0x66, 0x7b, 0x99,
+    0x9c, 0x9c, 0x9a, 0x63, 0xaa, 0x81, 0x7f, 0x90, 0x8c, 0xa0, 0x7e, 0x67,
+    0x94, 0x96, 0x7f, 0x8a, 0x95, 0x91, 0x5c, 0x73, 0x88, 0x9b, 0x85, 0x70,
+    0x87, 0x79, 0x56, 0x92, 0x69, 0x95, 0x62, 0x78, 0x93, 0x83, 0x63, 0x98,
+    0x7a, 0xa4, 0x95, 0x7c, 0x8e, 0x69, 0x86, 0x92, 0x7d, 0x6b, 0x69, 0x85,
+    0xa8, 0x90, 0x7c, 0x7b, 0x9e, 0x87, 0x7b, 0x90, 0x98, 0x7a, 0xa4, 0x92,
+    0xad, 0x97, 0xa0, 0x6d, 0xa6, 0x74, 0xb7, 0x7f, 0xb9, 0x94, 0x6c, 0x77,
+    0x65, 0x6f, 0xfc, 0x7d, 0x68, 0x74, 0xa1, 0x6c, 0x71, 0x61, 0xc3, 0xb5,
+    0x60, 0x86, 0x8b, 0x7d, 0x89, 0x8b, 0x93, 0xa4, 0x68, 0xa0, 0x8f, 0x73,
+    0x96, 0x6e, 0x81, 0x99, 0x81, 0x9d, 0xae, 0x93, 0x6a, 0x8b, 0x9a, 0xcb,
+    0x68, 0xaf, 0xca, 0x81, 0x73, 0x6e, 0x70, 0xd7, 0x49, 0xb9, 0xc5, 0x9d,
+    0x87, 0x8d, 0x61, 0xa8, 0x5e, 0xa4, 0xb7, 0xab, 0x96, 0x84, 0x76, 0x98,
+    0x84, 0x99, 0x8f, 0x70, 0x79, 0x94, 0xa5, 0x87, 0x6e, 0x73, 0x63, 0x7e,
+    0x83, 0x8c, 0x88, 0x71, 0x7a, 0x81, 0x7d, 0x94, 0x92, 0x89, 0xab, 0x7a,
+    0x96, 0x66, 0x7b, 0x8b, 0x8f, 0x8e, 0x94, 0x5b, 0xa0, 0x7f, 0x82, 0x84,
+    0x84, 0x80, 0x7d, 0x81, 0x89, 0x7b, 0x97, 0x78, 0x83, 0x93, 0x4c, 0x95,
+    0x7f, 0x93, 0x8e, 0x70, 0x89, 0x81, 0x69, 0x87, 0x76, 0x73, 0x9a, 0x74,
+    0xa2, 0x88, 0x5e, 0xac, 0x74, 0x8e, 0x74, 0x8e, 0x94, 0x85, 0x7b, 0x7a,
+    0x72, 0x82, 0x68, 0x77, 0x96, 0x8a, 0x7b, 0x6c, 0x88, 0x8b, 0x6b, 0x86,
+    0xa4, 0x88, 0xac, 0xa1, 0x90, 0x8e, 0x85, 0x6d, 0xb1, 0x69, 0xb1, 0xa2,
+    0xbe, 0x9a, 0x7c, 0xb4, 0x63, 0x56, 0xf2, 0x90, 0x5e, 0x71, 0xa3, 0x6a,
+    0x8b, 0x67, 0xbe, 0xa8, 0x6e, 0x8b, 0x90, 0x83, 0xa0, 0x78, 0x9f, 0xa5,
+    0x65, 0xa3, 0x8b, 0x94, 0x84, 0x6c, 0xa5, 0x97, 0x7d, 0xa7, 0x9f, 0x9c,
+    0x62, 0x7d, 0xb5, 0xb1, 0x58, 0x98, 0xba, 0x8d, 0x7f, 0x57, 0x86, 0xc5,
+    0x39, 0xb3, 0xc9, 0xa9, 0x89, 0x8e, 0x55, 0xaf, 0x54, 0xb4, 0xb0, 0x8f,
+    0x8b, 0x7c, 0x6e, 0x8e, 0x96, 0x90, 0x8a, 0x83, 0x84, 0x8c, 0x96, 0x7f,
+    0x89, 0x67, 0x99, 0x60, 0x74, 0x8d, 0x9b, 0x82, 0x6f, 0x61, 0x84, 0x9a,
+    0x7c, 0x85, 0x86, 0x7c, 0x9b, 0x5f, 0x81, 0x96, 0x90, 0x9b, 0xa0, 0x58,
+    0xaf, 0x78, 0x81, 0x8f, 0x96, 0x81, 0x77, 0x7d, 0xa2, 0x85, 0x74, 0x84,
+    0x99, 0x8d, 0x5f, 0x77, 0x8a, 0x8c, 0x85, 0x78, 0x8f, 0x80, 0x5c, 0x6f,
+    0x77, 0x73, 0x80, 0x99, 0x83, 0x89, 0x6f, 0x8e, 0x85, 0x7e, 0x6c, 0x81,
+    0x99, 0x89, 0x69, 0x70, 0x8c, 0x8f, 0x6b, 0x89, 0x80, 0x7a, 0x83, 0x7a,
+    0x96, 0x99, 0x73, 0x76, 0x9c, 0x67, 0xab, 0xab, 0xbd, 0x8b, 0x85, 0x90,
+    0xb0, 0x6b, 0xbd, 0x9c, 0xb9, 0xa0, 0x7c, 0x7d, 0x66, 0x78, 0xdb, 0x97,
+    0x55, 0x67, 0x96, 0x69, 0x80, 0x49, 0xc1, 0xbb, 0x6c, 0x91, 0x8a, 0x92,
+    0x9a, 0x98, 0xa5, 0x98, 0x51, 0xa6, 0x99, 0x8e, 0x73, 0x73, 0x9d, 0x9f,
+    0x77, 0xa6, 0xa4, 0x92, 0x64, 0x75, 0xac, 0xb2, 0x5d, 0xa1, 0xab, 0xa4,
+    0x5a, 0x5b, 0xb3, 0xb7, 0x2d, 0xca, 0xc8, 0x76, 0x94, 0x8e, 0x59, 0xb0,
+    0x52, 0x9d, 0xbd, 0x89, 0x97, 0x84, 0x5d, 0x9a, 0x87, 0x9b, 0x94, 0x6c,
+    0x7b, 0xaa, 0x8a, 0x8b, 0x79, 0x5d, 0x90, 0x5c, 0x8b, 0x7b, 0xbe, 0x68,
+    0x84, 0x6f, 0x75, 0x72, 0x98, 0x82, 0x92, 0x7a, 0xa2, 0x6e, 0x7b, 0x7d,
+    0x9c, 0x99, 0x97, 0x5d, 0x9b, 0x69, 0x80, 0xa3, 0x96, 0x8d, 0x7c, 0x82,
+    0xa3, 0x76, 0x95, 0x67, 0x93, 0x8e, 0x62, 0x7b, 0x78, 0x96, 0x69, 0x67,
+    0x84, 0x8f, 0x62, 0x80, 0x88, 0x7e, 0x6c, 0x94, 0xab, 0x8b, 0x82, 0x9e,
+    0x7e, 0x8c, 0x70, 0x83, 0x9c, 0x9c, 0x80, 0x87, 0x8f, 0xa1, 0x7f, 0x81,
+    0x95, 0x83, 0x6d, 0x7a, 0xa0, 0x77, 0x6d, 0x76, 0x91, 0x7e, 0xa3, 0x62,
+    0xa0, 0x93, 0x7e, 0x97, 0xb6, 0x6c, 0xad, 0x72, 0xb2, 0x95, 0x73, 0x83,
+    0x62, 0x56, 0xe2, 0x99, 0x6e, 0x66, 0xb0, 0x6c, 0x75, 0x4e, 0xb2, 0xc7,
+    0x51, 0x98, 0x90, 0x8c, 0x82, 0x63, 0xa8, 0x99, 0x54, 0xc1, 0x87, 0x80,
+    0x79, 0x62, 0xad, 0x81, 0x76, 0x99, 0xa9, 0x9b, 0x4e, 0x8c, 0xaf, 0xb6,
+    0x5d, 0x9b, 0xb4, 0x9f, 0x6d, 0x60, 0xa5, 0xb5, 0x3e, 0xb2, 0xc4, 0x96,
+    0x86, 0x6d, 0x48, 0x99, 0x50, 0xc1, 0xa8, 0x93, 0x8a, 0x92, 0x7d, 0x8f,
+    0x74, 0x87, 0x91, 0x71, 0x8c, 0x87, 0x90, 0x80, 0x80, 0x82, 0x7b, 0x85,
+    0x81, 0x7f, 0xa7, 0x6a, 0x78, 0x4e, 0x90, 0x85, 0x9f, 0x93, 0x91, 0x91,
+    0xa5, 0x6e, 0x9d, 0xa7, 0x9e, 0x7f, 0x9a, 0x66, 0xbe, 0x6f, 0x82, 0x81,
+    0x85, 0x86, 0x89, 0x6c, 0x88, 0x92, 0x6d, 0x6a, 0x8c, 0x95, 0x68, 0x70,
+    0x91, 0x9b, 0x76, 0x59, 0x87, 0x93, 0x6f, 0x79, 0x7a, 0x99, 0x7d, 0x76,
+    0xa3, 0x9c, 0x69, 0x75, 0x8f, 0x8e, 0x7e, 0x7a, 0x80, 0x8b, 0x76, 0x82,
+    0x70, 0x71, 0x77, 0x7a, 0x88, 0xa1, 0x79, 0x75, 0x9e, 0x7e, 0x6d, 0x6f,
+    0xa5, 0x84, 0xb1, 0x77, 0xad, 0x94, 0x98, 0x90, 0xa7, 0x5c, 0xb6, 0x84,
+    0x99, 0x91, 0x71, 0x7b, 0x6d, 0x54, 0xd2, 0x84, 0x5d, 0x75, 0xb4, 0x7e,
+    0x7d, 0x53, 0xc5, 0x98, 0x70, 0xaa, 0x9e, 0x81, 0x7d, 0x68, 0xa7, 0x8d,
+    0x63, 0xab, 0x9b, 0x96, 0x7e, 0x6b, 0xa3, 0x9e, 0x6d, 0x98, 0xaf, 0x9b,
+    0x78, 0x74, 0xae, 0xc7, 0x70, 0x98, 0xd4, 0x9a, 0x6e, 0x75, 0xa2, 0xcd,
+    0x42, 0xb0, 0xc9, 0x89, 0x88, 0x77, 0x6a, 0xa4, 0x66, 0xb5, 0xbc, 0x8a,
+    0x96, 0x87, 0x5e, 0xa5, 0x87, 0x95, 0x91, 0x5d, 0x85, 0x91, 0xaa, 0x8f,
+    0x99, 0x78, 0x79, 0x74, 0x7f, 0x81, 0xa1, 0x74, 0x77, 0x64, 0x6c, 0x94,
+    0xa0, 0x8b, 0x9b, 0x8e, 0xac, 0x6a, 0x98, 0x9c, 0x7a, 0x9f, 0xab, 0x7e,
+    0xa3, 0x8b, 0x68, 0x7f, 0x84, 0x9f, 0x93, 0x77, 0x90, 0x98, 0x8f, 0x87,
+    0x81, 0x8e, 0x76, 0x95, 0x66, 0x78, 0x85, 0x79, 0x95, 0x89, 0x64, 0x8e,
+    0x8a, 0x87, 0x6f, 0x65, 0xa4, 0x98, 0x7a, 0x83, 0x85, 0x7e, 0x6b, 0xaa,
+    0x81, 0x94, 0x7c, 0x6e, 0x78, 0x85, 0x87, 0x6d, 0x7a, 0x92, 0x67, 0x7a,
+    0x8d, 0x95, 0x77, 0x7f, 0x9f, 0x71, 0xb1, 0xa1, 0xb2, 0x91, 0x7f, 0xb0,
+    0xac, 0x5c, 0xaf, 0x6a, 0xae, 0x98, 0x63, 0x7e, 0x67, 0x6f, 0xc4, 0x8a,
+    0x75, 0x61, 0xac, 0x73, 0x86, 0x54, 0xc3, 0xa8, 0x5d, 0xa9, 0xb4, 0x9b,
+    0x80, 0x6d, 0xa1, 0x8d, 0x64, 0xaa, 0x86, 0x96, 0x86, 0x6c, 0x9b, 0x8b,
+    0x73, 0x9f, 0x9a, 0x87, 0x64, 0x6c, 0xad, 0xa6, 0x64, 0x8a, 0xbe, 0x88,
+    0x67, 0x67, 0xaf, 0xb0, 0x71, 0xae, 0xde, 0x95, 0x9f, 0x7c, 0x7d, 0xa1,
+    0x79, 0xb8, 0xaa, 0x9c, 0x84, 0x91, 0x6b, 0xac, 0x74, 0xa1, 0xad, 0x74,
+    0x88, 0x93, 0x94, 0x72, 0x97, 0x7a, 0x78, 0x86, 0x76, 0x93, 0xb1, 0x6f,
+    0x91, 0x44, 0x96, 0x8e, 0x8e, 0xa5, 0x9a, 0x70, 0x99, 0x79, 0x84, 0x82,
+    0x7f, 0x78, 0xac, 0x6f, 0x9c, 0x80, 0x7d, 0x87, 0x7f, 0x9d, 0x6a, 0x71,
+    0x7c, 0x92, 0x78, 0x7a, 0x93, 0x90, 0x55, 0x83, 0x7a, 0x8a, 0x9a, 0x65,
+    0x86, 0x9b, 0x7c, 0x6b, 0xa3, 0x85, 0x86, 0x71, 0xab, 0x9a, 0x86, 0x90,
+    0x86, 0x88, 0x88, 0x88, 0x99, 0x98, 0x77, 0x86, 0x88, 0x90, 0x79, 0x7c,
+    0x6e, 0x9f, 0x76, 0x70, 0x84, 0x67, 0x7e, 0x8b, 0xa5, 0x68, 0xa7, 0x9d,
+    0xb5, 0x9b, 0x8b, 0x8a, 0xc0, 0x60, 0x9e, 0x83, 0xb0, 0xb7, 0x65, 0x7f,
+    0x7a, 0x7e, 0xc3, 0x7b, 0x74, 0x8f, 0xa4, 0x68, 0x5f, 0x47, 0xbb, 0xa4,
+    0x74, 0x95, 0xab, 0x80, 0x70, 0x5c, 0x9a, 0x8a, 0x7d, 0xa5, 0x90, 0x7d,
+    0x86, 0x68, 0xb1, 0x73, 0x6d, 0xad, 0x93, 0x8d, 0x7b, 0x64, 0xbd, 0xae,
+    0x7a, 0x98, 0xcb, 0x97, 0x83, 0x67, 0xab, 0xb0, 0x61, 0xa7, 0xcd, 0x7e,
+    0x87, 0x78, 0x76, 0x95, 0x6a, 0xba, 0xa9, 0x84, 0x8f, 0x95, 0x7c, 0x8b,
+    0x90, 0x89, 0x8b, 0x81, 0x87, 0x8b, 0x76, 0x73, 0x6f, 0x61, 0x94, 0x73,
+    0x83, 0x97, 0xb3, 0x6b, 0x9c, 0x55, 0x7f, 0x96, 0x9a, 0x92, 0x85, 0x52,
+    0xc6, 0x73, 0x88, 0x9c, 0x7c, 0x86, 0x98, 0x6d, 0x99, 0x87, 0x80, 0x7c,
+    0x7d, 0x98, 0x74, 0x7c, 0x89, 0x8a, 0x7d, 0x7b, 0x83, 0x90, 0x7d, 0x81,
+    0x7a, 0xa0, 0x86, 0x5f, 0x74, 0x8e, 0x68, 0x7b, 0x6c, 0x86, 0x90, 0x84,
+    0x7e, 0xae, 0x73, 0x6f, 0x8d, 0x81, 0x7c, 0x93, 0xa0, 0xb3, 0x6b, 0x9a,
+    0x88, 0xab, 0x8a, 0x94, 0x9c, 0x87, 0x9c, 0x75, 0x7d, 0x8f, 0x7c, 0x7f,
+    0x9b, 0x69, 0xa8, 0x99, 0x9d, 0x89, 0x8f, 0x72, 0xba, 0x61, 0xac, 0x91,
+    0xb5, 0xa7, 0x84, 0x99, 0x71, 0x7e, 0xd0, 0x7c, 0x6d, 0x66, 0xb6, 0x72,
+    0x79, 0x61, 0xb6, 0xab, 0x69, 0xa0, 0xaa, 0x7d, 0x74, 0x61, 0x95, 0xa5,
+    0x71, 0xb0, 0x93, 0x95, 0x86, 0x7d, 0x9f, 0x7e, 0x6c, 0x97, 0x85, 0x87,
+    0x72, 0x7b, 0xb4, 0xad, 0x84, 0x7b, 0xcd, 0xa9, 0x7e, 0x6d, 0xc8, 0xc7,
+    0x7e, 0xb7, 0xcf, 0x98, 0x7b, 0x7c, 0x69, 0xaf, 0x64, 0xa6, 0xc1, 0x8e,
+    0x8f, 0x9c, 0x7d, 0x93, 0x7a, 0x96, 0x8a, 0x65, 0x92, 0x95, 0x8d, 0x6f,
+    0x9f, 0x7f, 0x65, 0x69, 0x7a, 0x92, 0x9f, 0x5c, 0x90, 0x4e, 0x69, 0x89,
+    0x8f, 0x9c, 0xa8, 0x7a, 0xb6, 0x7d, 0x84, 0x97, 0x7f, 0x91, 0x8d, 0x71,
+    0xae, 0x86, 0x80, 0x78, 0x81, 0x87, 0x6e, 0x88, 0x87, 0x7f, 0x8f, 0x9d,
+    0x78, 0x91, 0x74, 0x91, 0x7f, 0x7a, 0x80, 0x63, 0x93, 0xa0, 0x7f, 0x6f,
+    0xa3, 0x88, 0x76, 0x5c, 0x6e, 0xa1, 0x6e, 0x7f, 0x84, 0x8b, 0x87, 0x6d,
+    0x87, 0x9f, 0x79, 0x7c, 0x83, 0x89, 0x7e, 0x86, 0xa0, 0x82, 0x80, 0x8e,
+    0x8b, 0x6c, 0x6e, 0x69, 0x9f, 0x79, 0xaa, 0x6e, 0xa2, 0x8f, 0x9d, 0x87,
+    0xb4, 0x5d, 0xba, 0x6c, 0xaf, 0xa0, 0x84, 0x87, 0x8c, 0x89, 0xcb, 0x6f,
+    0x8e, 0x71, 0xae, 0x5d, 0x6c, 0x61, 0xb3, 0xaf, 0x7a, 0x94, 0xb1, 0x8a,
+    0x80, 0x65, 0x8a, 0x9d, 0x61, 0xb6, 0x8b, 0x97, 0x8a, 0x73, 0xa8, 0x82,
+    0x74, 0x8a, 0x9c, 0x73, 0x61, 0x69, 0xb8, 0x9f, 0x76, 0x90, 0xc5, 0xaa,
+    0x6b, 0x5f, 0xb7, 0xce, 0x6d, 0xb7, 0xcc, 0x97, 0x7a, 0x81, 0x95, 0xbe,
+    0x78, 0xb1, 0xb4, 0x97, 0x8e, 0x99, 0x70, 0xa2, 0x72, 0x8d, 0x8e, 0x7d,
+    0x90, 0x9f, 0x7b, 0x63, 0x87, 0x89, 0x7a, 0x5f, 0x81, 0x97, 0x8d, 0x78,
+    0x94, 0x64, 0x95, 0x9d, 0x90, 0x87, 0xb3, 0x6e, 0xc2, 0x80, 0x94, 0x86,
+    0x87, 0x93, 0xb3, 0x57, 0xb8, 0x73, 0x8a, 0x81, 0x6f, 0x95, 0x89, 0x82,
+    0x94, 0x7a, 0x8e, 0x97, 0x8a, 0x91, 0x7f, 0x77, 0x98, 0x72, 0x67, 0x5f,
+    0x7b, 0x8d, 0x78, 0x74, 0x91, 0x82, 0x86, 0x5c, 0x88, 0xa3, 0x73, 0x6f,
+    0x92, 0x78, 0x9c, 0x95, 0x99, 0x9d, 0x70, 0x89, 0x8f, 0xa7, 0x74, 0x89,
+    0x77, 0x90, 0x72, 0x8d, 0x9c, 0x6f, 0x7a, 0x6c, 0x9f, 0x72, 0xad, 0x6c,
+    0xa5, 0x7a, 0x9d, 0x78, 0xa4, 0x52, 0xbd, 0x94, 0xb5, 0x97, 0x75, 0x78,
+    0x86, 0x72, 0xdf, 0x6f, 0x98, 0x81, 0xab, 0x5d, 0x62, 0x65, 0x9d, 0xbc,
+    0x68, 0x8a, 0xc1, 0x7e, 0x67, 0x7f, 0x88, 0x95, 0x7f, 0xbd, 0x9c, 0x77,
+    0x7d, 0x7e, 0x96, 0x7c, 0x7f, 0xa1, 0xa4, 0x90, 0x7c, 0x74, 0xc0, 0xac,
+    0x7d, 0xa1, 0xdb, 0x85, 0x85, 0x51, 0xbc, 0xb1, 0x6c, 0xcb, 0xd1, 0xa7,
+    0x76, 0x70, 0x7d, 0xba, 0x88, 0xb6, 0xaf, 0xa2, 0x9d, 0x9b, 0x71, 0x96,
+    0x80, 0x89, 0xa3, 0x86, 0x89, 0x8f, 0x76, 0x77, 0xa9, 0x82, 0x8f, 0x69,
+    0x7f, 0x9d, 0xac, 0x80, 0x98, 0x6c, 0x70, 0x72, 0x81, 0x8b, 0xaf, 0x80,
+    0xb1, 0x6f, 0x7c, 0x90, 0x91, 0x82, 0xa5, 0x67, 0x9c, 0x76, 0x8c, 0x6b,
+    0x9c, 0x9b, 0x87, 0x8c, 0x8e, 0x8b, 0xb0, 0x9d, 0x89, 0x8f, 0x76, 0x87,
+    0x9b, 0x90, 0x8e, 0x74, 0x73, 0x91, 0x85, 0x80, 0x81, 0x72, 0x99, 0x84,
+    0x87, 0x95, 0x84, 0x8c, 0x8a, 0x6e, 0x8c, 0x82, 0xad, 0x9d, 0x80, 0x7f,
+    0x96, 0x9c, 0x7f, 0x67, 0xb0, 0x98, 0x69, 0x84, 0x94, 0xa9, 0x7e, 0x83,
+    0x9d, 0x62, 0x92, 0x6e, 0x95, 0x88, 0xa4, 0x90, 0x97, 0x4d, 0xae, 0x89,
+    0xb6, 0xa1, 0x88, 0x9f, 0x7a, 0x70, 0xc2, 0x71, 0x7f, 0x83, 0x90, 0x83,
+    0x5e, 0x50, 0xa9, 0x9f, 0x73, 0x8c, 0xb2, 0x80, 0x79, 0x65, 0x7c, 0x90,
+    0x6d, 0x9a, 0x91, 0x8d, 0x6f, 0x65, 0x97, 0x87, 0x82, 0xa0, 0xa4, 0x8c,
+    0x68, 0x76, 0xa8, 0xa2, 0x7f, 0xa4, 0xcd, 0x91, 0x70, 0x54, 0x95, 0xc6,
+    0x6e, 0x9c, 0xe2, 0xa1, 0x86, 0x82, 0x73, 0xbc, 0x89, 0xaa, 0xb2, 0x7d,
+    0x82, 0x84, 0x8b, 0x9e, 0x84, 0x94, 0xa0, 0x7a, 0x98, 0x9d, 0x99, 0x7b,
+    0x7b, 0x89, 0x8f, 0x66, 0x89, 0x9b, 0xa7, 0x8b, 0x9b, 0x62, 0x9b, 0x78,
+    0x8b, 0x95, 0xbd, 0x7a, 0x9e, 0x61, 0x80, 0x84, 0x89, 0x8e, 0xb4, 0x7b,
+    0xb8, 0x70, 0x75, 0x8e, 0x7b, 0x9c, 0x9e, 0x9f, 0x89, 0x86, 0x9b, 0x7a,
+    0x7b, 0x95, 0x83, 0x95, 0x80, 0x94, 0x85, 0x65, 0x8c, 0x81, 0x67, 0x77,
+    0x94, 0x8a, 0x92, 0x74, 0x72, 0x90, 0x6b, 0x74, 0x7e, 0x75, 0x71, 0x84,
+    0x9e, 0xa6, 0x64, 0x80, 0x8d, 0x7a, 0x8c, 0x82, 0x98, 0x96, 0x64, 0x7d,
+    0x8b, 0x82, 0x6a, 0x7f, 0x97, 0x4e, 0x91, 0x74, 0x94, 0x99, 0x6d, 0x6a,
+    0xb3, 0x5a, 0xb8, 0x64, 0xa3, 0x95, 0x5d, 0x95, 0x90, 0x87, 0xcc, 0x72,
+    0x85, 0x85, 0x8f, 0x55, 0x6f, 0x65, 0x84, 0xb6, 0x7b, 0x77, 0xce, 0x79,
+    0x82, 0x59, 0x8a, 0xa2, 0x68, 0x9b, 0xa3, 0x81, 0x9c, 0x7a, 0x97, 0x87,
+    0x6b, 0x8c, 0x9c, 0xaa, 0x5c, 0x69, 0xb8, 0xb7, 0x7c, 0xa0, 0xb5, 0x92,
+    0x8d, 0x67, 0x96, 0xd2, 0x77, 0xa6, 0xd9, 0xad, 0xaa, 0x79, 0x90, 0xc9,
+    0x81, 0xbf, 0xd0, 0x8d, 0x9d, 0x88, 0x9c, 0x91, 0x90, 0x94, 0x89, 0x8a,
+    0x91, 0x9b, 0x89, 0x79, 0x92, 0x80, 0x8f, 0x7b, 0x7e, 0x8b, 0xb1, 0x85,
+    0xa4, 0x5a, 0xb4, 0x7a, 0xa7, 0x8c, 0xa4, 0x75, 0xb9, 0x66, 0x93, 0x86,
+    0x8a, 0x87, 0xad, 0x64, 0xa2, 0x7e, 0x99, 0x9f, 0x81, 0xa2, 0x9b, 0x88,
+    0x9e, 0xa2, 0xb9, 0x8a, 0x78, 0x84, 0x91, 0x8e, 0x8b, 0x90, 0x83, 0x80,
+    0x64, 0x93, 0x77, 0x89, 0x81, 0x86, 0x96, 0x7a, 0x81, 0xab, 0x6d, 0x73,
+    0x7d, 0x7e, 0xaa, 0x85, 0x95, 0xac, 0x8b, 0x89, 0x8b, 0x77, 0xa3, 0x8b,
+    0xa3, 0xa0, 0x87, 0x86, 0x7a, 0x74, 0x6f, 0x7c, 0x90, 0x58, 0xa2, 0x64,
+    0x94, 0x8b, 0xa0, 0x88, 0xab, 0x53, 0xce, 0x67, 0xb7, 0x7f, 0x8d, 0x69,
+    0x84, 0x74, 0xaf, 0x72, 0xab, 0x70, 0x8f, 0x6e, 0x5d, 0x61, 0x96, 0xa1,
+    0x7b, 0x6f, 0xa2, 0x75, 0x8f, 0x5d, 0x93, 0x72, 0x82, 0x97, 0x76, 0x65,
+    0x7e, 0x96, 0xb3, 0x8b, 0x8d, 0x89, 0x8f, 0x7b, 0x6f, 0x71, 0xa1, 0x9e,
+    0x91, 0x7c, 0xc9, 0x9f, 0x7c, 0x71, 0xa1, 0xba, 0x77, 0xa5, 0xd4, 0xa6,
+    0xa0, 0x82, 0x7b, 0x95, 0x9d, 0xb7, 0xaa, 0x8d, 0x71, 0x87, 0x94, 0x7e,
+    0x88, 0x7f, 0x8b, 0x6e, 0x93, 0x9f, 0x82, 0x88, 0x94, 0x8a, 0x97, 0x7f,
+    0x7d, 0x8c, 0xa0, 0x84, 0xb4, 0x7c, 0x8c, 0x7f, 0x71, 0x8c, 0x8e, 0x7f,
+    0xc6, 0x64, 0x81, 0x8d, 0x89, 0x8d, 0xc4, 0x77, 0xaf, 0x75, 0x92, 0x7f,
+    0x84, 0xa1, 0x99, 0x94, 0x9e, 0x82, 0x7a, 0x98, 0x7e, 0x8e, 0x93, 0x8c,
+    0x6b, 0x93, 0x84, 0xaa, 0x7f, 0x8f, 0x6b, 0x94, 0xa3, 0x8a, 0x78, 0x82,
+    0x60, 0x92, 0x8b, 0x8d, 0x75, 0x8c, 0x8e, 0x6e, 0x7e, 0x9d, 0x6d, 0x8e,
+    0x79, 0x8d, 0x80, 0x89, 0xaa, 0x99, 0x7e, 0xa3, 0x83, 0x95, 0x83, 0x85,
+    0x9c, 0x60, 0x99, 0x78, 0x93, 0x8b, 0x80, 0x82, 0x9d, 0x6b, 0xc2, 0x54,
+    0xb9, 0x7a, 0x83, 0x98, 0x88, 0x65, 0xcb, 0x52, 0xa7, 0x8d, 0x7f, 0x81,
+    0x6b, 0x6d, 0x9e, 0x92, 0x85, 0x82, 0x9f, 0x67, 0x6f, 0x74, 0xaa, 0x75,
+    0x99, 0x9f, 0x8a, 0x8b, 0x88, 0x82, 0xb8, 0x6b, 0x85, 0x99, 0x93, 0x90,
+    0x8d, 0x7a, 0xaa, 0x9d, 0x86, 0x7f, 0xbd, 0x91, 0x67, 0x65, 0x8c, 0xb3,
+    0x87, 0x94, 0xa3, 0x9a, 0x7e, 0x73, 0x83, 0xaa, 0x7a, 0xba, 0xaa, 0x9e,
+    0x9e, 0x86, 0x9a, 0x63, 0x9c, 0x98, 0x5e, 0xa0, 0x9c, 0x9e, 0x8b, 0x85,
+    0xa2, 0x74, 0x80, 0x8d, 0x7e, 0x89, 0xc0, 0x75, 0xa5, 0x3f, 0x97, 0xa2,
+    0x8c, 0x8c, 0x9d, 0x88, 0xa4, 0x5e, 0x75, 0x5f, 0x87, 0x82, 0xbc, 0x72,
+    0xa3, 0x77, 0x83, 0x79, 0x82, 0x95, 0x8d, 0x77, 0x73, 0x81, 0x9d, 0x9b,
+    0x6c, 0x87, 0x93, 0x96, 0x83, 0x86, 0x8b, 0x89, 0x72, 0x7d, 0x96, 0x78,
+    0x67, 0xa2, 0x8d, 0x81, 0x6a, 0x98, 0x75, 0x80, 0x8a, 0x80, 0x9e, 0x82,
+    0x76, 0x9b, 0x6c, 0x94, 0x7a, 0x96, 0x74, 0x92, 0x78, 0x91, 0x7a, 0x7c,
+    0x9a, 0x98, 0x70, 0x5d, 0x9c, 0x4b, 0x70, 0x7d, 0xa9, 0x9b, 0x70, 0x96,
+    0xad, 0x59, 0xc4, 0x63, 0xbc, 0x8f, 0x5c, 0x86, 0x8e, 0x97, 0xa0, 0x7c,
+    0xa6, 0x77, 0xaa, 0x93, 0x68, 0x66, 0x93, 0x91, 0x7b, 0x7e, 0xa2, 0x7a,
+    0x98, 0x77, 0x97, 0x59, 0x84, 0x76, 0x9c, 0x7b, 0x8b, 0x76, 0x88, 0x7a,
+    0x8c, 0x7b, 0xa4, 0xae, 0x6e, 0x7d, 0xb3, 0x99, 0x8d, 0x68, 0x9e, 0x7e,
+    0x77, 0x59, 0x80, 0xbe, 0x80, 0x83, 0xd9, 0x9f, 0x7d, 0x60, 0x8b, 0x98,
+    0x7f, 0x9e, 0xa3, 0x8d, 0x7d, 0x81, 0x9e, 0x78, 0x99, 0x94, 0x70, 0x80,
+    0x9b, 0x89, 0x8c, 0x6d, 0x9c, 0x95, 0x76, 0x7c, 0x83, 0x87, 0x97, 0x93,
+    0x89, 0x6d, 0x77, 0x7e, 0x7e, 0x87, 0x8e, 0x7e, 0x94, 0x61, 0x94, 0xa2,
+    0x94, 0x91, 0xa1, 0x64, 0xc1, 0x78, 0x79, 0xaf, 0x67, 0x7a, 0x9b, 0xa1,
+    0x95, 0x8e, 0x97, 0x84, 0x7b, 0x85, 0x80, 0xa1, 0x6f, 0x87, 0x79, 0x83,
+    0x73, 0x9d, 0x81, 0x64, 0x7a, 0x7f, 0x8f, 0x91, 0x73, 0x97, 0x74, 0x8b,
+    0x7e, 0x88, 0x7f, 0x7e, 0x6e, 0xa1, 0x85, 0x8f, 0x77, 0x93, 0x7a, 0x6f,
+    0x7b, 0x91, 0x67, 0x73, 0x8b, 0x97, 0x6d, 0x87, 0x84, 0xf8, 0xff, 0xff,
+    0x88, 0xf8, 0xff, 0xff, 0xe6, 0xf8, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x02, 0x00, 0x00, 0x73, 0x84, 0xbb, 0xa4, 0xa5, 0x44, 0x5c, 0xb1,
+    0x8e, 0x50, 0x82, 0x8b, 0x81, 0x86, 0x48, 0x80, 0xa9, 0x61, 0xa3, 0xa8,
+    0xca, 0x5a, 0x9d, 0x8a, 0x89, 0x7c, 0x65, 0x91, 0x5e, 0x70, 0x84, 0x71,
+    0xbc, 0x36, 0x8e, 0x8b, 0xa6, 0x63, 0xb7, 0x75, 0x92, 0x59, 0x60, 0x7e,
+    0x33, 0x8f, 0x90, 0x7a, 0xa9, 0x27, 0x72, 0x80, 0x62, 0x95, 0x93, 0x7b,
+    0x60, 0x46, 0x40, 0x55, 0x01, 0x9e, 0x8a, 0x6b, 0x58, 0x8a, 0xa6, 0xb7,
+    0x91, 0x39, 0x72, 0xb4, 0x6e, 0x67, 0x83, 0x91, 0x82, 0x7b, 0x64, 0x7a,
+    0x87, 0x6e, 0xb0, 0xa0, 0xd3, 0x53, 0xb7, 0x93, 0x76, 0xa6, 0x68, 0x8a,
+    0x74, 0x6a, 0x96, 0x6e, 0xb3, 0x53, 0xaa, 0x89, 0xf1, 0x76, 0xb8, 0x75,
+    0x8b, 0x66, 0x5f, 0x6e, 0x52, 0x92, 0x6f, 0x82, 0xbe, 0x45, 0x8d, 0x69,
+    0x98, 0x98, 0x80, 0x87, 0x73, 0x7d, 0x4d, 0x42, 0x1f, 0xa5, 0x6a, 0x73,
+    0x47, 0x87, 0x8a, 0xd1, 0x75, 0x30, 0x91, 0xae, 0x60, 0x82, 0x7a, 0x94,
+    0x75, 0x71, 0x6a, 0x7c, 0x74, 0x7a, 0xac, 0xa2, 0xb6, 0x51, 0xc6, 0x97,
+    0x63, 0xa0, 0x67, 0x7f, 0x80, 0x69, 0x88, 0x6b, 0xa5, 0x5e, 0xc2, 0x72,
+    0xf4, 0x6e, 0xaf, 0x76, 0x7f, 0x7c, 0x55, 0x68, 0x67, 0x97, 0x61, 0x7b,
+    0xbe, 0x5e, 0xab, 0x58, 0xca, 0xa2, 0x77, 0x7a, 0x8f, 0x6e, 0x54, 0x33,
+    0x4d, 0xa7, 0x5d, 0x66, 0x47, 0x92, 0x6f, 0xd6, 0x5c, 0x25, 0xa9, 0xbc,
+    0x5c, 0xb8, 0x64, 0x9b, 0x58, 0x6e, 0x77, 0x76, 0x6a, 0x94, 0xb2, 0xac,
+    0x9a, 0x51, 0xd0, 0x94, 0x62, 0xcc, 0x5a, 0x7f, 0x74, 0x6e, 0x7d, 0x71,
+    0x9b, 0x69, 0xd3, 0x64, 0xef, 0x76, 0xaa, 0x75, 0x89, 0x84, 0x50, 0x76,
+    0x72, 0x97, 0x5f, 0x77, 0xc5, 0x66, 0xce, 0x3a, 0xe5, 0xad, 0x5a, 0x81,
+    0x9e, 0x8e, 0x60, 0x3d, 0x6d, 0xa9, 0x46, 0x6b, 0x44, 0x89, 0x4d, 0xd8,
+    0x4c, 0x28, 0xb1, 0xb7, 0x60, 0xc7, 0x57, 0xb5, 0x50, 0x68, 0x88, 0x7c,
+    0x60, 0x98, 0xac, 0x9a, 0x7f, 0x51, 0xce, 0x8a, 0x5e, 0xd8, 0x51, 0x7d,
+    0x68, 0x6e, 0x7f, 0x6e, 0x90, 0x7b, 0xdf, 0x60, 0xda, 0x77, 0x91, 0x6f,
+    0x85, 0xa0, 0x58, 0x73, 0x70, 0x93, 0x51, 0x7d, 0xb9, 0x70, 0xf5, 0x31,
+    0xe9, 0xa3, 0x47, 0x76, 0xa7, 0x9b, 0x72, 0x3d, 0x90, 0xb2, 0x57, 0x64,
+    0x5b, 0x6f, 0x2b, 0xcf, 0x52, 0x28, 0xc1, 0xa7, 0x6a, 0x78, 0x51, 0xad,
+    0x49, 0x70, 0x90, 0x81, 0x5c, 0x7e, 0x9e, 0x99, 0x77, 0x50, 0xc0, 0x94,
+    0x63, 0xb7, 0x4d, 0x71, 0x58, 0x66, 0x76, 0x6d, 0x78, 0x6a, 0xe1, 0x40,
+    0xc7, 0x73, 0x7f, 0x65, 0x7c, 0x7f, 0x4d, 0x80, 0x64, 0x95, 0x57, 0x81,
+    0xb1, 0x5e, 0xff, 0x26, 0xd6, 0xa2, 0x3a, 0x73, 0xa7, 0x81, 0x76, 0x5d,
+    0x92, 0xb1, 0x58, 0x48, 0x4e, 0x5e, 0x1a, 0xc8, 0x58, 0x2c, 0xb6, 0xa7,
+    0x67, 0x89, 0x5e, 0xa0, 0x4f, 0x78, 0x93, 0x8b, 0x57, 0x7b, 0x95, 0x78,
+    0x6e, 0x46, 0xb2, 0x98, 0x55, 0xd3, 0x5e, 0x66, 0x56, 0x68, 0x74, 0x7e,
+    0x72, 0x74, 0xdd, 0x36, 0xa6, 0x64, 0x65, 0x6b, 0x81, 0x98, 0x56, 0x76,
+    0x65, 0x93, 0x58, 0x7d, 0x9b, 0x82, 0xef, 0x44, 0xbf, 0xa4, 0x3d, 0x57,
+    0xa0, 0xa7, 0x7a, 0x74, 0x9f, 0xa8, 0x70, 0x52, 0x55, 0x5f, 0x1a, 0x94,
+    0x64, 0x37, 0xa7, 0xa6, 0x80, 0x7d, 0x6e, 0x99, 0x5d, 0x81, 0x8a, 0x99,
+    0x5c, 0x76, 0x8f, 0x44, 0x68, 0x50, 0x94, 0x97, 0x63, 0xb6, 0x73, 0x56,
+    0x5b, 0x70, 0x66, 0x8b, 0x72, 0x78, 0xcc, 0x31, 0x8b, 0x68, 0x4a, 0x74,
+    0x7d, 0x99, 0x54, 0x91, 0x6a, 0x90, 0x5d, 0x80, 0x8c, 0x82, 0xcd, 0x4f,
+    0xb0, 0x96, 0x63, 0x56, 0x97, 0xb3, 0x7e, 0x97, 0xa4, 0x9d, 0x7a, 0x5d,
+    0x49, 0x36, 0x18, 0x64, 0x60, 0x43, 0x89, 0xa2, 0x6a, 0x49, 0x7f, 0x58,
+    0x6a, 0x83, 0x77, 0x9d, 0x70, 0x3b, 0x83, 0x21, 0x59, 0x52, 0x6d, 0x95,
+    0x48, 0xa8, 0x8a, 0x42, 0x50, 0x6d, 0x44, 0x95, 0x69, 0x50, 0xc1, 0x4b,
+    0x7c, 0x59, 0x42, 0x78, 0x77, 0x7f, 0x5b, 0x98, 0x67, 0x89, 0x55, 0x8b,
+    0x82, 0x47, 0xb7, 0x64, 0x9d, 0x83, 0x5c, 0x53, 0x89, 0x90, 0x79, 0xb2,
+    0x90, 0x98, 0x85, 0x5a, 0x4d, 0x2b, 0x19, 0x1e, 0x52, 0x50, 0x57, 0x8b,
+    0x73, 0x3a, 0x88, 0x1e, 0x65, 0x80, 0x4d, 0x9b, 0x6c, 0x3c, 0x86, 0x26,
+    0x5b, 0x56, 0x36, 0x98, 0x49, 0x87, 0x9f, 0x2a, 0x40, 0x61, 0x27, 0x9d,
+    0x63, 0x40, 0xa8, 0x46, 0x6b, 0x52, 0x52, 0x7f, 0x67, 0x6a, 0x58, 0xa1,
+    0x5d, 0x6d, 0x5f, 0x9a, 0x72, 0x3a, 0x99, 0x63, 0x8c, 0x80, 0x68, 0x58,
+    0x72, 0x6a, 0x7c, 0xbb, 0x7e, 0x78, 0x94, 0x60, 0x72, 0xfb, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff,
+    0x8f, 0x00, 0x00, 0x00, 0x8f, 0xfc, 0xff, 0xff, 0xb4, 0xfe, 0xff, 0xff,
+    0xc1, 0xfd, 0xff, 0xff, 0x59, 0xff, 0xff, 0xff, 0xbc, 0xfe, 0xff, 0xff,
+    0x09, 0xff, 0xff, 0xff, 0x9e, 0xfb, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0xe9, 0x03, 0x00, 0x00, 0x2b, 0xfd, 0xff, 0xff,
+    0x3b, 0xfd, 0xff, 0xff, 0x91, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,
+    0x04, 0xfd, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xf0, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x18, 0x03, 0x00, 0x00, 0x78, 0x03, 0x00, 0x00,
+    0x88, 0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x64, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+    0xb2, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
+    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x84, 0xfd, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x22, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x1c, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x17, 0xb1, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x84, 0xdb, 0x33, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x9d, 0xf0, 0x2c, 0xc1, 0x8e, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f,
+    0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74,
+    0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f,
+    0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
+    0x84, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xac, 0x5f, 0xf6, 0x39, 0x01, 0x00, 0x00, 0x00, 0x1d, 0xaf, 0x62, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x5e, 0x1b, 0x83, 0xbd, 0x22, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x0f, 0x72, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x38, 0x1d, 0x71, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
+    0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x6c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc6, 0xd0, 0xd0, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x0e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x3c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x50, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x2f, 0x46, 0x61,
+    0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d,
+    0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf7, 0x5e, 0x6c, 0x3a,
+    0x01, 0x00, 0x00, 0x00, 0x30, 0x42, 0xec, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x42, 0xca, 0xe8, 0xbd, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x43, 0x6f, 0x6e, 0x76,
+    0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xec, 0xcd, 0xc0, 0x38, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,
+    0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x25, 0xf5, 0xe8, 0x37, 0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
+    0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+    0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
+    0x14, 0x00, 0x1c, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff,
+    0x00, 0x19, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x09, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x04};
+const int g_tiny_conv_micro_features_model_data_len = 18208;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..22c0a970b774299aea629ce034b9dd2e4c04e1ca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a standard TensorFlow Lite model file that has been converted into a
+// C data array, so it can be easily compiled into a binary for devices that
+// don't have a file system. It was created using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+
+extern const unsigned char g_tiny_conv_micro_features_model_data[];
+extern const int g_tiny_conv_micro_features_model_data_len;
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51b7d8b35bcbce892fbd891415e3f09725c62faa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+
+#include <string.h>
+
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read) {
+  const int size = state->size;
+
+  // Copy samples from the samples buffer over to our local input.
+  size_t max_samples_to_copy = state->size - state->input_used;
+  if (max_samples_to_copy > num_samples) {
+    max_samples_to_copy = num_samples;
+  }
+  memcpy(state->input + state->input_used, samples,
+         max_samples_to_copy * sizeof(*samples));
+  *num_samples_read = max_samples_to_copy;
+  state->input_used += max_samples_to_copy;
+
+  if (state->input_used < state->size) {
+    // We don't have enough samples to compute a window.
+    return 0;
+  }
+
+  // Apply the window to the input.
+  const int16_t* coefficients = state->coefficients;
+  const int16_t* input = state->input;
+  int16_t* output = state->output;
+  int i;
+  int16_t max_abs_output_value = 0;
+  for (i = 0; i < size; ++i) {
+    int16_t new_value = ((static_cast<int32_t>(*input++)) * *coefficients++) >>
+                        kFrontendWindowBits;
+    *output++ = new_value;
+    if (new_value < 0) {
+      new_value = -new_value;
+    }
+    if (new_value > max_abs_output_value) {
+      max_abs_output_value = new_value;
+    }
+  }
+  // Shuffle the input down by the step size, and update how much we have used.
+  memmove(state->input, state->input + state->step,
+          sizeof(*state->input) * (state->size - state->step));
+  state->input_used -= state->step;
+  state->max_abs_output_value = max_abs_output_value;
+
+  // Indicate that the output buffer is valid for the next stage.
+  return 1;
+}
+
+void WindowReset(struct WindowState* state) {
+  memset(state->input, 0, state->size * sizeof(*state->input));
+  memset(state->output, 0, state->size * sizeof(*state->output));
+  state->input_used = 0;
+  state->max_abs_output_value = 0;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h
new file mode 100644
index 0000000000000000000000000000000000000000..b32c059d81a8efe68c8a87a250fd733e1849479b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#define kFrontendWindowBits 12
+
+struct WindowState {
+  size_t size;
+  int16_t coefficients[kMaxAudioSampleSize];
+  size_t step;
+
+  int16_t input[kMaxAudioSampleSize];
+  size_t input_used;
+  int16_t output[kMaxAudioSampleSize];
+  int16_t max_abs_output_value;
+};
+
+// Applies a window to the samples coming in, stepping forward at the given
+// rate.
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read);
+
+void WindowReset(struct WindowState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..310f84fc60b32e37f7e7d9d79bc2425ce7cddf8a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test window function behaviors using default config values.
+class WindowTestConfig {
+ public:
+  WindowTestConfig() {
+    config_.size_ms = 25;
+    config_.step_size_ms = 10;
+  }
+
+  struct WindowConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(WindowState_CheckCoefficients) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+
+  const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
+                              2681, 3145, 3541, 3843, 4032, 4096, 4032,
+                              3843, 3541, 3145, 2681, 2177, 1664, 1176,
+                              743,  391,  144,  16};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.coefficients[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckResidualInput) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  int i;
+  for (i = kStepSamples; i < kWindowSamples; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  const int16_t expected[] = {
+      0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+      0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckMaxAbsValue) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.max_abs_output_value, 32256);
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckConsecutiveWindow) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+
+  const int16_t expected[] = {
+      0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
+      0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckNotEnoughSamples) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+  TF_LITE_MICRO_EXPECT_EQ(
+      false, WindowProcessSamples(
+                 &state, kFakeAudioData + kWindowSamples + kStepSamples,
+                 sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) -
+                     kWindowSamples - kStepSamples,
+                 &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      state.input_used,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..618973b39b2ebd2088b4c3756ea6ca1c1f7e8181
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+// Needed because some platforms don't have M_PI defined.
+#define WINDOW_PI (3.14159265358979323846f)
+
+void WindowFillConfigWithDefaults(struct WindowConfig* config) {
+  config->size_ms = 25;
+  config->step_size_ms = 10;
+}
+
+int WindowPopulateState(tflite::ErrorReporter* error_reporter,
+                        const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate) {
+  state->size = config->size_ms * sample_rate / 1000;
+  state->step = config->step_size_ms * sample_rate / 1000;
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->coefficients,
+                                 (state->size * sizeof(*state->coefficients)));
+
+  // Populate the window values.
+  const float arg = WINDOW_PI * 2.0 / (static_cast<float>(state->size));
+  int i;
+  for (i = 0; i < state->size; ++i) {
+    float float_value = 0.5 - (0.5 * cos(arg * (i + 0.5)));
+    // Scale it to fixed point and round it.
+    state->coefficients[i] =
+        floor(float_value * (1 << kFrontendWindowBits) + 0.5);
+  }
+
+  state->input_used = 0;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->input,
+                                 (state->size * sizeof(*state->input)));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->output,
+                                 (state->size * sizeof(*state->output)));
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0c61c29dc9cd2a91f37ea89ace5e031235dd337
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct WindowConfig {
+  // length of window frame in milliseconds
+  size_t size_ms;
+  // length of step for next frame in milliseconds
+  size_t step_size_ms;
+};
+
+// Populates the WindowConfig with "sane" default values.
+void WindowFillConfigWithDefaults(struct WindowConfig* config);
+
+// Allocates any buffers.
+int WindowPopulateState(tflite::ErrorReporter* error_reporter,
+                        const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void WindowFreeStateContents(struct WindowState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48535d12d5db850cf0a497645f9e77d98fbcb8a1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
@@ -0,0 +1,24 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
+
+const uint8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
+    214, 215, 236, 202, 235, 203, 225, 191, 203, 188, 199, 194, 212, 127,
+    51,  0,   174, 188, 219, 196, 228, 221, 240, 207, 235, 220, 241, 219,
+    237, 207, 212, 142, 95,  0,   139, 78,  162, 177, 197, 183,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e73a13153b65be78a2a57edce0d09f48a8cb444f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_micro_features_data.cc and consists of the 26th spectrogram slice of 40
+// values. This is the expected result of running the sample data in
+// yes_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
+
+#include <cstdint>
+
+constexpr int g_yes_feature_data_slice_size = 40;
+extern const uint8_t g_yes_feature_data_slice[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2ee0995c00ee0da1337c86cf9aa18ba726bfe1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize=1 \
+ * --preprocess="micro" \
+ * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
+ * --output_c_file="yes_micro_features_data.cc" \
+ */
+
+const int g_yes_micro_f2e59fea_nohash_1_width = 40;
+const int g_yes_micro_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_micro_f2e59fea_nohash_1_data[] = {
+    244, 226, 245, 223, 234, 213, 228, 208, 194, 110, 95,  116, 102, 0,   137,
+    161, 183, 173, 137, 116, 133, 157, 151, 156, 128, 110, 128, 0,   68,  78,
+    78,  90,  68,  68,  78,  102, 95,  78,  95,  78,  210, 188, 209, 183, 204,
+    188, 201, 191, 166, 119, 90,  107, 110, 107, 175, 157, 179, 168, 182, 145,
+    152, 164, 171, 165, 136, 143, 122, 68,  0,   78,  90,  90,  110, 90,  102,
+    99,  90,  68,  78,  68,  223, 186, 179, 123, 182, 110, 196, 171, 159, 110,
+    102, 95,  90,  99,  160, 134, 125, 136, 153, 152, 164, 134, 164, 151, 141,
+    136, 99,  90,  90,  90,  78,  78,  102, 119, 102, 90,  110, 90,  68,  51,
+    177, 175, 211, 172, 183, 0,   95,  68,  129, 102, 68,  85,  114, 105, 110,
+    85,  102, 95,  140, 51,  85,  51,  95,  90,  143, 116, 90,  78,  78,  51,
+    107, 85,  68,  0,   68,  51,  90,  51,  68,  0,   164, 117, 193, 120, 156,
+    0,   138, 51,  90,  0,   51,  0,   51,  85,  0,   0,   51,  0,   0,   0,
+    0,   0,   114, 0,   85,  78,  90,  51,  0,   0,   51,  85,  99,  85,  107,
+    68,  90,  85,  78,  0,   51,  0,   110, 0,   68,  0,   0,   0,   51,  0,
+    51,  0,   0,   0,   68,  90,  107, 0,   68,  0,   0,   0,   68,  0,   51,
+    68,  0,   78,  68,  0,   51,  0,   78,  68,  90,  68,  78,  51,  51,  0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   0,   0,
+    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  68,
+    0,   0,   78,  0,   78,  0,   78,  0,   51,  0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   51,  0,   51,  0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   51,
+    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
+    0,   0,   0,   0,   51,  78,  0,   0,   51,  51,  0,   0,   0,   78,  0,
+    213, 170, 192, 180, 196, 188, 173, 131, 173, 116, 137, 105, 159, 127, 0,
+    0,   0,   0,   127, 164, 165, 161, 170, 164, 185, 197, 195, 167, 134, 138,
+    159, 134, 136, 105, 51,  0,   99,  0,   51,  0,   228, 215, 229, 218, 237,
+    215, 228, 210, 237, 222, 239, 211, 208, 211, 234, 218, 220, 209, 225, 219,
+    235, 222, 245, 225, 245, 224, 243, 223, 241, 218, 237, 224, 234, 213, 221,
+    193, 197, 164, 157, 128, 227, 188, 232, 196, 220, 220, 240, 219, 234, 213,
+    234, 211, 231, 218, 233, 213, 239, 215, 228, 207, 229, 206, 224, 208, 226,
+    207, 232, 210, 225, 208, 230, 199, 227, 206, 210, 205, 218, 174, 178, 141,
+    235, 208, 220, 206, 225, 203, 233, 203, 225, 167, 205, 199, 208, 190, 221,
+    204, 223, 207, 225, 188, 225, 197, 215, 188, 199, 183, 225, 195, 224, 200,
+    216, 178, 208, 188, 215, 202, 214, 183, 176, 140, 198, 150, 211, 194, 203,
+    120, 175, 188, 204, 189, 219, 192, 223, 202, 216, 186, 203, 185, 210, 182,
+    214, 183, 204, 170, 204, 125, 184, 187, 206, 185, 198, 182, 210, 161, 202,
+    198, 218, 173, 145, 120, 188, 183, 205, 168, 200, 170, 210, 177, 187, 190,
+    209, 193, 193, 166, 210, 162, 175, 119, 174, 147, 182, 161, 181, 134, 176,
+    143, 187, 165, 186, 149, 185, 141, 192, 181, 202, 123, 170, 143, 144, 78,
+    149, 0,   208, 182, 170, 78,  170, 0,   117, 51,  156, 99,  195, 170, 200,
+    130, 152, 68,  175, 141, 173, 134, 194, 132, 189, 164, 198, 134, 173, 117,
+    171, 149, 183, 181, 185, 99,  153, 117, 125, 0,   166, 0,   173, 117, 144,
+    0,   117, 102, 188, 120, 193, 166, 197, 68,  163, 119, 169, 99,  134, 0,
+    162, 0,   164, 68,  171, 116, 126, 0,   120, 68,  68,  0,   105, 0,   159,
+    95,  150, 51,  90,  85,  0,   0,   131, 0,   105, 0,   145, 51,  170, 51,
+    120, 0,   107, 0,   145, 85,  160, 0,   85,  0,   0,   51,  149, 0,   78,
+    0,   0,   0,   0,   0,   0,   0,   90,  0,   112, 0,   78,  102, 122, 0,
+    0,   0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   0,   0,   112,
+    0,   164, 120, 143, 0,   0,   0,   0,   0,   51,  0,   90,  0,   78,  0,
+    0,   0,   0,   0,   110, 0,   139, 0,   112, 51,  0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   102, 0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   107,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,   51,  0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   127, 110, 133, 0,   167, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   132, 0,   190,
+    194, 202, 0,   197, 187, 161, 0,   0,   0,   0,   0,   0,   0,   0,   0,
+    214, 213, 223, 203, 218, 189, 200, 122, 78,  0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   191, 210, 231, 197, 226, 217, 238, 216, 236, 207,
+    199, 0,   0,   0,   0,   0,   107, 122, 155, 160, 214, 215, 236, 202, 235,
+    203, 225, 191, 203, 188, 199, 194, 212, 127, 51,  0,   174, 188, 219, 196,
+    228, 221, 240, 207, 235, 220, 241, 219, 237, 207, 212, 142, 95,  0,   139,
+    78,  162, 177, 197, 183, 211, 199, 235, 208, 238, 215, 227, 207, 211, 201,
+    224, 213, 226, 192, 213, 170, 223, 205, 234, 221, 245, 225, 242, 220, 245,
+    221, 239, 221, 238, 213, 226, 180, 159, 112, 176, 159, 208, 202, 213, 191,
+    205, 191, 225, 197, 238, 219, 224, 201, 227, 200, 221, 201, 225, 203, 212,
+    195, 229, 210, 228, 210, 239, 216, 226, 212, 233, 205, 225, 200, 229, 207,
+    222, 151, 147, 119, 179, 185, 230, 218, 223, 192, 202, 136, 205, 177, 223,
+    204, 228, 215, 232, 209, 221, 189, 221, 205, 209, 200, 226, 209, 229, 205,
+    235, 192, 209, 198, 228, 190, 206, 185, 207, 187, 214, 175, 177, 184, 220,
+    195, 214, 207, 230, 184, 205, 159, 208, 184, 189, 169, 224, 213, 219, 199,
+    229, 203, 216, 205, 222, 204, 224, 206, 231, 208, 231, 176, 197, 184, 216,
+    193, 211, 139, 212, 195, 231, 164, 166, 195, 217, 182, 208, 190, 217, 179,
+    205, 68,  182, 119, 195, 168, 182, 136, 204, 179, 193, 158, 182, 140, 188,
+    154, 197, 169, 190, 99,  184, 0,   125, 0,   131, 0,   99,  68,  179, 85,
+    190, 184, 213, 203, 223, 202, 212, 190, 209, 138, 178, 0,   159, 51,  128,
+    51,  105, 0,   139, 51,  179, 125, 185, 114, 171, 128, 175, 132, 181, 174,
+    155, 0,   0,   0,   90,  0,   125, 0,   176, 188, 227, 217, 244, 215, 234,
+    221, 239, 192, 224, 210, 0,   0,   134, 0,   51,  0,   105, 0,   105, 0,
+    143, 90,  192, 119, 175, 147, 141, 51,  184, 110, 85,  0,   0,   0,   0,
+    0,   0,   0,   151, 139, 201, 203, 232, 203, 226, 208, 236, 206, 230, 212,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   169, 0,   119,
+    0,   78,  0,   0,   0,   0,   0,   0,   0,   0,   0,   68,  0,   0,   133,
+    200, 180, 220, 197, 228, 201, 221, 184, 213, 193, 110, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   78,  0,   164, 0,   0,   0,   0,   0,   107, 0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   150, 164, 202, 182, 224,
+    197, 211, 179, 212, 193, 134, 0,   0,   0,   0,   0,   0,   0,   0,   0,
+    85,  0,   150, 0,   85,  0,   95,  0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   102, 90,  193, 160, 203, 164, 200, 178, 205, 174,
+    116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   120, 114, 123, 0,   114,
+    0,   145, 68,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    102, 68,  199, 170, 195, 180, 208, 176, 200, 164, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   110, 0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   142, 102, 172, 110, 186,
+    167, 185, 147, 189, 154, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   177, 0,   158, 136, 197, 155, 189, 166,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    85,  0,   155, 90,  175, 117, 175, 138, 202, 165, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   139,
+    0,   120, 68,  51,  123, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   119, 0,   78,  0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..d19bf8f067d7329dcda0b866d0d323b92f175e61
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
+
+extern const int g_yes_micro_f2e59fea_nohash_1_width;
+extern const int g_yes_micro_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_micro_f2e59fea_nohash_1_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
index 4e54ff670eb9badd648aee99cf154c0d3b988bff..6f0c2581771e87e69481726adaea4fab3108640b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -32,7 +32,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
@@ -61,12 +62,12 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(40, input->dims->data[2]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
 
   // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
   // into the memory area used for the input.
-  const uint8_t* yes_features_data = g_yes_f2e59fea_nohash_1_data;
+  const uint8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
   for (int i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = yes_features_data[i];
   }
@@ -102,7 +103,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_GT(yes_score, no_score);
 
   // Now test with a different input, from a recording of "No".
-  const uint8_t* no_features_data = g_no_f9643d42_nohash_4_data;
+  const uint8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
   for (int i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = no_features_data[i];
   }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85113a90dcf610a38f21e17f0b303befd6c1e071
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc
@@ -0,0 +1,1477 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+
+const int g_no_1000ms_sample_data_size = 16000;
+const int16_t g_no_1000ms_sample_data[16000] = {
+    5,     1,     -10,   -16,   -14,   -10,   -4,    -5,    -10,   -15,   -13,
+    -17,   -22,   -21,   -23,   -25,   -22,   -26,   -28,   -31,   -28,   -25,
+    -20,   -24,   -21,   -13,   -7,    -1,    -1,    3,     3,     4,     -4,
+    -6,    -8,    -10,   -13,   -4,    -2,    5,     8,     11,    26,    28,
+    34,    32,    34,    30,    21,    18,    15,    13,    8,     5,     14,
+    13,    7,     8,     4,     -5,    -7,    -4,    -9,    -13,   -17,   -21,
+    -16,   -14,   -12,   -12,   -14,   -11,   -9,    -2,    5,     -1,    2,
+    0,     2,     1,     -3,    -13,   -14,   -16,   -11,   -10,   -9,    -13,
+    -17,   -19,   -25,   -21,   -21,   -20,   -13,   -5,    -3,    0,     3,
+    6,     5,     1,     0,     -1,    -7,    -10,   -11,   -9,    -6,    -7,
+    -11,   -10,   -5,    -14,   -20,   -23,   -22,   -22,   -19,   -15,   -12,
+    -6,    -5,    3,     13,    16,    17,    25,    26,    28,    34,    34,
+    33,    34,    30,    21,    22,    18,    13,    20,    22,    24,    27,
+    26,    23,    21,    18,    9,     5,     -2,    -7,    -8,    -10,   -8,
+    -8,    -4,    2,     2,     -1,    -7,    -10,   -8,    -12,   -13,   -15,
+    -9,    -5,    -4,    -3,    -6,    -11,   -11,   -18,   -16,   -13,   -10,
+    -12,   -6,    0,     -2,    0,     -3,    -4,    -8,    -12,   -19,   -16,
+    -17,   -19,   -23,   -30,   -33,   -36,   -38,   -39,   -40,   -36,   -37,
+    -32,   -27,   -25,   -31,   -38,   -41,   -47,   -52,   -50,   -42,   -32,
+    -16,   -7,    -3,    0,     -1,    -1,    -5,    -16,   -23,   -29,   -34,
+    -33,   -27,   -17,   -11,   1,     4,     10,    18,    21,    24,    24,
+    25,    30,    34,    30,    29,    26,    23,    20,    15,    14,    13,
+    14,    16,    23,    28,    21,    23,    21,    13,    12,    12,    14,
+    17,    21,    26,    27,    30,    30,    26,    20,    15,    15,    9,
+    8,     9,     10,    7,     8,     7,     1,     -2,    -6,    -10,   -10,
+    -12,   -15,   -10,   -7,    -6,    -5,    0,     -3,    -3,    -12,   -25,
+    -35,   -49,   -53,   -49,   -51,   -48,   -46,   -48,   -39,   -33,   -31,
+    -37,   -42,   -47,   -49,   -46,   -47,   -47,   -46,   -42,   -39,   -33,
+    -26,   -23,   -14,   -8,    -9,    -7,    -10,   -11,   -13,   -13,   -19,
+    -20,   -16,   -11,   -9,    7,     16,    21,    29,    27,    29,    28,
+    21,    14,    13,    17,    19,    20,    18,    13,    17,    16,    18,
+    20,    17,    13,    16,    23,    26,    26,    25,    27,    31,    30,
+    31,    34,    32,    35,    32,    36,    31,    26,    23,    27,    27,
+    29,    27,    26,    32,    31,    28,    26,    23,    14,    6,     0,
+    -4,    -7,    -9,    -10,   -8,    -3,    4,     12,    11,    15,    11,
+    8,     2,     -3,    -3,    -4,    -6,    -11,   -14,   -20,   -28,   -32,
+    -38,   -46,   -42,   -44,   -40,   -34,   -26,   -29,   -25,   -23,   -24,
+    -17,   -21,   -26,   -23,   -25,   -19,   -10,   -11,   -10,   -10,   -12,
+    -9,    -3,    0,     -3,    -7,    -10,   -13,   -10,   -14,   -13,   -17,
+    -22,   -22,   -30,   -28,   -29,   -26,   -18,   -6,    -1,    -3,    -4,
+    -6,    -10,   -13,   -10,   -14,   -16,   -11,   -15,   -9,    -3,    -6,
+    -1,    2,     3,     4,     6,     6,     3,     4,     12,    14,    17,
+    21,    19,    20,    16,    17,    15,    21,    21,    22,    20,    17,
+    16,    16,    20,    17,    15,    9,     5,     11,    18,    24,    28,
+    26,    23,    23,    26,    22,    18,    21,    23,    26,    27,    25,
+    27,    29,    26,    20,    10,    7,     11,    8,     16,    25,    33,
+    37,    38,    39,    35,    30,    20,    13,    9,     6,     5,     13,
+    13,    14,    15,    12,    8,     3,     3,     3,     2,     9,     11,
+    10,    5,     5,     0,     -7,    -11,   -12,   -15,   -17,   -12,   -13,
+    -18,   -19,   -21,   -24,   -22,   -27,   -34,   -36,   -36,   -32,   -20,
+    -16,   -15,   -5,    -5,    -9,    -10,   -9,    -17,   -19,   -20,   -14,
+    -13,   -10,   -4,    -7,    -7,    -14,   -19,   -28,   -31,   -30,   -31,
+    -23,   -19,   -20,   -12,   -11,   -14,   -16,   -20,   -18,   -20,   -21,
+    -24,   -29,   -30,   -30,   -34,   -31,   -25,   -21,   -18,   -11,   -4,
+    2,     2,     3,     3,     2,     4,     -1,    -4,    -8,    -3,    -1,
+    7,     15,    18,    22,    20,    20,    16,    16,    14,    13,    21,
+    25,    26,    35,    28,    28,    28,    25,    21,    19,    18,    21,
+    24,    20,    25,    28,    19,    16,    15,    8,     3,     -1,    3,
+    5,     13,    18,    25,    31,    33,    39,    36,    36,    32,    36,
+    37,    39,    42,    36,    32,    27,    30,    24,    18,    15,    10,
+    7,     5,     6,     -1,    -4,    -10,   -17,   -15,   -19,   -15,   -7,
+    -4,    3,     0,     3,     4,     -2,    -7,    -13,   -21,   -23,   -28,
+    -27,   -26,   -25,   -15,   -10,   -4,    -6,    -5,    -9,    -5,    -3,
+    1,     2,     -1,    1,     -4,    -7,    -8,    -17,   -17,   -15,   -14,
+    -9,    -5,    -7,    -6,    -9,    -16,   -15,   -15,   -16,   -16,   -11,
+    -15,   -15,   -6,    -6,    -5,    -2,    0,     -9,    -10,   -12,   -13,
+    -10,   -4,    0,     8,     5,     4,     2,     0,     -5,    -8,    -16,
+    -15,   -12,   -3,    9,     17,    24,    26,    30,    28,    22,    17,
+    14,    9,     8,     9,     8,     11,    12,    12,    15,    14,    18,
+    20,    17,    19,    22,    21,    12,    5,     0,     3,     -3,    -4,
+    -6,    -7,    1,     8,     8,     8,     10,    2,     -3,    -8,    -15,
+    -20,   -24,   -22,   -23,   -13,   -6,    -7,    -5,    -10,   -8,    -15,
+    -19,   -22,   -20,   -17,   -18,   -13,   -10,   -1,    6,     5,     3,
+    1,     -5,    -11,   -10,   -14,   -19,   -15,   -13,   -8,    -2,    -3,
+    -4,    -3,    -4,    -1,    1,     0,     -3,    -4,    -8,    -18,   -21,
+    -25,   -24,   -16,   -9,    -2,    1,     5,     1,     3,     -2,    -7,
+    -10,   -23,   -30,   -29,   -23,   -9,    -3,    4,     11,    11,    6,
+    2,     0,     -12,   -20,   -28,   -24,   -22,   -17,   -22,   -19,   -14,
+    -21,   -17,   -17,   -12,   -8,    -3,    2,     0,     -6,    -5,    -8,
+    -12,   -17,   -27,   -34,   -31,   -30,   -27,   -19,   -14,   -14,   -14,
+    -14,   -19,   -22,   -21,   -19,   -14,   -1,    5,     9,     8,     6,
+    5,     -4,    -2,    -3,    -3,    -1,    -2,    -3,    2,     7,     8,
+    7,     6,     6,     3,     2,     1,     -2,    0,     6,     11,    18,
+    18,    19,    17,    14,    9,     4,     3,     3,     0,     -1,    3,
+    -1,    -5,    0,     -2,    0,     1,     7,     7,     8,     20,    29,
+    33,    31,    24,    14,    5,     -6,    -11,   -8,    -11,   -2,    6,
+    10,    12,    16,    26,    26,    24,    18,    12,    10,    4,     7,
+    6,     -2,    -12,   -17,   -17,   -20,   -23,   -23,   -18,   -8,    1,
+    3,     5,     6,     3,     0,     -6,    -12,   -12,   -15,   -12,   -7,
+    3,     3,     8,     7,     7,     7,     1,     -1,    -1,    4,     11,
+    17,    25,    32,    35,    42,    50,    52,    56,    50,    55,    53,
+    52,    47,    40,    38,    30,    26,    27,    28,    29,    25,    23,
+    23,    28,    30,    25,    26,    21,    19,    14,    9,     16,    22,
+    25,    33,    39,    45,    49,    48,    55,    51,    43,    35,    20,
+    14,    13,    23,    25,    24,    20,    22,    28,    22,    22,    17,
+    16,    13,    10,    10,    10,    9,     9,     14,    11,    10,    10,
+    4,     0,     0,     -2,    -3,    -5,    -7,    -3,    1,     -8,    -8,
+    -9,    -4,    4,     9,     11,    14,    11,    6,     8,     3,     -6,
+    -10,   -19,   -22,   -24,   -27,   -22,   -16,   -21,   -25,   -33,   -33,
+    -32,   -30,   -21,   -13,   -6,    -5,    2,     1,     4,     9,     7,
+    5,     1,     1,     8,     6,     7,     6,     0,     -6,    -15,   -18,
+    -23,   -22,   -23,   -25,   -22,   -21,   -19,   -17,   -13,   -10,   -10,
+    -16,   -17,   -15,   -13,   -8,    -9,    -14,   -13,   -17,   -20,   -26,
+    -28,   -31,   -29,   -26,   -23,   -13,   -10,   -6,    -1,    5,     7,
+    2,     -3,    -7,    -20,   -18,   -16,   -21,   -27,   -33,   -25,   -27,
+    -22,   -22,   -21,   -16,   -11,   -7,    -2,    2,     11,    18,    11,
+    9,     4,     1,     -1,    -6,    -4,    -5,    -9,    -12,   -16,   -25,
+    -29,   -37,   -37,   -38,   -37,   -33,   -23,   -16,   -14,   -7,    -1,
+    -4,    -3,    -4,    -5,    -11,   -14,   -8,    -8,    -8,    -8,    -9,
+    -4,    -14,   -21,   -22,   -21,   -18,   -15,   -2,    3,     -3,    0,
+    -2,    0,     -4,    -7,    -1,    -2,    3,     3,     -3,    -10,   -13,
+    -10,   -16,   -19,   -17,   -17,   -14,   -7,    5,     5,     7,     8,
+    12,    7,     0,     -5,    -13,   -17,   -18,   -14,   -7,    -4,    3,
+    11,    11,    12,    11,    8,     4,     -5,    -5,    -11,   -15,   -17,
+    -23,   -22,   -18,   -14,   -14,   -12,   -6,    -4,    -1,    3,     1,
+    -4,    -10,   -22,   -29,   -30,   -26,   -15,   -2,    6,     16,    21,
+    28,    32,    25,    24,    20,    9,     5,     0,     3,     7,     10,
+    11,    13,    17,    15,    16,    13,    11,    11,    8,     7,     1,
+    1,     -5,    -2,    -2,    -1,    4,     8,     17,    22,    24,    24,
+    26,    23,    20,    17,    16,    9,     4,     6,     5,     8,     2,
+    -1,    -5,    -4,    -10,   -14,   -14,   -17,   -19,   -18,   -16,   -14,
+    -6,    -3,    1,     3,     0,     -4,    -6,    -4,    -1,    -1,    2,
+    5,     3,     8,     7,     7,     14,    13,    20,    24,    29,    24,
+    12,    7,     -1,    -6,    -15,   -22,   -20,   -27,   -22,   -14,   -6,
+    2,     7,     9,     9,     2,     -3,    -7,    -8,    -10,   -9,    -3,
+    -6,    -11,   -12,   -8,    -5,    -4,    -5,    -3,    0,     3,     6,
+    6,     7,     5,     -7,    -10,   -14,   -13,   -14,   -17,   -11,   -7,
+    -4,    1,     1,     4,     -4,    -8,    -18,   -23,   -23,   -25,   -19,
+    -16,   -15,   -9,    3,     10,    19,    25,    30,    31,    26,    27,
+    23,    19,    16,    8,     7,     2,     0,     -1,    -1,    1,     5,
+    6,     6,     1,     3,     -1,    -7,    -11,   -17,   -19,   -19,   -7,
+    0,     3,     11,    12,    18,    20,    16,    9,     -2,    -7,    -14,
+    -19,   -22,   -30,   -33,   -34,   -36,   -26,   -14,   -11,   -9,    -3,
+    0,     -2,    1,     -2,    -3,    -5,    -12,   -15,   -19,   -14,   -9,
+    -8,    -2,    -6,    -13,   -15,   -19,   -22,   -25,   -26,   -21,   -20,
+    -11,   -1,    1,     5,     9,     13,    15,    12,    11,    3,     1,
+    -1,    0,     8,     13,    16,    16,    15,    16,    15,    12,    9,
+    7,     8,     4,     6,     4,     3,     3,     7,     0,     -4,    -8,
+    -11,   -18,   -18,   -15,   -20,   -23,   -21,   -22,   -21,   -27,   -25,
+    -15,   -7,    -2,    8,     9,     8,     8,     3,     3,     7,     8,
+    8,     8,     12,    11,    12,    4,     -1,    -7,    -11,   -15,   -18,
+    -17,   -17,   -20,   -19,   -13,   -11,   -3,    -3,    -1,    1,     -3,
+    1,     1,     8,     10,    15,    24,    26,    29,    34,    36,    26,
+    20,    12,    -2,    -6,    -9,    -7,    -6,    1,     10,    13,    19,
+    22,    22,    18,    21,    24,    28,    35,    37,    34,    33,    34,
+    34,    30,    19,    15,    10,    19,    21,    23,    24,    21,    19,
+    18,    21,    22,    22,    27,    30,    31,    32,    33,    32,    32,
+    24,    18,    10,    8,     10,    10,    6,     2,     -7,    -14,   -22,
+    -29,   -27,   -29,   -32,   -30,   -28,   -23,   -22,   -11,   -11,   -13,
+    -3,    2,     -1,    1,     1,     -3,    -7,    -5,    -7,    -11,   -17,
+    -23,   -25,   -26,   -27,   -26,   -23,   -14,   -5,    -3,    -1,    -2,
+    -2,    -1,    1,     -2,    -7,    -4,    2,     4,     10,    13,    6,
+    3,     -2,    -6,    -7,    -11,   -17,   -21,   -15,   -7,    -2,    11,
+    16,    22,    25,    25,    23,    24,    23,    21,    22,    25,    23,
+    17,    17,    12,    8,     -2,    -4,    1,     0,     4,     9,     8,
+    10,    9,     9,     15,    13,    10,    8,     1,     1,     -3,    1,
+    4,     11,    10,    9,     5,     5,     4,     1,     -1,    -4,    0,
+    8,     7,     4,     3,     3,     0,     -9,    -16,   -19,   -20,   -21,
+    -18,   -16,   -11,   -10,   -9,    -13,   -12,   -19,   -25,   -21,   -15,
+    -5,    8,     14,    21,    24,    18,    20,    17,    6,     1,     -2,
+    -2,    1,     1,     4,     1,     -3,    2,     0,     -3,    -3,    -4,
+    1,     0,     -5,    -11,   -17,   -21,   -20,   -20,   -20,   -14,   -9,
+    -3,    3,     7,     5,     3,     1,     -1,    -3,    -4,    -1,    1,
+    -5,    -1,    -1,    -7,    -11,   -14,   -12,   -14,   -17,   -18,   -23,
+    -29,   -24,   -27,   -19,   -12,   -13,   -2,    -3,    4,     4,     0,
+    -3,    -5,    -2,    -1,    -5,    -6,    -7,    -7,    -7,    -9,    -13,
+    -9,    -4,    1,     1,     1,     -4,    -11,   -8,    -15,   -19,   -19,
+    -12,   -5,    1,     7,     12,    8,     10,    10,    10,    11,    11,
+    19,    12,    9,     9,     2,     -4,    -13,   -22,   -24,   -25,   -24,
+    -26,   -19,   -14,   -10,   -1,    5,     4,     -1,    -4,    -5,    -10,
+    -14,   -11,   -8,    -10,   -8,    -9,    -7,    -8,    -6,    -1,    -5,
+    -10,   -18,   -27,   -29,   -24,   -19,   -11,   -7,    1,     10,    8,
+    8,     5,     2,     -5,    -1,    -1,    0,     2,     2,     -2,    -8,
+    -8,    -14,   -26,   -25,   -23,   -18,   -9,    2,     2,     7,     13,
+    6,     7,     5,     4,     3,     2,     1,     7,     2,     -1,    1,
+    -2,    2,     0,     -2,    -6,    -3,    5,     7,     9,     6,     5,
+    4,     2,     0,     -1,    -3,    3,     7,     6,     14,    18,    22,
+    20,    22,    19,    13,    9,     2,     -8,    -11,   -6,    -2,    -3,
+    -3,    0,     0,     0,     1,     -1,    -2,    1,     7,     11,    10,
+    11,    17,    17,    11,    11,    4,     6,     6,     13,    19,    22,
+    23,    27,    25,    24,    22,    14,    11,    13,    7,     0,     -3,
+    -9,    -11,   -7,    -7,    -6,    -4,    1,     7,     9,     15,    18,
+    18,    10,    5,     3,     -3,    -6,    -5,    -8,    -5,    4,     8,
+    8,     11,    10,    9,     4,     4,     1,     -3,    -10,   -11,   -8,
+    -16,   -20,   -22,   -19,   -12,   -7,    -10,   -10,   -13,   -14,   -11,
+    -11,   -13,   -18,   -21,   -19,   -17,   -22,   -18,   -22,   -22,   -16,
+    -9,    -3,    0,     3,     6,     3,     3,     -3,    -6,    -9,    -14,
+    -1,    14,    21,    30,    37,    33,    27,    26,    19,    15,    14,
+    11,    20,    12,    9,     10,    19,    20,    19,    22,    20,    22,
+    17,    13,    14,    10,    8,     12,    15,    13,    12,    12,    12,
+    9,     10,    11,    11,    9,     6,     4,     5,     -2,    1,     1,
+    -1,    5,     1,     8,     6,     3,     -1,    -4,    -15,   -24,   -27,
+    -26,   -23,   -19,   -9,    -3,    -4,    -9,    -9,    -10,   -16,   -22,
+    -19,   -18,   -15,   -2,    3,     5,     6,     7,     8,     11,    3,
+    1,     2,     1,     1,     0,     -4,    -13,   -18,   -19,   -19,   -20,
+    -23,   -15,   -10,   -5,    -3,    -1,    -1,    -1,    3,     -1,    0,
+    -8,    -11,   -13,   -14,   -13,   -8,    -6,    -3,    1,     1,     0,
+    0,     5,     4,     5,     5,     5,     4,     0,     -1,    -4,    -13,
+    -22,   -21,   -28,   -26,   -22,   -28,   -23,   -23,   -14,   -11,   -10,
+    -7,    -8,    -5,    -4,    1,     9,     10,    15,    19,    21,    17,
+    18,    19,    16,    13,    16,    21,    27,    29,    22,    22,    13,
+    4,     1,     0,     -5,    -6,    -2,    3,     5,     8,     6,     9,
+    10,    2,     -3,    -9,    -8,    -4,    -2,    -7,    -6,    -4,    -8,
+    -6,    -8,    -11,   -8,    -8,    -6,    2,     -2,    -2,    -1,    2,
+    4,     8,     5,     -1,    -8,    -10,   -7,    -6,    -5,    -6,    -5,
+    6,     13,    22,    28,    33,    31,    38,    35,    28,    27,    22,
+    22,    23,    26,    23,    21,    28,    28,    23,    23,    22,    21,
+    20,    14,    6,     -1,    -5,    -8,    -5,    -1,    2,     5,     5,
+    7,     8,     5,     4,     0,     3,     6,     10,    13,    13,    6,
+    4,     4,     0,     -2,    -3,    0,     3,     5,     7,     9,     7,
+    6,     10,    8,     3,     4,     -1,    -4,    -2,    0,     -2,    -2,
+    -2,    -3,    5,     8,     6,     4,     -1,    -7,    -6,    -7,    -12,
+    -18,   -11,   -2,    -1,    -1,    -1,    -2,    -7,    -7,    -3,    -3,
+    -5,    -6,    -6,    -6,    -6,    -6,    -9,    -12,   -9,    -5,    1,
+    3,     5,     5,     8,     7,     3,     -5,    -3,    -2,    2,     3,
+    5,     5,     -1,    -2,    -4,    -8,    -9,    -9,    -7,    -12,   -13,
+    -17,   -19,   -16,   -19,   -21,   -21,   -19,   -11,   -6,    -3,    7,
+    8,     6,     2,     0,     1,     1,     -2,    -5,    0,     -2,    2,
+    1,     2,     0,     -2,    -1,    -10,   -21,   -25,   -24,   -21,   -19,
+    -14,   -8,    -3,    -5,    0,     0,     -5,    -6,    -3,    -6,    -9,
+    -13,   -19,   -20,   -21,   -21,   -24,   -25,   -27,   -27,   -29,   -26,
+    -19,   -14,   -14,   -13,   -8,    -5,    -10,   -10,   -6,    1,     4,
+    14,    22,    23,    24,    20,    20,    18,    14,    11,    9,     6,
+    8,     12,    15,    18,    18,    12,    8,     9,     9,     9,     7,
+    4,     9,     5,     6,     5,     3,     3,     -1,    -1,    -6,    -10,
+    -6,    -8,    -3,    0,     -2,    -3,    -2,    -6,    -6,    -7,    -3,
+    -3,    -3,    -2,    1,     -1,    -10,   -7,    -13,   -21,   -23,   -20,
+    -19,   -18,   -18,   -19,   -15,   -16,   -7,    -6,    -9,    -13,   -12,
+    -6,    -1,    3,     6,     7,     5,     3,     -3,    -11,   -18,   -20,
+    -26,   -29,   -27,   -27,   -24,   -30,   -29,   -28,   -23,   -18,   -21,
+    -18,   -15,   -9,    1,     9,     17,    21,    23,    18,    14,    5,
+    -1,    -2,    -1,    0,     3,     6,     5,     4,     4,     0,     -1,
+    1,     -4,    -9,    -13,   -11,   -20,   -21,   -19,   -14,   -9,    -4,
+    1,     6,     10,    16,    24,    30,    35,    31,    38,    37,    35,
+    39,    36,    36,    32,    30,    33,    31,    24,    19,    12,    4,
+    -1,    -7,    -11,   -7,    -5,    -3,    2,     6,     10,    16,    19,
+    21,    21,    16,    10,    14,    12,    14,    13,    12,    12,    5,
+    6,     2,     0,     1,     3,     4,     6,     9,     6,     2,     -1,
+    -3,    -10,   -15,   -13,   -17,   -19,   -15,   -16,   -15,   -13,   -8,
+    -8,    -7,    -10,   -5,    -2,    1,     5,     5,     11,    10,    12,
+    10,    9,     9,     15,    23,    33,    35,    33,    34,    34,    35,
+    34,    24,    30,    26,    23,    21,    20,    15,    10,    3,     4,
+    0,     -7,    -8,    -9,    -9,    -8,    -4,    0,     5,     5,     2,
+    3,     -2,    0,     0,     -1,    0,     -1,    1,     2,     6,     3,
+    1,     -9,    -5,    -6,    -2,    -8,    -12,   -9,    -10,   -7,    -8,
+    -8,    -6,    -2,    -2,    -1,    0,     -2,    -1,    -8,    -18,   -19,
+    -27,   -37,   -42,   -40,   -39,   -33,   -30,   -23,   -16,   -16,   -9,
+    -13,   -11,   -10,   -10,   -8,    -3,    -1,    2,     0,     -1,    2,
+    6,     4,     8,     10,    17,    21,    28,    31,    33,    28,    20,
+    12,    8,     -3,    -5,    -4,    -3,    2,     6,     9,     8,     2,
+    7,     4,     -6,    -9,    -15,   -13,   -15,   -17,   -14,   -11,   -12,
+    -5,    -6,    -4,    -6,    -11,   -11,   -7,    -4,    -6,    -8,    -13,
+    -10,   -7,    -12,   -11,   -12,   -13,   -12,   -9,    -9,    -10,   -10,
+    -6,    -8,    -8,    -7,    -9,    -9,    -7,    2,     5,     5,     6,
+    3,     4,     6,     3,     -1,    -2,    -2,    -2,    1,     5,     3,
+    4,     2,     -2,    -7,    -9,    -13,   -11,   -8,    2,     12,    23,
+    31,    37,    41,    40,    37,    36,    31,    31,    27,    28,    24,
+    13,    16,    14,    15,    9,     4,     4,     5,     4,     7,     12,
+    16,    14,    11,    13,    6,     -2,    -4,    -1,    -3,    3,     6,
+    6,     9,     7,     9,     7,     5,     0,     1,     -1,    -2,    -4,
+    -1,    0,     0,     -4,    0,     -4,    -9,    -15,   -16,   -18,   -15,
+    -10,   -6,    -8,    -5,    -2,    -2,    0,     4,     7,     0,     -2,
+    -3,    4,     3,     2,     -1,    -3,    -8,    -19,   -19,   -19,   -16,
+    -8,    -5,    0,     1,     2,     1,     -1,    -2,    -10,   -12,   -10,
+    -4,    3,     4,     2,     7,     8,     4,     1,     -5,    -5,    -4,
+    -1,    9,     10,    12,    15,    15,    14,    11,    20,    16,    19,
+    18,    26,    29,    21,    23,    16,    16,    3,     -3,    -4,    -10,
+    -12,   -10,   -6,    -7,    -12,   -17,   -14,   -16,   -19,   -13,   -10,
+    -13,   -13,   -2,    2,     3,     7,     13,    22,    21,    21,    21,
+    24,    27,    23,    22,    20,    17,    17,    16,    13,    11,    5,
+    1,     1,     5,     5,     3,     2,     -1,    2,     -5,    -6,    -3,
+    -11,   -9,    -6,    -5,    -10,   -4,    -1,    1,     2,     -1,    -4,
+    -4,    -9,    -9,    -7,    -3,    3,     -2,    1,     1,     4,     -4,
+    -8,    -8,    -17,   -17,   -13,   -13,   -18,   -18,   -25,   -27,   -21,
+    -22,   -18,   -7,    -1,    5,     9,     11,    11,    11,    15,    11,
+    4,     1,     6,     8,     17,    12,    10,    5,     -2,    -3,    -14,
+    -17,   -25,   -26,   -22,   -20,   -13,   -12,   -12,   -13,   -10,   -4,
+    -6,    -6,    -4,    -6,    -4,    0,     -3,    -7,    -7,    -10,   -17,
+    -14,   -9,    -3,    4,     4,     6,     1,     0,     0,     -6,    -3,
+    -4,    -3,    -6,    -9,    -9,    -5,    0,     1,     2,     -2,    3,
+    -1,    -4,    -5,    -11,   -14,   -17,   -14,   -12,   -14,   -19,   -21,
+    -25,   -35,   -40,   -39,   -31,   -24,   -13,   -4,    -1,    0,     0,
+    2,     -2,    -5,    -8,    -8,    -9,    -6,    -2,    0,     -5,    -6,
+    2,     5,     4,     1,     6,     8,     9,     14,    13,    19,    15,
+    19,    13,    14,    20,    16,    16,    14,    14,    17,    13,    12,
+    11,    6,     -1,    -7,    -9,    -10,   -11,   -2,    8,     12,    12,
+    12,    8,     4,     1,     -3,    -4,    -4,    -3,    1,     9,     14,
+    16,    10,    12,    9,     6,     4,     -1,    8,     6,     3,     6,
+    1,     -11,   -10,   -10,   -13,   -9,    -6,    -2,    -2,    9,     13,
+    17,    17,    19,    17,    16,    9,     -2,    -5,    -5,    -3,    -9,
+    -8,    -8,    -12,   -17,   -16,   -18,   -15,   -9,    -7,    1,     10,
+    17,    18,    23,    25,    23,    20,    15,    17,    18,    23,    33,
+    40,    43,    45,    51,    53,    47,    36,    27,    10,    5,     1,
+    4,     5,     4,     0,     0,     6,     7,     8,     9,     3,     2,
+    1,     0,     -1,    3,     5,     5,     13,    7,     4,     4,     3,
+    11,    17,    21,    31,    31,    31,    31,    28,    26,    23,    19,
+    16,    17,    16,    10,    10,    12,    9,     7,     -1,    -7,    -12,
+    -15,   -15,   -15,   -13,   -13,   -16,   -19,   -19,   -23,   -31,   -34,
+    -38,   -39,   -31,   -30,   -21,   -21,   -18,   -11,   -16,   -20,   -25,
+    -22,   -18,   -14,   -7,    -8,    -3,    2,     10,    13,    12,    10,
+    6,     2,     0,     0,     0,     -6,    -4,    -1,    0,     0,     -1,
+    -2,    1,     3,     8,     9,     3,     6,     2,     -4,    -2,    -3,
+    -7,    -4,    -3,    2,     6,     8,     10,    12,    15,    11,    15,
+    12,    13,    14,    15,    18,    14,    8,     4,     4,     3,     -4,
+    -5,    -4,    -2,    -3,    -2,    4,     9,     13,    18,    21,    20,
+    18,    15,    11,    6,     7,     10,    8,     6,     3,     -3,    -7,
+    -14,   -21,   -29,   -33,   -32,   -26,   -17,   -12,   -11,   -9,    -3,
+    -10,   -13,   -18,   -23,   -21,   -26,   -26,   -24,   -28,   -25,   -29,
+    -30,   -30,   -27,   -17,   -7,    2,     10,    13,    16,    16,    17,
+    18,    17,    19,    19,    20,    15,    14,    16,    14,    10,    5,
+    0,     -4,    -18,   -21,   -25,   -20,   -16,   -13,   -8,    -5,    2,
+    6,     11,    12,    18,    16,    18,    15,    13,    17,    18,    22,
+    21,    25,    26,    25,    26,    28,    31,    27,    20,    10,    3,
+    -6,    -10,   -16,   -19,   -18,   -15,   -13,   -10,   -2,    0,     2,
+    4,     3,     5,     -1,    0,     1,     2,     0,     -2,    -1,    -6,
+    -5,    -7,    -12,   -10,   -9,    -4,    -1,    3,     4,     2,     4,
+    4,     3,     -3,    -6,    -11,   -14,   -15,   -23,   -25,   -29,   -30,
+    -28,   -25,   -22,   -19,   -21,   -19,   -11,   -7,    -7,    -3,    -3,
+    -6,    -8,    -13,   -10,   -10,   -5,    1,     4,     9,     7,     6,
+    6,     4,     -5,    -11,   -8,    -6,    -3,    0,     3,     7,     11,
+    7,     3,     5,     6,     10,    12,    14,    16,    8,     5,     -1,
+    -1,    4,     0,     0,     -3,    -5,    -5,    -4,    -2,    -2,    1,
+    4,     7,     5,     10,    9,     6,     9,     12,    19,    28,    32,
+    32,    33,    31,    29,    20,    17,    16,    14,    15,    6,     -2,
+    -5,    -7,    -10,   -10,   -11,   -9,    -6,    -3,    8,     10,    10,
+    10,    12,    12,    7,     7,     5,     3,     2,     2,     -2,    -5,
+    -4,    -7,    -2,    -6,    -5,    -6,    -11,   -14,   -13,   -10,   -11,
+    -15,   -16,   -11,   -11,   -11,   -10,   -16,   -15,   -15,   -16,   -10,
+    -11,   -11,   -5,    -1,    2,     1,     2,     0,     1,     4,     8,
+    5,     -4,    -2,    -4,    -12,   -18,   -24,   -20,   -25,   -14,   -3,
+    4,     11,    13,    13,    7,     4,     -4,    -9,    -13,   -17,   -10,
+    -6,    -1,    0,     2,     2,     -1,    1,     -8,    -18,   -22,   -19,
+    -19,   -22,   -20,   -22,   -20,   -17,   -12,   -9,    -4,    3,     9,
+    9,     9,     7,     6,     13,    10,    11,    8,     4,     -1,    5,
+    7,     7,     8,     4,     2,     2,     -2,    -8,    -11,   -16,   -18,
+    -12,   -12,   -9,    -2,    3,     3,     5,     5,     6,     9,     11,
+    20,    22,    26,    30,    28,    22,    15,    15,    10,    11,    9,
+    6,     9,     9,     11,    10,    12,    10,    8,     8,     7,     9,
+    4,     3,     9,     5,     1,     2,     0,     -3,    -3,    0,     3,
+    0,     -2,    1,     4,     6,     4,     0,     1,     -4,    -13,   -13,
+    -11,   -20,   -21,   -15,   -17,   -23,   -22,   -24,   -29,   -24,   -29,
+    -32,   -21,   -13,   -11,   -9,    -9,    -8,    -13,   -11,   -11,   -11,
+    -11,   -17,   -17,   -21,   -23,   -27,   -32,   -33,   -32,   -31,   -35,
+    -31,   -26,   -24,   -18,   -10,   -1,    5,     13,    17,    15,    13,
+    8,     4,     6,     9,     10,    13,    11,    12,    13,    9,     5,
+    6,     8,     12,    21,    25,    24,    23,    16,    8,     7,     0,
+    -3,    -8,    -9,    -2,    1,     11,    18,    25,    30,    31,    27,
+    21,    19,    19,    18,    18,    22,    24,    16,    14,    8,     2,
+    -4,    -9,    -7,    -10,   -6,    -8,    -8,    -13,   -14,   -11,   -13,
+    -8,    -7,    6,     9,     10,    15,    17,    11,    11,    9,     2,
+    2,     -2,    2,     -6,    -6,    -7,    -14,   -11,   -12,   -13,   -17,
+    -22,   -25,   -30,   -24,   -16,   -4,    5,     2,     7,     5,     2,
+    -1,    1,     -4,    -4,    4,     8,     8,     5,     6,     6,     2,
+    1,     -2,    -9,    -14,   -17,   -16,   -15,   -14,   -12,   -11,   -6,
+    -6,    -2,    -3,    -3,    6,     13,    18,    27,    27,    26,    24,
+    22,    19,    18,    19,    12,    8,     7,     -2,    0,     -6,    -8,
+    -6,    -4,    -6,    -14,   -16,   -16,   -15,   -12,   -2,    6,     12,
+    16,    18,    14,    16,    13,    12,    17,    16,    17,    17,    12,
+    13,    10,    14,    14,    10,    2,     -1,    -3,    -5,    -10,   -15,
+    -13,   -20,   -21,   -21,   -21,   -19,   -20,   -18,   -8,    -4,    -1,
+    -1,    4,     2,     -3,    0,     -5,    -5,    -3,    -1,    0,     6,
+    5,     6,     7,     7,     3,     2,     1,     -5,    -3,    0,     3,
+    5,     7,     4,     10,    15,    15,    11,    6,     8,     9,     14,
+    19,    18,    14,    12,    16,    15,    11,    9,     9,     5,     4,
+    0,     -7,    -12,   -18,   -22,   -29,   -32,   -36,   -37,   -38,   -39,
+    -32,   -24,   -20,   -14,   -10,   -2,    0,     1,     9,     13,    21,
+    26,    31,    35,    40,    38,    32,    33,    25,    14,    11,    7,
+    1,     -1,    -6,    -5,    -11,   -20,   -22,   -19,   -16,   -9,    2,
+    9,     14,    14,    13,    13,    12,    10,    3,     2,     1,     0,
+    6,     5,     -1,    -4,    -13,   -17,   -21,   -25,   -29,   -30,   -23,
+    -14,   -4,    4,     11,    11,    12,    13,    13,    5,     6,     6,
+    7,     5,     5,     9,     -2,    3,     0,     -2,    -3,    -5,    -1,
+    3,     9,     16,    18,    17,    17,    11,    5,     1,     -4,    -13,
+    -12,   -7,    -7,    1,     6,     4,     2,     3,     1,     1,     0,
+    -1,    -5,    -5,    -3,    -5,    -1,    8,     9,     7,     12,    7,
+    6,     4,     3,     -1,    -1,    -4,    -14,   -16,   -18,   -24,   -34,
+    -44,   -37,   -37,   -36,   -28,   -19,   -15,   -6,    -2,    -3,    2,
+    5,     6,     3,     6,     6,     9,     7,     3,     -4,    -15,   -25,
+    -34,   -37,   -41,   -41,   -38,   -33,   -27,   -22,   -14,   -15,   -18,
+    -18,   -15,   -8,    -7,    -2,    2,     0,     4,     12,    13,    10,
+    17,    20,    16,    17,    23,    24,    22,    24,    22,    28,    26,
+    24,    22,    26,    28,    27,    23,    17,    10,    4,     4,     1,
+    -1,    0,     4,     9,     15,    14,    15,    14,    14,    13,    8,
+    0,     -1,    -11,   -13,   -4,    -3,    -5,    -3,    -1,    -6,    -5,
+    -7,    -4,    -2,    2,     7,     15,    20,    14,    13,    8,     2,
+    -6,    -15,   -23,   -25,   -20,   -22,   -20,   -14,   -10,   -4,    -2,
+    1,     -10,   -15,   -12,   -8,    -8,    -7,    -5,    -10,   -12,   -20,
+    -28,   -26,   -24,   -16,   -8,    -5,    3,     8,     9,     12,    12,
+    12,    14,    13,    12,    10,    13,    23,    29,    28,    33,    36,
+    32,    28,    23,    25,    26,    30,    34,    27,    22,    16,    12,
+    3,     -6,    -13,   -13,   -15,   -14,   -9,    -11,   -13,   -13,   -16,
+    -15,   -20,   -22,   -20,   -32,   -30,   -29,   -24,   -18,   -18,   -18,
+    -13,   -15,   -15,   -16,   -17,   -10,   -11,   -12,   -15,   -17,   -17,
+    -19,   -21,   -22,   -26,   -28,   -21,   -18,   -14,   -5,    2,     6,
+    7,     5,     3,     -2,    0,     -4,    -2,    -3,    -6,    -9,    -12,
+    -11,   -11,   -19,   -23,   -20,   -21,   -16,   -19,   -23,   -22,   -24,
+    -21,   -22,   -17,   -15,   -8,    -1,    4,     14,    18,    23,    24,
+    25,    25,    18,    15,    7,     2,     14,    19,    22,    20,    23,
+    22,    20,    19,    20,    17,    16,    21,    22,    21,    18,    9,
+    3,     -6,    -14,   -19,   -30,   -36,   -40,   -32,   -22,   -21,   -16,
+    -7,    -1,    3,     2,     3,     6,     9,     16,    20,    22,    26,
+    27,    29,    32,    30,    23,    19,    20,    21,    18,    22,    24,
+    15,    14,    9,     9,     7,     6,     9,     9,     16,    22,    20,
+    18,    18,    9,     -1,    -10,   -16,   -19,   -22,   -22,   -20,   -16,
+    -11,   -5,    0,     1,     4,     2,     0,     3,     5,     10,    8,
+    12,    10,    11,    9,     8,     7,     -3,    -4,    -10,   -11,   -5,
+    2,     8,     12,    12,    13,    14,    15,    14,    12,    10,    14,
+    13,    8,     0,     -2,    -3,    -9,    -6,    -13,   -21,   -12,   -12,
+    -8,    -9,    -14,   -16,   -19,   -23,   -22,   -23,   -30,   -26,   -17,
+    -14,   -9,    -2,    3,     11,    16,    17,    17,    11,    12,    13,
+    12,    9,     8,     7,     10,    17,    14,    13,    9,     7,     6,
+    5,     10,    10,    6,     10,    9,     1,     -5,    -10,   -12,   -17,
+    -16,   -14,   -13,   -10,   -6,    -2,    0,     -1,    2,     2,     -1,
+    2,     6,     12,    18,    23,    22,    23,    24,    20,    16,    10,
+    6,     9,     16,    15,    15,    16,    14,    8,     4,     0,     -3,
+    -7,    -4,    -5,    -5,    0,     -4,    1,     1,     1,     -4,    -10,
+    -17,   -25,   -25,   -28,   -28,   -27,   -25,   -20,   -20,   -20,   -22,
+    -14,   -11,   -4,    4,     6,     11,    10,    12,    9,     6,     2,
+    -6,    -10,   -12,   -7,    -1,    -6,    0,     1,     2,     5,     1,
+    -1,    1,     -3,    -6,    -4,    -5,    -4,    -6,    -5,    -7,    -10,
+    -10,   -8,    -11,   -9,    -2,    9,     15,    14,    20,    19,    19,
+    16,    16,    11,    3,     2,     2,     5,     4,     5,     3,     -1,
+    -1,    -6,    -11,   -16,   -18,   -18,   -12,   -17,   -18,   -13,   -15,
+    -5,    -4,    -3,    -1,    2,     6,     7,     11,    14,    17,    17,
+    18,    21,    18,    19,    18,    23,    27,    36,    32,    35,    30,
+    24,    25,    18,    10,    3,     -1,    -4,    -11,   -16,   -21,   -33,
+    -37,   -35,   -36,   -35,   -30,   -26,   -26,   -21,   -10,   -7,    -3,
+    -4,    -3,    -3,    -9,    -12,   -16,   -25,   -22,   -11,   -6,    2,
+    5,     7,     4,     -2,    -8,    -16,   -23,   -30,   -28,   -23,   -20,
+    -11,   -11,   -8,    5,     2,     -3,    -1,    -11,   -15,   -10,   -13,
+    -8,    -8,    -12,   -9,    -10,   -15,   -8,    -4,    -3,    7,     6,
+    13,    20,    25,    24,    25,    27,    28,    25,    23,    22,    27,
+    28,    27,    30,    28,    26,    20,    16,    13,    7,     2,     1,
+    6,     3,     -4,    -6,    -13,   -18,   -19,   -21,   -15,   -3,    -1,
+    10,    16,    17,    20,    24,    28,    28,    26,    26,    28,    27,
+    24,    23,    20,    20,    24,    20,    17,    14,    6,     0,     2,
+    1,     0,     -3,    -7,    -12,   -18,   -29,   -28,   -30,   -32,   -23,
+    -27,   -25,   -20,   -17,   -13,   -11,   -14,   -17,   -21,   -22,   -18,
+    -11,   -12,   -6,    -8,    -9,    -5,    -6,    -10,   -18,   -19,   -16,
+    -13,   -9,    -6,    -7,    -13,   -10,   -14,   -22,   -30,   -37,   -35,
+    -37,   -35,   -34,   -36,   -30,   -23,   -17,   -16,   -16,   -11,   -6,
+    -2,    3,     7,     7,     6,     7,     7,     13,    21,    20,    22,
+    23,    22,    24,    17,    5,     -1,    -2,    -8,    -13,   -14,   -17,
+    -24,   -28,   -23,   -22,   -19,   -12,   -14,   -10,   -14,   -21,   -20,
+    -21,   -22,   -13,   -6,    -1,    6,     4,     10,    11,    8,     10,
+    10,    17,    20,    27,    34,    32,    26,    26,    24,    17,    13,
+    6,     9,     12,    15,    17,    12,    11,    9,     3,     -3,    -3,
+    -8,    -9,    -4,    -2,    -2,    2,     1,     -1,    -3,    -7,    -8,
+    -11,   -15,   -8,    -5,    1,     9,     7,     10,    13,    17,    14,
+    12,    8,     6,     3,     6,     9,     8,     5,     0,     -2,    1,
+    1,     -3,    -6,    -12,   -17,   -17,   -23,   -28,   -33,   -31,   -29,
+    -30,   -35,   -28,   -25,   -17,   -5,    0,     6,     10,    14,    27,
+    31,    26,    31,    30,    32,    41,    42,    42,    43,    34,    32,
+    21,    12,    2,     1,     -3,    -1,    8,     13,    20,    19,    18,
+    19,    13,    8,     5,     7,     6,     7,     6,     4,     3,     -2,
+    0,     2,     -4,    -1,    -3,    2,     12,    22,    33,    32,    31,
+    35,    35,    34,    32,    26,    27,    26,    21,    17,    10,    1,
+    -3,    -14,   -21,   -19,   -21,   -19,   -24,   -24,   -19,   -16,   -13,
+    -16,   -13,   -15,   -17,   -12,   -9,    -4,    7,     19,    27,    33,
+    37,    34,    35,    30,    24,    23,    25,    21,    20,    18,    15,
+    12,    13,    8,     2,     -4,    -12,   -18,   -17,   -14,   -10,   -14,
+    -8,    -14,   -14,   -12,   -14,   -19,   -23,   -31,   -32,   -28,   -30,
+    -22,   -20,   -13,   1,     0,     6,     14,    15,    20,    22,    20,
+    16,    9,     2,     1,     3,     6,     7,     9,     10,    14,    17,
+    16,    14,    4,     -7,    -16,   -31,   -40,   -41,   -40,   -38,   -34,
+    -40,   -37,   -33,   -28,   -22,   -17,   -11,   -10,   -12,   -5,    -5,
+    -8,    -4,    0,     -1,    1,     1,     6,     11,    14,    22,    25,
+    28,    31,    32,    32,    31,    31,    20,    13,    12,    5,     4,
+    4,     2,     0,     -3,    -6,    -8,    -4,    -4,    -4,    -1,    7,
+    9,     10,    13,    13,    16,    10,    7,     3,     6,     8,     8,
+    15,    20,    23,    18,    15,    12,    4,     1,     0,     -4,    -4,
+    -1,    8,     11,    13,    21,    24,    19,    12,    2,     -5,    -11,
+    -15,   -17,   -17,   -19,   -23,   -28,   -34,   -33,   -37,   -29,   -27,
+    -24,   -17,   -13,   -8,    -6,    -2,    5,     3,     4,     -2,    -5,
+    -4,    0,     2,     3,     1,     -5,    -5,    -6,    -11,   -11,   -15,
+    -15,   -19,   -17,   -17,   -21,   -23,   -21,   -22,   -24,   -28,   -27,
+    -25,   -15,   -8,    -1,    2,     2,     3,     3,     2,     -2,    0,
+    1,     -1,    2,     5,     7,     2,     0,     2,     -6,    -9,    -8,
+    -6,    -3,    -3,    3,     0,     5,     0,     0,     -5,    -12,   -13,
+    -20,   -14,   -14,   -6,    -5,    -2,    0,     6,     11,    9,     9,
+    11,    10,    13,    19,    26,    29,    36,    37,    40,    35,    27,
+    20,    13,    6,     3,     -1,    -1,    -1,    -3,    -6,    -8,    -14,
+    -16,   -25,   -28,   -23,   -21,   -24,   -22,   -22,   -22,   -24,   -28,
+    -35,   -43,   -42,   -37,   -29,   -20,   -5,    2,     10,    23,    28,
+    30,    31,    30,    39,    43,    40,    41,    43,    43,    38,    29,
+    18,    14,    12,    3,     6,     3,     3,     0,     -1,    -3,    -5,
+    -5,    -8,    -8,    -10,   -6,    -1,    1,     5,     1,     2,     6,
+    0,     -3,    -7,    -13,   -10,   -7,    -8,    -7,    -3,    -5,    -4,
+    -4,    -4,    -5,    -2,    2,     3,     6,     4,     3,     -1,    -2,
+    -5,    -16,   -22,   -31,   -39,   -38,   -42,   -47,   -42,   -42,   -35,
+    -27,   -30,   -28,   -25,   -26,   -24,   -20,   -19,   -19,   -19,   -19,
+    -14,   -16,   -13,   -9,    -10,   -1,    8,     17,    21,    28,    26,
+    28,    24,    14,    8,     2,     0,     -4,    -4,    -13,   -16,   -16,
+    -13,   -12,   -7,    -5,    0,     -4,    -1,    2,     4,     8,     8,
+    10,    10,    10,    14,    16,    17,    23,    20,    27,    27,    27,
+    21,    14,    11,    0,     -4,    -8,    -8,    -1,    -1,    1,     6,
+    8,     23,    22,    23,    23,    25,    26,    26,    22,    21,    20,
+    22,    17,    12,    8,     3,     -2,    -2,    -4,    -5,    -3,    1,
+    7,     6,     8,     9,     12,    6,     1,     -4,    -8,    -6,    -3,
+    -4,    -5,    -3,    -7,    -6,    -6,    -11,   -11,   -19,   -23,   -26,
+    -28,   -34,   -41,   -41,   -44,   -45,   -47,   -40,   -39,   -33,   -29,
+    -21,   -14,   -16,   -6,    -7,    -3,    1,     6,     8,     11,    14,
+    14,    15,    15,    18,    18,    16,    17,    12,    15,    20,    21,
+    19,    21,    23,    22,    21,    16,    12,    8,     7,     7,     10,
+    13,    13,    16,    16,    16,    16,    15,    15,    12,    14,    14,
+    15,    12,    11,    17,    19,    19,    14,    13,    15,    17,    18,
+    20,    24,    27,    24,    19,    11,    10,    1,     0,     0,     -1,
+    3,     8,     16,    18,    17,    22,    22,    21,    19,    7,     0,
+    1,     -1,    -2,    -1,    -6,    -8,    -12,   -14,   -20,   -21,   -24,
+    -19,   -9,    -4,    -3,    2,     2,     3,     0,     -10,   -19,   -23,
+    -29,   -31,   -35,   -29,   -33,   -28,   -25,   -25,   -19,   -22,   -23,
+    -24,   -21,   -17,   -15,   -17,   -13,   -15,   -12,   -15,   -14,   -14,
+    -12,   -9,    -5,    1,     9,     13,    13,    17,    17,    15,    11,
+    12,    8,     13,    20,    24,    30,    29,    33,    30,    26,    23,
+    13,    9,     4,     3,     3,     5,     3,     2,     5,     3,     2,
+    1,     3,     6,     10,    14,    19,    23,    21,    20,    21,    17,
+    11,    5,     -3,    -7,    -12,   -15,   -16,   -13,   -15,   -13,   -7,
+    -4,    -5,    -5,    -1,    5,     11,    8,     7,     -2,    -2,    -5,
+    -6,    -1,    -2,    0,     2,     8,     13,    15,    17,    15,    16,
+    10,    13,    3,     -1,    -4,    -4,    -4,    0,     8,     13,    15,
+    9,     11,    9,     12,    9,     10,    10,    5,     11,    16,    21,
+    20,    15,    13,    5,     3,     -3,    1,     1,     0,     -4,    -7,
+    -9,    -7,    -9,    -10,   -7,    -6,    -3,    -2,    -3,    -3,    -6,
+    -12,   -16,   -22,   -21,   -26,   -28,   -25,   -24,   -23,   -23,   -28,
+    -32,   -29,   -26,   -26,   -23,   -29,   -23,   -16,   -11,   -7,    -9,
+    -10,   -12,   -18,   -20,   -20,   -26,   -23,   -16,   -17,   -10,   -7,
+    0,     3,     -2,    0,     -4,    -7,    -8,    -6,    -3,    -7,    -5,
+    -5,    1,     0,     -3,    -2,    -3,    5,     7,     10,    19,    17,
+    22,    21,    20,    16,    8,     9,     10,    12,    20,    28,    31,
+    28,    28,    26,    21,    14,    8,     5,     4,     5,     8,     9,
+    9,     13,    17,    16,    14,    20,    17,    13,    16,    17,    18,
+    18,    15,    11,    5,     -2,    -8,    -15,   -17,   -17,   -24,   -24,
+    -23,   -18,   -13,   -13,   -9,    -7,    -4,    0,     3,     6,     2,
+    2,     -4,    -5,    -5,    -4,    -4,    -2,    2,     6,     10,    7,
+    4,     2,     -2,    -3,    -8,    -10,   -14,   -27,   -29,   -37,   -36,
+    -29,   -27,   -19,   -7,    -3,    0,     -2,    2,     8,     13,    18,
+    15,    10,    10,    6,     1,     -5,    -12,   -17,   -20,   -23,   -23,
+    -22,   -19,   -17,   -10,   -6,    -3,    2,     0,     4,     11,    14,
+    19,    16,    6,     7,     3,     3,     4,     1,     7,     8,     7,
+    3,     -2,    0,     0,     0,     -1,    -2,    0,     4,     3,     5,
+    9,     9,     12,    7,     5,     0,     0,     1,     0,     2,     -6,
+    -10,   -9,    -13,   -15,   -19,   -15,   -18,   -16,   -17,   -9,    -5,
+    -2,    2,     2,     3,     7,     2,     -3,    -8,    -13,   -8,    1,
+    8,     12,    15,    17,    17,    11,    7,     0,     -4,    -8,    -8,
+    -3,    -1,    -4,    -6,    -6,    -13,   -12,   -12,   -13,   -12,   -8,
+    -9,    -5,    -4,    -2,    0,     -1,    -6,    -7,    -6,    -10,   -10,
+    -8,    -6,    1,     5,     6,     15,    18,    16,    12,    12,    12,
+    10,    13,    7,     0,     -9,    -10,   -11,   -6,    -8,    -8,    -4,
+    0,     6,     10,    11,    15,    15,    15,    12,    10,    6,     6,
+    11,    12,    20,    25,    23,    25,    18,    12,    6,     -1,    -4,
+    -10,   -12,   -9,    -13,   -16,   -15,   -18,   -18,   -22,   -22,   -17,
+    -14,   -12,   -8,    -3,    1,     4,     11,    13,    7,     0,     -8,
+    -11,   -11,   -13,   -14,   -12,   -11,   -9,    -6,    -5,    -2,    1,
+    5,     6,     10,    18,    17,    15,    13,    11,    12,    13,    10,
+    9,     13,    16,    16,    13,    11,    6,     5,     0,     -5,    -4,
+    -3,    2,     6,     5,     6,     11,    14,    20,    23,    28,    27,
+    22,    24,    23,    22,    16,    17,    12,    7,     -1,    -9,    -10,
+    -9,    -9,    -13,   -11,   -9,    -2,    -2,    -7,    -8,    -6,    -7,
+    -12,   -12,   -10,   0,     5,     11,    13,    11,    10,    7,     3,
+    0,     0,     3,     10,    14,    16,    18,    19,    21,    14,    15,
+    12,    7,     6,     7,     9,     7,     11,    6,     4,     4,     -1,
+    -9,    -12,   -12,   -14,   -9,    -9,    -6,    -5,    -4,    -6,    -7,
+    -12,   -15,   -17,   -27,   -23,   -20,   -19,   -19,   -18,   -24,   -20,
+    -25,   -28,   -33,   -31,   -29,   -27,   -15,   -12,   -7,    -3,    1,
+    -3,    -3,    -5,    -8,    -6,    0,     13,    17,    24,    25,    23,
+    24,    18,    8,     -3,    -4,    -4,    -7,    -3,    1,     4,     7,
+    9,     10,    14,    14,    20,    28,    35,    38,    42,    43,    43,
+    39,    30,    27,    19,    15,    8,     10,    12,    19,    25,    26,
+    27,    23,    22,    15,    10,    6,     8,     4,     6,     6,     3,
+    7,     7,     15,    11,    7,     6,     5,     9,     6,     0,     -3,
+    -14,   -21,   -21,   -30,   -39,   -42,   -40,   -37,   -37,   -36,   -32,
+    -30,   -24,   -21,   -22,   -23,   -24,   -28,   -31,   -31,   -29,   -27,
+    -30,   -31,   -31,   -31,   -34,   -33,   -34,   -26,   -21,   -15,   -10,
+    -5,    -3,    -2,    -3,    -6,    -5,    -11,   -14,   -10,   -5,    0,
+    9,     10,    18,    21,    19,    21,    11,    7,     4,     6,     6,
+    7,     3,     -6,    -9,    -16,   -23,   -24,   -23,   -26,   -18,   -16,
+    -11,   -8,    0,     6,     5,     6,     10,    8,     8,     16,    24,
+    24,    23,    24,    24,    24,    18,    9,     4,     -3,    -11,   -16,
+    -15,   -18,   -14,   -12,   -9,    -3,    -4,    -1,    8,     11,    10,
+    19,    21,    21,    23,    20,    22,    15,    9,     7,     5,     3,
+    1,     12,    13,    10,    18,    23,    31,    37,    40,    36,    38,
+    40,    40,    38,    27,    24,    21,    14,    12,    12,    7,     7,
+    15,    18,    19,    18,    17,    18,    14,    12,    11,    7,     5,
+    7,     9,     9,     15,    14,    15,    18,    16,    7,     0,     -5,
+    -6,    -6,    -6,    -1,    7,     9,     12,    6,     4,     4,     2,
+    -1,    2,     3,     3,     5,     4,     -1,    -13,   -19,   -29,   -34,
+    -39,   -43,   -49,   -54,   -53,   -55,   -55,   -56,   -59,   -58,   -49,
+    -41,   -32,   -19,   -10,   -2,    -4,    -1,    -6,    -19,   -27,   -26,
+    -27,   -27,   -21,   -22,   -20,   -26,   -26,   -20,   -20,   -20,   -21,
+    -17,   -18,   -7,    -6,    -6,    -5,    -1,    7,     18,    10,    16,
+    25,    24,    31,    30,    32,    30,    26,    24,    22,    23,    21,
+    23,    21,    24,    19,    17,    13,    12,    15,    6,     2,     -5,
+    -9,    -13,   -10,   -5,    1,     10,    13,    17,    13,    8,     5,
+    5,     6,     5,     13,    19,    16,    14,    12,    7,     15,    18,
+    19,    16,    4,     -1,    0,     -1,    -2,    -9,    -15,   -19,   -21,
+    -13,   -13,   -10,   -7,    -7,    -7,    -6,    -11,   -22,   -18,   -19,
+    -22,   -22,   -19,   -18,   -10,   -7,    -9,    -7,    -12,   -16,   -20,
+    -27,   -35,   -37,   -37,   -33,   -24,   -14,   -4,    8,     14,    19,
+    19,    16,    12,    6,     2,     -5,    -6,    -11,   -17,   -16,   -14,
+    -13,   -12,   -17,   -21,   -22,   -24,   -18,   -14,   -12,   -1,    4,
+    9,     17,    14,    9,     13,    14,    13,    14,    14,    12,    11,
+    15,    11,    16,    21,    20,    20,    22,    31,    30,    26,    15,
+    13,    6,     8,     5,     1,     -5,    -3,    2,     9,     14,    13,
+    16,    17,    18,    13,    10,    8,     7,     9,     12,    21,    23,
+    23,    21,    19,    16,    14,    5,     -4,    -12,   -15,   -16,   -12,
+    -9,    -12,   -14,   -17,   -16,   -15,   -14,   -15,   -28,   -27,   -24,
+    -12,   -8,    -3,    3,     9,     15,    18,    25,    25,    31,    32,
+    35,    36,    33,    36,    24,    13,    2,     -11,   -19,   -18,   -18,
+    -10,   -6,    -4,    0,     -3,    -3,    -15,   -18,   -17,   -9,    -7,
+    2,     5,     7,     6,     2,     -2,    -12,   -16,   -16,   -9,    -3,
+    6,     8,     15,    17,    16,    18,    11,    5,     -4,    -8,    -17,
+    -16,   -22,   -24,   -25,   -28,   -23,   -19,   -11,   -3,    5,     11,
+    22,    26,    29,    24,    14,    12,    7,     6,     -2,    -1,    2,
+    10,    23,    33,    36,    32,    31,    16,    3,     -4,    -3,    -3,
+    1,     8,     11,    13,    12,    8,     3,     5,     3,     1,     -1,
+    4,     2,     3,     8,     5,     5,     1,     -2,    -1,    -3,    -1,
+    5,     8,     10,    17,    17,    15,    19,    27,    18,    21,    23,
+    19,    20,    15,    1,     -7,    -18,   -24,   -24,   -33,   -28,   -32,
+    -30,   -30,   -30,   -30,   -29,   -30,   -41,   -43,   -50,   -51,   -49,
+    -42,   -32,   -19,   -10,   0,     4,     -2,    5,     9,     8,     12,
+    19,    17,    10,    9,     3,     1,     -4,    -8,    -4,    0,     5,
+    7,     10,    9,     12,    0,     -6,    -7,    -13,   -16,   -10,   -10,
+    -9,    -1,    -1,    -2,    -6,    -11,   -14,   -17,   -18,   -10,   -3,
+    -3,    0,     6,     1,     6,     4,     3,     3,     9,     16,    22,
+    28,    27,    32,    18,    21,    25,    20,    21,    18,    18,    22,
+    23,    15,    8,     -3,    -9,    -10,   -13,   -8,    3,     7,     18,
+    26,    23,    26,    30,    17,    11,    9,     -1,    0,     2,     2,
+    12,    15,    6,     1,     0,     -5,    2,     1,     -3,    -1,    -6,
+    -2,    -4,    -11,   -18,   -30,   -38,   -36,   -33,   -32,   -27,   -19,
+    -18,   -14,   -13,   -16,   -11,   -12,   -12,   -4,    0,     7,     13,
+    13,    10,    11,    6,     3,     3,     3,     4,     10,    4,     -1,
+    -3,    -11,   -21,   -27,   -34,   -33,   -31,   -33,   -28,   -22,   -21,
+    -14,   -8,    -13,   -10,   -8,    -12,   -7,    -11,   -3,    3,     5,
+    7,     7,     -1,    -12,   -13,   -17,   -21,   -8,    -2,    4,     7,
+    13,    18,    18,    16,    15,    13,    11,    15,    13,    12,    17,
+    18,    15,    15,    11,    -3,    -1,    2,     11,    15,    10,    18,
+    13,    10,    12,    9,     2,     2,     4,     -1,    6,     9,     11,
+    5,     7,     13,    8,     9,     10,    11,    9,     7,     11,    5,
+    3,     1,     -9,    -19,   -31,   -40,   -42,   -33,   -27,   -24,   -22,
+    -20,   -25,   -20,   -12,   -17,   -23,   -23,   -25,   -25,   -20,   -18,
+    -17,   -19,   -15,   -22,   -20,   -19,   -13,   -8,    -12,   0,     2,
+    -6,    -1,    -5,    -15,   -10,   -12,   -19,   -8,    -6,    -3,    9,
+    5,     12,    22,    10,    9,     12,    5,     8,     28,    13,    20,
+    25,    11,    16,    19,    10,    15,    14,    6,     23,    19,    18,
+    32,    17,    12,    19,    -1,    -8,    11,    -4,    -8,    9,     -4,
+    -6,    0,     -10,   -7,    -3,    -8,    -11,   -11,   -23,   -7,    -4,
+    -4,    14,    6,     4,     9,     3,     -4,    4,     2,     9,     26,
+    19,    26,    33,    22,    22,    24,    13,    20,    18,    18,    28,
+    28,    19,    24,    16,    -1,    1,     -12,   -34,   -28,   -25,   -27,
+    -13,   6,     8,     21,    25,    22,    19,    3,     4,     0,     -5,
+    6,     8,     1,     6,     8,     -4,    -3,    -10,   -23,   -17,   -9,
+    -10,   3,     6,     -1,    3,     -10,   -22,   -28,   -49,   -49,   -36,
+    -29,   -10,   8,     -1,    4,     14,    -3,    -14,   -5,    -16,   -10,
+    8,     7,     21,    24,    17,    25,    15,    -4,    13,    -7,    -23,
+    0,     -7,    -14,   12,    1,     -18,   -10,   -27,   -43,   -31,   -34,
+    -19,   -3,    -10,   15,    20,    -7,    10,    9,     -20,   7,     28,
+    14,    42,    54,    32,    34,    24,    5,     10,    -11,   -13,   11,
+    -6,    -4,    31,    7,     0,     34,    3,     -9,    5,     -24,   -33,
+    -14,   -11,   -1,    8,     0,     10,    7,     -7,    11,    10,    -6,
+    17,    16,    0,     10,    3,     -26,   -23,   -33,   -39,   -26,   -29,
+    -18,   -6,    -9,    -1,    5,     -11,   -6,    7,     -6,    1,     13,
+    8,     1,     3,     -13,   -23,   -25,   -33,   -28,   -21,   -9,    2,
+    4,     1,     8,     4,     -13,   -5,    -12,   -14,   3,     14,    18,
+    26,    30,    21,    20,    15,    15,    10,    5,     13,    11,    20,
+    25,    29,    18,    19,    9,     -10,   -15,   -13,   -12,   1,     16,
+    20,    30,    39,    37,    21,    15,    3,     -7,    -9,    -1,    2,
+    -6,    -7,    -10,   -20,   -19,   -19,   -31,   -25,   -12,   -15,   -13,
+    -17,   -18,   -14,   -24,   -24,   -18,   -28,   -24,   -3,    1,     17,
+    46,    48,    43,    46,    34,    12,    6,     -14,   -19,   -10,   -14,
+    3,     15,    3,     7,     7,     -13,   4,     9,     -2,    3,     22,
+    19,    25,    41,    48,    46,    36,    42,    40,    24,    33,    50,
+    29,    30,    57,    35,    13,    29,    17,    -9,    5,     15,    7,
+    13,    38,    47,    40,    56,    72,    42,    29,    40,    18,    14,
+    36,    52,    50,    58,    55,    42,    22,    20,    13,    -8,    8,
+    32,    26,    41,    70,    48,    51,    65,    36,    27,    23,    4,
+    5,     1,     -3,    2,     -8,    -23,   -6,    -30,   -46,   -24,   -40,
+    -45,   -22,   -32,   -35,   -24,   -50,   -41,   -35,   -56,   -38,   -29,
+    -55,   -25,   -7,    -40,   -26,   -25,   -63,   -51,   -40,   -61,   -47,
+    -38,   -38,   -5,    2,     3,     26,    -1,    -7,    8,     -20,   -17,
+    10,    -14,   -6,    41,    24,    27,    52,    26,    13,    25,    5,
+    -6,    2,     -7,    -2,    10,    4,     29,    36,    30,    74,    93,
+    91,    131,   150,   132,   167,   177,   158,   189,   188,   178,   200,
+    199,   187,   212,   202,   188,   210,   188,   173,   187,   175,   183,
+    215,   218,   236,   264,   253,   279,   296,   275,   290,   288,   261,
+    261,   261,   230,   216,   199,   157,   160,   147,   115,   108,   84,
+    50,    32,    7,     -30,   -56,   -96,   -130,  -146,  -179,  -199,  -223,
+    -255,  -280,  -293,  -326,  -341,  -352,  -391,  -410,  -429,  -464,  -489,
+    -507,  -538,  -559,  -577,  -602,  -634,  -656,  -679,  -696,  -702,  -700,
+    -699,  -700,  -687,  -666,  -665,  -656,  -634,  -626,  -609,  -572,  -539,
+    -518,  -484,  -462,  -444,  -418,  -390,  -364,  -336,  -295,  -245,  -210,
+    -175,  -127,  -97,   -63,   -28,   10,    45,    83,    121,   167,   222,
+    272,   324,   369,   396,   439,   485,   502,   536,   571,   585,   618,
+    656,   676,   705,   729,   744,   767,   776,   786,   798,   796,   813,
+    849,   855,   865,   883,   862,   843,   834,   794,   781,   778,   767,
+    746,   744,   721,   702,   681,   638,   607,   562,   521,   490,   447,
+    398,   361,   313,   255,   204,   123,   20,    -59,   -143,  -217,  -270,
+    -328,  -400,  -462,  -529,  -607,  -666,  -737,  -797,  -854,  -906,  -936,
+    -944,  -955,  -965,  -976,  -993,  -1003, -1007, -1032, -1040, -1045, -1055,
+    -1039, -1016, -1003, -990,  -995,  -1026, -1046, -1070, -1079, -1058, -1060,
+    -1062, -1028, -1010, -1006, -991,  -1000, -1004, -987,  -981,  -958,  -921,
+    -890,  -852,  -798,  -754,  -713,  -681,  -682,  -658,  -617,  -585,  -524,
+    -452,  -404,  -332,  -258,  -224,  -183,  -144,  -132,  -94,   -64,   -31,
+    37,    99,    147,   219,   280,   329,   389,   439,   483,   563,   632,
+    702,   799,   884,   965,   1050,  1107,  1150,  1209,  1260,  1308,  1383,
+    1446,  1514,  1582,  1632,  1679,  1727,  1770,  1804,  1837,  1872,  1916,
+    1961,  1999,  2038,  2071,  2089,  2097,  2107,  2091,  2084,  2072,  2051,
+    2021,  1998,  1940,  1868,  1814,  1734,  1641,  1559,  1480,  1395,  1305,
+    1213,  1115,  1015,  901,   785,   667,   520,   381,   256,   110,   -26,
+    -141,  -284,  -417,  -528,  -670,  -805,  -935,  -1080, -1206, -1324, -1438,
+    -1527, -1622, -1725, -1798, -1879, -1956, -2006, -2063, -2128, -2166, -2201,
+    -2238, -2257, -2292, -2316, -2337, -2357, -2356, -2362, -2382, -2375, -2368,
+    -2367, -2358, -2337, -2329, -2318, -2296, -2273, -2240, -2195, -2140, -2095,
+    -2044, -1990, -1932, -1872, -1803, -1737, -1673, -1602, -1520, -1428, -1325,
+    -1219, -1112, -1006, -896,  -780,  -681,  -591,  -481,  -388,  -294,  -189,
+    -85,   30,    148,   252,   348,   466,   579,   692,   811,   918,   1041,
+    1162,  1271,  1389,  1507,  1611,  1735,  1864,  1965,  2085,  2203,  2312,
+    2436,  2536,  2614,  2697,  2760,  2812,  2886,  2956,  3010,  3066,  3088,
+    3098,  3120,  3110,  3101,  3106,  3108,  3130,  3149,  3139,  3122,  3085,
+    3016,  2951,  2874,  2770,  2671,  2559,  2435,  2315,  2198,  2059,  1915,
+    1761,  1570,  1387,  1185,  984,   787,   601,   413,   224,   40,    -158,
+    -348,  -560,  -760,  -960,  -1147, -1312, -1471, -1621, -1779, -1925, -2069,
+    -2206, -2333, -2463, -2570, -2664, -2743, -2811, -2860, -2886, -2934, -2976,
+    -3015, -3057, -3074, -3076, -3079, -3060, -3032, -2998, -2950, -2920, -2893,
+    -2863, -2837, -2806, -2761, -2715, -2662, -2607, -2554, -2486, -2402, -2325,
+    -2264, -2190, -2127, -2063, -1989, -1932, -1862, -1788, -1724, -1640, -1545,
+    -1455, -1346, -1234, -1112, -984,  -859,  -735,  -610,  -494,  -384,  -280,
+    -176,  -68,   40,    140,   244,   363,   478,   596,   739,   876,   1001,
+    1128,  1240,  1352,  1474,  1595,  1717,  1853,  1972,  2093,  2215,  2328,
+    2432,  2533,  2641,  2744,  2855,  2949,  3055,  3157,  3242,  3329,  3415,
+    3479,  3528,  3569,  3588,  3617,  3649,  3676,  3708,  3747,  3751,  3753,
+    3744,  3693,  3640,  3576,  3470,  3369,  3248,  3098,  2976,  2838,  2690,
+    2557,  2395,  2222,  2055,  1872,  1675,  1488,  1279,  1057,  851,   623,
+    393,   180,   -74,   -315,  -537,  -771,  -979,  -1161, -1373, -1558, -1729,
+    -1932, -2110, -2294, -2478, -2636, -2785, -2917, -3007, -3094, -3183, -3247,
+    -3319, -3402, -3450, -3510, -3564, -3595, -3622, -3635, -3627, -3635, -3639,
+    -3620, -3620, -3610, -3596, -3581, -3535, -3495, -3455, -3410, -3361, -3323,
+    -3265, -3202, -3141, -3078, -3001, -2919, -2830, -2739, -2640, -2540, -2430,
+    -2320, -2192, -2057, -1909, -1761, -1603, -1422, -1244, -1059, -887,  -726,
+    -570,  -425,  -256,  -92,   69,    238,   411,   557,   728,   910,   1066,
+    1229,  1403,  1561,  1727,  1895,  2050,  2208,  2352,  2492,  2638,  2765,
+    2893,  3025,  3145,  3263,  3387,  3496,  3595,  3707,  3804,  3884,  3975,
+    4046,  4105,  4167,  4204,  4220,  4237,  4243,  4247,  4260,  4255,  4251,
+    4246,  4201,  4143,  4092,  3996,  3885,  3772,  3604,  3435,  3283,  3086,
+    2923,  2742,  2535,  2341,  2130,  1887,  1649,  1411,  1137,  915,   659,
+    398,   163,   -81,   -351,  -580,  -814,  -1069, -1262, -1476, -1689, -1850,
+    -2043, -2237, -2395, -2591, -2763, -2918, -3095, -3224, -3319, -3435, -3508,
+    -3582, -3698, -3772, -3858, -3950, -4008, -4047, -4088, -4093, -4085, -4098,
+    -4064, -4052, -4057, -4033, -4028, -4018, -3991, -3971, -3933, -3865, -3802,
+    -3727, -3633, -3562, -3477, -3392, -3300, -3210, -3115, -3018, -2924, -2819,
+    -2721, -2606, -2490, -2381, -2246, -2111, -1963, -1810, -1638, -1460, -1293,
+    -1132, -980,  -828,  -666,  -496,  -322,  -125,  72,    264,   470,   676,
+    879,   1087,  1280,  1457,  1633,  1799,  1970,  2152,  2327,  2501,  2678,
+    2840,  3007,  3165,  3301,  3434,  3558,  3667,  3791,  3912,  4023,  4140,
+    4257,  4359,  4475,  4554,  4614,  4656,  4682,  4697,  4726,  4749,  4775,
+    4810,  4812,  4812,  4810,  4768,  4697,  4620,  4502,  4368,  4210,  4031,
+    3860,  3663,  3472,  3291,  3076,  2849,  2642,  2392,  2140,  1890,  1610,
+    1325,  1064,  782,   494,   231,   -50,   -329,  -593,  -861,  -1112, -1345,
+    -1588, -1812, -2022, -2257, -2467, -2682, -2924, -3126, -3317, -3495, -3630,
+    -3737, -3855, -3941, -4031, -4128, -4200, -4281, -4348, -4388, -4427, -4449,
+    -4444, -4450, -4458, -4452, -4464, -4460, -4451, -4444, -4425, -4384, -4344,
+    -4289, -4234, -4160, -4076, -4000, -3917, -3837, -3753, -3669, -3558, -3460,
+    -3354, -3230, -3111, -2966, -2824, -2665, -2495, -2333, -2151, -1951, -1752,
+    -1554, -1367, -1222, -1053, -882,  -716,  -520,  -331,  -141,  62,    270,
+    476,   707,   923,   1133,  1349,  1534,  1735,  1943,  2124,  2317,  2511,
+    2668,  2839,  3002,  3140,  3317,  3481,  3615,  3771,  3920,  4050,  4196,
+    4319,  4430,  4556,  4657,  4765,  4868,  4945,  4999,  5057,  5075,  5100,
+    5123,  5133,  5134,  5127,  5104,  5084,  5058,  4968,  4896,  4750,  4575,
+    4381,  4179,  3971,  3776,  3590,  3394,  3209,  2991,  2800,  2535,  2269,
+    1972,  1654,  1319,  998,   697,   384,   105,   -187,  -476,  -759,  -1047,
+    -1316, -1579, -1841, -2085, -2317, -2550, -2745, -2938, -3145, -3326, -3523,
+    -3706, -3859, -3998, -4124, -4218, -4288, -4346, -4386, -4437, -4495, -4550,
+    -4619, -4680, -4732, -4779, -4813, -4820, -4842, -4825, -4791, -4773, -4742,
+    -4715, -4709, -4683, -4652, -4605, -4527, -4428, -4315, -4194, -4086, -3978,
+    -3872, -3779, -3685, -3569, -3458, -3313, -3121, -2921, -2693, -2454, -2230,
+    -1998, -1783, -1588, -1414, -1240, -1069, -886,  -690,  -473,  -256,  -36,
+    170,   384,   594,   797,   1015,  1235,  1449,  1664,  1882,  2098,  2311,
+    2504,  2681,  2843,  3019,  3171,  3337,  3534,  3709,  3885,  4072,  4235,
+    4380,  4524,  4641,  4746,  4864,  4979,  5087,  5213,  5308,  5393,  5450,
+    5468,  5475,  5472,  5452,  5462,  5467,  5453,  5451,  5425,  5342,  5255,
+    5113,  4914,  4725,  4512,  4273,  4053,  3866,  3632,  3436,  3205,  2955,
+    2705,  2420,  2095,  1794,  1503,  1195,  941,   639,   342,   56,    -269,
+    -601,  -894,  -1208, -1499, -1736, -1994, -2239, -2426, -2652, -2891, -3099,
+    -3361, -3588, -3793, -4013, -4183, -4302, -4439, -4523, -4613, -4734, -4809,
+    -4891, -4999, -5056, -5090, -5131, -5092, -5061, -5044, -4987, -4954, -4955,
+    -4924, -4911, -4873, -4809, -4755, -4673, -4555, -4440, -4316, -4187, -4088,
+    -3986, -3881, -3802, -3717, -3605, -3495, -3359, -3207, -3063, -2889, -2698,
+    -2504, -2306, -2088, -1861, -1627, -1415, -1201, -1000, -799,  -593,  -410,
+    -220,  -7,    203,   412,   634,   865,   1126,  1367,  1602,  1838,  2052,
+    2257,  2474,  2659,  2863,  3076,  3255,  3429,  3617,  3773,  3939,  4102,
+    4222,  4358,  4501,  4611,  4733,  4846,  4939,  5056,  5147,  5217,  5301,
+    5357,  5388,  5428,  5417,  5400,  5430,  5422,  5406,  5442,  5446,  5431,
+    5437,  5381,  5304,  5212,  5057,  4874,  4683,  4465,  4249,  4026,  3767,
+    3545,  3304,  3021,  2741,  2450,  2113,  1807,  1490,  1151,  841,   544,
+    212,   -102,  -439,  -788,  -1091, -1413, -1730, -2033, -2336, -2627, -2854,
+    -3118, -3350, -3560, -3781, -4008, -4194, -4376, -4524, -4640, -4757, -4865,
+    -4945, -5016, -5083, -5131, -5170, -5184, -5198, -5208, -5211, -5210, -5209,
+    -5192, -5174, -5154, -5108, -5052, -5002, -4932, -4854, -4780, -4704, -4604,
+    -4514, -4421, -4309, -4208, -4111, -4004, -3880, -3751, -3622, -3496, -3367,
+    -3210, -3047, -2867, -2654, -2430, -2177, -1897, -1651, -1417, -1182, -983,
+    -793,  -593,  -406,  -211,  17,    232,   461,   716,   958,   1197,  1441,
+    1674,  1899,  2130,  2355,  2573,  2788,  3004,  3220,  3419,  3612,  3809,
+    3973,  4120,  4277,  4433,  4573,  4742,  4902,  5037,  5165,  5282,  5377,
+    5460,  5539,  5596,  5654,  5716,  5741,  5759,  5770,  5776,  5762,  5751,
+    5737,  5706,  5675,  5644,  5550,  5446,  5324,  5169,  4974,  4767,  4530,
+    4289,  4067,  3823,  3621,  3391,  3145,  2878,  2575,  2228,  1890,  1525,
+    1149,  807,   473,   145,   -152,  -454,  -769,  -1057, -1374, -1703, -2033,
+    -2372, -2701, -2977, -3258, -3495, -3694, -3897, -4089, -4270, -4483, -4668,
+    -4840, -5015, -5140, -5225, -5304, -5334, -5350, -5390, -5398, -5403, -5428,
+    -5438, -5449, -5472, -5463, -5441, -5401, -5333, -5252, -5151, -5051, -4974,
+    -4880, -4805, -4729, -4626, -4526, -4403, -4248, -4088, -3939, -3778, -3617,
+    -3464, -3308, -3173, -3027, -2852, -2669, -2461, -2233, -1979, -1713, -1455,
+    -1216, -996,  -796,  -610,  -397,  -198,  21,    272,   517,   775,   1037,
+    1295,  1544,  1790,  2007,  2211,  2423,  2634,  2848,  3081,  3319,  3551,
+    3792,  4000,  4171,  4303,  4418,  4518,  4596,  4679,  4807,  4913,  5044,
+    5172,  5288,  5405,  5518,  5609,  5664,  5713,  5735,  5735,  5737,  5701,
+    5691,  5656,  5633,  5611,  5552,  5475,  5394,  5293,  5177,  5064,  4924,
+    4737,  4599,  4420,  4237,  4048,  3828,  3623,  3413,  3183,  2915,  2622,
+    2308,  1980,  1657,  1261,  901,   549,   205,   -85,   -383,  -688,  -969,
+    -1246, -1530, -1850, -2206, -2561, -2915, -3224, -3482, -3713, -3921, -4107,
+    -4287, -4470, -4660, -4850, -5057, -5239, -5395, -5540, -5619, -5697, -5724,
+    -5697, -5675, -5633, -5590, -5579, -5530, -5486, -5442, -5426, -5391, -5348,
+    -5276, -5197, -5124, -5039, -4925, -4808, -4677, -4581, -4479, -4343, -4218,
+    -4087, -3970, -3858, -3729, -3570, -3384, -3206, -3020, -2839, -2636, -2453,
+    -2287, -2185, -2154, -1926, -1562, -1223, -758,  -473,  -64,   395,   599,
+    880,   814,   938,   1172,  1498,  1928,  2127,  2422,  2608,  2841,  2937,
+    2886,  2815,  2985,  3324,  3757,  4152,  4481,  4652,  4917,  4965,  4766,
+    4583,  4328,  4503,  4815,  5118,  5408,  5682,  5956,  6082,  6055,  5744,
+    5426,  5341,  5427,  5606,  5882,  6065,  6226,  6428,  6477,  6385,  6009,
+    5728,  5552,  5439,  5339,  5200,  5008,  4947,  4835,  4614,  4330,  3887,
+    3521,  3111,  2460,  1983,  1297,  650,   279,   -353,  -720,  -1044, -1518,
+    -1668, -2117, -2496, -2743, -3266, -3607, -3790, -4149, -4075, -4042, -4096,
+    -3981, -4138, -4226, -4214, -4503, -4455, -4577, -4642, -4346, -4351, -4270,
+    -4263, -4522, -4521, -4673, -4814, -4731, -4950, -5011, -5004, -5288, -5341,
+    -5566, -5833, -5783, -5929, -5847, -5765, -5828, -5644, -5613, -5615, -5428,
+    -5291, -5014, -4554, -4277, -3964, -3854, -3829, -3612, -3603, -3438, -3137,
+    -2831, -2164, -1438, -939,  -330,  -156,  46,    242,   73,    242,   220,
+    239,   542,   565,   739,   872,   801,   857,   676,   543,   586,   567,
+    828,   1142,  1490,  1985,  2508,  2982,  3438,  3699,  3939,  4069,  4178,
+    4420,  4622,  4917,  5338,  5801,  6285,  6658,  6963,  7213,  7233,  7328,
+    7176,  7038,  7031,  6860,  6957,  6767,  6599,  6523,  6212,  6147,  6063,
+    5860,  6020,  6015,  6033,  6184,  5722,  5607,  5016,  4337,  4063,  3229,
+    3080,  3006,  2804,  3035,  2541,  2136,  1879,  1012,  401,   -575,  -1584,
+    -1930, -2278, -2485, -2477, -2712, -2747, -2766, -3320, -3592, -4188, -4669,
+    -4672, -4939, -4789, -4426, -4203, -3674, -3563, -3656, -3759, -4067, -4257,
+    -4522, -4970, -5204, -5237, -5139, -4907, -4911, -4917, -4921, -5007, -5230,
+    -5654, -6122, -6464, -6733, -6948, -7067, -6972, -6800, -6520, -6132, -5830,
+    -5382, -5091, -4797, -4546, -4472, -4362, -4350, -4235, -3851, -3454, -3144,
+    -2735, -2341, -1845, -1262, -958,  -549,  -166,  66,    382,   366,   352,
+    341,   85,    -13,   -176,  -303,  -235,  -341,  -309,  -227,  -249,  -50,
+    143,   384,   874,   1149,  1552,  2155,  2767,  3499,  3994,  4460,  4920,
+    5288,  5569,  5704,  5881,  6094,  6461,  6653,  6803,  7115,  7311,  7521,
+    7612,  7443,  7380,  7124,  6742,  6495,  5964,  5656,  5415,  5167,  5656,
+    5813,  6027,  6401,  6351,  6787,  7019,  6581,  6512,  5965,  5308,  5140,
+    4336,  4147,  3899,  3398,  3360,  2830,  2624,  1968,  1026,  395,   -699,
+    -1424, -2327, -3006, -3192, -3435, -3337, -3686, -3513, -3350, -3502, -3261,
+    -3878, -4005, -4063, -4187, -3767, -3598, -3384, -3300, -3094, -2857, -3023,
+    -3274, -3851, -4352, -4523, -4943, -5477, -5612, -5682, -5733, -5714, -5965,
+    -6110, -5950, -6158, -6548, -6897, -7165, -7281, -7352, -7258, -7185, -6659,
+    -5946, -5470, -4738, -4046, -3707, -3210, -3108, -3270, -3227, -3222, -3218,
+    -3017, -2943, -2668, -2296, -1593, -1061, -811,  -403,  -513,  -361,  -128,
+    -595,  -633,  -991,  -1205, -1159, -1284, -1330, -1164, -999,  -729,  -538,
+    -336,  27,    350,   794,   1245,  1646,  2446,  3210,  4017,  4835,  5271,
+    5739,  6028,  6140,  6212,  6161,  6066,  5984,  6081,  5995,  6152,  6301,
+    6278,  6424,  6377,  6396,  6362,  6152,  5788,  5309,  5071,  4860,  4704,
+    4804,  4919,  5258,  5869,  6121,  6365,  6694,  6692,  6694,  6532,  6187,
+    5808,  5704,  5302,  4816,  4611,  4043,  3775,  3249,  2600,  1933,  982,
+    336,   -848,  -1538, -2242, -3103, -3374, -3756, -3975, -4017, -4061, -3972,
+    -3749, -3609, -3853, -3850, -3714, -3760, -3736, -3914, -3923, -3830, -3541,
+    -3649, -3757, -3661, -3913, -4038, -4231, -4594, -4769, -5009, -5273, -5588,
+    -5676, -5937, -5997, -6060, -6164, -6414, -6623, -6765, -6857, -6771, -6921,
+    -6914, -6535, -6187, -5626, -5206, -4742, -4189, -3618, -3120, -2823, -2606,
+    -2550, -2703, -2736, -2626, -2498, -2406, -2133, -1852, -1348, -753,  -318,
+    162,   330,   524,   375,   9,     -204,  -866,  -1249, -1532, -1669, -1455,
+    -1235, -723,  -283,  262,   535,   862,   1340,  1712,  2316,  2625,  3171,
+    4015,  4698,  5516,  6006,  6452,  6838,  6921,  7003,  6735,  6339,  6138,
+    5768,  5575,  5593,  5568,  5728,  6041,  6233,  6260,  6175,  6048,  5728,
+    5366,  4931,  4340,  4194,  4174,  4330,  4743,  5028,  5754,  6250,  6598,
+    7120,  7114,  6962,  6675,  6157,  5373,  4797,  4081,  3237,  3153,  2588,
+    2143,  1639,  1021,  681,   -149,  -816,  -1987, -3003, -3493, -4138, -4420,
+    -4607, -4841, -4725, -4254, -4033, -3845, -3842, -4063, -4035, -4099, -4582,
+    -4718, -4779, -4689, -4437, -4327, -4352, -4119, -3881, -4061, -4345, -4768,
+    -5248, -5610, -5920, -6383, -6779, -6731, -6673, -6677, -6597, -6659, -6619,
+    -6417, -6516, -6862, -7017, -7069, -6944, -6715, -6376, -6000, -5162, -4333,
+    -3577, -2884, -2355, -1807, -1366, -1380, -1590, -1869, -1962, -1945, -2006,
+    -2141, -1960, -1516, -1025, -471,  -135,  85,    348,   239,   -8,    -475,
+    -951,  -1245, -1520, -1569, -1448, -1188, -517,  134,   827,   1585,  2114,
+    2792,  3214,  3651,  4230,  4546,  4894,  5321,  5588,  6105,  6583,  6877,
+    7014,  7087,  7068,  6876,  6695,  6280,  5684,  5385,  5205,  5064,  5033,
+    5028,  5080,  5322,  5510,  5461,  5390,  5541,  5494,  5443,  5306,  5065,
+    5193,  5338,  5513,  5818,  5911,  6345,  6506,  6514,  6543,  5981,  5703,
+    5082,  4228,  3517,  2424,  1880,  1245,  562,   -130,  -864,  -1156, -1561,
+    -1970, -2597, -3357, -3707, -4189, -4521, -4975, -5477, -5478, -5585, -5445,
+    -5353, -5327, -4971, -4580, -4431, -4469, -4432, -4422, -4275, -4227, -4507,
+    -4745, -4758, -4752, -4845, -4933, -5118, -5117, -5124, -5324, -5673, -5971,
+    -6152, -6366, -6702, -6970, -7159, -7136, -6929, -6917, -6703, -6520, -6302,
+    -5794, -5484, -5123, -4694, -4254, -3722, -3334, -2917, -2410, -1721, -1010,
+    -584,  -312,  27,    321,   327,   214,   -17,   -363,  -402,  -550,  -638,
+    -469,  -315,  -86,   142,   242,   387,   448,   458,   423,   321,   194,
+    285,   417,   717,   1176,  1673,  2402,  3144,  3985,  4764,  5406,  6056,
+    6507,  6783,  6891,  6868,  6850,  6717,  6532,  6359,  6248,  6303,  6279,
+    6140,  6071,  5927,  5687,  5480,  5146,  4835,  4572,  4447,  4481,  4578,
+    4840,  4936,  5246,  5659,  5732,  5856,  5658,  5403,  5282,  5004,  4949,
+    4843,  4681,  4884,  4886,  4967,  5108,  4781,  4647,  4240,  3443,  2768,
+    1830,  983,   309,   -769,  -1382, -1987, -2553, -2750, -3346, -3555, -4052,
+    -4400, -4599, -5196, -5437, -5945, -6340, -6343, -6554, -6611, -6381, -6184,
+    -5681, -5398, -5098, -4751, -4529, -4138, -4100, -4088, -4044, -4186, -4189,
+    -4263, -4453, -4465, -4598, -4651, -4726, -4919, -4926, -5142, -5286, -5490,
+    -5831, -6002, -6341, -6492, -6562, -6710, -6553, -6506, -6219, -5766, -5521,
+    -5008, -4556, -4002, -3293, -2769, -2069, -1467, -824,  -34,   509,   1034,
+    1385,  1560,  1650,  1664,  1419,  1016,  834,   511,   353,   381,   299,
+    523,   833,   956,   1280,  1492,  1425,  1547,  1350,  1143,  1114,  931,
+    1054,  1217,  1583,  2217,  2917,  4017,  4965,  5827,  6816,  7393,  7875,
+    8197,  8175,  7924,  7578,  7040,  6566,  6242,  5746,  5530,  5334,  5222,
+    5237,  5074,  5146,  5011,  4902,  4753,  4442,  4482,  4254,  4247,  4319,
+    4187,  4516,  4690,  4935,  5193,  5229,  5350,  5332,  5486,  5386,  5143,
+    4999,  4494,  4304,  3961,  3421,  2781,  2032,  1404,  614,   -88,   -956,
+    -1714, -2155, -2684, -3038, -3237, -3368, -3423, -3569, -3809, -4213, -4533,
+    -4973, -5514, -6011, -6663, -7084, -7258, -7158, -6947, -6639, -6111, -5548,
+    -4887, -4362, -4043, -3895, -3940, -4107, -4452, -4836, -5143, -5500, -5532,
+    -5510, -5485, -5096, -4739, -4375, -4065, -4063, -4094, -4252, -4576, -4904,
+    -5431, -5837, -6190, -6402, -6310, -6292, -5992, -5516, -5025, -4342, -3899,
+    -3386, -2697, -2077, -1493, -994,  -392,  232,   931,   1608,  1988,  2360,
+    2589,  2639,  2623,  2471,  2121,  1708,  1478,  1181,  1167,  1296,  1279,
+    1648,  1859,  2107,  2368,  2359,  2390,  2122,  1904,  1629,  1418,  1502,
+    1524,  1859,  2357,  3041,  3909,  4810,  5751,  6449,  7128,  7534,  7767,
+    7908,  7699,  7460,  7032,  6647,  6301,  5876,  5556,  5190,  4948,  4762,
+    4576,  4464,  4370,  4338,  4275,  4287,  4265,  4320,  4221,  4066,  3947,
+    3514,  3379,  3003,  2635,  2534,  2078,  2040,  1950,  1958,  2152,  2085,
+    2390,  2321,  2319,  2359,  1851,  1643,  877,   168,   -527,  -1245, -1704,
+    -2519, -2739, -3251, -3382, -3236, -3527, -3294, -3523, -3732, -3916, -4434,
+    -4888, -5615, -6161, -6729, -7283, -7543, -7920, -7865, -7660, -7430, -7034,
+    -6758, -6224, -5866, -5441, -5076, -4998, -4760, -4673, -4539, -4410, -4308,
+    -4131, -3992, -3791, -3611, -3448, -3213, -3070, -3046, -3048, -3168, -3244,
+    -3354, -3607, -3834, -4170, -4439, -4648, -4864, -4892, -4928, -4821, -4524,
+    -4211, -3576, -2819, -1968, -929,  -19,   1029,  2064,  2949,  3716,  4159,
+    4450,  4536,  4503,  4301,  3968,  3655,  3242,  2979,  2856,  2744,  2750,
+    2771,  2749,  2859,  2850,  2793,  2702,  2402,  2179,  1877,  1672,  1581,
+    1543,  1769,  1967,  2485,  3089,  3783,  4662,  5406,  6246,  6950,  7542,
+    8016,  8200,  8245,  8027,  7584,  6958,  6241,  5494,  4710,  3974,  3255,
+    2653,  2274,  2038,  1986,  1964,  2141,  2321,  2513,  2772,  2756,  2743,
+    2636,  2406,  2125,  1836,  1456,  1247,  1145,  995,   1077,  1140,  1290,
+    1561,  1685,  1762,  1609,  1391,  1147,  544,   84,    -754,  -1546, -2107,
+    -2806, -3137, -3522, -3732, -3826, -3834, -3609, -3493, -3340, -3254, -3499,
+    -3621, -3981, -4455, -4859, -5513, -6080, -6626, -7061, -7372, -7556, -7573,
+    -7515, -7366, -7091, -6799, -6366, -5887, -5484, -5098, -4746, -4334, -3941,
+    -3558, -3269, -3053, -2844, -2663, -2497, -2314, -2227, -2185, -2141, -2139,
+    -2070, -2037, -2031, -2062, -2205, -2348, -2544, -2774, -2979, -3298, -3520,
+    -3647, -3622, -3395, -3054, -2513, -1829, -948,  64,    1090,  2169,  3127,
+    3987,  4712,  5229,  5560,  5754,  5741,  5619,  5401,  5005,  4666,  4287,
+    3967,  3734,  3476,  3322,  3203,  3147,  3144,  3116,  3080,  3011,  2871,
+    2735,  2544,  2363,  2245,  2075,  2032,  2118,  2263,  2688,  3066,  3605,
+    4244,  4746,  5384,  5819,  6151,  6319,  6194,  5938,  5495,  4929,  4305,
+    3581,  2924,  2279,  1713,  1372,  1086,  1006,  983,   1006,  1146,  1249,
+    1349,  1360,  1231,  1084,  794,   502,   264,   -85,   -238,  -411,  -504,
+    -394,  -322,  -51,   188,   420,   589,   624,   666,   573,   338,   -86,
+    -564,  -1056, -1560, -1925, -2434, -2806, -3017, -3341, -3320, -3375, -3480,
+    -3410, -3567, -3553, -3595, -3805, -3919, -4284, -4482, -4754, -5190, -5354,
+    -5806, -6050, -6136, -6387, -6343, -6330, -6206, -5851, -5468, -4960, -4549,
+    -4080, -3542, -3150, -2698, -2440, -2318, -2132, -2067, -2081, -2017, -2099,
+    -2151, -2060, -2067, -1916, -1823, -1718, -1523, -1386, -1221, -1189, -1141,
+    -1014, -1008, -966,  -996,  -1015, -916,  -809,  -648,  -467,  -128,  237,
+    735,   1358,  1969,  2697,  3399,  4060,  4732,  5295,  5720,  6077,  6169,
+    6139,  5928,  5614,  5292,  4766,  4247,  3705,  3262,  3030,  2827,  2702,
+    2684,  2728,  2887,  3092,  3216,  3310,  3313,  3214,  3098,  2873,  2620,
+    2343,  2031,  1799,  1589,  1491,  1537,  1645,  1913,  2210,  2548,  2922,
+    3295,  3650,  3951,  4100,  4099,  3972,  3740,  3421,  2948,  2427,  1762,
+    1136,  574,   44,    -330,  -642,  -846,  -852,  -751,  -520,  -229,  44,
+    272,   446,   502,   443,   329,   66,    -191,  -492,  -841,  -1002, -1240,
+    -1237, -1199, -1177, -936,  -867,  -660,  -456,  -508,  -464,  -706,  -997,
+    -1265, -1780, -2178, -2724, -3270, -3735, -4142, -4378, -4609, -4666, -4749,
+    -4575, -4355, -4137, -3767, -3563, -3218, -2970, -2834, -2630, -2716, -2776,
+    -2920, -3210, -3363, -3764, -4023, -4125, -4268, -4194, -4223, -4005, -3639,
+    -3258, -2891, -2644, -2297, -1987, -1751, -1587, -1570, -1485, -1415, -1342,
+    -1194, -1100, -889,  -613,  -267,  161,   482,   865,   1269,  1639,  2005,
+    2202,  2381,  2549,  2628,  2700,  2625,  2559,  2481,  2357,  2319,  2192,
+    2142,  2199,  2283,  2514,  2670,  2919,  3214,  3510,  3830,  3971,  4080,
+    4073,  3911,  3700,  3359,  2954,  2549,  2094,  1766,  1556,  1442,  1462,
+    1560,  1808,  2070,  2357,  2606,  2730,  2831,  2737,  2582,  2309,  1931,
+    1585,  1178,  834,   529,   288,   214,   218,   302,   470,   679,   944,
+    1211,  1420,  1562,  1674,  1631,  1548,  1355,  1072,  776,   375,   25,
+    -320,  -614,  -818,  -992,  -991,  -906,  -755,  -525,  -291,  -17,   225,
+    447,   528,   546,   466,   270,   96,    -205,  -536,  -861,  -1148, -1383,
+    -1586, -1688, -1814, -1783, -1772, -1745, -1630, -1611, -1505, -1488, -1462,
+    -1409, -1519, -1489, -1609, -1723, -1755, -1977, -2042, -2132, -2215, -2184,
+    -2268, -2205, -2170, -2107, -1978, -1990, -1909, -1886, -1943, -1997, -2152,
+    -2326, -2500, -2762, -2987, -3227, -3392, -3522, -3630, -3579, -3469, -3262,
+    -2916, -2555, -2103, -1581, -1090, -531,  -20,   457,   873,   1228,  1561,
+    1809,  1999,  2105,  2139,  2196,  2201,  2149,  2113,  2038,  1990,  1913,
+    1787,  1705,  1595,  1490,  1372,  1201,  1113,  998,   917,   917,   894,
+    961,   1007,  1098,  1321,  1470,  1681,  1882,  2067,  2317,  2465,  2626,
+    2750,  2777,  2783,  2694,  2569,  2431,  2142,  1843,  1597,  1306,  1069,
+    824,   622,   532,   430,   388,   357,   377,   438,   414,   481,   468,
+    431,   454,   383,   374,   305,   207,   187,   133,   157,   115,   113,
+    206,   244,   382,   475,   591,   753,   821,   916,   908,   855,   754,
+    577,   399,   123,   -159,  -399,  -647,  -784,  -923,  -1010, -965,  -918,
+    -806,  -647,  -504,  -355,  -253,  -179,  -130,  -138,  -156,  -262,  -339,
+    -401,  -552,  -600,  -671,  -697,  -662,  -673,  -616,  -597,  -522,  -495,
+    -513,  -490,  -624,  -701,  -804,  -961,  -1073, -1328, -1503, -1656, -1798,
+    -1801, -1913, -1863, -1785, -1720, -1453, -1309, -1051, -846,  -715,  -487,
+    -457,  -357,  -331,  -400,  -427,  -627,  -765,  -873,  -1021, -1105, -1255,
+    -1312, -1357, -1370, -1288, -1261, -1165, -1139, -1062, -917,  -808,  -680,
+    -597,  -452,  -277,  -104,  122,   312,   558,   771,   919,   1110,  1205,
+    1312,  1355,  1302,  1280,  1151,  1049,  946,   818,   733,   569,   451,
+    429,   388,   408,   387,   376,   426,   463,   542,   576,   632,   666,
+    673,   740,   766,   791,   845,   829,   857,   841,   822,   835,   796,
+    773,   671,   600,   560,   484,   460,   371,   311,   284,   242,   277,
+    261,   261,   277,   273,   358,   380,   410,   433,   435,   471,   432,
+    414,   386,   330,   294,   194,   149,   108,   69,    84,    69,    92,
+    83,    75,    88,    53,    12,    -96,   -194,  -269,  -369,  -438,  -523,
+    -553,  -528,  -500,  -392,  -277,  -136,  53,    240,   466,   678,   870,
+    1050,  1178,  1294,  1336,  1310,  1247,  1080,  916,   677,   387,   120,
+    -182,  -471,  -740,  -972,  -1148, -1273, -1343, -1402, -1363, -1263, -1129,
+    -922,  -724,  -518,  -288,  -79,   111,   250,   364,   405,   405,   395,
+    284,   199,   83,    -43,   -126,  -244,  -313,  -400,  -451,  -497,  -610,
+    -672,  -807,  -951,  -1087, -1325, -1517, -1736, -1929, -2086, -2260, -2318,
+    -2356, -2271, -2125, -1967, -1685, -1379, -1000, -598,  -238,  149,   481,
+    790,   1042,  1185,  1287,  1274,  1195,  1068,  868,   654,   386,   138,
+    -65,   -273,  -450,  -598,  -665,  -670,  -669,  -620,  -553,  -425,  -288,
+    -179,  -72,   15,    122,   205,   263,   324,   357,   435,   518,   603,
+    709,   779,   892,   1006,  1107,  1170,  1183,  1190,  1173,  1116,  1016,
+    890,   750,   628,   488,   331,   197,   95,    43,    25,    1,     22,
+    97,    209,   363,   495,   615,   724,   833,   937,   984,   990,   933,
+    884,   851,   747,   678,   573,   497,   469,   401,   391,   352,   339,
+    352,   337,   354,   361,   370,   402,   411,   418,   440,   468,   526,
+    576,   619,   683,   766,   857,   965,   1038,  1114,  1159,  1172,  1167,
+    1106,  1006,  840,   644,   426,   177,   -110,  -390,  -665,  -929,  -1160,
+    -1375, -1497, -1550, -1592, -1553, -1507, -1394, -1201, -1084, -863,  -685,
+    -540,  -322,  -234,  -68,   29,    59,    160,   141,   170,   140,   79,
+    77,    -11,   -53,   -179,  -274,  -327,  -480,  -564,  -736,  -884,  -995,
+    -1185, -1300, -1461, -1617, -1711, -1832, -1831, -1863, -1865, -1776, -1691,
+    -1516, -1353, -1168, -954,  -729,  -490,  -305,  -93,   81,    211,   322,
+    364,   392,   384,   332,   264,   146,   29,    -101,  -230,  -357,  -486,
+    -616,  -705,  -752,  -801,  -809,  -788,  -750,  -654,  -546,  -456,  -328,
+    -200,  -78,   45,    137,   232,   316,   388,   447,   485,   528,   578,
+    630,   697,   760,   835,   910,   988,   1068,  1124,  1154,  1157,  1166,
+    1163,  1116,  1070,  1024,  994,   986,   988,   1030,  1110,  1212,  1303,
+    1411,  1498,  1551,  1599,  1587,  1565,  1481,  1336,  1212,  1028,  847,
+    669,   466,   330,   187,   61,    -9,    -54,   -55,   -20,   11,    69,
+    133,   195,   244,   253,   225,   182,   133,   62,    -11,   -96,   -168,
+    -199,  -214,  -213,  -197,  -167,  -127,  -105,  -86,   -83,   -109,  -140,
+    -217,  -323,  -448,  -588,  -717,  -854,  -971,  -1086, -1185, -1211, -1227,
+    -1180, -1135, -1099, -992,  -918,  -788,  -704,  -651,  -562,  -542,  -470,
+    -421,  -431,  -391,  -429,  -386,  -344,  -336,  -260,  -257,  -162,  -61,
+    -6,    100,   120,   178,   215,   179,   132,   15,    -106,  -238,  -416,
+    -595,  -765,  -929,  -1066, -1170, -1252, -1278, -1290, -1258, -1173, -1114,
+    -1012, -945,  -868,  -741,  -695,  -612,  -547,  -494,  -388,  -332,  -225,
+    -110,  22,    182,   318,   496,   677,   835,   992,   1104,  1162,  1166,
+    1133,  1054,  916,   709,   430,   164,   -90,   -340,  -600,  -853,  -1033,
+    -1135, -1177, -1146, -1079, -946,  -746,  -500,  -208,  83,    377,   673,
+    950,   1183,  1356,  1503,  1627,  1707,  1735,  1708,  1678,  1668,  1645,
+    1588,  1494,  1419,  1354,  1291,  1194,  1052,  900,   718,   524,   325,
+    110,   -114,  -330,  -500,  -630,  -729,  -803,  -834,  -795,  -727,  -627,
+    -492,  -325,  -125,  54,    238,   393,   528,   642,   691,   706,   661,
+    585,   504,   380,   245,   87,    -61,   -195,  -320,  -435,  -556,  -663,
+    -742,  -814,  -883,  -952,  -1009, -1038, -1047, -1067, -1063, -1050, -1020,
+    -949,  -888,  -795,  -698,  -574,  -405,  -257,  -70,   68,    203,   381,
+    479,   580,   619,   623,   645,   565,   492,   364,   206,   106,   -71,
+    -191,  -331,  -460,  -469,  -527,  -471,  -441,  -386,  -222,  -123,  60,
+    168,   245,   404,   470,   596,   605,   581,   633,   548,   562,   468,
+    355,   334,   192,   161,   62,    -36,   -39,   -146,  -121,  -167,  -243,
+    -229,  -302,  -276,  -327,  -415,  -419,  -444,  -396,  -433,  -455,  -407,
+    -357,  -244,  -221,  -158,  -63,   36,    172,   210,   296,   326,   351,
+    424,   367,   369,   300,   224,   235,   124,   54,    -39,   -122,  -118,
+    -239,  -304,  -360,  -403,  -361,  -418,  -427,  -394,  -342,  -259,  -232,
+    -176,  -110,  -48,   27,    48,    78,    90,    86,    91,    76,    57,
+    -1,    -34,   -53,   -103,  -151,  -209,  -239,  -261,  -319,  -354,  -372,
+    -382,  -385,  -411,  -432,  -428,  -431,  -446,  -471,  -496,  -512,  -532,
+    -562,  -570,  -567,  -543,  -499,  -457,  -379,  -290,  -204,  -94,   -11,
+    78,    155,   196,   234,   222,   198,   160,   113,   64,    5,     -57,
+    -108,  -136,  -175,  -186,  -196,  -184,  -125,  -90,   -25,   58,    146,
+    271,   372,   472,   562,   636,   709,   741,   760,   752,   730,   710,
+    688,   655,   608,   595,   570,   556,   540,   517,   513,   511,   497,
+    481,   449,   417,   401,   347,   325,   295,   248,   261,   238,   250,
+    294,   295,   367,   380,   416,   454,   430,   479,   443,   431,   430,
+    386,   397,   333,   292,   238,   176,   153,   54,    24,    -37,   -84,
+    -109,  -172,  -155,  -199,  -220,  -219,  -261,  -227,  -255,  -280,  -266,
+    -293,  -277,  -273,  -243,  -214,  -221,  -179,  -153,  -130,  -109,  -154,
+    -149,  -151,  -155,  -186,  -243,  -253,  -311,  -326,  -358,  -434,  -427,
+    -491,  -533,  -554,  -598,  -596,  -655,  -668,  -679,  -714,  -671,  -694,
+    -643,  -607,  -602,  -532,  -496,  -409,  -408,  -377,  -309,  -289,  -211,
+    -223,  -196,  -145,  -147,  -104,  -157,  -123,  -125,  -177,  -152,  -229,
+    -192,  -204,  -243,  -213,  -259,  -194,  -190,  -172,  -98,   -123,  -43,
+    -12,   41,    103,   87,    148,   150,   166,   154,   113,   118,   80,
+    54,    8,     4,     25,    12,    59,    70,    162,   260,   305,   387,
+    427,   501,   549,   564,   571,   517,   488,   423,   355,   294,   206,
+    165,   113,   92,    77,    62,    115,   116,   154,   162,   171,   218,
+    210,   221,   208,   192,   215,   176,   169,   114,   89,    89,    52,
+    62,    29,    35,    73,    98,    167,   195,   261,   325,   349,   401,
+    382,   393,   368,   302,   254,   174,   104,   6,     -78,   -136,  -203,
+    -229,  -291,  -303,  -284,  -294,  -241,  -235,  -222,  -186,  -187,  -156,
+    -160,  -149,  -122,  -114,  -71,   -44,   -28,   6,     20,    47,    57,
+    54,    52,    55,    53,    23,    9,     -16,   -59,   -86,   -158,  -223,
+    -292,  -372,  -421,  -498,  -532,  -561,  -570,  -531,  -512,  -456,  -367,
+    -297,  -206,  -125,  -37,   26,    88,    147,   157,   188,   169,   152,
+    152,   131,   99,    62,    44,    46,    53,    61,    61,    79,    110,
+    159,   175,   185,   237,   220,   278,   276,   239,   264,   203,   190,
+    138,   70,    34,    -9,    18,    1,     10,    71,    115,   191,   220,
+    255,   265,   296,   319,   270,   266,   214,   189,   187,   155,   145,
+    123,   149,   166,   172,   186,   179,   195,   213,   201,   182,   161,
+    150,   116,   76,    41,    -29,   -58,   -101,  -183,  -209,  -269,  -314,
+    -342,  -385,  -379,  -380,  -348,  -304,  -273,  -197,  -144,  -88,   -28,
+    -5,    11,    20,    27,    -5,    -24,   -22,   -61,   -73,   -87,   -124,
+    -118,  -133,  -150,  -160,  -198,  -196,  -219,  -228,  -239,  -281,  -276,
+    -275,  -288,  -277,  -305,  -324,  -302,  -294,  -292,  -266,  -261,  -224,
+    -203,  -210,  -190,  -198,  -176,  -180,  -201,  -196,  -198,  -175,  -166,
+    -151,  -127,  -114,  -59,   -48,   -8,    39,    75,    126,   131,   168,
+    160,   152,   142,   82,    36,    -13,   -49,   -81,   -105,  -105,  -103,
+    -65,   -38,   -16,   19,    33,    67,    82,    95,    110,   98,    111,
+    98,    87,    67,    54,    66,    52,    49,    53,    71,    106,   139,
+    186,   224,   270,   320,   361,   413,   433,   462,   473,   478,   480,
+    459,   441,   391,   339,   298,   239,   206,   159,   149,   120,   114,
+    117,   95,    106,   81,    67,    61,    30,    11,    -29,   -42,   -76,
+    -97,   -98,   -124,  -107,  -107,  -103,  -69,   -71,   -36,   -12,   23,
+    69,    86,    129,   152,   158,   162,   152,   127,   81,    48,    -9,
+    -80,   -120,  -172,  -201,  -225,  -276,  -297,  -311,  -330,  -339,  -361,
+    -375,  -389,  -376,  -365,  -374,  -378,  -375,  -370,  -358,  -347,  -355,
+    -338,  -314,  -289,  -244,  -212,  -168,  -129,  -80,   -26,   -12,   47,
+    79,    92,    105,   105,   113,   99,    85,    29,    -18,   -53,   -110,
+    -133,  -167,  -186,  -196,  -199,  -176,  -177,  -150,  -122,  -106,  -73,
+    -61,   -30,   -34,   -29,   -40,   -68,   -63,   -85,   -84,   -71,   -65,
+    -40,   -16,   23,    56,    87,    144,   167,   196,   206,   221,   243,
+    226,   233,   210,   192,   190,   150,   140,   110,   91,    77,    43,
+    27,    -10,   -5,    -5,    -22,   -9,    -7,    27,    48,    59,    64,
+    70,    87,    104,   139,   151,   188,   239,   270,   317,   311,   336,
+    349,   341,   330,   274,   254,   223,   195,   163,   102,   81,    43,
+    20,    8,     -37,   -28,   -31,   -29,   -21,   -39,   -16,   -22,   -11,
+    -21,   -41,   -32,   -47,   -39,   -60,   -75,   -71,   -94,   -98,   -131,
+    -147,  -139,  -145,  -146,  -165,  -150,  -136,  -112,  -90,   -106,  -86,
+    -91,   -87,   -98,   -136,  -121,  -135,  -124,  -132,  -144,  -114,  -108,
+    -87,   -74,   -75,   -50,   -30,   -5,    -18,   -24,   -3,    -3,    -6,
+    -41,   -76,   -98,   -127,  -159,  -215,  -257,  -263,  -268,  -266,  -262,
+    -237,  -194,  -144,  -113,  -99,   -61,   -28,   12,    21,    46,    76,
+    92,    130,   115,   123,   132,   135,   149,   134,   133,   132,   135,
+    138,   94,    76,    51,    19,    -15,   -72,   -98,   -125,  -135,  -154,
+    -174,  -171,  -164,  -139,  -130,  -99,   -74,   -40,   9,     34,    86,
+    129,   176,   214,   226,   245,   250,   280,   271,   256,   250,   226,
+    234,   212,   187,   178,   148,   144,   104,   79,    64,    37,    36,
+    9,     -10,   -23,   -38,   -35,   -62,   -67,   -67,   -82,   -70,   -80,
+    -75,   -59,   -34,   -3,    9,     48,    76,    101,   120,   120,   123,
+    126,   131,   112,   92,    77,    61,    54,    32,    3,     -18,   -28,
+    -39,   -56,   -71,   -91,   -92,   -100,  -124,  -134,  -142,  -144,  -155,
+    -177,  -178,  -175,  -171,  -168,  -160,  -141,  -123,  -89,   -73,   -64,
+    -46,   -39,   -18,   -19,   -34,   -32,   -46,   -51,   -63,   -74,   -73,
+    -81,   -70,   -83,   -71,   -49,   -39,   -12,   -1,    30,    48,    65,
+    94,    100,   125,   136,   148,   156,   138,   140,   124,   115,   86,
+    58,    57,    32,    43,    40,    44,    63,    60,    83,    90,    99,
+    115,   113,   135,   140,   148,   164,   172,   187,   182,   190,   183,
+    171,   171,   146,   139,   121,   105,   94,    61,    46,    17,    -6,
+    -34,   -70,   -89,   -121,  -138,  -158,  -178,  -190,  -206,  -206,  -210,
+    -214,  -204,  -196,  -173,  -154,  -128,  -97,   -81,   -58,   -51,   -46,
+    -38,   -47,   -49,   -57,   -58,   -57,   -59,   -49,   -58,   -58,   -54,
+    -60,   -48,   -65,   -72,   -72,   -78,   -70,   -77,   -73,   -76,   -79,
+    -76,   -90,   -90,   -91,   -88,   -76,   -67,   -43,   -16,   6,     27,
+    39,    55,    69,    71,    74,    65,    56,    60,    47,    37,    27,
+    8,     -5,    -29,   -50,   -71,   -89,   -96,   -114,  -111,  -113,  -115,
+    -105,  -112,  -90,   -78,   -68,   -49,   -46,   -26,   -14,   5,     18,
+    10,    14,    3,     5,     -9,    -20,   -15,   -30,   -26,   -33,   -31,
+    -23,   -23,   -12,   -21,   -20,   -16,   -23,   -20,   -13,   -7,    6,
+    28,    47,    69,    96,    115,   134,   147,   154,   166,   174,   186,
+    196,   202,   204,   198,   193,   181,   164,   144,   125,   113,   102,
+    96,    90,    92,    91,    96,    99,    99,    100,   99,    99,    93,
+    94,    86,    68,    55,    44,    36,    22,    13,    15,    13,    15,
+    21,    16,    11,    3,     -15,   -31,   -50,   -75,   -105,  -125,  -145,
+    -154,  -155,  -164,  -178,  -189,  -186,  -177,  -174,  -169,  -152,  -134,
+    -114,  -93,   -65,   -42,   -23,   -4,    -1,    6,     6,     2,     -4,
+    -18,   -26,   -25,   -25,   -23,   -32,   -31,   -33,   -39,   -50,   -68,
+    -69,   -74,   -79,   -78,   -83,   -85,   -85,   -77,   -71,   -61,   -42,
+    -27,   -3,    28,    59,    95,    123,   146,   155,   160,   162,   144,
+    130,   112,   94,    82,    67,    60,    46,    35,    35,    22,    4,
+    -14,   -27,   -35,   -45,   -52,   -61,   -62,   -65,   -68,   -55,   -52,
+    -43,   -38,   -34,   -20,   -8,    8,     18,    24,    34,    36,    37,
+    42,    46,    51,    50,    58,    76,    75,    70,    67,    58,    53,
+    48,    36,    23,    18,    10,    3,     9,     14,    24,    39,    43,
+    53,    62,    63,    66,    62,    66,    64,    59,    51,    25,    19,
+    6,     -10,   -19,   -26,   -35,   -43,   -44,   -37,   -47,   -43,   -50,
+    -54,   -60,   -69,   -75,   -84,   -91,   -93,   -98,   -96,   -99,   -91,
+    -87,   -91,   -88,   -84,   -80,   -75,   -61,   -48,   -44,   -40,   -37,
+    -34,   -45,   -52,   -58,   -72,   -82,   -84,   -78,   -68,   -65,   -63,
+    -51,   -42,   -27,   -22,   -13,   -3,    8,     20,    26,    31,    31,
+    37,    33,    29,    33,    31,    32,    31,    34,    44,    55,    68,
+    74,    69,    75,    73,    72,    65,    63,    67,    70,    83,    81,
+    81,    85,    84,    80,    75,    69,    53,    44,    36,    27,    20,
+    11,    1,     -4,    -19,   -26,   -27,   -25,   -21,   -14,   -12,   -12,
+    -14,   -9,    -21,   -29,   -40,   -50,   -50,   -54,   -46,   -35,   -17,
+    -4,    -1,    7,     20,    28,    26,    22,    23,    21,    23,    18,
+    13,    12,    7,     6,     3,     2,     -1,    -1,    4,     6,     17,
+    29,    35,    34,    34,    32,    28,    33,    26,    22,    16,    16,
+    22,    20,    13,    -1,    -1,    -7,    -15,   -20,   -30,   -32,   -38,
+    -39,   -45,   -45,   -53,   -63,   -70,   -83,   -96,   -107,  -113,  -122,
+    -122,  -118,  -114,  -114,  -113,  -112,  -111,  -110,  -107,  -103,  -102,
+    -94,   -80,   -71,   -58,   -52,   -47,   -40,   -43,   -47,   -48,   -50,
+    -39,   -46,   -44,   -44,   -44,   -43,   -45,   -41,   -40,   -34,   -32,
+    -23,   -12,   -6,    -1,    -1,    6,     12,    18,    20,    22,    32,
+    48,    65,    80,    93,    109,   122,   128,   131,   135,   135,   129,
+    126,   130,   127,   124,   125,   121,   122,   115,   118,   122,   128,
+    137,   143,   143,   141,   142,   134,   131,   121,   109,   105,   97,
+    93,    99,    96,    96,    94,    83,    84,    80,    77,    66,    59,
+    46,    42,    44,    32,    28,    20,    12,    8,     4,     4,     5,
+    3,     -4,    -7,    -6,    -14,   -19,   -24,   -34,   -40,   -45,   -52,
+    -61,   -62,   -60,   -57,   -57,   -61,   -63,   -61,   -65,   -73,   -81,
+    -89,   -94,   -93,   -89,   -87,   -82,   -82,   -84,   -81,   -86,   -82,
+    -84,   -86,   -90,   -86,   -83,   -82,   -81,   -80,   -80,   -76,   -75,
+    -76,   -70,   -69,   -68,   -61,   -53,   -50,   -43,   -38,   -42,   -43,
+    -41,   -41,   -39,   -34,   -27,   -21,   -16,   -20,   -22,   -27,   -36,
+    -39,   -38,   -40,   -37,   -35,   -28,   -14,   -6,    -3,    -2,    2,
+    4,     5,     15,    18,    25,    35,    36,    41,    45,    48,    52,
+    54,    52,    50,    60,    67,    76,    85,    85,    90,    86,    83,
+    84,    77,    77,    72,    77,    81,    89,    91,    93,    99,    101,
+    102,   98,    94,    87,    77,    70,    69,    63,    62,    55,    59,
+    58,    54,    51,    53,    57,    62,    65,    60,    54,    48,    45,
+    40,    29,    17,    8,     -3,    -14,   -17,   -18,   -20,   -25,   -34,
+    -40,   -44,   -53,   -56,   -63,   -71,   -71,   -69,   -66,   -62,   -66,
+    -67,   -68,   -71,   -75,   -79,   -79,   -73,   -67,   -60,   -49,   -46,
+    -45,   -45,   -46,   -55,   -64,   -67,   -72,   -74,   -70,   -68,   -67,
+    -69,   -70,   -64,   -56,   -55,   -54,   -51,   -41,   -30,   -26,   -28,
+    -29,   -30,   -28,   -25,   -27,   -20,   -12,   -5,    -2,    2,     3,
+    -3,    0,     -7,    -8,    -14,   -15,   -9,    -7,    4,     12,    24,
+    36,    41,    52,    58,    59,    51,    45,    48,    44,    46,    43,
+    40,    42,    47,    53,    52,    52,    63,    69,    74,    75,    80,
+    78,    69,    68,    59,    60,    54,    54,    54,    58,    66,    71,
+    78,    78,    75,    78,    72,    71,    61,    55,    53,    42,    36,
+    31,    28,    29,    23,    19,    25,    27,    27,    23,    29,    29,
+    20,    11,    5,     -4,    -10,   -31,   -38,   -39,   -36,   -33,   -27,
+    -17,   -15,   -14,   -17,   -13,   -14,   -25,   -33,   -44,   -51,   -61,
+    -63,   -63,   -65,   -67,   -66,   -63,   -59,   -52,   -48,   -45,   -44,
+    -50,   -62,   -74,   -84,   -89,   -100,  -101,  -102,  -96,   -95,   -85,
+    -76,   -78,   -72,   -71,   -66,   -61,   -63,   -60,   -62,   -72,   -69,
+    -69,   -58,   -56,   -50,   -37,   -28,   -17,   -17,   -16,   -17,   -18,
+    -18,   -13,   -7,    -4,    6,     17,    23,    25,    28,    24,    21,
+    17,    21,    27,    30,    33,    35,    46,    49,    48,    54,    56,
+    57,    58,    60,    64,    62,    64,    66,    67,    64,    70,    77,
+    83,    82,    84,    88,    89,    95,    86,    75,    64,    51,    36,
+    29,    26,    21,    26,    31,    38,    40,    55,    63,    65,    65,
+    64,    60,    54,    54,    49,    41,    34,    26,    21,    9,     6,
+    6,     5,     -1,    3,     5,     3,     2,     -4,    -13,   -13,   -24,
+    -32,   -33,   -36,   -33,   -24,   -18,   -15,   -9,    -5,    -5,    -14,
+    -17,   -24,   -34,   -36,   -42,   -43,   -36,   -42,   -43,   -43,   -38,
+    -36,   -27,   -20,   -23,   -21,   -28,   -25,   -22,   -24,   -25,   -23,
+    -22,   -30,   -31,   -26,   -25,   -20,   -15,   -8,    -10,   -11,   -13,
+    -18,   -22,   -30,   -36,   -35,   -39,   -35,   -34,   -27,   -24,   -19,
+    -15,   -7,    -6,    -7,    -2,    0,     7,     12,    14,    19,    20,
+    26,    26,    24,    16,    10,    4,     1,     3,     2,     9,     11,
+    17,    19,    27,    31,    31,    32,    30,    27,    25,    28,    27,
+    25,    22,    23,    23,    20,    21,    25,    36,    38,    40,    43,
+    40,    32,    27,    20,    9,     4,     1,     12,    27,    37,    49,
+    63,    73,    72,    73,    70,    67,    53,    39,    33,    26,    23,
+    13,    9,     6,     0,     -2,    -3,    0,     -1,    0,     -1,    -4,
+    -9,    -16,   -22,   -21,   -24,   -21,   -19,   -12,   -3,    0,     12,
+    14,    13,    3,     -6,    -13,   -27,   -34,   -42,   -41,   -44,   -42,
+    -43,   -46,   -42,   -40,   -39,   -36,   -31,   -29,   -30,   -22,   -19,
+    -21,   -20,   -17,   -17,   -22,   -31,   -41,   -45,   -54,   -65,   -64,
+    -68,   -70,   -74,   -70,   -64,   -62,   -61,   -60,   -58,   -52,   -46,
+    -43,   -37,   -35,   -40,   -41,   -47,   -52,   -58,   -62,   -61,   -53,
+    -54,   -46,   -41,   -40,   -34,   -29,   -20,   -15,   -8,    2,     12,
+    28,    35,    41,    42,    42,    43,    41,    43,    39,    45,    44,
+    46,    55,    54,    55,    55,    51,    48,    42,    43,    39,    40,
+    46,    54,    65,    70,    76,    81,    86,    89,    79,    73,    70,
+    62,    56,    52,    39,    32,    28,    17,    18,    19,    18,    15,
+    19,    20,    15,    13,    13,    10,    6,     5,     12,    10,    15,
+    20,    24,    30,    31,    28,    22,    17,    2,     -15,   -24,   -39,
+    -52,   -53,   -55,   -46,   -40,   -34,   -26,   -21,   -22,   -31,   -32,
+    -38,   -36,   -35,   -32,   -33,   -34,   -30,   -28,   -27,   -35,   -40,
+    -42,   -45,   -44,   -45,   -44,   -52,   -54,   -57,   -57,   -53,   -60,
+    -63,   -63,   -65,   -51,   -45,   -40,   -40,   -39,   -39,   -43,   -44,
+    -46,   -52,   -46,   -51,   -49,   -45,   -45,   -47,   -47,   -45,   -50,
+    -47,   -40,   -35,   -32,   -24,   -17,   -19,   -14,   -13,   -9,    -7,
+    -7,    -7,    -9,    0,     3,     7,     13,    12,    14,    15,    13,
+    6,     -1,    -3,    -9,    -10,   -5,    -2,    6,     9,     11,    12,
+    15,    19,    24,    37,    47,    47,    56,    53,    51,    52,    52,
+    47,    39,    38,    40,    41,    43,    44,    42,    43,    42,    41,
+    43,    40,    41,    35,    37,    39,    40,    41,    38,    30,    21,
+    14,    5,     2,     -1,    -2,    1,     -2,    6,     2,     4,     2,
+    -1,    -11,   -16,   -23,   -25,   -20,   -18,   -25,   -27,   -32,   -27,
+    -24,   -16,   -15,   -11,   -9,    -3,    -4,    -2,    -9,    -10,   -18,
+    -28,   -33,   -38,   -37,   -41,   -41,   -33,   -24,   -22,   -25,   -25,
+    -25,   -24,   -33,   -38,   -42,   -52,   -57,   -55,   -50,   -51,   -53,
+    -52,   -48,   -49,   -49,   -53,   -55,   -58,   -51,   -34,   -19,   -12,
+    -12,   -5,    1,     1,     0,     -6,    -2,    -10,   -11,   -11,   -6,
+    0,     -6,    2,     -2,    -6,    2,     5,     16,    18,    18,    21,
+    16,    18,    18,    20,    20,    13,    18,    9,     7,     12,    7,
+    8,     10,    16,    17,    18,    23,    26,    36,    44,    51,    55,
+    60,    64,    69,    68,    71,    70,    62,    58,    52,    44,    35,
+    31,    34,    32,    33,    36,    37,    38,    41,    47,    55,    56,
+    58,    60,    60,    57,    48,    41,    29,    19,    7,     4,     8,
+    9,     10,    8,     13,    15,    13,    8,     8,     6,     4,     10,
+    8,     -4,    -6,    -9,    -20,   -28,   -39,   -38,   -27,   -24,   -22,
+    -19,   -23,   -32,   -35,   -36,   -41,   -48,   -51,   -50,   -52,   -55,
+    -60,   -67,   -72,   -76,   -84,   -82,   -80,   -81,   -75,   -64,   -50,
+    -36,   -28,   -18,   -14,   -12,   -15,   -12,   -18,   -24,   -21,   -22,
+    -19,   -21,   -19,   -22,   -20,   -18,   -16,   -17,   -19,   -15,   -7,
+    1,     0,     0,     9,     14,    20,    24,    20,    16,    17,    20,
+    20,    25,    27,    26,    32,    33,    35,    38,    42,    38,    37,
+    39,    46,    44,    43,    45,    45,    42,    37,    34,    25,    21,
+    22,    33,    44,    49,    54,    53,    58,    54,    51,    46,    40,
+    37,    37,    39,    34,    37,    39,    31,    39,    38,    36,    35,
+    32,    33,    33,    32,    28,    23,    18,    22,    28,    31,    27,
+    18,    3,     4,     0,     -4,    -7,    -15,   -18,   -24,   -32,   -34,
+    -39,   -42,   -36,   -31,   -24,   -12,   -10,   -10,   -13,   -20,   -28,
+    -34,   -44,   -49,   -50,   -53,   -56,   -54,   -52,   -53,   -47,   -43,
+    -41,   -45,   -41,   -38,   -38,   -33,   -32,   -34,   -35,   -33,   -40,
+    -45,   -53,   -62,   -61,   -67,   -72,   -70,   -67,   -68,   -59,   -51,
+    -47,   -38,   -31,   -20,   -13,   -13,   -13,   -14,   -17,   -21,   -22,
+    -29,   -31,   -27,   -23,   -13,   -6,    4,     12,    17,    25,    23,
+    23,    25,    30,    30,    32,    31,    28,    27,    18,    14,    13,
+    3,     5,     7,     19,    35,    47,    61,    70,    84,    90,    95,
+    92,    94,    89,    77,    71,    66,    59,    50,    51,    50,    51,
+    53,    56,    65,    67,    69,    75,    74,    69,    67,    56,    51,
+    44,    34,    25,    17,    10,    6,     7,     7,     4,     6,     -1,
+    -1,    -2,    -9,    -9,    -9,    -7,    -5,    1,     -2,    -5,    -11,
+    -19,   -27,   -39,   -38,   -44,   -45,   -48,   -48,   -54,   -59,   -53,
+    -51,   -49,   -52,   -50,   -50,   -47,   -42,   -32,   -28,   -28,   -26,
+    -27,   -34,   -40,   -40,   -36,   -37,   -37,   -34,   -37,   -36,   -41,
+    -36,   -40,   -46,   -48,   -52,   -47,   -44,   -40,   -40,   -38,   -43,
+    -43,   -47,   -59,   -62,   -59,   -59,   -51,   -41,   -29,   -19,   -8,
+    -2,    1,     1,     -4,    -9,    -19,   -23,   -29,   -29,   -25,   -23,
+    -15,   -7,    -2,    6,     8,     15,    27,    35,    43,    40,    36,
+    35,    32,    25,    22,    19,    17,    13,    13,    21,    25,    28,
+    36,    44,    50,    57,    56,    58,    59,    62,    66,    70,    73,
+    69,    66,    66,    66,    62,    53,    48,    44,    38,    39,    44,
+    52,    51,    55,    57,    52,    49,    44,    36,    26,    16,    13,
+    13,    14,    14,    17,    14,    10,    6,     -5,    -14,   -23,   -24,
+    -21,   -28,   -25,   -27,   -29,   -29,   -33,   -33,   -39,   -42,   -43,
+    -41,   -40,   -43,   -46,   -45,   -43,   -42,   -41,   -41,   -46,   -46,
+    -52,   -52,   -52,   -59,   -63,   -70,   -68,   -73,   -77,   -73,   -68,
+    -66,   -62,   -64,   -66,   -58,   -54,   -51,   -52,   -48,   -47,   -43,
+    -40,   -39,   -33,   -26,   -19,   -17,   -16,   -17,   -14,   -9,    -10,
+    -3,    5,     5,     9,     5,     9,     8,     4,     3,     0,     -5,
+    -10,   -3,    2,     8,     14,    16,    20,    27,    39,    40,    44,
+    48,    43,    39,    34,    29,    22,    12,    8,     5,     0,     -2,
+    -3,    5,     12,    16,    19,    22,    25,    28,    35,    28,    30,
+    31,    30,    39,    43,    47,    43,    42,    41,    41,    41,    37,
+    37,    39,    37,    38,    43,    44,    41,    43,    34,    28,    25,
+    23,    30,    34,    32,    33,    29,    21,    18,    13,    14,    11,
+    3,     2,     1,     3,     1,     -1,    0,     -3,    -1,    -3,    -8,
+    -9,    -7,    -9,    -2,    0,     -3,    0,     1,     5,     0,     -1,
+    -9,    -13,   -8,    -11,   -18,   -23,   -25,   -29,   -29,   -26,   -27,
+    -29,   -25,   -24,   -23,   -18,   -19,   -18,   -17,   -21,   -22,   -30,
+    -38,   -42,   -42,   -42,   -40,   -41,   -43,   -39,   -38,   -37,   -36,
+    -33,   -31,   -28,   -27,   -18,   -15,   -7,    -8,    -8,    -1,    1,
+    3,     -5,    0,     -4,    -5,    -4,    -8,    -10,   -14,   -21,   -24,
+    -25,   -20,   -11,   -4,    3,     6,     13,    15,    12,    17,    16,
+    17,    17,    15,    21,    28,    33,    36,    35,    35,    29,    31,
+    29,    28,    23,    21,    14,    15,    27,    36,    40,    40,    43,
+    51,    56,    62,    69,    77,    80,    88,    88,    88,    82,    76,
+    63,    52,    44,    36,    26,    23,    25,    24,    27,    26,    31,
+    21,    13,    8,     -8,    -8,    -11,   -14,   -18,   -28,   -28,   -30,
+    -32,   -29,   -26,   -26,   -27,   -24,   -20,   -14,   -8,    -6,    -8,
+    -5,    -10,   -14,   -18,   -26,   -34,   -36,   -38,   -44,   -51,   -57,
+    -66,   -64,   -68,   -72,   -75,   -75,   -70,   -68,   -65,   -64,   -62,
+    -68,   -63,   -60,   -65,   -65,   -69,   -68,   -67,   -57,   -46,   -41,
+    -38,   -34,   -31,   -39,   -40,   -45,   -45,   -48,   -47,   -40,   -39,
+    -32,   -26,   -24,   -14,   -9,    -7,    -3,    -2,    3,     4,     0,
+    -2,    -2,    -2,    1,     3,     2,     3,     8,     13,    20,    25,
+    29,    31,    26,    17,    11,    3,     -5,    2,     6,     9,     11,
+    19,    26,    40,    51,    61,    60,    58,    61,    55,    55,    57,
+    60,    54,    40,    42,    38,    34,    38,    37,    34,    32,    35,
+    36,    35,    41,    36,    32,    29,    23,    22,    23,    22,    14,
+    13,    19,    19,    20,    22,    22,    17,    13,    6,     9,     13,
+    15,    17,    19,    11,    15,    8,     4,     6,     -1,    -3,    3,
+    7,     11,    8,     10,    7,     6,     4,     -4,    -5,    -11,   -9,
+    -16,   -14,   -14,   -16,   -16,   -22,   -19,   -19,   -13,   -9,    -4,
+    1,     1,     2,     -6,    -14,   -25,   -32,   -41,   -46,   -50,   -49,
+    -42,   -39,   -34,   -24,   -14,   -18,   -15,   -17,   -21,   -23,   -21,
+    -19,   -21,   -20,   -19,   -20,   -19,   -16,   -17,   -19,   -20,   -20,
+    -20,   -20,   -22,   -22,   -23,   -22,   -22,   -14,   -5,    5,     8,
+    13,    16,    19,    23,    19,    21,    16,    16,    18,    13,    18,
+    13,    15,    18,    12,    12,    6,     11,    8,     5,     5,     9,
+    17,    14,    15,    14,    16,    14,    14,    12,    9,     7,     9,
+    11,    13,    15,    15,    19,    17,    14,    8,     7,     4,     0,
+    3,     8,     10,    7,     8,     19,    15,    19,    18,    19,    17,
+    9,     14,    10,    4,     -3,    -11,   -19,   -25,   -31,   -35,   -36,
+    -28,   -21,   -8,    5,     8,     11,    13,    7,     4,     1,     -7,
+    -15,   -17,   -17,   -21,   -28,   -33,   -37,   -40,   -39,   -41,   -45,
+    -46,   -44,   -40,   -41,   -36,   -31,   -41,   -40,   -42,   -44,   -47,
+    -50,   -49,   -55,   -52,   -52,   -52,   -45,   -50,   -52,   -56,   -58,
+    -60,   -69,   -75,   -82,   -86,   -91,   -87,   -80,   -80,   -72,   -58,
+    -52,   -45,   -33,   -21,   -13,   -12,   -10,   -6,    -1,    -2,    -7,
+    -7,    -5,    -6,    -3,    9,     15,    25,    36,    35,    39,    28,
+    16,    11,    8,     11,    17,    27,    34,    36,    47,    49,    52,
+    52,    42,    46,    49,    55,    65,    66,    67,    62,    56,    53,
+    49,    50,    55,    53,    62,    69,    72,    73,    68,    61,    54,
+    46,    43,    38,    34,    39,    43,    42,    39,    36,    31,    26,
+    24,    17,    13,    14,    14,    21,    26,    29,    28,    26,    24,
+    18,    19,    16,    11,    6,     2,     -2,    1,     3,     2,     -4,
+    -3,    -1,    -3,    -2,    -2,    -5,    -3,    0,     3,     -3,    -6,
+    -6,    -15,   -19,   -25,   -30,   -35,   -39,   -34,   -34,   -34,   -31,
+    -17,   -17,   -8,    -2,    -2,    8,     14,    25,    24,    26,    22,
+    16,    10,    2,     -3,    -5,    -12,   -15,   -11,   -14,   -16,   -17,
+    -17,   -16,   -21,   -18,   -18,   -21,   -23,   -21,   -15,   -11,   -4,
+    -2,    3,     8,     10,    17,    18,    25,    24,    24,    24,    21,
+    24,    23,    24,    22,    23,    31,    39,    49,    58,    64,    67,
+    63,    57,    53,    52,    44,    45,    43,    40,    45,    42,    49,
+    50,    49,    52,    51,    48,    46,    38,    37,    35,    36,    37,
+    37,    37,    44,    45,    47,    42,    42,    36,    35,    44,    40,
+    40,    28,    24,    23,    18,    12,    9,     8,     10,    17,    17,
+    18,    12,    5,     -2,    -12,   -16,   -20,   -27,   -29,   -29,   -26,
+    -22,   -17,   -16,   -15,   -14,   -15,   -11,   -11,   -15,   -19,   -15,
+    -20,   -22,   -24,   -37,   -52,   -62,   -63,   -68,   -64,   -59,   -51,
+    -43,   -42,   -36,   -32,   -33,   -33,   -33,   -41,   -48,   -51,   -49,
+    -48,   -47,   -42,   -45,   -42,   -41,   -40,   -39,   -33,   -29,   -25,
+    -14,   -1,    -4,    -6,    -11,   -16,   -19,   -26,   -29,   -28,   -25,
+    -17,   -10,   -1,    -1,    3,     7,     -1,    -3,    -8,    -18,   -20,
+    -20,   -16,   -13,   -11,   -8,    0,     6,     8,     11,    14,    15,
+    20,    26,    26,    26,    24,    23,    24,    30,    34,    41,    52,
+    61,    70,    80,    85,    86,    89,    84,    87,    79,    67,    60,
+    57,    59,    63,    68,    74,    78,    84,    89,    91,    87,    81,
+    74,    69,    63,    59,    59,    56,    58,    60,    60,    59,    54,
+    49,    41,    40,    34,    25,    19,    11,    1,     0,     -1,    -4,
+    -8,    -12,   -12,   -17,   -22,   -31,   -44,   -54,   -58,   -68,   -74,
+    -80,   -80,   -73,   -65,   -61,   -61,   -55,   -50,   -50,   -59,   -65,
+    -69,   -73,   -73,   -78,   -79,   -83,   -87,   -87,   -88,   -94,   -103,
+    -107,  -107,  -109,  -106,  -113,  -115,  -110,  -105,  -100,  -100,  -92,
+    -78,   -62,   -49,   -39,   -35,   -27,   -26,   -25,   -24,   -22,   -23,
+    -28,   -26,   -22,   -15,   -11,   -4,    4,     13,    21,    32,    31,
+    28,    30,    30,    28,    23,    25,    23,    21,    25,    21,    26,
+    27,    32,    40,    48,    53,    55,    54,    55,    55,    54,    48,
+    44,    47,    48,    54,    60,    71,    79,    79,    74,    72,    59,
+    48,    42,    32,    26,    22,    21,    23,    22,    31,    42,    44,
+    41,    36,    30,    30,    33,    38,    35,    30,    28,    20,    15,
+    8,     4,     6,     9,     16,    26,    27,    23,    19,    16,    10,
+    4,     -4,    -12,   -12,   -16,   -16,   -19,   -24,   -23,   -23,   -31,
+    -34,   -38,   -40,   -41,   -39,   -39,   -36,   -36,   -40,   -45,   -48,
+    -53,   -66,   -73,   -76,   -76,   -78,   -75,   -71,   -65,   -59,   -58,
+    -59,   -56,   -60,   -62,   -62,   -62,   -64,   -68,   -73,   -79,   -80,
+    -85,   -87,   -85,   -78,   -72,   -66,   -56,   -48,   -42,   -37,   -35,
+    -32,   -33,   -31,   -25,   -26,   -27,   -16,   -18,   -18,   -13,   -14,
+    -17,   -22,   -24,   -25,   -23,   -19,   -14,   -12,   -11,   -7,    -4,
+    -1,    2,     5,     8,     10,    10,    18,    28,    29,    25,    22,
+    29,    21,    20,    21,    22,    30,    32,    41,    41,    45,    46,
+    49,    52,    57,    59,    58,    52,    46,    47,    56,    58,    49,
+    49,    46,    40,    33,    23,    14,    11,    16,    29,    34,    37,
+    41,    42,    48,    54,    60,    61,    62,    62,    69,    79,    76,
+    71,    72,    71,    64,    59,    54,    49,    40,    42,    34,    23,
+    27,    18,    13,    9,     3,     -4,    -8,    -16,   -18,   -20,   -26,
+    -28,   -30,   -32,   -29,   -32,   -35,   -39,   -41,   -38,   -34,   -31,
+    -26,   -18,   -21,   -20,   -22,   -28,   -35,   -34,   -31,   -33,   -31,
+    -31,   -40,   -43,   -45,   -53,   -64,   -67,   -74,   -75,   -74,   -75,
+    -70,   -61,   -56,   -45,   -37,   -30,   -33,   -35,   -32,   -31,   -27,
+    -25,   -19,   -17,   -14,   -9,    -4,    -1,    -3,    -4,    1,     8,
+    14,    20,    24,    25,    18,    11,    7,     -3,    -9,    -3,    4,
+    15,    30,    29,    33,    33,    36,    35,    31,    33,    34,    42,
+    43,    42,    47,    49,    53,    61,    69,    73,    74,    79,    81,
+    84,    76,    69,    62,    47,    39,    31,    19,    8,     2,     -6,
+    -5,    -3,    -3,    -1,    1,     -2,    -3,    -3,    -6,    -12,   -13,
+    -15,   -11,   -5,    -4,    -8,    -14,   -9,    -3,    0,     -3,    -4,
+    0,     3,     0,     -6,    -14,   -23,   -33,   -38,   -41,   -38,   -38,
+    -34,   -30,   -29,   -29,   -26,   -31,   -33,   -41,   -49,   -50,   -56,
+    -57,   -58,   -54,   -46,   -39,   -39,   -34,   -31,   -28,   -30,   -30,
+    -31,   -29,   -27,   -16,   -18,   -17,   -15,   -13,   -15,   -12,   -7,
+    -11,   -9,    -9,    -4,    -11,   -7,    -7,    -8,    -9,    -10,   -7,
+    -9,    1,     9,     15,    12,    19,    19,    18,    17,    13,    11,
+    8,     6,     10,    17,    20,    26,    28,    33,    39,    30,    25,
+    25,    18,    16,    21,    26,    30,    33,    32,    36,    42,    49,
+    46,    39,    44,    44,    37,    35,    30,    24,    22,    23,    26,
+    23,    25,    21,    24,    24,    22,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc8030cdac7c4e8364e0fcd7dcc5fff63617908
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav
+// This should contain all 16,000 samples from the one-second file.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_no_1000ms_sample_data_size;
+extern const int16_t g_no_1000ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
index 892757e799f3832db725424163e613bea35ab9e7..6468c1a95a9cd3f844595bf2c6e88c1e2833823b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <AudioToolbox/AudioToolbox.h>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
index 9366dc71e0d76d087a3dad9b9c4c206a0749e235..8187962c3e780a76413134771dc63ba30910f3b6 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
@@ -28,8 +28,8 @@ RecognizeCommands::RecognizeCommands(tflite::ErrorReporter* error_reporter,
       suppression_ms_(suppression_ms),
       minimum_count_(minimum_count),
       previous_results_(error_reporter) {
-  previous_top_label_ = "_silence_";
-  previous_top_label_time_ = 0;
+  previous_top_label_ = "silence";
+  previous_top_label_time_ = std::numeric_limits<int32_t>::min();
 }
 
 TfLiteStatus RecognizeCommands::ProcessLatestResults(
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
index adefffe850076821dd1e0bf683fdd2180d6999ea..292cd3e88dcd63f925cb16995b5e8a16554a8547 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
 // Partial implementation of std::dequeue, just providing the functionality
@@ -129,8 +129,8 @@ class RecognizeCommands {
   // help reduce spurious recognitions.
   explicit RecognizeCommands(tflite::ErrorReporter* error_reporter,
                              int32_t average_window_duration_ms = 1000,
-                             uint8_t detection_threshold = 51,
-                             int32_t suppression_ms = 500,
+                             uint8_t detection_threshold = 200,
+                             int32_t suppression_ms = 1500,
                              int32_t minimum_count = 3);
 
   // Call this with the results of running a model on sample data.
@@ -149,8 +149,6 @@ class RecognizeCommands {
 
   // Working variables
   PreviousResultsQueue previous_results_;
-  int previous_results_head_;
-  int previous_results_tail_;
   const char* previous_top_label_;
   int32_t previous_top_label_time_;
 };
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
index f0cc73f10b3dadfdf06cb0f2935140b792635add..6582c948d16f9493a4b1e5bdf43bdc1f30e6dc31 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
@@ -118,7 +118,9 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     }
   }
   TF_LITE_MICRO_EXPECT(has_found_new_command);
-  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+  if (has_found_new_command) {
+    TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+  }
 
   TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
       {0, 0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
@@ -141,8 +143,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     }
   }
   TF_LITE_MICRO_EXPECT(has_found_new_command);
-  TF_LITE_MICRO_EXPECT_EQ(231, score);
-  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+  if (has_found_new_command) {
+    TF_LITE_MICRO_EXPECT_EQ(231, score);
+    TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+  }
 }
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
similarity index 85%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
index 78155cc2babfc34717a57555e60b161bb5621fc8..403976e222fe549f6f8c755bf7460d245d9370e8 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
+
 extern "C" {
 #define IFFT_FLAG_R 0
 #define BIT_REVERSE_FLAG 1
@@ -23,8 +25,6 @@ extern "C" {
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
 }
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-
 void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output);
 
 q15_t bufA[FFT_SIZE];
@@ -41,9 +41,9 @@ constexpr int kOutputSize =
     ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   if (input_size > kInputSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
                            kInputSize);
@@ -93,12 +93,3 @@ void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output) {
     output[i] = (uint8_t)(bufA[i] >> 5);
   }
 }
-
-TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
-                             const int16_t* input, uint8_t* output) {
-  int i;
-  for (i = 0; i < 49; i++) {
-    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
-  }
-  return kTfLiteOk;
-}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
similarity index 96%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
index b623d8d11b75d59600cc6a029527d3957084a328..ad11684b0a94e630580aa9a95d4b1db92f914d6f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
@@ -27,11 +27,11 @@ limitations under the License.
 // instead of floating point, to help show how this can work on platforms that
 // don't have good float support.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 
 #include <cmath>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
@@ -118,9 +118,9 @@ void CalculatePeriodicHann(int window_length, int16_t* window_function) {
 
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   // Ensure our input and output data arrays are valid.
   if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
index c4fc5c33bb329cba4e1abcf6d36b01f14e9e2b27..0b20f2f86fb6455d4251cb81d3e70c3c15de7c6b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See the header for documentation on the meaning of this data.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h"
 
 const uint8_t g_no_power_spectrum_data[g_no_power_spectrum_data_size] = {
     255, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
index fa39d3c70d78ce261db81cf8ad7c416efd2c468c..9693950fb5ee1d56242b83c6265e9e2315ec8971 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
@@ -18,12 +18,12 @@ limitations under the License.
 // This is the expected result of running the sample data in
 // no_30ms_sample_data.cc through through the preprocessing pipeline.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
 
 #include <cstdint>
 
 constexpr int g_no_power_spectrum_data_size = 43;
 extern const uint8_t g_no_power_spectrum_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
similarity index 97%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
index e98c84f7ed2e678eb91580a2b6fb69514cee4740..3d3a9538fb527888e3bdf0e1aa9ca00d4d5f1544 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h"
 
 /* File automatically created by
  * tensorflow/examples/speech_commands/wav_to_features.py \
@@ -22,15 +22,15 @@ limitations under the License.
  * --window_size_ms=30 \
  * --window_stride_ms=20 \
  * --feature_bin_count=40 \
- * --quantize \
+ * --quantize=1 \
  * --preprocess="average" \
  * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
- * --output_c_file="no_features_data.cc" \
+ * --output_c_file="no_simple_features_data.cc" \
  */
 
-const int g_no_f9643d42_nohash_4_width = 43;
-const int g_no_f9643d42_nohash_4_height = 49;
-const unsigned char g_no_f9643d42_nohash_4_data[] = {
+const int g_no_simple_f9643d42_nohash_4_width = 43;
+const int g_no_simple_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_simple_f9643d42_nohash_4_data[] = {
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
index 39a3bb914cc1986aa851ace0e39ce63ed1a93282..30332b30c5c8325edb53713d572fcf987446844a 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
 
-extern const int g_yes_f2e59fea_nohash_1_width;
-extern const int g_yes_f2e59fea_nohash_1_height;
-extern const unsigned char g_yes_f2e59fea_nohash_1_data[];
+extern const int g_no_simple_f9643d42_nohash_4_width;
+extern const int g_no_simple_f9643d42_nohash_4_height;
+extern const unsigned char g_no_simple_f9643d42_nohash_4_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
similarity index 92%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
index f8858aad72f3c141d20077ffa927e30bd9492987..3aa05b7bf1d5d1762c9c6744ac8a5fe99f922332 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
@@ -24,11 +24,11 @@ limitations under the License.
 // functions used here, for example replacing the DFT with an FFT, so this
 // version shouldn't be used where performance is critical.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 
 #include <cmath>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
@@ -72,9 +72,9 @@ void CalculatePeriodicHann(int window_length, float* window_function) {
 
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   // Ensure our input and output data arrays are valid.
   if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
@@ -146,12 +146,3 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
   return kTfLiteOk;
 }
-
-TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
-                             const int16_t* input, uint8_t* output) {
-  int i;
-  for (i = 0; i < 49; i++) {
-    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
-  }
-  return kTfLiteOk;
-}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
similarity index 77%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
index d710beeceea6a7b6fb7fca748e5795f602276e32..f4e86b18a4c3d1c0a5beb32eb6806faaf1c11c14 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -24,11 +24,8 @@ limitations under the License.
 // both floating point and fixed point available, but because the calculations
 // involved can be time-consuming, it's recommended that you use or write
 // specialized versions for your platform.
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output);
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output);
 
-TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
-                             const int16_t* input, uint8_t* output);
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
index e8b49f67e3d72faa4700c4bdec7f94a5b79cd72e..65e526327c77c727ec88cee421a466f0df34ee76 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
@@ -13,23 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(TestPreprocessor) {
+TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t yes_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus yes_status = Preprocess(
+  TfLiteStatus yes_status = GenerateSimpleFeatures(
       error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
       g_yes_power_spectrum_data_size, yes_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
@@ -45,7 +45,7 @@ TF_LITE_MICRO_TEST(TestPreprocessor) {
   }
 
   uint8_t no_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus no_status = Preprocess(
+  TfLiteStatus no_status = GenerateSimpleFeatures(
       error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
       g_no_power_spectrum_data_size, no_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4842f8dbd907dbbd73aab14c7767a8d64476b52d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
+
+const char* kCategoryLabels[kCategoryCount] = {
+    "silence",
+    "unknown",
+    "yes",
+    "no",
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
similarity index 93%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
index f48252d14d251673f0070e63dfa4169ca3a89025..d31d6b33622b3a15c90fab4c52d7452960a54930 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
 
 // Keeping these as constant expressions allow us to allocate fixed-sized arrays
 // on the stack for our working memory.
@@ -40,4 +40,4 @@ constexpr int kSilenceIndex = 0;
 constexpr int kUnknownIndex = 1;
 extern const char* kCategoryLabels[kCategoryCount];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
similarity index 99%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
index 62e4359859a422c96ec368b6f91cba99e3c4a4eb..a14412edc941e8a7df0aef9dd66b79b1d9a1d7a6 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 // Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
 // See the README for a full description of the creation process.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
 
-const unsigned char g_tiny_conv_model_data[] = {
+const unsigned char g_tiny_conv_simple_features_model_data[] = {
     0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
     0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
@@ -1670,4 +1670,4 @@ const unsigned char g_tiny_conv_model_data[] = {
     0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
     0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
     0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_tiny_conv_model_data_len = 19800;
+const int g_tiny_conv_simple_features_model_data_len = 19800;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
similarity index 74%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
index a465dbfabf7cbba44473ae7e2ff94b1de2092b20..cadf7d0de754e032ae9ff77cdd8deec43bc03847 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
@@ -16,12 +16,12 @@ limitations under the License.
 // This is a standard TensorFlow Lite model file that has been converted into a
 // C data array, so it can be easily compiled into a binary for devices that
 // don't have a file system. It was created using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
 
-extern const unsigned char g_tiny_conv_model_data[];
-extern const int g_tiny_conv_model_data_len;
+extern const unsigned char g_tiny_conv_simple_features_model_data[];
+extern const int g_tiny_conv_simple_features_model_data_len;
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
index 9a34a2045a221e2eee8c51f23000e819b1638499..cd46408c0fb5c2c5dad12ae67c5456c8cb178b2d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See the header for documentation on the meaning of this data.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h"
 
 const uint8_t g_yes_power_spectrum_data[g_yes_power_spectrum_data_size] = {
     8, 89, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 13, 1, 6, 23, 20, 6, 4, 0, 0, 0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
index 5c8c00ac1116dcbd7ad4aeda1828603e962c2001..77e52d58b54763ec8df46729ab6f8dd84086d59b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
@@ -18,12 +18,12 @@ limitations under the License.
 // This is the expected result of running the sample data in
 // yes_30ms_sample_data.cc through through the preprocessing pipeline.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
 
 #include <cstdint>
 
 constexpr int g_yes_power_spectrum_data_size = 43;
 extern const uint8_t g_yes_power_spectrum_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
similarity index 97%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
index 2eb737fb8e1204a02f7ea4852016e85d03980bfd..2d660bb8b5c5b825eb48490699c89e5ba241369f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h"
 
 /* File automatically created by
  * tensorflow/examples/speech_commands/wav_to_features.py \
@@ -22,15 +22,15 @@ limitations under the License.
  * --window_size_ms=30 \
  * --window_stride_ms=20 \
  * --feature_bin_count=40 \
- * --quantize \
+ * --quantize=1 \
  * --preprocess="average" \
  * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
- * --output_c_file="yes_features_data.cc" \
+ * --output_c_file="yes_simple_features_data.cc" \
  */
 
-const int g_yes_f2e59fea_nohash_1_width = 43;
-const int g_yes_f2e59fea_nohash_1_height = 49;
-const unsigned char g_yes_f2e59fea_nohash_1_data[] = {
+const int g_yes_simple_f2e59fea_nohash_1_width = 43;
+const int g_yes_simple_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_simple_f2e59fea_nohash_1_data[] = {
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
index e2ee0c46cf13b00b310bd22b7ca1cb5a9751c6e6..87ea4a4aea89d02189bca9c37872e27b95672190 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
 
-extern const int g_no_f9643d42_nohash_4_width;
-extern const int g_no_f9643d42_nohash_4_height;
-extern const unsigned char g_no_f9643d42_nohash_4_data[];
+extern const int g_yes_simple_f2e59fea_nohash_1_width;
+extern const int g_yes_simple_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_simple_f2e59fea_nohash_1_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f6ceb3f0b3935d084fa9463c72e98d4e0cad83
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc
@@ -0,0 +1,1800 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+const int g_yes_1000ms_sample_data_size = 16000;
+const int16_t g_yes_1000ms_sample_data[16000] = {
+    -7,     -12,    -18,    -20,    -20,    -21,    -21,    -25,    -29,
+    -31,    -31,    -30,    -30,    -29,    -30,    -30,    -29,    -28,
+    -24,    -22,    -17,    -12,    -8,     -7,     -6,     -1,     2,
+    5,      7,      8,      11,     15,     18,     19,     23,     24,
+    24,     27,     27,     26,     25,     28,     30,     32,     33,
+    31,     29,     27,     28,     30,     28,     26,     26,     24,
+    22,     17,     16,     15,     13,     10,     5,      0,      -4,
+    -4,     -7,     -9,     -12,    -14,    -14,    -13,    -11,    -10,
+    -8,     -6,     -3,     3,      7,      8,      12,     15,     18,
+    21,     19,     19,     21,     23,     24,     23,     22,     19,
+    17,     11,     5,      -3,     -12,    -22,    -28,    -35,    -45,
+    -54,    -62,    -69,    -76,    -84,    -92,    -100,   -109,   -116,
+    -117,   -120,   -120,   -120,   -122,   -124,   -126,   -123,   -121,
+    -116,   -113,   -107,   -97,    -88,    -75,    -61,    -50,    -41,
+    -27,    -12,    4,      21,     37,     58,     76,     93,     108,
+    121,    137,    156,    172,    184,    196,    205,    215,    224,
+    235,    242,    245,    242,    240,    238,    231,    223,    214,
+    205,    195,    178,    158,    135,    112,    90,     69,     46,
+    19,     -11,    -45,    -76,    -105,   -133,   -159,   -186,   -211,
+    -236,   -260,   -280,   -294,   -308,   -320,   -331,   -336,   -338,
+    -335,   -326,   -316,   -301,   -286,   -267,   -246,   -225,   -203,
+    -180,   -154,   -124,   -91,    -59,    -34,    -8,     19,     42,
+    64,     87,     103,    119,    134,    148,    162,    174,    182,
+    188,    190,    189,    187,    184,    180,    177,    171,    162,
+    154,    144,    137,    129,    118,    106,    95,     81,     69,
+    58,     48,     37,     26,     14,     3,      -7,     -22,    -31,
+    -42,    -52,    -62,    -69,    -75,    -79,    -82,    -87,    -88,
+    -92,    -94,    -91,    -87,    -85,    -81,    -74,    -70,    -64,
+    -55,    -47,    -40,    -33,    -25,    -19,    -12,    -6,     -4,
+    -1,     1,      1,      -2,     -9,     -15,    -17,    -18,    -20,
+    -22,    -22,    -26,    -31,    -33,    -35,    -31,    -26,    -17,
+    -4,     8,      19,     31,     44,     54,     64,     71,     79,
+    86,     92,     102,    109,    111,    109,    104,    96,     84,
+    70,     60,     51,     38,     27,     13,     4,      -3,     -9,
+    -13,    -18,    -26,    -33,    -32,    -27,    -20,    -10,    -4,
+    2,      6,      10,     14,     16,     21,     25,     29,     31,
+    33,     35,     37,     33,     22,     15,     13,     11,     12,
+    9,      5,      2,      1,      -3,     -9,     -17,    -27,    -32,
+    -35,    -36,    -36,    -42,    -50,    -56,    -66,    -77,    -85,
+    -96,    -100,   -106,   -113,   -118,   -121,   -119,   -117,   -119,
+    -122,   -124,   -123,   -112,   -94,    -77,    -64,    -51,    -37,
+    -22,    -3,     17,     37,     54,     68,     86,     100,    114,
+    134,    154,    167,    174,    178,    182,    189,    189,    187,
+    185,    179,    177,    174,    171,    157,    138,    123,    108,
+    94,     76,     50,     25,     6,      -8,     -20,    -37,    -59,
+    -86,    -110,   -132,   -147,   -159,   -169,   -178,   -191,   -203,
+    -213,   -217,   -215,   -208,   -199,   -194,   -195,   -190,   -178,
+    -165,   -155,   -144,   -134,   -123,   -103,   -80,    -56,    -35,
+    -18,    -4,     11,     23,     36,     50,     65,     78,     93,
+    111,    122,    129,    132,    131,    127,    125,    126,    126,
+    128,    127,    125,    122,    118,    111,    108,    104,    99,
+    93,     89,     90,     87,     82,     78,     75,     68,     65,
+    67,     69,     66,     61,     54,     39,     28,     15,     3,
+    -7,     -18,    -25,    -29,    -35,    -42,    -52,    -66,    -78,
+    -83,    -85,    -86,    -86,    -82,    -83,    -84,    -83,    -81,
+    -75,    -62,    -57,    -53,    -49,    -46,    -41,    -34,    -26,
+    -16,    -10,    -7,     -2,     2,      6,      12,     15,     19,
+    18,     15,     17,     21,     24,     30,     33,     27,     22,
+    21,     20,     23,     24,     21,     15,     13,     8,      3,
+    1,      -1,     -3,     -4,     -6,     -9,     -11,    -11,    -8,
+    -10,    -13,    -15,    -19,    -17,    -11,    -2,     1,      2,
+    6,      9,      10,     12,     13,     9,      8,      10,     13,
+    20,     18,     13,     10,     4,      1,      -2,     -6,     -11,
+    -13,    -16,    -18,    -15,    -18,    -21,    -21,    -22,    -23,
+    -25,    -23,    -22,    -20,    -19,    -16,    -12,    -10,    -9,
+    -11,    -15,    -19,    -22,    -19,    -14,    -11,    -9,     -11,
+    -17,    -20,    -18,    -19,    -15,    -11,    -8,     -2,     8,
+    19,     30,     36,     37,     36,     38,     45,     57,     69,
+    77,     81,     79,     75,     76,     74,     69,     66,     60,
+    53,     45,     36,     28,     22,     17,     10,     0,      -5,
+    -11,    -15,    -18,    -26,    -31,    -33,    -34,    -34,    -35,
+    -37,    -37,    -35,    -28,    -24,    -29,    -37,    -45,    -46,
+    -41,    -36,    -31,    -32,    -33,    -37,    -37,    -36,    -36,
+    -34,    -27,    -19,    -14,    -11,    -8,     -1,     6,      14,
+    19,     21,     25,     30,     34,     38,     38,     33,     26,
+    22,     19,     20,     18,     17,     15,     10,     2,      -3,
+    -5,     -10,    -13,    -13,    -13,    -16,    -16,    -16,    -15,
+    -13,    -14,    -13,    -16,    -19,    -20,    -18,    -17,    -18,
+    -16,    -16,    -24,    -28,    -28,    -28,    -23,    -21,    -21,
+    -20,    -24,    -27,    -23,    -18,    -14,    -7,     4,      11,
+    15,     19,     21,     25,     33,     39,     41,     45,     47,
+    50,     56,     58,     57,     59,     59,     55,     50,     47,
+    39,     34,     30,     24,     18,     11,     8,      3,      0,
+    -3,     -8,     -14,    -15,    -13,    -13,    -12,    -14,    -17,
+    -17,    -12,    -10,    -4,     -7,     -12,    -10,    -14,    -17,
+    -17,    -19,    -25,    -28,    -27,    -29,    -30,    -31,    -35,
+    -38,    -43,    -47,    -51,    -52,    -50,    -49,    -48,    -47,
+    -45,    -39,    -32,    -30,    -31,    -35,    -35,    -31,    -24,
+    -17,    -12,    -11,    -14,    -15,    -17,    -16,    -9,     -5,
+    -3,     -1,     0,      1,      0,      3,      12,     21,     26,
+    33,     35,     38,     45,     50,     53,     53,     54,     58,
+    61,     64,     69,     67,     66,     64,     58,     54,     51,
+    46,     44,     45,     41,     35,     31,     27,     25,     27,
+    25,     20,     13,     12,     16,     17,     17,     12,     7,
+    3,      2,      -2,     -4,     -8,     -14,    -19,    -25,    -29,
+    -38,    -49,    -60,    -69,    -73,    -71,    -74,    -82,    -89,
+    -98,    -103,   -104,   -103,   -99,    -98,    -98,    -98,    -99,
+    -97,    -94,    -91,    -85,    -82,    -78,    -74,    -74,    -71,
+    -68,    -61,    -54,    -52,    -47,    -41,    -36,    -32,    -21,
+    -12,    -3,     11,     26,     36,     44,     48,     55,     64,
+    77,     92,     100,    108,    117,    120,    122,    128,    130,
+    129,    130,    127,    124,    122,    121,    118,    114,    110,
+    102,    92,     85,     80,     77,     68,     55,     46,     39,
+    36,     34,     31,     27,     15,     5,      -1,     -5,     -11,
+    -20,    -29,    -37,    -43,    -46,    -47,    -54,    -61,    -65,
+    -74,    -82,    -84,    -91,    -94,    -96,    -104,   -109,   -111,
+    -111,   -112,   -113,   -111,   -112,   -110,   -104,   -99,    -96,
+    -93,    -89,    -87,    -81,    -71,    -63,    -54,    -45,    -43,
+    -37,    -30,    -24,    -17,    -12,    -8,     -2,     2,      15,
+    23,     28,     35,     41,     42,     44,     52,     58,     66,
+    74,     78,     80,     82,     85,     88,     90,     92,     92,
+    88,     87,     87,     79,     73,     69,     64,     62,     55,
+    50,     45,     41,     36,     29,     24,     20,     16,     12,
+    8,      5,      2,      1,      1,      0,      1,      -4,     -4,
+    -4,     -4,     -1,     1,      2,      1,      -3,     -6,     -1,
+    5,      6,      7,      8,      4,      2,      0,      -2,     -3,
+    0,      -3,     -4,     -3,     -4,     -5,     -8,     -15,    -20,
+    -25,    -28,    -32,    -37,    -38,    -39,    -43,    -48,    -55,
+    -62,    -69,    -75,    -75,    -78,    -81,    -83,    -89,    -89,
+    -92,    -91,    -91,    -89,    -83,    -81,    -74,    -66,    -63,
+    -54,    -45,    -39,    -31,    -23,    -15,    -4,     6,      14,
+    23,     29,     35,     41,     45,     49,     55,     61,     69,
+    75,     75,     76,     75,     74,     74,     73,     74,     72,
+    69,     69,     65,     62,     57,     52,     44,     35,     33,
+    29,     24,     14,     7,      3,      -4,     -12,    -17,    -20,
+    -22,    -27,    -32,    -34,    -39,    -42,    -43,    -42,    -43,
+    -40,    -38,    -36,    -36,    -37,    -36,    -33,    -31,    -27,
+    -24,    -23,    -22,    -17,    -11,    -7,     -7,     -7,     -3,
+    5,      13,     19,     25,     27,     25,     27,     35,     40,
+    40,     41,     45,     47,     50,     54,     52,     50,     45,
+    43,     44,     40,     34,     28,     24,     18,     11,     6,
+    -2,     -9,     -14,    -21,    -27,    -35,    -39,    -43,    -50,
+    -57,    -62,    -66,    -68,    -71,    -72,    -73,    -74,    -76,
+    -76,    -77,    -75,    -75,    -74,    -67,    -61,    -55,    -49,
+    -45,    -40,    -30,    -21,    -11,    -4,     4,      13,     23,
+    34,     44,     52,     59,     65,     70,     77,     84,     87,
+    88,     90,     91,     90,     89,     85,     80,     75,     72,
+    71,     64,     56,     48,     41,     34,     27,     21,     12,
+    1,      -11,    -19,    -28,    -33,    -39,    -46,    -50,    -53,
+    -58,    -63,    -66,    -71,    -73,    -76,    -76,    -74,    -73,
+    -71,    -67,    -65,    -62,    -60,    -55,    -51,    -45,    -39,
+    -35,    -31,    -27,    -20,    -13,    -6,     -3,     1,      8,
+    12,     18,     24,     26,     30,     35,     38,     44,     47,
+    47,     51,     53,     52,     53,     52,     50,     51,     49,
+    50,     51,     50,     48,     48,     45,     43,     42,     37,
+    34,     31,     31,     30,     26,     24,     21,     15,     12,
+    11,     7,      4,      1,      -3,     -5,     -7,     -9,     -15,
+    -21,    -26,    -28,    -31,    -35,    -39,    -46,    -48,    -49,
+    -53,    -58,    -63,    -67,    -69,    -71,    -72,    -74,    -75,
+    -77,    -77,    -73,    -72,    -69,    -65,    -60,    -55,    -50,
+    -47,    -43,    -38,    -30,    -25,    -20,    -12,    -4,     4,
+    9,      16,     20,     24,     28,     35,     43,     50,     58,
+    61,     65,     72,     74,     74,     76,     79,     78,     76,
+    78,     76,     76,     74,     70,     64,     59,     52,     46,
+    41,     33,     26,     19,     12,     5,      -2,     -8,     -15,
+    -20,    -26,    -31,    -37,    -39,    -41,    -44,    -44,    -47,
+    -51,    -52,    -52,    -48,    -45,    -46,    -48,    -45,    -42,
+    -40,    -36,    -32,    -27,    -24,    -22,    -18,    -16,    -11,
+    -10,    -5,     0,      3,      8,      11,     16,     18,     21,
+    23,     25,     26,     27,     28,     30,     31,     31,     30,
+    29,     27,     26,     23,     19,     17,     13,     10,     6,
+    0,      -2,     -5,     -10,    -12,    -15,    -19,    -23,    -26,
+    -29,    -30,    -30,    -32,    -33,    -34,    -35,    -34,    -31,
+    -29,    -29,    -28,    -28,    -23,    -19,    -17,    -12,    -12,
+    -10,    -5,     -2,     3,      7,      10,     13,     14,     19,
+    22,     26,     31,     34,     34,     35,     36,     39,     43,
+    45,     47,     47,     48,     49,     51,     48,     47,     50,
+    45,     41,     41,     38,     34,     34,     30,     23,     17,
+    11,     7,      4,      -4,     -9,     -15,    -23,    -28,    -32,
+    -35,    -39,    -45,    -46,    -49,    -53,    -52,    -53,    -55,
+    -56,    -56,    -55,    -54,    -53,    -53,    -51,    -47,    -44,
+    -42,    -40,    -37,    -33,    -28,    -25,    -23,    -18,    -15,
+    -8,     -6,     -2,     3,      8,      15,     18,     23,     26,
+    27,     32,     36,     36,     36,     39,     38,     38,     40,
+    39,     35,     31,     29,     25,     23,     19,     15,     11,
+    7,      5,      3,      1,      -1,     -6,     -8,     -7,     -10,
+    -9,     -10,    -11,    -10,    -7,     -6,     -8,     -6,     -5,
+    -4,     1,      2,      4,      7,      7,      9,      11,     11,
+    9,      9,      10,     11,     13,     17,     15,     15,     15,
+    17,     19,     17,     17,     17,     15,     15,     13,     11,
+    12,     8,      7,      5,      3,      0,      -4,     -4,     -6,
+    -9,     -12,    -14,    -15,    -15,    -16,    -20,    -19,    -20,
+    -20,    -20,    -18,    -18,    -21,    -22,    -21,    -21,    -23,
+    -20,    -20,    -23,    -24,    -23,    -25,    -25,    -25,    -25,
+    -26,    -24,    -23,    -23,    -23,    -23,    -22,    -19,    -18,
+    -15,    -14,    -10,    -8,     -4,     -1,     1,      3,      6,
+    8,      9,      14,     19,     22,     24,     26,     29,     32,
+    31,     34,     39,     42,     42,     46,     49,     50,     50,
+    52,     53,     52,     49,     49,     48,     48,     46,     45,
+    40,     34,     30,     25,     21,     17,     13,     10,     6,
+    2,      -4,     -9,     -12,    -15,    -18,    -21,    -26,    -28,
+    -31,    -32,    -33,    -35,    -35,    -38,    -37,    -36,    -34,
+    -35,    -35,    -33,    -33,    -34,    -30,    -26,    -27,    -25,
+    -23,    -22,    -18,    -15,    -16,    -12,    -9,     -9,     -6,
+    -1,     2,      3,      5,      8,      7,      9,      12,     15,
+    17,     18,     18,     19,     18,     20,     19,     18,     21,
+    20,     19,     18,     16,     15,     15,     15,     14,     12,
+    9,      9,      10,     8,      6,      4,      2,      1,      -1,
+    -3,     -1,     -3,     -2,     -4,     -5,     -5,     -8,     -8,
+    -10,    -10,    -8,     -8,     -8,     -7,     -8,     -8,     -8,
+    -9,     -11,    -12,    -11,    -9,     -7,     -8,     -8,     -8,
+    -10,    -8,     -7,     -8,     -7,     -6,     -7,     -5,     -3,
+    -3,     -3,     -3,     -2,     0,      3,      3,      5,      7,
+    10,     11,     10,     10,     12,     13,     16,     16,     16,
+    17,     15,     16,     17,     16,     14,     16,     13,     11,
+    11,     9,      9,      6,      4,      4,      3,      0,      -2,
+    -4,     -7,     -7,     -7,     -13,    -15,    -13,    -14,    -16,
+    -15,    -15,    -17,    -16,    -16,    -18,    -19,    -19,    -20,
+    -19,    -16,    -15,    -13,    -12,    -10,    -7,     -6,     -4,
+    -4,     -2,     0,      2,      6,      8,      10,     12,     14,
+    15,     14,     13,     13,     13,     15,     15,     17,     17,
+    17,     18,     17,     16,     15,     15,     14,     11,     9,
+    8,      8,      9,      8,      5,      5,      3,      -1,     -1,
+    -4,     -5,     -7,     -8,     -8,     -8,     -9,     -10,    -8,
+    -11,    -12,    -12,    -12,    -12,    -13,    -11,    -11,    -9,
+    -8,     -7,     -8,     -7,     -6,     -7,     -6,     -5,     -4,
+    -4,     -2,     -2,     -3,     -2,     -2,     -3,     0,      -1,
+    -3,     1,      1,      2,      4,      3,      5,      6,      3,
+    3,      4,      3,      3,      4,      5,      4,      6,      7,
+    7,      7,      6,      3,      3,      5,      3,      3,      6,
+    6,      7,      6,      4,      5,      2,      1,      1,      0,
+    0,      2,      1,      1,      1,      -1,     -2,     -3,     -5,
+    -4,     -5,     -4,     -4,     -6,     -4,     -4,     -4,     -5,
+    -6,     -5,     -6,     -5,     -4,     -5,     -4,     -3,     -4,
+    0,      2,      2,      2,      2,      2,      2,      3,      3,
+    5,      6,      6,      5,      6,      7,      6,      8,      6,
+    5,      5,      5,      6,      6,      6,      5,      5,      2,
+    2,      1,      2,      0,      -1,     -1,     -1,     -1,     0,
+    -1,     -4,     -6,     -8,     -8,     -9,     -8,     -7,     -6,
+    -5,     -5,     -6,     -3,     -4,     -5,     -4,     -7,     -6,
+    -4,     -2,     -1,     -1,     1,      1,      1,      1,      1,
+    2,      2,      1,      3,      4,      4,      6,      6,      6,
+    6,      4,      4,      4,      4,      3,      2,      2,      2,
+    2,      1,      1,      1,      0,      1,      1,      0,      -2,
+    -2,     -3,     -3,     -3,     -3,     -5,     -4,     -3,     -5,
+    -5,     -3,     -5,     -4,     -4,     -2,     -2,     -2,     -1,
+    -3,     -2,     -2,     -1,     -3,     -2,     -1,     -2,     -2,
+    -2,     0,      0,      0,      0,      0,      1,      0,      0,
+    1,      2,      3,      3,      3,      4,      5,      4,      3,
+    4,      5,      5,      7,      7,      6,      9,      8,      6,
+    7,      8,      6,      5,      7,      8,      8,      8,      7,
+    6,      5,      4,      4,      4,      5,      4,      2,      1,
+    2,      1,      0,      -2,     -3,     -2,     -4,     -6,     -6,
+    -7,     -7,     -8,     -9,     -9,     -9,     -9,     -9,     -9,
+    -9,     -10,    -10,    -10,    -8,     -7,     -8,     -6,     -5,
+    -4,     -3,     -5,     -2,     -2,     -2,     -1,     -1,     0,
+    1,      1,      2,      3,      2,      4,      3,      3,      5,
+    3,      3,      5,      4,      5,      6,      5,      4,      5,
+    3,      2,      2,      3,      4,      4,      4,      4,      4,
+    3,      4,      4,      4,      3,      2,      2,      2,      2,
+    2,      2,      2,      2,      1,      1,      1,      2,      1,
+    1,      2,      1,      1,      2,      1,      1,      1,      -1,
+    0,      1,      0,      -1,     1,      -1,     -1,     -1,     -2,
+    -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -2,
+    -1,     0,      -1,     -1,     1,      1,      2,      0,      -1,
+    0,      -1,     -1,     0,      0,      1,      2,      2,      2,
+    1,      1,      0,      0,      0,      0,      1,      1,      0,
+    0,      0,      0,      0,      -1,     -2,     -1,     -3,     -4,
+    -4,     -4,     -4,     -4,     -4,     -4,     -3,     -3,     -5,
+    -6,     -4,     -2,     -2,     -1,     -1,     -1,     -2,     1,
+    -1,     1,      0,      0,      1,      1,      1,      1,      2,
+    1,      2,      2,      3,      3,      3,      3,      4,      5,
+    5,      5,      5,      5,      5,      5,      5,      6,      6,
+    5,      5,      5,      6,      6,      5,      3,      6,      5,
+    4,      5,      3,      2,      2,      2,      2,      1,      1,
+    2,      0,      -1,     0,      -1,     -1,     -1,     -1,     -1,
+    -1,     -1,     -3,     -3,     -3,     -3,     -4,     -4,     -5,
+    -6,     -6,     -6,     -6,     -6,     -6,     -5,     -5,     -6,
+    -5,     -4,     -4,     -4,     -4,     -2,     -2,     -2,     -1,
+    -2,     0,      1,      0,      1,      3,      4,      4,      4,
+    4,      4,      4,      5,      4,      4,      4,      5,      7,
+    5,      4,      4,      4,      4,      3,      2,      2,      2,
+    2,      2,      0,      1,      1,      0,      1,      1,      -1,
+    0,      -1,     -2,     -1,     -3,     -4,     -4,     -3,     -5,
+    -5,     -5,     -5,     -5,     -5,     -4,     -3,     -3,     -2,
+    -3,     -2,     -2,     -5,     -3,     -3,     -3,     -2,     0,
+    1,      1,      1,      1,      1,      1,      1,      1,      3,
+    3,      4,      4,      4,      4,      5,      5,      2,      3,
+    4,      3,      5,      4,      3,      4,      3,      3,      5,
+    5,      3,      4,      2,      1,      1,      3,      4,      3,
+    1,      3,      2,      1,      2,      1,      0,      1,      0,
+    1,      0,      1,      1,      1,      1,      0,      -1,     0,
+    0,      -1,     -1,     -2,     -1,     -1,     -2,     0,      -1,
+    -2,     -1,     -1,     -2,     -2,     -1,     -3,     -3,     -3,
+    -3,     -3,     -4,     -3,     -5,     -6,     -4,     -4,     -5,
+    -4,     -3,     -5,     -6,     -4,     -5,     -6,     -4,     -3,
+    -5,     -4,     -3,     -4,     -3,     -2,     -2,     -2,     0,
+    0,      1,      1,      0,      0,      0,      1,      1,      3,
+    3,      3,      4,      3,      3,      3,      3,      3,      3,
+    3,      3,      3,      3,      3,      3,      3,      3,      3,
+    1,      1,      1,      1,      1,      1,      1,      0,      0,
+    0,      1,      -2,     -1,     1,      0,      -1,     -2,     -2,
+    0,      1,      0,      1,      1,      1,      1,      0,      0,
+    1,      0,      0,      2,      1,      0,      1,      1,      1,
+    1,      3,      3,      3,      4,      3,      3,      4,      2,
+    2,      2,      2,      2,      2,      2,      1,      2,      2,
+    2,      2,      -1,     -1,     -1,     -1,     -1,     -1,     -1,
+    -1,     -1,     -3,     -3,     -3,     -5,     -4,     -5,     -5,
+    -5,     -5,     -7,     -7,     -7,     -8,     -7,     -8,     -7,
+    -8,     -8,     -7,     -8,     -8,     -8,     -8,     -7,     -6,
+    -6,     -6,     -7,     -6,     -6,     -5,     -5,     -3,     -2,
+    -2,     -1,     0,      -1,     0,      1,      2,      2,      3,
+    3,      3,      6,      7,      7,      7,      8,      9,      8,
+    10,     10,     9,      10,     11,     9,      10,     12,     11,
+    10,     9,      9,      9,      9,      10,     9,      6,      6,
+    5,      5,      6,      3,      1,      1,      0,      1,      0,
+    0,      1,      -1,     -2,     -2,     -1,     -3,     -3,     -2,
+    -4,     -4,     -3,     -2,     -4,     -4,     -4,     -5,     -3,
+    -3,     -5,     -3,     -3,     -5,     -4,     -2,     -2,     -3,
+    -3,     -1,     0,      -1,     0,      0,      0,      -2,     -1,
+    0,      -1,     -2,     -2,     -2,     -2,     -1,     -3,     -2,
+    -3,     -4,     -3,     -3,     -3,     -3,     -3,     -3,     -3,
+    -2,     -4,     -6,     -5,     -3,     -2,     -4,     -3,     -2,
+    -4,     -4,     -4,     -3,     -4,     -5,     -4,     -5,     -3,
+    -2,     -5,     -2,     -4,     -4,     -3,     -2,     -1,     -1,
+    -1,     0,      2,      2,      1,      1,      3,      3,      3,
+    3,      4,      4,      5,      6,      5,      5,      6,      7,
+    7,      7,      8,      8,      7,      9,      9,      9,      9,
+    10,     9,      9,      9,      9,      9,      9,      8,      7,
+    9,      9,      6,      7,      5,      2,      3,      2,      1,
+    1,      0,      -2,     -2,     -2,     -3,     -3,     -2,     -2,
+    -4,     -5,     -4,     -4,     -4,     -4,     -5,     -4,     -4,
+    -5,     -4,     -5,     -4,     -5,     -6,     -4,     -4,     -5,
+    -5,     -5,     -5,     -6,     -4,     -4,     -4,     -3,     -2,
+    -3,     -3,     -2,     -2,     -1,     -2,     -3,     -1,     0,
+    -1,     0,      0,      0,      0,      1,      0,      0,      0,
+    0,      -1,     1,      1,      1,      0,      -2,     -2,     -3,
+    -3,     -4,     -4,     -6,     -7,     -5,     -4,     -5,     -5,
+    -4,     -6,     -8,     -7,     -6,     -5,     -5,     -5,     -4,
+    -4,     -5,     -4,     -3,     -3,     0,      0,      -2,     -1,
+    0,      0,      1,      1,      2,      2,      2,      2,      2,
+    4,      5,      5,      5,      6,      7,      7,      9,      10,
+    10,     10,     12,     12,     13,     14,     14,     14,     15,
+    15,     15,     15,     15,     15,     14,     15,     15,     12,
+    13,     13,     12,     10,     11,     11,     11,     10,     8,
+    6,      5,      7,      6,      6,      4,      3,      4,      5,
+    3,      2,      2,      1,      1,      2,      3,      1,      0,
+    0,      1,      0,      -2,     -1,     -2,     -3,     -3,     -3,
+    -3,     -4,     -6,     -8,     -9,     -9,     -10,    -12,    -14,
+    -15,    -18,    -21,    -21,    -21,    -21,    -22,    -24,    -26,
+    -26,    -27,    -27,    -28,    -26,    -25,    -26,    -28,    -27,
+    -24,    -23,    -23,    -24,    -21,    -17,    -17,    -15,    -12,
+    -12,    -12,    -12,    -9,     -7,     -6,     -5,     -3,     -3,
+    -2,     0,      0,      1,      3,      7,      6,      4,      6,
+    7,      8,      11,     10,     10,     13,     15,     14,     13,
+    18,     20,     18,     19,     21,     23,     24,     23,     22,
+    24,     26,     26,     26,     27,     25,     23,     25,     27,
+    28,     28,     28,     23,     19,     23,     24,     20,     20,
+    21,     15,     13,     15,     16,     14,     11,     8,      7,
+    8,      11,     11,     6,      4,      8,      7,      6,      7,
+    6,      4,      7,      13,     12,     7,      8,      8,      4,
+    1,      1,      1,      2,      -4,     -12,    -18,    -24,    -25,
+    -25,    -32,    -41,    -55,    -59,    -61,    -75,    -87,    -96,
+    -109,   -122,   -133,   -141,   -148,   -157,   -168,   -180,   -191,
+    -198,   -202,   -207,   -206,   -207,   -211,   -211,   -208,   -203,
+    -189,   -171,   -153,   -132,   -114,   -96,    -75,    -54,    -30,
+    -5,     19,     43,     61,     77,     93,     106,    123,    143,
+    161,    182,    198,    202,    201,    209,    229,    242,    240,
+    235,    239,    249,    258,    255,    242,    233,    245,    268,
+    278,    256,    223,    223,    253,    263,    235,    198,    178,
+    188,    215,    230,    200,    143,    113,    128,    158,    158,
+    128,    99,     90,     82,     70,     56,     32,     7,      14,
+    46,     36,     -23,    -71,    -76,    -54,    -36,    -39,    -74,
+    -118,   -134,   -122,   -101,   -104,   -129,   -164,   -174,   -129,
+    -86,    -109,   -184,   -219,   -191,   -147,   -141,   -183,   -249,
+    -290,   -269,   -236,   -266,   -346,   -394,   -366,   -325,   -353,
+    -431,   -472,   -406,   -313,   -316,   -398,   -449,   -401,   -287,
+    -194,   -164,   -193,   -245,   -212,   -55,    75,     67,     26,
+    67,     165,    237,    269,    293,    319,    333,    368,    414,
+    432,    463,    488,    448,    404,    391,    377,    361,    365,
+    376,    308,    197,    150,    129,    73,     53,     91,     43,
+    -107,   -165,   -54,    1,      -148,   -312,   -273,   -125,   -62,
+    -128,   -258,   -294,   -141,   70,     57,     -217,   -378,   -145,
+    198,    289,    169,    -47,    -219,   -101,   264,    458,    217,
+    -163,   -199,   13,     121,    101,    -51,    -293,   -319,   -62,
+    24,     -274,   -474,   -296,   -170,   -336,   -422,   -285,   -248,
+    -302,   -130,   98,     -11,    -257,   -146,   184,    278,    264,
+    331,    192,    -35,    235,    805,    830,    315,    82,     322,
+    503,    522,    619,    557,    242,    163,    399,    507,    489,
+    618,    602,    156,    -164,   112,    476,    406,    94,     -154,
+    -242,   -132,   56,     5,      -325,   -566,   -527,   -478,   -624,
+    -692,   -561,   -551,   -744,   -836,   -671,   -520,   -626,   -736,
+    -647,   -581,   -639,   -687,   -702,   -739,   -665,   -383,   -236,
+    -414,   -513,   -321,   -114,   -43,    32,     65,     -98,    -236,
+    34,     608,    924,    680,    218,    56,     329,    847,    1214,
+    1006,   341,    11,     340,    667,    553,    353,    355,    415,
+    416,    364,    257,    108,    6,      113,    293,    233,    46,
+    4,      25,     -10,    -12,    55,     40,     -65,    -56,    -26,
+    -101,   -61,    143,    229,    78,     -161,   -210,   103,    424,
+    377,    86,     -274,   -491,   -328,   -37,    60,     128,    188,
+    -105,   -625,   -823,   -464,   138,    389,    111,    -343,   -526,
+    -306,   13,     205,    250,    -35,    -554,   -764,   -498,   -42,
+    167,    -210,   -639,   -448,   -101,   -110,   -171,   -74,    -39,
+    47,     424,    616,    324,    98,     367,    853,    942,    416,
+    -184,   -130,   339,    472,    369,    239,    -165,   -418,   101,
+    742,    659,    325,    365,    476,    233,    -14,    270,    785,
+    719,    -29,    -533,   -220,   237,    305,    179,    -190,   -644,
+    -610,   -380,   -526,   -601,   -237,   48,     -36,    -124,   -49,
+    -6,     23,     117,    55,     -199,   -428,   -512,   -338,   -238,
+    -424,   -323,   -135,   -464,   -657,   -189,   100,    -379,   -964,
+    -893,   -346,   -64,    -322,   -650,   -480,   32,     238,    201,
+    386,    616,    611,    400,    195,    357,    842,    1051,   832,
+    712,    829,    1070,   1307,   1081,   551,    363,    544,    623,
+    239,    -374,   -609,   -230,   375,    486,    -52,    -446,   -270,
+    181,    645,    601,    -135,   -654,   -256,   567,    840,    380,
+    -54,    18,     334,    386,    21,     -214,   83,     243,    -316,
+    -937,   -1074,  -1006,  -896,   -674,   -424,   -331,   -354,   -380,
+    -481,   -392,   80,     358,    171,    -170,   -624,   -796,   -130,
+    706,    803,    381,    152,    367,    620,    685,    655,    347,
+    36,     180,    417,    412,    358,    288,    189,    150,    16,
+    -240,   -428,   -428,   -266,   -335,   -819,   -1150,  -946,   -587,
+    -437,   -580,   -961,   -1218,  -1065,  -704,   -431,   -350,   -315,
+    -214,   -162,   -81,    26,     -8,     -52,    -117,   -226,   -40,
+    285,    241,    -2,     -69,    57,     207,    81,     -144,   -69,
+    65,     84,     49,     -168,   -248,   126,    502,    472,    192,
+    120,    442,    667,    551,    512,    634,    814,    1014,   1098,
+    1156,   1112,   974,    1144,   1330,   1099,   825,    847,    877,
+    555,    2,      -243,   -102,   -196,   -471,   -377,   -235,   -439,
+    -622,   -547,   -470,   -495,   -431,   -197,   -21,    21,     -9,
+    -246,   -438,   -238,   -31,    0,      96,     137,    -25,    -211,
+    -181,   -149,   -350,   -368,   -33,    21,     -308,   -323,   32,
+    379,    605,    531,    85,     -374,   -367,   9,      277,    147,
+    -356,   -698,   -494,   -140,   -126,   -354,   -549,   -673,   -642,
+    -428,   -269,   -273,   -246,   -216,   -349,   -323,   -16,    32,
+    -387,   -742,   -662,   -434,   -223,   41,     140,    -58,    -227,
+    -80,    93,     20,     -166,   -360,   -536,   -555,   -305,   -33,
+    -23,    -86,    -75,    -9,     82,     -1,     -156,   24,     532,
+    916,    956,    835,    901,    1127,   1279,   1417,   1435,   1144,
+    822,    862,    1214,   1352,   1001,   611,    539,    532,    369,
+    189,    170,    308,    465,    430,    232,    64,     14,     51,
+    -37,    -244,   -321,   -276,   -144,   57,     77,     -215,   -467,
+    -335,   -186,   -245,   -133,   -81,    -588,   -1130,  -959,   -520,
+    -631,   -1122,  -1270,  -971,   -873,   -1118,  -1157,  -1078,  -1296,
+    -1365,  -1010,  -873,   -1138,  -1061,  -379,   89,     51,     177,
+    372,    185,    -14,    63,     197,    125,    -123,   -60,    243,
+    195,    88,     201,    115,    -63,    -12,    -79,    -492,   -751,
+    -489,   49,     163,    -293,   -424,   -52,    229,    302,    212,
+    217,    315,    70,     -207,   -210,   -173,   129,    619,    556,
+    213,    181,    170,    112,    167,    322,    451,    206,    -136,
+    58,     426,    526,    524,    394,    387,    568,    481,    297,
+    164,    8,      263,    664,    777,    943,    989,    934,    1283,
+    1495,   1153,   861,    738,    582,    614,    692,    655,    629,
+    432,    127,    -119,   -338,   -313,   -138,   -204,   -561,   -994,
+    -1168,  -948,   -700,   -658,   -788,   -1053,  -1027,  -684,   -566,
+    -528,   -355,   -335,   -323,   -28,    206,    87,     56,     387,
+    585,    296,    24,     261,    492,    248,    -132,   -469,   -674,
+    -502,   -235,   -255,   -517,   -847,   -1038,  -965,   -707,   -630,
+    -767,   -639,   -298,   -193,   -290,   -310,   -118,   74,     -77,
+    -337,   -324,   -120,   187,    323,    -72,    -552,   -454,   -14,
+    29,     -427,   -803,   -735,   -586,   -762,   -918,   -783,   -649,
+    -723,   -857,   -786,   -626,   -591,   -417,   -83,    167,    262,
+    49,     -161,   157,    842,    1298,   1356,   1206,   1041,   1194,
+    1461,   1323,   1070,   1221,   1687,   2051,   2002,   1673,   1464,
+    1550,   1851,   1907,   1531,   1327,   1399,   1342,   1287,   1264,
+    1152,   1030,   878,    716,    601,    454,    264,    264,    352,
+    151,    -193,   -296,   -161,   -93,    -215,   -423,   -617,   -668,
+    -547,   -416,   -464,   -807,   -1175,  -1174,  -1045,  -1076,  -1023,
+    -829,   -710,   -745,   -1069,  -1443,  -1417,  -1099,  -939,   -1165,
+    -1307,  -1056,  -843,   -638,   -304,   -190,   -334,   -578,   -770,
+    -705,   -675,   -947,   -957,   -565,   -437,   -617,   -843,   -1015,
+    -813,   -489,   -584,   -904,   -1054,  -797,   -229,   -26,    -208,
+    -66,    398,    710,    644,    390,    413,    726,    992,    1204,
+    1337,   1234,   1104,   1038,   1001,   1043,   982,    847,    885,
+    1024,   1098,   1138,   1108,   1038,   966,    885,    882,    878,
+    929,    1005,   944,    1008,   1284,   1415,   1289,   1007,   760,
+    812,    947,    806,    455,    111,    -72,    -290,   -611,   -626,
+    -559,   -765,   -1034,  -1375,  -1632,  -1565,  -1588,  -1728,  -1585,
+    -1477,  -1547,  -1533,  -1371,  -1103,  -995,   -1090,  -1102,  -947,
+    -686,   -403,   -295,   -250,   -107,   -86,    -171,   -150,   12,
+    234,    283,    185,    300,    461,    393,    382,    434,    378,
+    306,    202,    195,    253,    -8,     -307,   -105,   264,    342,
+    212,    34,     -57,    78,     435,    571,    180,    -165,   -51,
+    339,    705,    683,    464,    658,    958,    825,    579,    465,
+    390,    241,    61,     202,    429,    128,    -122,   241,    406,
+    39,     -167,   -60,    15,     -31,    -68,    146,    402,    344,
+    227,    208,    87,     -25,    -31,    -66,    -169,   -249,   -87,
+    75,     -181,   -438,   -249,   49,     87,     -40,    -16,    53,
+    -86,    -74,    98,     78,     110,    169,    -84,    -323,   -251,
+    -102,   -172,   -513,   -750,   -675,   -568,   -587,   -583,   -523,
+    -450,   -302,   -245,   -356,   -480,   -590,   -495,   -183,   -105,
+    -191,   -215,   -308,   -206,   39,     4,      -77,    -21,    74,
+    186,    218,    356,    611,    489,    83,     13,     246,    371,
+    348,    240,    61,     -66,    -107,   -170,   -205,   -74,    200,
+    277,    45,     -11,    180,    263,    100,    -74,    102,    246,
+    6,      -154,   -162,   -197,   -128,   -189,   -227,   -49,    -238,
+    -490,   -333,   -188,   1,      215,    150,    144,    128,    -33,
+    187,    532,    676,    911,    773,    283,    351,    673,    620,
+    349,    105,    205,    425,    325,    295,    372,    340,    511,
+    628,    394,    224,    187,    91,     -174,   -556,   -482,   -37,
+    -9,     -226,   -382,   -568,   -466,   -208,   -241,   -426,   -656,
+    -814,   -788,   -902,   -1065,  -946,   -860,   -896,   -831,   -744,
+    -672,   -685,   -743,   -723,   -783,   -813,   -570,   -341,   -239,
+    -57,    137,    348,    576,    593,    454,    429,    503,    449,
+    238,    173,    350,    423,    419,    530,    501,    272,    156,
+    207,    295,    404,    568,    676,    419,    30,     113,    463,
+    550,    473,    349,    126,    33,     144,    207,    193,    267,
+    304,    81,     -252,   -401,   -368,   -347,   -404,   -452,   -408,
+    -272,   -40,    234,    281,    48,     -72,    -18,    54,     208,
+    309,    285,    245,    164,    38,     -20,    148,    430,    563,
+    655,    679,    453,    300,    319,    219,    25,     -15,    54,
+    -117,   -444,   -431,   -135,   -147,   -468,   -667,   -722,   -593,
+    -301,   -217,   -428,   -642,   -598,   -400,   -422,   -602,   -628,
+    -554,   -509,   -501,   -541,   -488,   -250,   -129,   -284,   -441,
+    -358,   -161,   -82,    4,      134,    157,    290,    516,    582,
+    702,    859,    871,    858,    759,    615,    616,    754,    839,
+    725,    464,    259,    187,    127,    150,    280,    238,    92,
+    78,     5,      -86,    6,      67,     -14,    -92,    -143,   -211,
+    -89,    213,    300,    107,    -91,    -154,   -153,   -238,   -355,
+    -314,   -227,   -168,   -92,    -142,   -219,   -156,   -47,    53,
+    -15,    -195,   -161,   -186,   -382,   -395,   -297,   -238,   -240,
+    -390,   -502,   -336,   -97,    -29,    -116,   -290,   -289,   -67,
+    74,     112,    119,    182,    358,    382,    315,    341,    290,
+    218,    190,    101,    -51,    -168,   -132,   -41,    -39,    -15,
+    104,    186,    151,    68,     89,     154,    67,     10,     143,
+    120,    -185,   -382,   -365,   -263,   -145,   -111,   -159,   -190,
+    -53,    151,    177,    179,    384,    553,    502,    490,    572,
+    600,    573,    442,    119,    -212,   -260,   -166,   -318,   -506,
+    -413,   -279,   -285,   -354,   -390,   -278,   -142,   -85,    -18,
+    -19,    -121,   -143,   -32,    88,     118,    42,     -96,    -187,
+    -167,   -113,   -172,   -270,   -256,   -178,   -192,   -249,   -128,
+    103,    132,    -47,    -147,   -104,   -56,    -9,     45,     35,
+    109,    315,    381,    326,    336,    457,    667,    786,    675,
+    489,    460,    569,    595,    470,    303,    272,    448,    620,
+    545,    226,    -92,    -128,   91,     172,    -98,    -385,   -378,
+    -264,   -284,   -362,   -314,   -148,   -72,    -198,   -350,   -353,
+    -344,   -389,   -353,   -292,   -327,   -413,   -473,   -519,   -588,
+    -577,   -546,   -737,   -989,   -1030,  -997,   -1010,  -861,   -683,
+    -731,   -690,   -419,   -197,   -47,    112,    167,    74,     41,
+    176,    309,    438,    671,    781,    793,    868,    904,    991,
+    1099,   987,    812,    816,    869,    766,    605,    633,    728,
+    592,    424,    460,    405,    170,    75,     30,     -105,   -58,
+    63,     -58,    -242,   -359,   -415,   -255,   -44,    -127,   -266,
+    -191,   -187,   -296,   -273,   -260,   -341,   -345,   -324,   -384,
+    -467,   -421,   -233,   -125,   -227,   -341,   -256,   -168,   -217,
+    -249,   -302,   -447,   -425,   -274,   -289,   -299,   -229,   -275,
+    -272,   -103,   -57,    -117,   -106,   -162,   -256,   -184,   -31,
+    51,     69,     31,     -19,    72,     256,    318,    331,    254,
+    28,     -7,     121,    48,     -64,    58,     183,    152,    161,
+    201,    167,    190,    287,    278,    157,    56,     103,    332,
+    460,    299,    166,    238,    308,    374,    508,    509,    373,
+    275,    270,    298,    229,    185,    192,    23,     -160,   -80,
+    67,     31,     -170,   -378,   -384,   -330,   -500,   -648,   -615,
+    -686,   -716,   -510,   -510,   -771,   -752,   -475,   -434,   -556,
+    -480,   -403,   -515,   -464,   -255,   -177,   -105,   29,     95,
+    152,    210,    190,    180,    279,    408,    325,    225,    462,
+    607,    537,    759,    1022,   973,    945,    964,    846,    818,
+    952,    907,    584,    313,    302,    428,    533,    479,    260,
+    178,    262,    185,    18,     -77,    -263,   -370,   -208,   -240,
+    -589,   -739,   -572,   -444,   -405,   -357,   -475,   -738,   -771,
+    -542,   -441,   -529,   -651,   -803,   -823,   -556,   -285,   -227,
+    -233,   -202,   -168,   -110,   -78,    -220,   -302,   -56,    129,
+    -60,    -149,   54,     130,    169,    324,    231,    24,     89,
+    269,    320,    262,    231,    225,    138,    67,     153,    310,
+    399,    269,    -21,    -197,   -183,   -59,    144,    234,    -13,
+    -274,   -168,   32,     -37,    -277,   -417,   -441,   -416,   -324,
+    -312,   -467,   -540,   -373,   -166,   -161,   -297,   -365,   -341,
+    -246,   -69,    81,     99,     -3,     11,     305,    540,    449,
+    394,    586,    667,    606,    685,    665,    425,    410,    585,
+    509,    360,    424,    538,    583,    482,    250,    159,    310,
+    423,    217,    -131,   -280,   -204,   -51,    -12,    -204,   -338,
+    -232,   -143,   -201,   -306,   -374,   -336,   -229,   -257,   -453,
+    -576,   -497,   -379,   -326,   -302,   -372,   -504,   -453,   -229,
+    -133,   -226,   -328,   -326,   -261,   -151,   -6,     97,     143,
+    164,    143,    138,    267,    433,    500,    470,    297,    143,
+    279,    504,    556,    475,    333,    233,    225,    228,    198,
+    128,    24,     -17,    4,      -55,    -187,   -251,   -213,   -119,
+    -94,    -214,   -357,   -349,   -246,   -195,   -183,   -261,   -440,
+    -533,   -476,   -341,   -213,   -170,   -220,   -299,   -220,   -8,
+    51,     -11,    19,     172,    292,    189,    9,      -6,     102,
+    238,    384,    477,    448,    353,    304,    354,    473,    543,
+    400,    229,    275,    380,    425,    415,    371,    398,    460,
+    377,    202,    154,    199,    110,    -123,   -365,   -524,   -524,
+    -360,   -134,   -47,    -182,   -348,   -453,   -542,   -503,   -376,
+    -398,   -521,   -595,   -621,   -560,   -439,   -284,   -115,   -80,
+    -123,   -57,    28,     -15,    -60,    -9,     47,     119,    203,
+    288,    435,    571,    635,    706,    750,    627,    436,    345,
+    330,    398,    460,    368,    213,    127,    140,    215,    202,
+    58,     -99,    -244,   -387,   -470,   -527,   -637,   -754,   -791,
+    -768,   -742,   -739,   -735,   -704,   -649,   -552,   -479,   -491,
+    -494,   -454,   -433,   -422,   -398,   -315,   -115,   75,     175,
+    244,    307,    360,    398,    460,    532,    529,    446,    422,
+    497,    541,    504,    541,    702,    803,    744,    645,    621,
+    727,    877,    873,    734,    593,    513,    523,    516,    412,
+    336,    334,    274,    199,    163,    123,    125,    117,    107,
+    140,    72,     -73,    -114,   -68,    -15,    13,     -122,   -338,
+    -367,   -325,   -386,   -497,   -608,   -634,   -546,   -477,   -427,
+    -377,   -412,   -464,   -436,   -343,   -276,   -327,   -390,   -313,
+    -149,   -17,    2,      -93,    -146,   -104,   -76,    -87,    -131,
+    -224,   -280,   -194,   -46,    12,     -76,    -189,   -151,   18,
+    160,    200,    99,     -81,    -149,   -95,    -31,    -6,     -45,
+    -93,    -97,    -71,    0,      73,     34,     -82,    -129,   -102,
+    -84,    -96,    -107,   -69,    -5,     6,      18,     48,     35,
+    27,     32,     -4,     -71,    -30,    119,    205,    266,    352,
+    325,    237,    282,    352,    358,    342,    265,    203,    200,
+    159,    120,    159,    195,    185,    133,    37,     20,     152,
+    312,    363,    316,    255,    251,    259,    211,    160,    86,
+    -4,     -30,    -79,    -154,   -213,   -271,   -243,   -146,   -147,
+    -211,   -283,   -319,   -219,   -157,   -207,   -237,   -252,   -245,
+    -136,   0,      42,     -22,    -108,   -82,    34,     130,    179,
+    152,    98,     105,    110,    116,    180,    175,    66,     -9,
+    -9,     36,     82,     75,     12,     -39,    -14,    23,     1,
+    12,     31,     -61,    -155,   -184,   -158,   -86,    -60,    -67,
+    -63,    -84,    -100,   -81,    -115,   -171,   -157,   -150,   -179,
+    -191,   -209,   -245,   -217,   -128,   -54,    -42,    -73,    -100,
+    -88,    -10,    104,    199,    249,    227,    201,    204,    151,
+    83,     75,     87,     84,     67,     34,     18,     44,     110,
+    218,    275,    232,    190,    209,    263,    294,    256,    174,
+    108,    37,     -54,    -110,   -129,   -179,   -293,   -360,   -339,
+    -282,   -190,   -135,   -188,   -239,   -234,   -227,   -182,   -127,
+    -89,    -51,    -73,    -136,   -151,   -85,    0,      72,     129,
+    122,    65,     44,     103,    202,    272,    252,    170,    148,
+    167,    152,    130,    127,    79,     14,     70,     157,    142,
+    109,    70,     -25,    -57,    -6,     46,     98,     135,    135,
+    82,     16,     10,     68,     87,     -20,    -120,   -116,   -98,
+    -102,   -129,   -204,   -271,   -282,   -252,   -216,   -215,   -221,
+    -156,   -70,    -66,    -120,   -156,   -146,   -126,   -84,    -15,
+    -21,    -76,    -8,     131,    146,    86,     42,     12,     44,
+    110,    169,    171,    91,     68,     173,    262,    248,    160,
+    36,     -90,    -109,   -24,    -12,    -57,    -64,    -78,    -89,
+    -75,    -87,    -101,   -82,    -72,    -76,    -81,    -63,    -34,
+    -4,     61,     87,     46,     23,     -1,     -8,     40,     63,
+    46,     45,     39,     14,     -11,    -25,    -16,    36,     78,
+    85,     110,    120,    132,    189,    228,    217,    154,    89,
+    57,     14,     -14,    -6,     0,      13,     8,      -50,    -68,
+    -60,    -107,   -140,   -126,   -122,   -151,   -147,   -118,   -105,
+    -85,    -83,    -100,   -139,   -195,   -194,   -168,   -183,   -173,
+    -148,   -166,   -168,   -123,   -59,    -11,    20,     64,     98,
+    80,     58,     83,     111,    143,    176,    171,    152,    146,
+    165,    174,    143,    93,     30,     5,      21,     42,     35,
+    -37,    -94,    -61,    -12,    -5,     -27,    -58,    -85,    -81,
+    -11,    79,     65,     -14,    -17,    15,     -4,     -2,     39,
+    20,     -29,    -19,    3,      -11,    -39,    -62,    -43,    -34,
+    -60,    -77,    -119,   -163,   -128,   -5,     87,     73,     51,
+    116,    189,    217,    240,    234,    177,    192,    295,    344,
+    313,    263,    236,    240,    230,    179,    99,     19,     -25,
+    -16,    -9,     -35,    -66,    -53,    -16,    -40,    -70,    -81,
+    -102,   -86,    -87,    -156,   -225,   -228,   -145,   -52,    -22,
+    -57,    -171,   -255,   -247,   -208,   -165,   -187,   -242,   -275,
+    -261,   -168,   -75,    -13,    8,      -62,    -125,   -136,   -133,
+    -81,    -11,    -17,    -80,    -115,   -103,   -27,    71,     134,
+    137,    44,     -48,    -24,    69,     156,    194,    175,    112,
+    55,     54,     101,    148,    157,    142,    100,    44,     27,
+    63,     106,    107,    89,     67,     37,     17,     30,     63,
+    69,     61,     21,     -37,    -55,    -72,    -53,    -26,    -53,
+    -77,    -87,    -109,   -119,   -80,    -36,    -29,    -38,    -48,
+    -57,    -65,    -16,    52,     83,     83,     24,     -27,    -14,
+    9,      27,     52,     50,     45,     90,     132,    117,    75,
+    16,     -1,     60,     95,     55,     25,     26,     20,     61,
+    119,    89,     1,      -61,    -68,    -46,    -36,    -40,    -39,
+    -49,    -58,    -16,    30,     13,     -12,    18,     35,     6,
+    3,      30,     22,     25,     52,     32,     12,     9,      -5,
+    -16,    -25,    -33,    -38,    -44,    -76,    -118,   -118,   -96,
+    -54,    -3,     9,      -31,    -82,    -84,    -35,    18,     25,
+    -26,    -72,    -48,    8,      25,     8,      -20,    -66,    -105,
+    -102,   -80,    -73,    -79,    -80,    -70,    -59,    -55,    -82,
+    -113,   -85,    -51,    -59,    -57,    -38,    -13,    -7,     -18,
+    -6,     20,     51,     55,     18,     -8,     -7,     24,     78,
+    119,    137,    135,    139,    153,    144,    155,    179,    166,
+    128,    56,     8,      38,     85,     94,     72,     20,     -32,
+    -9,     25,     17,     -15,    -84,    -123,   -106,   -82,    -62,
+    -60,    -43,    -4,     -12,    -45,    -68,    -108,   -100,   -47,
+    -49,    -64,    -50,    -9,     37,     59,     68,     62,     53,
+    49,     25,     13,     32,     40,     60,     109,    82,     18,
+    10,     -1,     21,     102,    111,    40,     -10,    -9,     20,
+    31,     0,      -51,    -108,   -135,   -89,    -21,    1,      -54,
+    -125,   -129,   -113,   -144,   -205,   -227,   -167,   -118,   -114,
+    -100,   -71,    5,      34,     -51,    -119,   -120,   -72,    10,
+    56,     51,     58,     65,     98,     135,    84,     20,     -3,
+    -1,     57,     135,    137,    90,     88,     107,    102,    45,
+    -4,     9,      48,     95,     99,     65,     42,     44,     78,
+    80,     29,     11,     39,     27,     0,      7,      19,     10,
+    -45,    -99,    -86,    -77,    -74,    -57,    -74,    -84,    -92,
+    -134,   -114,   -65,    -73,    -76,    -96,    -105,   -50,    -31,
+    -17,    17,     9,      18,     62,     75,     55,     63,     76,
+    61,     61,     80,     103,    107,    110,    131,    134,    120,
+    94,     66,     70,     78,     59,     52,     57,     53,     72,
+    76,     31,     -18,    -53,    -57,    -35,    -17,    -9,     -27,
+    -34,    -7,     -17,    -26,    -13,    -60,    -86,    -53,    -42,
+    -36,    -36,    -46,    -13,    19,     -16,    -47,    -15,    11,
+    -9,     -18,    -26,    -24,    14,     8,      -53,    -54,    15,
+    43,     15,     -9,     -5,     5,      -12,    -40,    -57,    -74,
+    -94,    -105,   -91,    -20,    30,     -10,    -50,    -58,    -52,
+    -42,    -47,    -54,    -61,    -83,    -64,    -30,    -3,     31,
+    9,      -35,    -43,    -31,    6,      50,     54,     55,     67,
+    53,     43,     30,     27,     62,     37,     -26,    -52,    -54,
+    -29,    3,      -12,    -23,    11,     26,     23,     31,     57,
+    66,     46,     32,     35,     83,     124,    111,    124,    157,
+    143,    101,    80,     60,     27,     11,     21,     22,     9,
+    -4,     -26,    -41,    -35,    -50,    -103,   -138,   -116,   -90,
+    -89,    -90,    -79,    -74,    -58,    -18,    -12,    -29,    -36,
+    -17,    22,     30,     -1,     -8,     8,      10,     19,     31,
+    36,     38,     41,     28,     -7,     -14,    -6,     -20,    -30,
+    -11,    -2,     -9,     0,      25,     56,     78,     68,     40,
+    34,     47,     50,     40,     37,     26,     28,     53,     61,
+    57,     25,     -35,    -75,    -65,    -48,    -65,    -81,    -67,
+    -53,    -41,    3,      19,     -3,     -9,     -2,     -1,     -24,
+    -36,    -23,    -26,    -29,    -9,     0,      -15,    -17,    -9,
+    12,     50,     45,     14,     19,     37,     24,     9,      16,
+    13,     -16,    -19,    3,      -3,     -12,    -10,    -23,    -43,
+    -47,    -38,    -46,    -44,    -7,     3,      -19,    -13,    -26,
+    -52,    -29,    -19,    -32,    0,      11,     -26,    -24,    -20,
+    -41,    -30,    -24,    -53,    -67,    -26,    23,     20,     9,
+    6,      -8,     3,      16,     7,      3,      -5,     2,      33,
+    53,     72,     94,     86,     69,     96,     118,    95,     91,
+    78,     32,     26,     48,     48,     37,     21,     7,      -6,
+    -8,     8,      1,      -17,    -2,     18,     1,      -28,    -51,
+    -84,    -93,    -74,    -46,    -18,    -19,    -31,    -10,    10,
+    10,     7,      -5,     -30,    -39,    -28,    -9,     10,     17,
+    11,     14,     20,     -1,     2,      18,     7,      15,     40,
+    40,     32,     27,     23,     31,     43,     33,     7,      -3,
+    18,     51,     53,     31,     21,     14,     16,     14,     4,
+    11,     16,     1,      -24,    -38,    -33,    -27,    -50,    -74,
+    -70,    -60,    -54,    -44,    -22,    -22,    -43,    -33,    -16,
+    -35,    -36,    -18,    -27,    -42,    -46,    -36,    -17,    -15,
+    -22,    -21,    -20,    -2,     15,     12,     22,     27,     22,
+    41,     57,     60,     63,     54,     56,     65,     62,     68,
+    58,     34,     53,     70,     58,     60,     51,     33,     41,
+    39,     16,     -3,     -16,    -18,    -15,    -18,    -32,    -76,
+    -85,    -62,    -82,    -87,    -68,    -84,    -75,    -40,    -48,
+    -55,    -45,    -42,    -24,    -14,    -1,     27,     23,     -1,
+    -2,     12,     15,     32,     55,     52,     55,     82,     81,
+    58,     62,     59,     37,     24,     20,     17,     18,     19,
+    15,     14,     5,      -18,    -27,    -20,    -19,    -34,    -39,
+    -29,    -30,    -27,    -27,    -48,    -52,    -54,    -77,    -48,
+    -18,    -36,    -34,    -13,    -21,    -38,    -28,    -15,    -7,
+    -6,     -20,    -18,    2,      4,      -11,    -5,     7,      1,
+    1,      12,     -2,     -17,    7,      15,     2,      15,     34,
+    48,     78,     94,     82,     66,     66,     64,     47,     44,
+    57,     64,     74,     65,     34,     26,     31,     32,     33,
+    18,     5,      -1,     -18,    -22,    -31,    -54,    -37,    -32,
+    -74,    -89,    -77,    -73,    -65,    -72,    -75,    -39,    -21,
+    -31,    -31,    -24,    -19,    -8,     -4,     7,      26,     22,
+    15,     13,     11,     28,     47,     42,     35,     28,     5,
+    18,     55,     55,     45,     44,     18,     9,      18,     -2,
+    -5,     6,      -15,    -16,    -12,    -20,    -4,     4,      -15,
+    -18,    -10,    -5,     -2,     -16,    -24,    -14,    -7,     -14,
+    -33,    -33,    -20,    -17,    -17,    -18,    -30,    -37,    -35,
+    -34,    -13,    -3,     -28,    -28,    -10,    -21,    -17,    -4,
+    -12,    -16,    -20,    -27,    -16,    -8,     -4,     14,     24,
+    11,     17,     30,     27,     14,     7,      28,     30,     22,
+    45,     47,     23,     31,     23,     -5,     10,     17,     -5,
+    2,      15,     9,      20,     29,     11,     -9,     -8,     8,
+    10,     -1,     -14,    -30,    -30,    -8,     -9,     -20,    -17,
+    -17,    -12,    1,      6,      -7,     -18,    -6,     10,     -6,
+    -7,     29,     35,     21,     16,     9,      25,     44,     26,
+    21,     34,     28,     40,     41,     9,      -2,     1,      12,
+    34,     18,     -12,    -10,    -16,    -29,    -24,    -25,    -20,
+    -17,    -35,    -29,    -12,    -29,    -39,    -32,    -30,    -17,
+    -12,    -28,    -20,    -5,     -4,     7,      14,     10,     3,
+    -3,     0,      19,     27,     4,      -21,    -18,    -7,     -4,
+    0,      1,      -6,     -17,    -30,    -24,    -11,    -9,     0,
+    -1,     0,      -3,     -12,    1,      15,     -2,     3,      16,
+    -3,     -8,     7,      3,      13,     32,     23,     10,     -6,
+    -11,    8,      4,      -12,    -9,     3,      12,     -2,     -31,
+    -36,    -33,    -37,    -17,    -5,     -20,    -14,    4,      5,
+    4,      6,      17,     31,     27,     23,     16,     -1,     -4,
+    15,     24,     21,     18,     7,      -7,     -14,    18,     41,
+    25,     14,     13,     2,      5,      12,     8,      15,     10,
+    2,      13,     10,     3,      5,      -1,     0,      11,     10,
+    6,      2,      7,      10,     -4,     -3,     2,      -13,    -4,
+    14,     -4,     -17,    -11,    -4,     8,      3,      -8,     -1,
+    -7,     -20,    -4,     23,     23,     8,      5,      24,     21,
+    -5,     -2,     7,      -9,     -15,    -8,     -6,     6,      2,
+    -26,    -19,    1,      -19,    -31,    -27,    -34,    -41,    -47,
+    -39,    -12,    -12,    -29,    -32,    -41,    -36,    -26,    -36,
+    -35,    -33,    -29,    -1,     5,      -13,    -21,    -21,    -3,
+    12,     1,      -7,     -1,     2,      12,     9,      -1,     15,
+    21,     18,     25,     4,      -13,    5,      12,     16,     33,
+    33,     19,     21,     26,     30,     30,     24,     23,     19,
+    22,     34,     39,     28,     15,     14,     24,     24,     18,
+    12,     10,     4,      8,      28,     29,     2,      -7,     6,
+    8,      10,     2,      -13,    -8,     -2,     0,      12,     13,
+    -1,     3,      21,     26,     24,     17,     11,     15,     19,
+    19,     19,     11,     1,      3,      3,      0,      -5,     -11,
+    -16,    -26,    -18,    3,      -5,     -17,    2,      10,     6,
+    6,      -8,     -11,    4,      -3,     -17,    -10,    -17,    -37,
+    -31,    -17,    -26,    -37,    -42,    -53,    -49,    -34,    -40,
+    -39,    -21,    -17,    -23,    -23,    -25,    -30,    -24,    -13,
+    -10,    -10,    1,      1,      -7,     7,      19,     11,     4,
+    -3,     -8,     1,      6,      7,      25,     22,     -5,     3,
+    20,     7,      -1,     14,     17,     18,     20,     12,     25,
+    41,     23,     19,     37,     39,     21,     17,     23,     17,
+    6,      9,      15,     4,      -15,    -8,     8,      7,      1,
+    -12,    -18,    -14,    -15,    -10,    0,      -3,     3,      13,
+    -8,     -21,    -8,     -26,    -29,    -1,     -9,     -24,    -19,
+    -22,    -24,    -18,    -25,    -27,    -28,    -34,    -26,    -9,
+    -14,    -14,    -8,     -8,     -5,     4,      4,      -10,    -12,
+    -7,     -8,     -10,    -15,    -19,    -10,    -5,     -9,     -9,
+    -19,    -33,    -27,    -14,    -15,    -14,    -16,    -25,    -10,
+    5,      -7,     -11,    2,      3,      7,      17,     28,     33,
+    32,     33,     39,     49,     57,     63,     62,     64,     67,
+    59,     55,     67,     71,     58,     53,     53,     44,     38,
+    44,     51,     51,     45,     35,     34,     46,     55,     48,
+    36,     21,     3,      -5,     2,      7,      0,      -17,    -30,
+    -34,    -48,    -62,    -64,    -66,    -66,    -62,    -79,    -90,
+    -85,    -88,    -88,    -85,    -88,    -103,   -112,   -112,   -102,
+    -99,    -102,   -103,   -110,   -100,   -80,    -60,    -57,    -68,
+    -59,    -45,    -35,    -6,     9,      -3,     2,      32,     45,
+    48,     51,     40,     51,     78,     85,     83,     87,     94,
+    101,    104,    105,    100,    86,     82,     96,     102,    96,
+    85,     68,     63,     65,     55,     50,     46,     28,     32,
+    43,     33,     30,     27,     8,      18,     36,     27,     20,
+    13,     -14,    -19,    8,      12,     0,      -1,     -12,    -24,
+    -20,    -27,    -39,    -39,    -39,    -44,    -38,    -32,    -42,
+    -38,    -33,    -43,    -55,    -57,    -60,    -61,    -56,    -57,
+    -55,    -43,    -46,    -58,    -55,    -50,    -50,    -51,    -48,
+    -46,    -44,    -36,    -26,    -20,    -13,    -11,    -8,     1,
+    5,      0,      8,      21,     31,     42,     39,     43,     56,
+    48,     37,     45,     45,     47,     52,     46,     40,     26,
+    18,     28,     30,     22,     14,     0,      -3,     8,      0,
+    -7,     0,      -10,    -13,    -9,     -13,    -13,    -18,    -33,
+    -32,    -26,    -37,    -41,    -32,    -26,    -30,    -34,    -31,
+    -38,    -40,    -24,    -25,    -29,    -15,    -18,    -23,    -4,
+    2,      -7,     0,      5,      10,     22,     23,     25,     31,
+    33,     37,     38,     39,     43,     46,     41,     44,     46,
+    37,     35,     46,     63,     67,     52,     38,     30,     35,
+    41,     41,     41,     29,     15,     16,     4,      -4,     3,
+    -12,    -18,    -13,    -27,    -39,    -47,    -55,    -44,    -43,
+    -53,    -45,    -36,    -37,    -37,    -38,    -40,    -49,    -57,
+    -41,    -24,    -28,    -31,    -26,    -20,    -15,    -21,    -23,
+    -18,    -19,    -14,    -10,    -11,    1,      -6,     -26,    -14,
+    -1,     -7,     -10,    -11,    -9,     0,      -4,     -9,     3,
+    8,      0,      -2,     1,      16,     20,     7,      9,      10,
+    8,      18,     12,     11,     17,     -6,     -19,    0,      0,
+    -10,    -6,     -12,    -14,    -11,    -9,     -2,     -10,    -19,
+    -9,     -11,    -4,     18,     7,      -3,     9,      17,     23,
+    28,     25,     19,     19,     24,     33,     37,     30,     28,
+    35,     44,     43,     33,     31,     30,     26,     33,     39,
+    35,     31,     27,     19,     23,     24,     19,     13,     0,
+    0,      2,      -7,     -9,     -10,    -13,    -6,     -6,     -23,
+    -28,    -15,    -9,     -20,    -34,    -30,    -15,    -12,    -11,
+    -3,     -4,     -4,     6,      15,     9,      -11,    -20,    3,
+    26,     23,     1,      -16,    -3,     12,     2,      -22,    -36,
+    -35,    -28,    -20,    -13,    -19,    -38,    -43,    -29,    -11,
+    -5,     -15,    -37,    -40,    -9,     12,     -1,     -23,    -30,
+    -16,    12,     21,     -1,     -25,    -21,    4,      34,     55,
+    34,     -12,    -11,    47,     99,     107,    58,     0,      8,
+    78,     148,    151,    56,     -40,    -2,     142,    215,    99,
+    -67,    -64,    76,     153,    99,     -21,    -107,   -92,    -1,
+    106,    107,    -123,   -395,   -334,   60,     274,    -69,    -597,
+    -626,   -126,   238,    18,     -447,   -577,   -312,   -34,    20,
+    -89,    -242,   -332,   -222,   74,     262,    64,     -285,   -232,
+    259,    563,    294,    -138,   -130,   312,    642,    515,    189,
+    57,     187,    415,    538,    467,    277,    109,    134,    334,
+    441,    299,    59,     -7,     128,    228,    146,    -20,    -99,
+    -34,    60,     24,     -108,   -188,   -147,   -57,    -48,    -142,
+    -224,   -210,   -144,   -122,   -175,   -212,   -176,   -150,   -199,
+    -256,   -210,   -100,   -79,    -195,   -298,   -248,   -107,   -48,
+    -110,   -192,   -224,   -189,   -112,   -40,    -31,    -124,   -238,
+    -193,   -3,     87,     -53,    -221,   -165,   48,     132,    -2,
+    -150,   -109,   61,     147,    83,     -20,    -60,    -13,    85,
+    157,    130,    17,     -68,    -10,    147,    217,    116,    -20,
+    -21,    103,    200,    158,    52,     35,     105,    155,    132,
+    81,     74,     110,    114,    74,     48,     68,     100,    77,
+    27,     30,     48,     19,     -15,    7,      63,     53,     -56,
+    -123,   -41,    81,     75,     -61,    -154,   -84,    45,     68,
+    -24,    -105,   -76,    22,     53,     -13,    -63,    -21,    54,
+    59,     -1,     -34,    16,     80,     81,     48,     37,     61,
+    89,     88,     101,    134,    132,    100,    83,     125,    188,
+    173,    101,    95,     172,    214,    149,    68,     94,     181,
+    177,    103,    83,     132,    165,    122,    83,     140,    191,
+    153,    92,     106,    198,    226,    138,    85,     146,    215,
+    187,    110,    77,     115,    146,    115,    91,     96,     78,
+    27,     -3,     42,     102,    71,     -23,    -46,    30,     95,
+    63,     -18,    -25,    77,     174,    138,    13,     -25,    96,
+    218,    181,    34,     -70,    -45,    17,     2,      -67,    -174,
+    -346,   -516,   -553,   -446,   -455,   -789,   -1213,  -1308,  -1046,
+    -878,   -1179,  -1691,  -1839,  -1528,  -1219,  -1292,  -1623,  -1772,
+    -1538,  -1147,  -921,   -951,   -1038,  -929,   -549,   -95,    155,
+    127,    97,     387,    931,    1339,   1380,   1234,   1276,   1661,
+    2102,   2223,   2027,   1848,   1942,   2198,   2295,   2119,   1856,
+    1725,   1745,   1752,   1601,   1335,   1102,   993,    952,    830,
+    570,    286,    139,    133,    85,     -135,   -436,   -638,   -645,
+    -571,   -620,   -835,   -1064,  -1151,  -1069,  -951,   -964,   -1109,
+    -1209,  -1162,  -1044,  -961,   -944,   -977,   -1001,  -912,   -687,
+    -517,   -623,   -887,   -897,   -469,   10,     -35,    -590,   -934,
+    -545,   184,    427,    -53,    -619,   -563,   40,     489,    339,
+    -128,   -306,   -6,     403,    497,    232,    -55,    0,      388,
+    704,    584,    145,    -76,    260,    816,    942,    485,    2,
+    65,     575,    923,    744,    290,    76,     276,    596,    662,
+    419,    134,    92,     280,    434,    344,    88,     -66,    8,
+    151,    126,    -81,    -239,   -176,   -29,    -74,    -351,   -574,
+    -487,   -208,   -132,   -426,   -780,   -797,   -577,   -595,   -978,
+    -1169,  -667,   -36,    -548,   -2285,  -3281,  -1756,  927,    1236,
+    -1911,  -5006,  -4073,  -66,    2017,   -295,   -3701,  -3797,  -892,
+    975,    -165,   -1978,  -1636,  374,    1482,   679,    -567,   -591,
+    706,    2337,   3224,   2743,   1269,   287,    1221,   3597,   5083,
+    4106,   1858,   972,    2334,   4096,   4167,   2806,   1916,   2383,
+    3045,   2508,   1220,   820,    1784,   2669,   1981,   204,    -876,
+    -470,   510,    803,    170,    -787,   -1568,  -1893,  -1598,  -1027,
+    -992,   -1803,  -2610,  -2484,  -1905,  -2113,  -3113,  -3399,  -2267,
+    -1261,  -2007,  -3637,  -3909,  -2340,  -893,   -1158,  -2272,  -2486,
+    -1639,  -915,   -777,   -596,   -91,    196,    85,     210,    875,
+    1373,   1247,   1219,   1958,   2718,   2328,   1196,   1008,   2350,
+    3677,   3269,   1503,   366,    922,    2264,   2810,   1996,   608,
+    -168,   75,     680,    811,    395,    -56,    -318,   -607,   -966,
+    -1108,  -925,   -613,   -368,   -369,   -919,   -1926,  -2460,  -1685,
+    -300,   155,    -611,   -1524,  -2204,  -3227,  -3859,  -2037,  1622,
+    2382,   -2583,  -8448,  -7544,  -84,    4814,   915,    -6423,  -7558,
+    -1746,  2515,   -59,    -4587,  -3858,  1260,   3625,   187,    -4148,
+    -3500,  1542,   5467,   4780,   1256,   -1127,  -403,   2481,   5332,
+    6346,   5014,   2536,   1216,   2467,   5039,   6238,   5070,   3381,
+    3269,   4173,   3905,   2248,   1586,   3299,   5240,   4362,   1004,
+    -1382,  -489,   2113,   3168,   1620,   -742,   -1824,  -1435,  -897,
+    -1058,  -1500,  -1545,  -1398,  -1965,  -3266,  -4136,  -3756,  -2609,
+    -1804,  -1986,  -3087,  -4599,  -5296,  -4051,  -1731,  -781,   -2228,
+    -4092,  -3977,  -2325,  -1353,  -1568,  -1490,  -428,   178,    -672,
+    -1650,  -1058,  749,    2039,   2079,   1540,   897,    310,    572,
+    2266,   4265,   4265,   1869,   -231,   559,    3332,   4752,   3229,
+    768,    101,    1364,   2463,   1984,   819,    411,    723,    675,
+    -162,   -923,   -743,   -32,    185,    -516,   -1653,  -2359,  -2103,
+    -986,   42,     -205,   -1702,  -2870,  -2337,  -809,   -221,   -982,
+    -1544,  -946,   -598,   -2117,  -4291,  -4100,  -857,   1948,   338,
+    -4799,  -7972,  -5403,  173,    2371,   -1063,  -5533,  -5578,  -1777,
+    605,    -985,   -3249,  -2213,  1184,   2691,   560,    -2356,  -2288,
+    1233,   5244,   6441,   4004,   370,    -663,   2555,   7404,   9282,
+    6573,   2612,   1836,   4662,   7467,   7393,   5421,   4262,   4741,
+    5362,   4705,   3163,   2397,   3337,   4887,   4810,   2254,   -749,
+    -1316,  772,    2706,   2016,   -573,   -2552,  -2746,  -2012,  -1647,
+    -1978,  -2579,  -3105,  -3473,  -3911,  -4484,  -4891,  -4795,  -4163,
+    -3543,  -3538,  -4275,  -5356,  -5743,  -4637,  -2614,  -1301,  -1825,
+    -3341,  -4011,  -2937,  -751,   1007,   1245,   235,    -639,   -61,
+    1626,   2864,   2967,   2734,   3013,   3329,   2914,   2312,   2666,
+    3839,   4308,   3162,   1453,   768,    1255,   1887,   2006,   1715,
+    1031,   -297,   -1660,  -1690,  -277,   813,    -30,    -2137,  -3370,
+    -2854,  -1553,  -593,   -413,   -1146,  -2567,  -3440,  -2369,  -205,
+    379,    -1258,  -2315,  -812,   262,    -3205,  -8576,  -7894,  738,
+    7492,   1951,   -11595, -17098, -6934,  7139,   8065,   -4575,  -14199,
+    -8946,  3606,   7504,   -547,   -8242,  -5113,  4406,   8113,   2134,
+    -5040,  -4089,  4157,   10934,  10158,  4167,   -565,   -192,   4428,
+    9765,   12201,  9861,   4512,   1225,   3451,   8483,   10133,  6497,
+    2574,   3333,   6806,   6986,   2487,   -1214,  623,    5416,   6647,
+    2204,   -3289,  -4556,  -1565,  1544,   1525,   -1236,  -4293,  -5695,
+    -5174,  -3995,  -3403,  -3449,  -3750,  -4505,  -6014,  -7296,  -6523,
+    -3849,  -2096,  -3288,  -5722,  -6004,  -3581,  -1497,  -1960,  -3330,
+    -2800,  -434,   964,    -111,   -1739,  -1136,  1736,   4151,   3736,
+    1274,   -451,   469,    3386,   5833,   5898,   3646,   1085,   272,
+    1743,   4061,   5108,   3837,   1490,   246,    967,    1866,   859,
+    -1069,  -974,   1542,   2835,   47,     -4285,  -5068,  -1567,  1781,
+    1223,   -1997,  -4227,  -3747,  -1720,  41,     245,    -1228,  -2972,
+    -2673,  22,     1980,   -930,   -7721,  -11271, -5725,  4974,   8484,
+    -2007,  -16979, -19255, -4670,  11057,  9690,   -6417,  -17537, -10841,
+    4262,   9292,   206,    -9128,  -6224,  4828,   10018,  3699,   -5183,
+    -5121,  4702,   14279,  14466,  5778,   -2633,  -2185,  7036,   16118,
+    16305,  8081,   390,    499,    6580,   11150,  10036,  5704,   2902,
+    3378,   4664,   3786,   863,    -796,   1216,   4609,   4493,   -338,
+    -5670,  -6486,  -2751,  884,    571,    -3095,  -6446,  -6997,  -5770,
+    -5041,  -5016,  -4216,  -2579,  -2468,  -5088,  -8129,  -7964,  -4228,
+    -323,   497,    -1556,  -3653,  -3615,  -1718,  464,    1808,   2386,
+    2832,   3085,   2905,   2676,   3473,   5501,   7094,   6442,   3929,
+    1663,   1436,   3254,   5807,   7100,   5044,   -34,    -4091,  -2992,
+    2149,   5333,   2562,   -3067,  -5877,  -4480,  -2080,  -1793,  -3026,
+    -3838,  -3735,  -3663,  -4472,  -5756,  -5753,  -3576,  -640,   -274,
+    -3965,  -7787,  -6757,  -717,   4380,   3595,   -1553,  -5936,  -8603,
+    -10223, -8952,  -922,   9700,   9355,   -7788,  -25795, -22413, 2268,
+    20887,  12133,  -11291, -20129, -5899,  10236,  8585,   -3645,  -6300,
+    4667,   14216,  9346,   -3593,  -8558,  715,    15085,  21179,  14887,
+    3733,   -2703,  -675,   7170,   15131,  18360,  13959,  4205,   -2825,
+    -656,   7594,   11845,  7182,   319,    -439,   3255,   3213,   -3299,
+    -8972,  -6318,  2300,   7190,   2254,   -9247,  -17334, -15064, -4452,
+    5160,   5127,   -4268,  -14501, -17256, -11145, -1830,  3786,   2984,
+    -2498,  -8101,  -9587,  -5703,  622,    4570,   4035,   1442,   729,
+    2493,   3534,   2433,   2239,   5944,   11438,  12371,  6496,   -211,
+    -156,   7092,   13566,  11979,  3928,   -2545,  -2226,  2713,   6150,
+    5117,   1270,   -1851,  -2859,  -2376,  -1909,  -2364,  -3401,  -4183,
+    -3897,  -2875,  -3205,  -5503,  -7822,  -7501,  -3934,  -942,   -1572,
+    -4262,  -5939,  -4671,  -2353,  -1387,  -1159,  -1270,  -1328,  -606,
+    474,    1044,   -2647,  -11603, -17081, -10374, 5922,   14849,  2056,
+    -22033, -31238, -14612, 11094,  17910,  1778,   -15538, -15417, -2045,
+    6690,   2855,   -2559,  473,    8823,   11423,  3782,   -4649,  -2775,
+    9111,   20847,  21610,  11572,  962,    -1465,  5731,   15559,  20008,
+    16950,  9230,   2204,   114,    3088,   8130,   10523,  7643,   2045,
+    -2107,  -2945,  -2538,  -3593,  -5210,  -4403,  -857,   1328,   -2497,
+    -11667, -18881, -16866, -6286,  3400,   2835,   -7811,  -18322, -19279,
+    -10025, 1525,   6930,   3766,   -4647,  -11401, -9904,  -322,   10100,
+    12428,  5874,   -274,   926,    6762,   9360,   6778,   5904,   10509,
+    15077,  12681,  3846,   -1653,  2460,   11036,  14737,  8967,   -1021,
+    -6168,  -3899,  2328,   6041,   3404,   -2878,  -7672,  -6869,  -1918,
+    801,    -2188,  -7419,  -8083,  -2687,  1898,   -692,   -8121,  -11198,
+    -5642,  2830,   5915,   1120,   -5666,  -8314,  -5770,  118,    4614,
+    4713,   1482,   -2544,  -3331,  -3779,  -8931,  -13840, -10273, 3355,
+    13432,  2906,   -20058, -30890, -17080, 7759,   16047,  2886,   -12525,
+    -15117, -5998,  1614,   2294,   2684,   4610,   6236,   5486,   2514,
+    1346,   1962,   4564,   11022,  17438,  18182,  10179,  -796,   -3019,
+    5456,   15942,  18468,  11176,  2796,   -143,   1670,   3922,   3836,
+    3337,   3330,   1623,   -2609,  -7177,  -7654,  -4250,  -2210,  -3491,
+    -5312,  -4380,  -3103,  -6738,  -13209, -14278, -6529,  3346,   4931,
+    -2861,  -11176, -12097, -5552,  2679,   7102,   6050,   1301,   -3350,
+    -3378,  1785,   7413,   9059,   7013,   5043,   5331,   5197,   3143,
+    1862,   3790,   8037,   10159,  7236,   1450,   -3393,  -3980,  598,
+    6251,   7410,   1502,   -7144,  -10260, -5116,  2386,   4197,   -894,
+    -6255,  -6026,  -1493,  873,    -1639,  -4426,  -2720,  2252,   4206,
+    158,    -4631,  -4466,  537,    4709,   4528,   1691,   -828,   -1394,
+    -455,   756,    2662,   3101,   1730,   -3579,  -12987, -18531, -12998,
+    1944,   11963,  1503,   -19826, -29919, -18138, 2254,   7644,   -1829,
+    -9260,  -6516,  134,    -793,   -5234,  -2336,  6264,   12828,  11829,
+    6589,   3429,   2592,   4795,   11433,  19490,  21681,  13136,  379,
+    -4138,  3585,   14812,  17633,  10124,  623,    -2287,  696,    2273,
+    -926,   -5000,  -4391,  -386,   139,    -4657,  -11003, -13946, -11930,
+    -7460,  -1932,  1277,   -2311,  -10543, -16920, -14512, -4039,  4987,
+    7518,   3175,   -4213,  -7535,  -4747,  3590,   12231,  13419,  8429,
+    2377,   1080,   5563,   8497,   7304,   5331,   5656,   8235,   6997,
+    998,    -3131,  -1857,  3017,   5883,   3744,   -408,   -4503,  -6489,
+    -4796,  -374,   3254,   1651,   -2830,  -5206,  -3690,  -681,   -969,
+    -2819,  -2616,  19,     3379,   2359,   -2476,  -6413,  -6111,  -463,
+    4664,   4106,   -565,   -4801,  -4960,  -1242,  2479,   3706,   2168,
+    -1104,  -3048,  -1563,  1217,   2013,   -5714,  -17921, -21743, -10839,
+    7751,   13091,  -4648,  -26509, -29653, -9872,  10100,  9523,   -4335,
+    -12121, -5509,  4923,   6380,   1839,   -508,   3312,   10704,  14545,
+    12317,  5508,   -243,   2421,   11485,  19096,  18306,  8626,   -1357,
+    -5542,  -1695,  7815,   13549,  10229,  -23,    -8373,  -7496,  -2775,
+    -1016,  -2900,  -4868,  -4103,  -4535,  -6851,  -8099,  -8137,  -6414,
+    -4023,  -1790,  -45,    -1513,  -4791,  -6160,  -4105,  1060,   5970,
+    7099,   3934,   -996,   -2213,  1973,   6975,   7927,   4726,   2474,
+    3951,   5221,   2642,   -2359,  -3579,  1362,   6614,   6282,   116,
+    -5643,  -5733,  -1884,  2107,   3418,   2566,   684,    -2319,  -3803,
+    -2133,  1512,   2943,   475,    -1004,  753,    3095,   1652,   -3074,
+    -4562,  -932,   3815,   4486,   -22,    -4199,  -4666,  -2201,  284,
+    316,    -914,   -2297,  -2441,  -1538,  -435,   909,    626,    -1222,
+    -1534,  -429,   1711,   2386,   -1786,  -10676, -18200, -16272, -3805,
+    9505,   8238,   -9397,  -24577, -22256, -4907,  8659,   5940,   -3701,
+    -6764,  40,     6190,   4239,   208,    238,    7081,   14458,  15143,
+    10726,  3479,   -706,   1700,   9131,   17577,  17708,  7959,   -5009,
+    -11508, -5347,  5635,   10789,  6499,   -3121,  -9303,  -9814,  -6625,
+    -3333,  -3193,  -4349,  -5615,  -6188,  -5123,  -4441,  -4550,  -4074,
+    -2769,  -61,    2441,   2881,   1395,   -578,   -341,   2509,   6034,
+    8202,   6377,   2696,   1272,   2589,   4787,   4611,   2378,   2124,
+    3911,   4872,   2049,   -3374,  -5770,  -2705,  3179,   5905,   2589,
+    -2792,  -5419,  -3176,  1056,   2875,   2483,   1205,   605,    856,
+    1012,   892,    105,    -411,   707,    2924,   4184,   1755,   -2553,
+    -4857,  -3556,  401,    2466,   945,    -2315,  -5556,  -5549,  -2241,
+    534,    601,    -1774,  -3034,  -1962,  -886,   -448,   -720,   -467,
+    864,    760,    -22,    -2546,  -10211, -17121, -15877, -4803,  7993,
+    7254,   -6563,  -18374, -17755, -6143,  3291,   4322,   1822,   416,
+    2788,   5190,   4256,   2627,   2590,   6398,   12709,  15757,  12829,
+    5542,   -667,   167,    7241,   14346,  14826,  6392,   -3516,  -7434,
+    -4607,  1054,   2988,   847,    -1549,  -2641,  -3046,  -5363,  -8256,
+    -9130,  -6906,  -1460,  2260,   1568,   -2911,  -8580,  -9418,  -3675,
+    5021,   10127,  7909,   1478,   -4015,  -3331,  2450,   7291,   7632,
+    2567,   -2022,  -899,   3418,   5544,   1349,   -4117,  -3409,  1758,
+    6000,   3526,   -3975,  -7331,  -3931,  2747,   7037,   4962,   -21,
+    -2902,  -2008,  1306,   4461,   6364,   5956,   3623,   1734,   793,
+    44,     -893,   -1041,  1633,   5264,   4870,   -943,   -7404,  -8611,
+    -4974,  -1192,  185,    -1334,  -3672,  -4910,  -5132,  -4387,  -3532,
+    -3233,  -2430,  -469,   1245,   892,    -969,   -2441,  -2140,  320,
+    4999,   5954,   -4638,  -20056, -24424, -8954,  13558,  16089,  -3145,
+    -20665, -19447, -4802,  4488,   3733,   943,    683,    3109,   6219,
+    9247,   7736,   782,    -1410,  8024,   20877,  20174,  4723,   -7148,
+    -2758,  11240,  17896,  11462,  414,    -6134,  -4913,  113,    2818,
+    98,     -5900,  -8369,  -4446,  924,    1657,   -3389,  -10569, -13223,
+    -7690,  2339,   7741,   1634,   -9014,  -10982, -1172,  9642,   9098,
+    1310,   -2795,  -1040,  2790,   3808,   3559,   3064,   -527,   -3160,
+    -1391,  3120,   5224,   -144,   -6714,  -6416,  -719,   5630,   7253,
+    2735,   -2973,  -4325,  679,    7146,   8220,   4055,   -42,    814,
+    5288,   7658,   6592,   3051,   -746,   -541,   3401,   6030,   1953,
+    -6340,  -8619,  -2689,  4076,   3217,   -4875,  -9612,  -7826,  -4293,
+    -2441,  -4080,  -5740,  -5529,  -3656,  -506,   -1035,  -5787,  -9518,
+    -7034,  2323,   9287,   6495,   -1853,  -6110,  -3281,  -1708,  -8958,
+    -19544, -18870, -2771,  13029,  10762,  -7491,  -21837, -18923, -4183,
+    8733,   12580,  9779,   4597,   738,    1460,   6302,   9711,   8375,
+    8143,   12512,  15808,  11272,  389,    -5554,  161,    11080,  15851,
+    10426,  692,    -6372,  -6808,  -2525,  652,    827,    -219,   -349,
+    -622,   -3328,  -7883,  -11020, -8961,  -3240,  1884,   4155,   1995,
+    -3530,  -7816,  -6444,  -218,   6086,   9279,   7901,   3113,   -2352,
+    -5757,  -3836,  2022,   4572,   894,    -3519,  -3311,  -534,   -618,
+    -3716,  -5515,  -3290,  1495,   4374,   4455,   2961,   -645,   -3247,
+    -656,   5273,   9838,   9751,   5755,   1863,   158,    1457,   4585,
+    6390,   5379,   2894,   2284,   1867,   -2279,  -7051,  -6578,  70,
+    4745,   1660,   -4524,  -8007,  -7088,  -5690,  -5467,  -4178,  -2679,
+    -2218,  -3422,  -4167,  -4313,  -6105,  -6633,  -4202,  864,    5119,
+    4084,   -163,   -5331,  -8699,  -8710,  -7313,  -4649,  -2471,  -1419,
+    -1136,  -3199,  -6428,  -8048,  -4902,  1089,   4681,   5723,   5535,
+    5146,   4006,   2052,   2314,   5274,   8680,   9907,   8776,   6722,
+    2548,   -2403,  -3303,  1224,   7406,   9468,   5089,   -1197,  -4384,
+    -3570,  -298,   1776,   2005,   2041,   1326,   971,    -180,   -2334,
+    -1170,  1913,   4281,   4732,   2874,   1174,   -1341,  -3384,  -2503,
+    368,    4031,   3270,   -986,   -3519,  -5360,  -6004,  -5576,  -3603,
+    208,    708,    -2137,  -4940,  -5349,  -3588,  -2796,  -1399,  1017,
+    3144,   4196,   2483,   828,    338,    919,    3842,   6202,   7189,
+    7499,   6330,   4847,   3252,   2136,   3698,   5845,   5566,   3019,
+    267,    -55,    -1091,  -4220,  -5041,  -3430,  -280,   171,    -4649,
+    -8723,  -9280,  -5975,  -3192,  -3974,  -3912,  -4053,  -3748,  -3570,
+    -5871,  -5499,  -3552,  -1691,  320,    341,    748,    -313,   -3436,
+    -4687,  -3681,  21,     2550,   643,    -2123,  -3254,  -2226,  -1044,
+    -1617,  -1510,  183,    1250,   726,    -1662,  -3388,  -1759,  933,
+    3817,   5242,   3025,   248,    -1339,  -514,   2022,   3410,   3970,
+    3324,   2632,   2603,   2240,   2166,   1271,   487,    1076,   2039,
+    3296,   3836,   3610,   2913,   2718,   4213,   5555,   6023,   4769,
+    2442,   2067,   2173,   1623,   1201,   348,    52,     -124,   -1528,
+    -2834,  -3604,  -3463,  -2357,  -2564,  -3775,  -3801,  -1929,  -465,
+    -2109,  -3743,  -2657,  200,    2580,   954,    -1304,  -95,    1549,
+    2303,   1795,   1633,   3356,   3699,   2361,   792,    1148,   4045,
+    4820,   3851,   3197,   2449,   2704,   1722,   -652,   -1154,  -393,
+    113,    -1010,  -3328,  -4342,  -3939,  -3345,  -3697,  -5115,  -5610,
+    -4202,  -3639,  -5088,  -5351,  -3216,  -862,   -414,   -1839,  -3996,
+    -4831,  -2467,  147,    1055,   1288,   -247,   -2225,  -2233,  -1562,
+    -1278,  -936,   -961,   -935,   -367,   -323,   -459,   -1940,  -3974,
+    -2262,  -13,    2,      -401,   -1825,  -2308,  -1124,  448,    2154,
+    2434,   1300,   -812,   -1337,  1325,   3374,   3466,   2500,   2156,
+    3439,   3549,   2068,   1392,   1986,   3025,   3944,   3898,   3259,
+    4467,   6347,   5356,   2893,   1690,   2072,   4136,   5313,   2776,
+    -236,   -1063,  -794,   524,    802,    -1377,  -2879,  -2167,  -1439,
+    -1595,  -1539,  -1666,  -2495,  -2375,  -1253,  -515,   -187,   -1409,
+    -2847,  -511,   2411,   1761,   492,    -18,    607,    2350,   3288,
+    3505,   2741,   1099,   699,    2017,   3214,   3333,   1567,   33,
+    1260,   1925,   808,    -377,   -2558,  -3781,  -1677,  164,    -580,
+    -1727,  -2619,  -3421,  -3586,  -3957,  -4562,  -3646,  -2285,  -3437,
+    -5293,  -4792,  -4128,  -4012,  -2920,  -2249,  -2439,  -3737,  -5607,
+    -4427,  -1259,  71,     609,    555,    -1039,  -3354,  -5388,  -3760,
+    415,    2513,   2513,   819,    -1436,  -2780,  -2740,  -501,   2727,
+    3936,   1491,   -965,   -766,   -484,   -223,   361,    695,    1771,
+    1130,   -1839,  -1764,  797,    -31,    -2549,  -1790,  2108,   4043,
+    887,    -154,   2411,   2605,   2012,   1977,   3923,   6630,   4176,
+    107,    -311,   1731,   1910,   1011,   3119,   3219,   998,    -1282,
+    -2832,  -1645,  -685,   945,    2574,   2543,   -267,   -5015,  -3819,
+    -342,   1228,   2055,   -619,   -1233,  2069,   2896,   1095,   62,
+    1365,   3366,   4584,   4956,   3323,   -19,    -50,    4024,   5222,
+    3695,   3118,   1933,   1256,   1443,   128,    -119,   2043,   2477,
+    1823,   1324,   30,     -1363,  -3023,  -3074,  -188,   621,    -1775,
+    -2806,  -2961,  -2753,  -4359,  -5350,  -1220,  -116,   -4157,  -4811,
+    -2793,  -1040,  -1957,  -2862,  -1901,  -3192,  -3720,  -2357,  -1727,
+    -387,   -2131,  -5011,  -3650,  -454,   596,    -1298,  -3716,  -3122,
+    496,    136,    -2415,  -1675,  -811,   -837,   140,    -1243,  -187,
+    -1431,  -5320,  -2121,  100,    -467,   2465,   681,    -2093,  1224,
+    1632,   1428,   1776,   648,    2480,   3622,   876,    259,    1403,
+    2139,   3117,   497,    -763,   -170,   279,    1769,   342,    -871,
+    -25,    -1549,  -2290,  290,    1042,   -796,   -4291,  -3895,  159,
+    1264,   -540,   -2328,  -702,   1972,   852,    -2274,  -798,   1126,
+    -579,   -480,   3481,   3833,   1004,   901,    1536,   1809,   3103,
+    2521,   3183,   5220,   1800,   -266,   4663,   4230,   -790,   159,
+    2274,   5114,   4304,   -1998,  344,    4921,   -343,   -2048,  1180,
+    2112,   3109,   -10,    -1818,  552,    -1360,  -2889,  -1302,  -1918,
+    -37,    1406,   -1762,  -3054,  -1446,  -2073,  -4292,  -3214,  1163,
+    2333,   -712,   -2583,  -2058,  -1034,  -600,   -3796,  -2395,  2137,
+    -1122,  -1927,  702,    -2196,  -4374,  -3257,  -1558,  -256,   -728,
+    -395,   -176,   -1529,  -2772,  -1121,  -340,   -1147,  -250,   -4079,
+    -473,   4241,   -2818,  -3523,  3255,   2355,   -2550,  -1082,  1197,
+    2213,   -94,    -237,   3123,   1314,   -1075,  977,    1081,   2045,
+    2966,   -1328,  -1069,  -741,   -524,   -380,   -2766,  -986,   926,
+    -3281,  -1554,  2554,   -3620,  -6394,  -1680,  -321,   2889,   243,
+    -1567,  2276,   -1294,  -525,   2010,   -4883,  -1495,  6778,   2085,
+    -873,   2496,   418,    -1156,  -1179,  1604,   6173,   1190,   -2381,
+    5788,   2431,   -4941,  -242,   1248,   1023,   4426,   3399,   2726,
+    1388,   -922,   595,    392,    1414,   6260,   2673,   -973,   2237,
+    1776,   -2393,  -757,   4158,   2842,   -2327,  505,    1230,   -3623,
+    -917,   336,    -1400,  -1018,  1771,   2696,   -570,   -2435,  886,
+    2309,   -2865,  -1328,  2077,   -1967,  -3486,  -411,   961,    -1661,
+    -1979,  1179,   -493,   -2597,  1995,   284,    -3300,  -2213,  184,
+    312,    -1665,  -641,   -1325,  -1276,  90,     69,     476,    -778,
+    -1099,  853,    1515,   1630,   1188,   -877,   -1751,  702,    2983,
+    -201,   664,    4018,   -352,   -1864,  875,    2367,   813,    -2463,
+    -702,   886,    -2204,  -2216,  399,    -1729,  -2408,  1412,   -2757,
+    -3530,  449,    -2554,  -3910,  906,    697,    -1696,  566,    -1360,
+    -1991,  81,     -1756,  -159,   1180,   -667,   -584,   -359,   183,
+    1943,   -412,   -1747,  1659,   1961,   280,    294,    222,    2000,
+    2076,   829,    -43,    -880,   3353,   3615,   1279,   1746,   -1031,
+    1301,   3477,   -777,   2567,   1215,   -2344,  3556,   561,    -2166,
+    1119,   2377,   -391,   -1825,  -2359,  49,     1764,   391,    -291,
+    325,    1223,   1443,   -624,   -2828,  1381,   2438,   28,     -652,
+    -166,   581,    -2039,  -374,   -20,    -2459,  -1149,  1505,   2008,
+    -1798,  -3848,  -1796,  -2208,  -2224,  -878,   728,    -154,   -534,
+    1061,   538,    -1465,  73,     1147,   82,     -119,   3800,   4797,
+    -873,   784,    1458,   -148,   3180,   1319,   908,    4951,   584,
+    -57,    2394,   -967,   586,    405,    -1601,  3566,   -285,   -3949,
+    -1301,  -1953,  -1223,  -1831,  -3477,  -779,   -389,   -3169,  -1828,
+    -1496,  -1451,  -556,   -3327,  -209,   534,    -4908,  131,    -386,
+    -5232,  1373,   2129,   -1740,  -1957,  -1102,  76,     396,    -1426,
+    -179,   1357,   -3276,  -1420,  3819,   -44,    56,     2777,   -1202,
+    1908,   1410,   2031,   3495,   -2197,  -163,   1565,   239,    2803,
+    480,    -1636,  1180,   616,    1206,   1166,   -1579,  1572,   814,
+    -774,   2310,   740,    -2606,  1234,   -603,   -362,   1562,   -2134,
+    652,    -777,   -2353,  5464,   377,    -2490,  1012,   157,    680,
+    -1389,  -1898,  1135,   -1,     -1730,  1800,   -1466,  -1687,  -1469,
+    -3250,  -1081,  1381,   -81,    -204,   -26,    353,    1941,   174,
+    104,    2009,   1032,   -871,   3280,   3398,   -651,   -154,   3309,
+    1964,   448,    812,    -17,    887,    2405,   3295,   -54,    -2396,
+    1410,   1380,   -1156,  296,    -1706,  -1729,  401,    -970,   -878,
+    -723,   -2285,  1259,   1320,   -1960,  -1039,  -211,   -661,   -763,
+    -1599,  -43,    308,    -1841,  72,     -2075,  -3010,  -497,   506,
+    -377,   247,    1932,   -1788,  -2419,  257,    208,    -2176,  488,
+    2827,   -1720,  -1649,  -619,   520,    1103,   -1231,  -1327,  2162,
+    1535,   -383,   315,    -1488,  -235,   1761,   -27,    -232,   515,
+    127,    -2239,  654,    2871,   -379,   -1274,  2445,   874,    -2444,
+    514,    -206,   -1289,  1314,   1869,   1316,   1878,   -1454,  -982,
+    476,    359,    2084,   -708,   405,    -246,   -1071,  1757,   -866,
+    -2331,  783,    501,    -853,   896,    36,     -2468,  -1138,  1445,
+    -613,   -687,   1999,   -449,   -731,   1478,   384,    -45,    96,
+    1530,   1919,   186,    -94,    1347,   -329,   -348,   1631,   574,
+    1062,   735,    -1652,  675,    244,    1241,   1137,   -2469,  621,
+    45,     -612,   1308,   -2015,  -208,   2392,   -1646,  -67,    77,
+    -1558,  113,    1263,   -236,   -971,   -333,   -733,   -555,   2024,
+    -135,   -3817,  -398,   1696,   -1179,  -1473,  1175,   -166,   618,
+    1132,   -2504,  -575,   146,    -688,   1323,   150,    -2021,  15,
+    1673,   347,    -1535,  -106,   235,    -32,    1167,   -471,   -503,
+    -1260,  416,    -13,    -1082,  1036,   -790,   -1676,  487,    985,
+    77,     57,     -1175,  1146,   2023,   -1706,  -404,   3249,   -739,
+    -979,   3044,   -514,   -168,   2201,   -2863,  1009,   1833,   -2309,
+    1565,   476,    -1698,  1667,   -496,   -2193,  1686,   532,    336,
+    -1095,  -1655,  578,    -909,   -1263,  2569,   -2833,  -1808,  2860,
+    -822,   27,     1098,   -1371,  1585,   -284,   -1074,  2944,   -764,
+    -2871,  2484,   1179,   -1213,  -670,   -1226,  1112,   1837,   -299,
+    -388,   -51,    1,      992,    -723,   -361,   1723,   -1115,  -2012,
+    1261,   -9,     -127,   -510,   -1550,  1448,   957,    -1930,  171,
+    776,    -2104,  14,     764,    -599,   -745,   -438,   -371,   -659,
+    1075,   282,    -3116,  684,    3747,   22,     -2139,  816,    1413,
+    -333,   458,    906,    483,    -1084,  797,    1039,   -467,   -377,
+    1386,   -1182,  610,    1787,   -1354,  -2800,  2638,   424,    -2372,
+    1153,   -51,    -689,   290,    -2199,  818,    3755,   -2674,  -1689,
+    3497,   -507,   -1978,  1729,   1413,   215,    -76,    53,     759,
+    371,    -1529,  1005,   -770,   -685,   1754,   -908,   -653,   1047,
+    -1066,  -784,   -199,   -526,   86,     -1750,  -916,   1839,   580,
+    -1884,  319,    226,    -977,   212,    202,    -741,   -1013,  2057,
+    69,     -2961,  974,    1964,   -512,   -224,   1554,   -79,    -1142,
+    1853,   -71,    1009,   1174,   -718,   2040,   -158,   -1508,  1042,
+    0,      -1219,  1212,   448,    -208,   -47,    -779,   -867,   1924,
+    -254,   -1085,  -221,   -1283,  1543,   -584,   -951,   225,    -1089,
+    -464,   -853,   -615,   1576,   -2313,  -1214,  950,    -2548,  -314,
+    1201,   -1527,  952,    764,    -1915,  528,    169,    -1676,  1742,
+    425,    -2346,  932,    290,    109,    492,    -379,   932,    70,
+    582,    135,    769,    1665,   -1751,  576,    1013,   366,    2339,
+    71,     637,    1500,   576,    111,    494,    765,    1170,   1421,
+    -5,     -892,   2054,   -640,   160,    1426,   -651,   348,    -841,
+    -558,   1563,   277,    -408,   -1468,  482,    -1538,  -2255,  968,
+    -1307,  -454,   1306,   -3085,  -1680,  2624,   -2191,  -1719,  1891,
+    -3826,  -1441,  2736,   -3694,  -266,   1897,   -4468,  841,    2828,
+    -4060,  -318,   2305,   -1662,  528,    3056,   -2429,  -156,   2045,
+    -753,   475,    419,    -597,   1100,   1845,   504,    1067,   -402,
+    -824,   1807,   1192,   459,    200,    1728,   50,     -497,   678,
+    -355,   938,    1239,   -1223,  360,    1251,   -95,    981,    1029,
+    -1940,  260,    1627,   -2387,  3426,   519,    -3141,  1822,   -506,
+    -1471,  1101,   -2137,  1069,   885,    -2618,  1673,   -463,   -1558,
+    1439,   -386,   -1923,  1538,   -1313,  -1735,  540,    -1433,  -915,
+    494,    -839,   -1527,  -1143,  480,    -1081,  27,     1732,   -1285,
+    -1833,  1952,   -667,   -1626,  1819,   -1293,  -1323,  2139,   -376,
+    -1392,  1277,   -1172,  -240,   2907,   -1875,  -238,   2573,   -1068,
+    -471,   2065,   -686,   -1315,  2575,   233,    -1005,  1135,   706,
+    534,    278,    -182,   1091,   -21,    -222,   1413,   -371,   -54,
+    1108,   -103,   382,    -70,    787,    894,    -108,   1308,   1113,
+    -1412,  574,    1140,   -2032,  500,    569,    -1251,  951,    -50,
+    -1398,  772,    -474,   -1536,  1297,   251,    -2321,  109,    -703,
+    -425,   40,     -1354,  -773,   -225,   -1743,  -1839,  1244,   261,
+    -3082,  -424,   1162,   -937,   123,    -322,   -407,   -561,   -331,
+    1369,   -1142,  -1050,  1024,   1116,   -213,   -752,   1521,   -383,
+    -415,   1011,   947,    -713,   743,    1945,   -237,   881,    600,
+    -757,   885,    -835,   756,    2454,   -1985,  699,    1572,   -1652,
+    673,    232,    -42,    1975,   -736,   -270,   1660,   -704,   -96,
+    1264,   -428,   278,    774,    -954,   -1325,  756,    1275,   -594,
+    -353,   204,    -1130,  -782,   -432,   -979,   268,    378,    20,
+    -870,   405,    -357,   -1661,  637,    473,    293,    -314,   -895,
+    3,      -175,   -1016,  -643,   204,    -588,   -1007,  -131,   401,
+    -849,   -476,   271,    320,    -198,   533,    -25,    -1994,  1421,
+    525,    -1611,  1261,   507,    -488,   1093,   361,    -1814,  2230,
+    312,    -196,   3242,   -803,   -962,   1714,   -1479,  1426,   1612,
+    -1953,  1376,   -581,   -669,   1370,   -1251,  426,    1274,   -470,
+    1757,   807,    -589,   1275,   126,    -871,   1025,   -1331,  287,
+    1258,   -1813,  146,    -839,   -1471,  828,    -402,   -281,   1704,
+    -1341,  -231,   939,    -1035,  -472,   -197,   -764,   -380,   -816,
+    -266,   382,    -497,   -1708,  -591,   1119,   -1941,  178,    969,
+    -1656,  685,    1004,   -1114,  -127,   -1473,  -678,   1610,   -1253,
+    277,    1807,   -1642,  -461,   2033,   -1449,  392,    98,     -157,
+    1525,   -860,   2455,   413,    -2159,  2457,   475,    -374,   1532,
+    -981,   843,    973,    324,    1168,   225,    -407,   1487,   681,
+    -680,   1098,   117,    245,    1238,   -223,   1076,   -428,   -466,
+    2593,   -663,   -1225,  1303,   -933,   -561,   1190,   -1071,  -1229,
+    406,    -284,   -13,    198,    -1494,  -637,   352,    -1960,  420,
+    49,     -1472,  -761,   -234,   -2213,  -1750,  -521,   -1554,  -813,
+    662,    -633,   -1388,  -15,    -947,   -391,   -152,   -894,   631,
+    -461,   -885,   633,    -51,    -1063,  218,    1149,   -61,    -274,
+    988,    -140,   7,      1774,   1558,   -623,   755,    1352,   -511,
+    1106,   744,    17,     2640,   -91,    697,    1547,   -1757,  1832,
+    1859,   -206,   1505,   575,    -444,   556,    250,    1786,   792,
+    -125,   -266,   407,    501,    798,    -536,   -1214,  58,     6,
+    354,    -685,   613,    99,     -2022,  -116,   -236,   -182,   263,
+    -824,   -1187,  -142,   -138,   -1228,  -1008,  786,    -1421,  -1127,
+    -269,   -2278,  841,    222,    -2423,  678,    -1153,  -2082,  574,
+    -570,   -729,   180,    -777,   212,    270,    -274,   1077,   -493,
+    118,    804,    -1260,  349,    799,    545,    481,    971,    1099,
+    1146,   -273,   34,     1728,   1128,   411,    758,    308,    -808,
+    950,    1490,   209,    -265,   1154,   -11,    -460,   2644,   -122,
+    -728,   2033,   -1100,  -305,   1774,   -208,   -1567,  -57,    -140,
+    -670,   -454,   -1390,  -80,    978,    -438,   -731,   -684,   344,
+    -458,   -199,   -126,   -1663,  -883,   642,    -1517,  -1144,  -375,
+    -422,   -452,   -1815,  -791,   763,    -1502,  -205,   684,    -1641,
+    448,    1399,   -2160,  804,    1088,   -2214,  1030,   1585,   -1093,
+    -11,    1718,   -360,   -81,    1294,   398,    218,    1225,   644,
+    505,    2090,   -385,   526,    2111,   -303,   -316,   1550,   1323,
+    -459,   881,    1874,   -1256,  1429,   2485,   -1003,  -552,   14,
+    432,    952,    471,    -633,   408,    -358,   140,    554,    -1260,
+    -404,   245,    -2572,  954,    1005,   -1621,  -82,    -175,   -957,
+    112,    106,    -1117,  -819,   -62,    -785,   71,     93,     -1296,
+    -1680,  242,    -956,   -2696,  302,    -204,   -1404,  254,    -558,
+    -201,   -630,   16,     -436,   -1647,  1649,   -1096,  -1267,  2273,
+    -1270,  20,     1749,   -2509,  780,    942,    -1859,  2762,   304,
+    -300,   2617,   -947,   861,    2601,   -1153,  754,    1629,   -681,
+    686,    1443,   -235,   1900,   5,      -565,   1559,   285,    -170,
+    757,    480,    547,    752,    -427,   50,     839,    -95,    -791,
+    -1698,  -291,   -62,    -1730,  524,    1008,   -2176,  -369,   165,
+    -749,   -972,   -287,   889,    -1218,  -1712,  833,    -855,   -995,
+    -14,    -793,   -1815,  605,    -607,   -1890,  769,    -781,   230,
+    1155,   -2000,  876,    1835,   -1617,  9,      1058,   -1232,  859,
+    1486,   -1301,  1595,   501,    -951,   2935,   -921,   -634,   2826,
+    -793,   655,    2660,   -232,   235,    1879,   481,    -51,    804,
+    987,    -360,   -331,   2099,   -302,   -149,   1966,   -1233,  -12,
+    1330,   -2265,  1256,   -116,   -1394,  2937,   -995,   -1572,  2964,
+    -2257,  -2587,  1820,   -2132,  -1609,  778,    -1596,  -486,   560,
+    -1749,  274,    -706,   -1714,  1304,   -360,   -2657,  1833,   -750,
+    -1729,  433,    -1461,  -794,   -1545,  -892,   385,    -891,   -374,
+    1261,   -589,   235,    815,    -773,   -669,   636,    -471,   136,
+    871,    -392,   782,    677,    -472,   1130,   1029,   -1262,  1070,
+    2171,   575,    675,    600,    2104,   1077,   -182,   2621,   -604,
+    -30,    3302,   -1331,  599,    742,    291,    1329,   -551,   1043,
+    1729,   -1754,  1220,   1113,   -2174,  1281,   743,    -2027,  851,
+    -205,   -1576,  214,    -1629,  -605,   -394,   -1508,  -254,   -63,
+    -489,   -847,   -26,    -997,   -1065,  -120,   -376,   -1283,  -1393,
+    83,     -212,   -1610,  419,    -1120,  -590,   395,    -1210,  -21,
+    -273,   -622,   899,    -196,   -1059,  1130,   616,    -529,   -166,
+    794,    22,     -216,   862,    664,    -390,   980,    228,    789,
+    182,    402,    2149,   -1133,  799,    2637,   -799,   176,    1306,
+    905,    -93,    677,    338,    121,    483,    297,    339,    347,
+    249,    731,    40,     66,     112,    -889,   -128,   582,    -1191,
+    -67,    -1364,  -233,   488,    -1734,  -634,   1517,   -1657,  -1015,
+    594,    -1422,  1396,   -1357,  -1617,  1254,   -1596,  -941,   789,
+    -1860,  -77,    245,    -327,   569,    -723,   104,    905,    -543,
+    -918,   1387,   -42,    -440,   619,    68,     45,     1364,   -880,
+    19,     1491,   -561,   1174,   1403,   -1411,  1351,   1222,   -612,
+    864,    877,    -658,   382,    864,    -552,   1286,   309,    -105,
+    1083,   -170,   -289,   1049,   -248,   -537,   625,    -48,    337,
+    -385,   532,    -315,   -1398,  588,    -628,   -1192,  649,    -806,
+    -170,   541,    -2267,  1052,   274,    -1970,  833,    253,    -1345,
+    -290,   -120,   -959,   -94,    -189,   -1397,  -136,   -155,   -654,
+    207,    -706,   617,    415,    -1962,  1169,   670,    -1132,  319,
+    297,    -589,   100,    510,    -620,   610,    -153,   -15,    1327,
+    -99,    229,    281,    169,    1015,   -106,   1197,   577,    -698,
+    577,    931,    -964,   1605,   505,    -1713,  2369,   115,    -1585,
+    1839,   664,    -1411,  867,    620,    329,    491,    -1119,  420,
+    266,    -1708,  499,    -69,    -1037,  795,    -321,   -959,   32,
+    235,    -1748,  295,    -249,   -230,   485,    -1185,  -97,    489,
+    -2036,  711,    405,    -2800,  593,    434,    -1038,  536,    347,
+    -570,   705,    -806,   -290,   818,    -999,   53,     1585,   -756,
+    -657,   1180,   115,    -364,   217,    -226,   1033,   347,    -20,
+    611,    658,    590,    -128,   -451,   1676,   -660,   -21,    805,
+    -880,   1481,   412,    -1534,  1522,   221,    -132,   662,    -407,
+    613,    1132,   -551,   -187,   1184,   -577,   -444,   953,    -1034,
+    -472,   461,    -865,   -99,    637,    -572,   300,    450,    -591,
+    137,    404,    -972,   306,    -524,   -1167,  433,    124,    -1326,
+    -368,   -305,   -917,   452,    -626,   -695,   656,    258,    -1401,
+    270,    446,    -1045,  636,    -357,   -1072,  913,    512,    -1732,
+    489,    952,    -747,   58,     673,    -453,   1125,   -488,   46,
+    1723,   -1244,  417,    1803,   -1215,  623,    659,    -560,   676,
+    -9,     92,     701,    1100,   -623,   142,    283,    -512,   547,
+    576,    -525,   -155,   1143,   -1286,  -329,   1959,   -1302,  -459,
+    1188,   -1199,  1020,   -118,   -1303,  956,    -905,   -647,   595,
+    -356,   -1354,  -74,    750,    -791,   -335,   56,     -862,   -36,
+    276,    -279,   46,     -485,   -181,   196,    -584,   -238,   259,
+    -314,   -77,    383,    509,    -386,   -180,   859,    -542,   955,
+    372,    -362,   1458,   113,    -106,   1495,   -534,   63,     1295,
+    -505,   846,    983,    -1097,  1764,   320,    -185,   1061,   -525,
+    115,    217,    -328,   326,    312,    374,    179,    -683,   485,
+    -1286,  147,    -583,   -979,   888,    -504,   -1235,  715,    -1050,
+    -1111,  848,    -828,   -1043,  -115,   -327,   22,     -451,   -1008,
+    98,     -262,   -545,   -363,   -48,    -257,   -731,   878,    96,
+    -1186,  426,    359,    -1101,  1074,   -267,   521,    -375,   -166,
+    1398,   -994,   780,    550,    124,    -298,   581,    236,    305,
+    -111,   396,    741,    -10,    662,    155,    271,    563,    65,
+    -318,   812,    -483,   843,    75,     -714,   1152,   -26,    -190,
+    -97,    533,    -111,   -564,   724,    -24,    -820,   835,    -473,
+    -632,   154,    -104,   -932,   919,    -606,   -619,   496,    -310,
+    -271,   -360,   120,    -630,   126,    65,     -931,   548,    -207,
+    -455,   410,    -282,   -931,   944,    -354,   69,     412,    -661,
+    1068,   -969,   -443,   1894,   -1281,  -442,   2003,   -1640,  713,
+    852,    -1344,  1338,   -457,   243,    498,    -697,   -129,   993,
+    -388,   -76,    1039,   -768,   492,    -104,   -58,    951,    -854,
+    181,    1093,   -1111,  491,    544,    -1061,  118,    586,    -477,
+    -411,   392,    233,    91,     -908,   532,    218,    -1176,  670,
+    -74,    -674,   696,    -801,   194,    592,    -1790,  762,    -564,
+    -791,   595,    -145,   -727,   228,    434,    -246,   -232,   -169,
+    281,    -324,   289,    -120,   -270,   -49,    282,    250,    -56,
+    -405,   507,    27,     -1060,  1329,   -203,   -204,   1677,   -767,
+    -313,   1272,   -968,   717,    183,    -1652,  2157,   -75,    -1906,
+    2590,   -428,   -1614,  2564,   -1511,  -240,   1421,   -1911,  1420,
+    396,    -1397,  1691,   -694,   -1500,  1942,   -823,   -784,   841,
+    -635,   759,    -447,   351,    44,     -946,   227,    441,    -564,
+    155,    -719,   182,    509,    -320,   -300,   205,    -662,   726,
+    469,    -1240,  191,    664,    -269,   -152,   -18,    214,    -149,
+    -257,   347,    76,     -79,    -384,   874,    -387,   -269,   892,
+    -783,   537,    46,     27,     251,    -332,   133,    377,    -522,
+    232,    626,    -362,   -499,   1112,   -342,   -522,   362,    -187,
+    547,    -384,   -155,   517,    -551,   227,    651,    -825,   -88,
+    579,    -758,   -40,    456,    -774,   542,    -164,   -482,   968,
+    -1000,  -394,   1094,   -885,   431,    74,     -348,   403,    -959,
+    831,    -465,   -330,   762,    -717,   -645,   1342,   -499,   -416,
+    944,    -417,   -438,   737,    -368,   -42,    740,    -1234,  689,
+    29,     -106,   619,    -824,   -10,    1047,   -824,   146,    -59,
+    210,    163,    -43,    522,    -352,   213,    460,    -1049,  599,
+    308,    -843,   632,    223,    -504,   296,    530,    -931,   751,
+    -176,   -524,   379,    236,    -626,   66,     662,    -575,   191,
+    -175,   -619,   660,    -424,   -217,   704,    -498,   200,    62,
+    -543,   280,    91,     -378,   54,     168,    -554,   670,    -215,
+    -1097,  1805,   -1015,  -617,   1642,   -1560,  727,    61,     7,
+    -48,    -659,   1308,   -752,   -613,   914,    160,    -469,   164,
+    -167,   274,    326,    -667,   497,    333,    -757,   1252,   -481,
+    -1257,  2019,   -949,   -719,   1676,   -1078,  250,    323,    -1100,
+    1550,   145,    -1697,  972,    522,    -966,   374,    -365,   846,
+    -276,   -756,   629,    -278,   302,    -151,   -243,   -363,   841,
+    -7,     -1092,  476,    45,     201,    -378,   -456,   1113,   -926,
+    97,     178,    -240,   326,    -597,   472,    -10,    -190,   394,
+    -501,   -259,   307,    133,    240,    -433,   -192,   472,    -190,
+    12,     398,    -191,   -605,   1295,   -576,   -154,   474,    -661,
+    866,    -968,   172,    887,    -736,   36,     259,    -201,   265,
+    460,    -859,   622,    102,    -690,   776,    -80,    -745,   919,
+    140,    -750,   224,    134,    -236,   -196,   456,    409,    -1069,
+    600,    239,    -306,   -383,   541,    -213,   -323,   -121,   700,
+    -735,   179,    222,    -613,   653,    -711,   -81,    592,    -694,
+    117,    703,    -772,   -264,   644,    -117,   -422,   276,    64,
+    -355,   -430,   800,    -74,    -619,   1207,   -1057,  4,      960,
+    -1219,  977,    -78,    -1186,  1536,   267,    -1388,  1144,   -90,
+    -1052,  1889,   -1255,  -387,   1815,   -1763,  1037,   421,    -1003,
+    767,    -24,    -277,   -54,    759,    -285,   -1015,  1422,   -581,
+    -121,   547,    -687,   288,    440,    -626,   -623,   1261,   -248,
+    -1133,  1204,   -714,   382,    219,    -851,   240,    -161,   672,
+    -261,   -855,   1043,   -599,   111,    -362,   225,    641,    -913,
+    -122,   1075,   -1165,  432,    131,    -803,   978,    33,     -1291,
+    992,    224,    -1054,  789,    -121,   -215,   262,    -11,    89,
+    -174,   365,    -240,   114,    406,    -813,   291,    233,    158,
+    -377,   194,    216,    -477,   635,    -228,   -512,   599,    23,
+    -273,   71,     258,    10,     -155,   -198,   354,    61,     -749,
+    768,    -19,    -709,   596,    97,     -276,   164,    69,     -144,
+    -20,    529,    -897,   188,    480,    -703,   836,    -874,   259,
+    917,    -1044,  -7,     566,    -97,    -439,   256,    -466,   998,
+    -360,   -1134,  1619,   -762,   -752,   1446,   -707,   -177,   652,
+    -899,   579,    253,    -410,   146,    -262,   275,    353,    -610,
+    52,     671,    -862,   419,    -140,   273,    247,    -1062,  1005,
+    -175,   -497,   772,    -431,   -101,   450,    -598,   266,    428,
+    -842,   477,    -11,    -554,   642,    17,     -787,   544,    445,
+    -625,   -205,   796,    -222,   -733,   764,    -572,   423,    166,
+    -994,   931,    -228,   -303,   362,    -214,   104,    448,    -1091,
+    722,    570,    -1311,  773,    259,    -648,   477,    193,    -682,
+    302,    459,    -464,   -383,   1120,   -561,   -564,   1083,   -372,
+    -354,   864,    -586,   -200,   502,    -331,   27,     446,    -657,
+    281,    571,    -888,   502,    251,    -423,   116,    277,    -263,
+    118,    -170,   168,    367,    -723,   202,    438,    -793,   451,
+    -30,    -292,   202,    38,     -188,   -66,    221,    -90,    -105,
+    7,      346,    -578,   337,    247,    -371,   -14,    22,     36,
+    151,    -322,   -244,   692,    -556,   -5,     550,    -560,   200,
+    161,    -347,   191,    258,    -520,   441,    -212,   -215,   584,
+    -428,   -251,   213,    90,     -187,   109,    138,    -211,   -17,
+    191,    111,    -259,   161,    -141,   232,    -175,   0,      154,
+    -369,   539,    -171,   -438,   484,    43,     -375,   -37,    249,
+    196,    -328,   -106,   541,    -531,   103,    240,    -191,   186,
+    -363,   40,     585,    -573,   258,    170,    -593,   515,    -261,
+    -86,    407,    -339,   164,    -214,   -34,    464,    -377,   -206,
+    336,    -230,   239,    -85,    -69,    322,    -503,   322,    142,
+    -748,   867,    -160,   -753,   836,    -249,   -362,   750,    -374,
+    -222,   448,    -82,    -246,   399,    13,     -429,   441,    -47,
+    -127,   -29,    337,    -502,   318,    132,    -457,   498,    -145,
+    -91,    98,     208,    -179,   54,     62,     -260,   237,    96,
+    -161,   32,     -150,   93,     21,     -31,    74,     75,     -322,
+    164,    168,    -191,   119,    -121,   -66,    -195,   296,    -128,
+    -251,   381,    -56,    -338,   281,    -29,    -472,   664,    -301,
+    -275,   423,    -285,   -77,    258,    -82,    -139,   160,    -54,
+    -26,    27,     75,     -49,    -196,   305,    -131,   -187,   262,
+    -37,    -206,   65,     269,    -240,   -144,   261,    54,     -338,
+    355,    3,      -503,   535,    -253,   -210,   433,    -290,   -33,
+    381,    -546,   173,    252,    -364,   271,    -329,   166,    266,
+    -564,   507,    -32,    -648,   861,    -400,   -357,   819,    -519,
+    -74,    392,    -423,   426,    -306,   -93,    691,    -991,   537,
+    467,    -992,   614,    426,    -823,   491,    182,    -371,   174,
+    84,     -64,    98,     -96,    23,     182,    -69,    -211,   226,
+    18,     -134,   334,    -514,   352,    378,    -623,   363,    266,
+    -592,   493,    -46,    -369,   594,    -440,   -10,    295,    -368,
+    326,    -192,   -140,   306,    -305,   140,    198,    -396,   202,
+    154,    -341,   208,    -8,     -169,   -76,    106,    20,     -347,
+    233,    30,     -193,   117,    -9,     -165,   182,    -4,     -195,
+    96,     131,    -188,   -106,   166,    -71,    -99,    57,     4,
+    -31,    -131,   101,    63,     -199,   225,    -25,    -281,   342,
+    -247,   -170,   516,    -289,   -263,   422,    -158,   -148,   363,
+    -192,   -138,   122,    62,     -105,   7,      194,    -53,    -224,
+    83,     173,    -182,   20,     178,    -274,   182,    74,     -109,
+    -5,     319,    -303,   -72,    428,    -371,   50,     271,    -204,
+    17,     161,    -256,   169,    93,     -169,   94,     -89,    139,
+    80,     -199,   325,    -67,    -83,    202,    -154,   16,     202,
+    -325,   162,    61,     -93,    201,    -278,   236,    108,    -477,
+    594,    -145,   -370,   647,    -261,   -356,   669,    -369,   -181,
+    420,    -266,   -154,   159,    -25,    53,     -40,    -22,    68,
+    -203,   144,    -2,     -173,   88,     -3,     -62,    2,      75,
+    55,     -95,    -130,   219,    -142,   -191,   164,    -170,   44,
+    0,      -246,   249,    -27,    -413,   461,    27,     -490,   292,
+    19,     -145,   13,     99,     91,     -466,   209,    295,    -773,
+    465,    210,    -680,   410,    163,    -358,   399,    -201,   87,
+    23,     -212,   270,    -230,   86,     159,    -353,   381,    -73,
+    -456,   726,    -353,   -357,   754,    -367,   -344,   657,    -59,
+    -417,   432,    35,     -309,   153,    97,     -69,    89,     -101,
+    63,     107,    -127,   106,    112,    -26,    -236,   376,    43,
+    -479,   544,    -57,    -407,   447,    -148,   -103,   195,    -198,
+    80,     156,    -228,   35,     145,    -77,    -55,    130,    -33,
+    -190,   123,    41,     -170,   74,     114,    -241,   67,     192,
+    -195,   -76,    186,    -136,   -133,   213,    -105,   -110,   144,
+    -51,    -126,   154,    -59,    -124,   147,    -49,    -132,   82,
+    26,     -130,   63,     68,     -211,   97,     131,    -224,   59,
+    184,    -250,   59,     205,    -225,   -67,    163,    -135,   -24,
+    74,     -22,    -4,     -81,    21,     71,     -137,   71,     47,
+    -120,   71,     34,     -65,    138,    -6,     -116,   112,    -47,
+    -39,    20,     -75,    64,     -7,     2,      35,     52,     -61,
+    -29,    81,     -61,    -30,    195,    -91,    -136,   261,    -11,
+    -186,   162,    -86,    -35,    152,    -106,   -32,    126,    -4,
+    49,     33,     -9,     -11,    46,     111,    -132,   -3,     204,
+    -175,   -10,    281,    -146,   -94,    226,    -126,   -36,    58,
+    -14,    61,     -172,   48,     193,    -221,   83,     149,    -279,
+    195,    130,    -357,   226,    102,    -260,   191,    16,     -223,
+    124,    14,     -144,   90,     -31,    -81,    -66,    54,     103,
+    -181,   29,     174,    -281,   92,     81,     -226,   139,    -133,
+    -41,    167,    -147,   44,     27,     -132,   107,    -34,    -122,
+    105,    -54,    17,     52,     -131,   138,    33,     -206,   158,
+    43,     -80,    24,     10,     -27,    33,     43,     -71,    15,
+    71,     -42,    14,     18,     0,      -3,     -14,    -14,    58,
+    46,     -99,    122,    105,    -202,   125,    119,    -238,   112,
+    133,    -242,   113,    129,    -301,   52,     161,    -177,   82,
+    73,     -139,   46,     122,    -119,   22,     155,    -230,   23,
+    242,    -211,   -12,    182,    -184,   -57,    190,    -34,    -101,
+    58,     -20,    6,      103,    -61,    -78,    12,     18,     12,
+    86,     -71,    -27,    43,     -24,    8,      39,     -109,   21,
+    -4,     -44,    66,     13,     -59,    61,     -39,    35,     113,
+    -179,   19,     171,    -158,   14,     112,    -133,   26,     9,
+    -43,    -9,     6,      41,     -77,    22,     80,     -61,    -63,
+    65,     -32,    -32,    125,    -105,   -11,    114,    -120,   42,
+    42,     -92,    45,     -56,    -25,    131,    -83,    -24,    97,
+    -51,    -5,     67,     -69,    7,      41,     -27,    8,      3,
+    -10,    8,      -3,     -87,    -28,    122,    -33,    -58,    124,
+    -53,    -50,    67,     -115,   -17,    111,    -112,   -30,    101,
+    -24,    -13,    41,     3,      45,     -13,    -34,    23,     23,
+    -19,    13,     -49,    -49,    68,     -68,    -32,    91,     -58,
+    -18,    73,     -19,    -27,    17,     -33,    -35,    99,     -38,
+    -99,    78,     -31,    -62,    95,     -71,    -124,   184,    -15,
+    -146,   160,    -27,    -109,   140,    -25,    -63,    84,     -34,
+    -18,    58,     -68,    -16,    22,     -87,    86,     23,     -130,
+    61,     62,     -132,   51,     168,    -139,   35,     133,    -121,
+    50,     102,    -120,   40,     126,    -87,    -40,    119,    -14,
+    -59,    78,     11,     -68,    41,     24,     -25,    55,     -2,
+    15,     21,     -73,    56,     88,     -74,    -41,    4,      -10,
+    -4,     5,      7,      -39,    -3,     -4,     -39,    94,     52,
+    -135,   42,     90,     -86,    12,     21,     -55,    -70,    -37,
+    55,     -63,    -35,    50,     -100,   21,     84,     -151,   24,
+    87,     -94,    51,     2,      -58,    104,    -61,    -70,    60,
+    -25,    -42,    -31,    55,     35,     -129,   47,     69,     -65,
+    77,     2,      -60,    110,    -32,    -69,    84,     -54,    -26,
+    98,     -28,    -7,     49,     -49,    -19,    119,    -11,    -157,
+    20,     106,    29,     -8,     -38,    -30,    72,     30,     -3,
+    1,      -32,    -11,    -9,     52,     46,     -144,   -38,    86,
+    -31,    -9,     -42,    -75,    142,    34,     -64,    79,     -109,
+    -55,    195,    -69,    -80,    48,     -49,    62,     25,     -111,
+    -42,    52,     19,     -41,    1,      -16,    -33,    44,     30,
+    -21,    17,     -2,     -30,    111,    34,     -111,   83,     55,
+    -119,   66,     62,     -89,    63,     -39,    -143,   168,    21,
+    -158,   158,    32,     -132,   134,    -3,     -77,    88,     -45,
+    -18,    117,    -51,    -71,    10,     30,     35,     -27,    -63,
+    13,     34,     23,     -23,    19,     -4,     -92,    34,     74,
+    -69,    -15,    20,     -36,    56,     -36,    -96,    69,     -34,
+    -122,   32,     31,     -51,    -3,     -21,    4,      43,     -44,
+    6,      81,     -39,    -35,    26,     -38,    -24,    29,     -16,
+    -47,    -6,     19,     -7,     -9,     41,     32,     13,     -2,
+    -21,    3,      24,     49,     -3,     -66,    14,     95,     -7,
+    -52,    80,     68,     -72,    -14,    39,     2,      24,     -6,
+    -53,    86,     21,     -78,    67,     28,     -34,    16,     -23,
+    -1,     70,     -3,     -58,    45,     33,     -94,    -34,    62,
+    41,     -11,    -27,    27,     46,     14,     -33,    -12,    44,
+    -16,    -59,    6,      45,     -3,     -42,    2,      13,     19,
+    -1,     -71,    3,      42,     -36,    6,      17,     26,     5,
+    -46,    6,      -68,    -75,    86,     -20,    -90,    80,     4,
+    -86,    5,      2,      -33,    -15,    -2,     -8,     -18,    15,
+    -7,     -25,    27,     -28,    -88,    39,     -2,     -85,    58,
+    40,     -45,    3,      17,     0,      11,     -4,     -3,     84,
+    22,     -113,   8,      94,     10,     9,      28,     6,      -3,
+    5,      -2,     23,     23,     -1,     -40,    20,     48,     -40,
+    -21,    72,     7,      -40,    -1,     27,     16,     30,     31,
+    -16,    11,     9,      -71,    -7,     62,     21,     -61,    -19,
+    78,     -2,     -22,    67,     -42,    -12,    75,     -79,    47,
+    86,     -124,   -42,    21,     4,      23,     -32,    -7,     19,
+    1,      -13,    -46,    2,      32,     -43,    -7,     86,     -16,
+    -22,    46,     -61,    -35,    11,     -64,    -38,    17,     -12,
+    -27,    20,     41,     6,      -58,    -61,    58,     -51,    -77,
+    36,     -25,    19,     93,     -76,    1,      72,     -92,    15,
+    40,     -56,    65,     13,     -29,    82,     -9,     -21,    24,
+    -83,    -5,     4,      -63,    77,     80,     -58,    -6,     -19,
+    -43,    100,    5,      -36,    63,     33,     -26,    -48,    26,
+    -18,    -75,    34,     24,     -45,    -1,     6,      -35,    -24,
+    -23,    -22,    47,     -15,    -46,    31,     -40,    -41,    74,
+    -32,    -73,    59,     -51,    -26,    143,    -29,    -42,    93,
+    -44,    -21,    56,     -7,     55,     51,     -61,    74,     111,
+    -71,    35,     124,    -123,   -3,     62,     -79,    100,    49,
+    -122,   143,    79,     -137,   72,     30,     -82,    75,     -10,
+    -48,    35,     -23,    -25,    34,     0,      -54,    -6,     34,
+    -46,    -59,    -7,     -72,    -6,     70,     -41,    -39,    23,
+    -33,    11,     104,    -44,    -30,    54,     -69,    -20,    62,
+    -75,    1,      45,     -69,    1,      40,     -59,    -15,    18,
+    -16,    38,     -1,     -52,    8,      14,     -32,    11,     -15,
+    -58,    18,     -22,    -44,    69,     40,     -50,    -21,    1,
+    -35,    -3,     -5,     -20,    40,     36,     -41,    -36,    -43,
+    -11,    48,     -34,    -40,    51,     -10,    -9,     30,     10,
+    12,     51,     51,     -8,     -16,    32,     -6,     31,     24,
+    -38,    43,     18,     -15,    53,     -10,    -55,    9,      8,
+    -28,    21,     10,     -26,    21,     10,     -9,     5,      -29,
+    -13,    38,     -1,     -11,    49,     0,      -41,    10,     23,
+    -25,    -35,    -2,     -32,    -10,    58,     -6,     -18,    16,
+    -9,     4,      11,     17,     21,     21,     12,     -2,     49,
+    -16,    -128,   21,     75,     -32,    22,     34,     -59,    48,
+    75,     -69,    -11,    -2,     -65,    39,     57,     -54,    -79,
+    -11,    -20,    -13,    38,     4,      -9,     -22,    -22,    33,
+    -7,     -52,    10,     -10,    -19,    54,     47,     -21,    -35,
+    -6,     -4,     11,     8,      -28,    1,      8,      -4,     30,
+    1,      -22,    26,     -7,     -24,    56,     25,     -45,    13,
+    24,     -32,    13,     22,     -46,    -2,     15,     -39,    28,
+    32,     -69,    0,      27,     -69,    0,      39,     -40,    28,
+    55,     -27,    -13,    0,      -14,    37,     25,     -25,    34,
+    -3,     -69,    26,     39,     -41,    -6,     29,     -7,     5,
+    66,     41,     -27,    -17,    6,      -14,    -21,    0,      29,
+    -9,     -26,    32,     -5,     -34,    60,     15,     -60,    20,
+    13,     11,     43,     -48,    -15,    88,     -13,    -55,    26,
+    -32,    -46,    35,     14,     -37,    -11,    12,     -20,    11,
+    9,      -64,    -16,    17,     5,      38,     7,      -30,    -9,
+    -49,    -11,    52,     -15,    -38,    -27,    -12,    36,     53,
+    1,      -37,    -17,    -12,    0,      31,     1,      13,     40,
+    -15,    2,      47,     -15,    -17,    28,     -2,     -4,     25,
+    -6,     -12,    2,      -17,    -9,     5,      -15,    17,     21,
+    -28,    0,      15,     -43,    -63,    -6,     -14,    -8,     37,
+    -34,    -40,    30,     -12,    -14,    37,     -13,    -16,    26,
+    -15,    -2,     13,     -37,    -13,    32,     13,     -8,     -2,
+    -12,    -8,     9,      9,      -3,     4,      13,     34,     -2,
+    -22,    40,     19,     29,     25,     -48,    -17,    23,     17,
+    7,      3,      0,      12,     37,     -1,     -25,    30,     41,
+    -7,     7,      29,     -31,    -31,    -23,    -27,    5,      2,
+    -18,    -2,     22,     9,      -6,     5,      -7,     -24,    9,
+    0,      -28,    19,     61,     -11,    -45,    21,     -28,    -65,
+    28,     33,     -44,    -27,    -6,     -26,    -8,     4,      5,
+    9,      -10,    -46,    -20,    20,     -7,     -7,     -33,    -26,
+    50,     9,      -65,    -22,    -3,     -20,    15,     21,     20,
+    24,     -16,    -27,    -13,    14,     21,     -38,    -48,    9,
+    35,     28,     21,     3,      -31,    -8,     57,     32,     -35,
+    -22,    20,     14,     12,     28,     39,     0,      -18,    44,
+    -2,     -17,    53,     0,      -27,    33,     43,     5,      -10,
+    25,     47,     -3,     -4,     36,     15,     -12,    -3,     29,
+    41,     23,     23,     -8,     -32,    15,     37,     0,      3,
+    22,     31,     1,      -20,    27,     2,      -50,    0,      33,
+    16,     -16,    -17,    18,     -26,    -34,    31,     -27,    -84,
+    -33,    4,      -5,     -22,    -17,    -28,    -66,    -24,    8,
+    -16,    -25,    -51,    -13,    45,     -11,    -49,    -26,    -49,
+    -38,    21,     10,     -52,    -58,    -19,    -4,     9,      -31,
+    -29,    55,     2,      -45,    29,     10,     -22,    49,     33,
+    -27,    -19,    -5,     30,     47,     11,     -11,    -2,     8,
+    5,      17,     8,      3,      57,     63,     28,     24,     11,
+    2,      14,     22,     7,      7,      2,      23,     33,     -2,
+    -8,     14,     7,      20,     57,     32,     -5,     12,     23,
+    10,     17,     26,     -18,    -72,    -6,     74,     61,     13,
+    -17,    -21,    -7,     29,     45,     5,      -52,    -49,    1,
+    10,     35,     40,     -46,    -66,    7,      31,     -27,    -44,
+    -12,    -41,    -22,    32,     -12,    -32,    -3,     -17,    -22,
+    -22,    -31,    -30,    -23,    -13,    3,      0,      -21,    -19,
+    -7,     -17,    -9,     18,     -40,    -64,    1,      4,      -4,
+    8,      -17,    -28,    -1,     9,      -7,     -9,     27,     6,
+    -63,    -32,    52,     25,     -46,    -23,    -6,     -11,    35,
+    29,     -50,    -44,    17,     -6,     -12,    53,     28,     -17,
+    -9,     28,     34,     -20,    -18,    22,     43,     28,     -6,
+    8,      14,     19,     28,     14,     27,     26,     12,     76,
+    66,     -18,    -2,     18,     -12,    -1,     -2,     -1,     51,
+    30,     -18,    5,      14,     -12,    2,      13,     -25,    -9,
+    32,     7,      -5,     15,     -12,    -33,    -18,    -13,    6,
+    0,      -25,    -12,    1,      -17,    0,      13,     -24,    -27,
+    4,      35,     14,     -22,    5,      13,     -18,    -30,    -10,
+    -7,     -7,     31,     23,     -27,    -26,    9,      47,     6,
+    -50,    -11,    19,     1,      11,     12,     -19,    -43,    -18,
+    10,     -6,     -3,     12,     2,      -12,    -16,    10,     9,
+    -25,    -21,    -10,    -13,    0,      8,      -1,     -9,     10,
+    4,      -34,    14,     46,     5,      18,     24,     -15,    -7,
+    20,     -1,     -13,    7,      11,     14,     11,     -2,     8,
+    27,     10,     -1,     13,     -2,     -7,     48,     44,     -15,
+    -16,    -6,     3,      7,      -35,    -25,    8,      -31,    -16,
+    30,     36,     22,     -13,    -21,    -10,    8,      2,      -58,
+    -37,    32,     25,     -1,     -25,    -21,    3,      3,      -6,
+    -11,    -3,     2,      4,      34,     22,     -25,    -19,    0,
+    -6,     -10,    -8,     -35,    -32,    8,      -3,     -20,    -11,
+    -6,     3,      8,      -8,     3,      25,     23,     -7,     -35,
+    -15,    8,      -20,    -6,     15,     -44,    -29,    19,     -5,
+    -1,     18,     28,     6,      -21,    9,      11,     -20,    -10,
+    18,     22,     6,      -2,     12,     6,      23,     34,     -20,
+    -19,    1,      -10,    34,     41,     13,     6,      3,      22,
+    11,     -4,     4,      -12,    -8,     17,     18,     12,     -1,
+    5,      9,      -6,     -2,     4,      1,      3,      2,      -6,
+    -32,    -25,    9,      18,     27,     -4,     -54,    -29,    2,
+    -3,     -18,    -38,    -28,    -10,    9,      20,     5,      -9,
+    -15,    -3,     2,      -14,    -15,    -6,     5,      10,     6,
+    3,      -11,    -9,     -5,     -20,    -13,    8,      3,      -14,
+    6,      20,     -15,    -21,    9,      19,     21,     12,     -4,
+    -21,    -17,    16,     27,     -4,     -28,    -2,     26,     9,
+    -12,    -16,    -28,    -28,    -4,     4,      -15,    -9,     3,
+    -10,    -16,    2,      17,     -10,    -26,    3,      16,     26,
+    17,     -12,    -9,     2,      -2,     -5,     -11,    5,      28,
+    1,      -14,    13,     14,     5,      18,     6,      -17,    -5,
+    7,      2,      -3,     11,     10,     -1,     50,     36,     -28,
+    21,     39,     -9,     -6,     2,      10,     36,     20,     -2,
+    -3,     -11,    -10,    -6,     -5,     -4,     -8,     2,      17,
+    1,      -13,    11,     -13,    -36,    11,     14,     -19,    -6,
+    3,      0,      20,     -5,     -24,    12,     7,      -11,    2,
+    -15,    -28,    -1,     6,      -14,    -31,    -39,    -19,    19,
+    37,     3,      -32,    -27,    -6,     13,     31,     15,     -41,
+    -41,    25,     35,     -3,     -16,    -25,    -19,    -10,    -3,
+    19,     10,     -4,     7,      -4,     -19,    -12,    -13,    -9,
+    6,      2,      -12,    -6,     12,     6,      -1,     -5,     -19,
+    -7,     7,      40,     56,     -3,     -13,    21,     24,     7,
+    -11,    -9,     -3,     24,     28,     -10,    1,      12,     21,
+    24,     -16,    -15,    4,      -7,     -2,     19,     13,     -11,
+    -7,     -8,     15,     41,     5,      -16,    -18,    -11,    26,
+    26,     -5,     -12,    -14,    -6,     10,     8,      -8,     -16,
+    -16,    -3,     10,     1,      -3,     -3,     -2,     -15,    -18,
+    6,      -4,     -4,     21,     4,      -2,     15,     13,     0,
+    -2,     12,     7,      -15,    -9,     1,      -2,     2,      -1,
+    -9,     -15,    -17,    -14,    -10,    1,      -4,     -16,    -17,
+    -1,     18,     8,      1,      22,     11,     -19,    -10,    4,
+    -23,    -29,    0,      -2,     -14,    -6,     13,     7,      -23,
+    -13,    10,     9,      11,     10,     4,      -4,     -4,     1,
+    6,      14,     9,      2,      0,      2,      6,      4,      -9,
+    -18,    -8,     8,      18,     8,      13,     9,      -27,    -22,
+    -10,    -24,    -9,     17,     11,     2,      9,      3,      -13,
+    -10,    -1,     -7,     -1,     10,     -4,     1,      16,     12,
+    -6,     -14,    -2,     -5,     -1,     0,      -1,     6,      -9,
+    -3,     12,     4,      1,      -2,     2,      17,     24,     22,
+    9,      8,      21,     14,     -2,     -2,     4,      -1,     -7,
+    -7,     -6,     -1,     -6,     17,     30,     -7,     -10,    -3,
+    -19,    -18,    2,      21,     4,      -20,    -6,     -1,     -18,
+    -14,    -6,     -7,     -1,     6,      10,     8,      -5,     0,
+    10,     -22,    -40,    -22,    4,      34,     16,     -19,    -16,
+    -12,    -17,    -16,    -17,    -29,    -28,    -4,     10,     16,
+    22,     13,     4,      -1,     -5,     16,     15,     -11,    -6,
+    9,      3,      -14,    -22,    -19,    -12,    5,      -5,     -15,
+    3,      9,      27,     17,     -4,     8,      -2,     1,      16,
+    11,     9,      9,      8,      -14,    -16,    7,      -5,     -15,
+    -11,    -5,     19,     25,     25,     43,     21,     -9,     -9,
+    -19,    -10,    14,     -11,    -19,    8,      3,      1,      11,
+    -1,     -24,    -20,    -1,     2,      7,      24,     22,     11,
+    8,      6,      -2,     -11,    -3,     -2,     -4,     0,      -7,
+    0,      6,      -1,     -16,    -35,    -8,     8,      -11,    -6,
+    6,      18,     16,     7,      12,     5,      -2,     -3,     -10,
+    -21,    -27,    -10,    -3,     -3,     8,      0,      -9,     -10,
+    -3,     0,      -5,     6,      9,      19,     23,     8,      -5,
+    -19,    -16,    -5,     -6,     -27,    -22,    1,      6,      8,
+    2,      -9,     -13,    -15,    -18,    -13,    4,      25,     29,
+    26,     -2,     -22,    1,      8,      1,      -6,     -6,     -7,
+    -20,    0,      13,     -14,    -24,    -24,    -21,    2,      14,
+    16,     23,     15,     10,     10,     5,      0,      -26,    -32,
+    3,      19,     5,      -8,     -7,     -8,     -3,     17,     27,
+    -7,     -28,    10,     32,     10,     1,      10,     3,      -4,
+    22,     24,     -31,    -40,    0,      6,      5,      17,     17,
+    1,      10,     30,     8,      -12,    -6,     9,      6,      -12,
+    -5,     1,      -4,     6,      11,     0,      -9,     -4,     -3,
+    -4,     -3,     2,      0,      -2,     -9,     -27,    -23,    2,
+    13,     -6,     -9,     -3,     -12,    -2,     10,     6,      -7,
+    -19,    -31,    -13,    16,     11,     -3,     -13,    -15,    0,
+    7,      -3,     -7,     -1,     -4,     7,      15,     0,      -12,
+    -8,     -1,     -7,     -12,    -21,    -17,    5,      30,     25,
+    -6,     -6,     0,      -12,    -8,     2,      13,     11,     1,
+    5,      4,      4,      10,     -1,     -20,    -12,    -4,     3,
+    15,     11,     -7,     -24,    -4,     8,      -2,     -14,    -25,
+    -17,    7,      21,     14,     1,      0,      12,     17,     13,
+    6,      1,      6,      14,     11,     -10,    -21,    -12,    -4,
+    3,      -2,     -21,    -24,    -2,     12,     14,     17,     4,
+    -2,     11,     11,     11,     1,      -34,    -32,    -5,     10,
+    7,      -11,    -12,    6,      7,      -4,     -10,    -15,    -5,
+    17,     21,     0,      -15,    -15,    -1,     5,      -18,    -18,
+    -10,    -9,     24,     27,     -9,     -14,    0,      9,      25,
+    22,     1,      -7,     -2,     16,     13,     -14,    -10,    7,
+    0,      2,      15,     2,      -9,     5,      10,     -5,     -3,
+    10,     3,      0,      15,     15,     -1,     -3,     8,      6,
+    -7,     -7,     2,      0,      -4,     5,      -8,     -37,    -28,
+    -1,     8,      6,      10,     -1,     -12,    12,     28,     8,
+    -17,    -16,    -15,    -17,    1,      6,      -4,     -8,     -4,
+    -15,    -15,    6,      -9,     -15,    10,     9,      -13,    -8,
+    5,      -2,     -10,    5,      12,     -27,    -33,    9,      8,
+    -16,    -3,     16,     -3,     -7,     22,     22,     10,     5,
+    -11,    -16,    -4,     9,      12,     6,      -3,     2,      2,
+    -1,     4,      -7,     -8,     1,      8,      19,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..33aeea516fb8c7fcb080b3b971bf5d69b81b9c4c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav
+// This should contain all 16,000 samples from the one-second file.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_yes_1000ms_sample_data_size;
+extern const int16_t g_yes_1000ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/testing/micro_test.h b/tensorflow/lite/experimental/micro/testing/micro_test.h
index 2f20dd5ac77dfd3f304c7cc93be0b865a0c2f0cb..32e9a57f76ecc055c67c0ede8d1c83550c602aab 100644
--- a/tensorflow/lite/experimental/micro/testing/micro_test.h
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.h
@@ -107,13 +107,13 @@ extern tflite::ErrorReporter* reporter;
     }                                                                          \
   } while (false)
 
-#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                         \
-  do {                                                                        \
-    if ((x) != (y)) {                                                         \
-      micro_test::reporter->Report(#x " == " #y " failed at %s:%d", __FILE__, \
-                                   __LINE__);                                 \
-      micro_test::did_test_fail = true;                                       \
-    }                                                                         \
+#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                          \
+  do {                                                                         \
+    if ((x) != (y)) {                                                          \
+      micro_test::reporter->Report(#x " == " #y " failed at %s:%d (%d vs %d)", \
+                                   __FILE__, __LINE__, (x), (y));              \
+      micro_test::did_test_fail = true;                                        \
+    }                                                                          \
   } while (false)
 
 #define TF_LITE_MICRO_EXPECT_NE(x, y)                                         \
diff --git a/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh b/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1647cf82a276d7c1725c7c3334693e0e1b7e057c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh b/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dcec7269bdc95ab57204f3b4cbc17f9d3cacadc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Creates the project file distributions for the TensorFlow Lite Micro test and
+# example targets aimed at embedded platforms.
+#
+# Usage: ci_build_micro_projects.sh <TARGET OS> <TAGS>
+#
+# For example:
+# ci_build_micro_projects.sh mbed "CMSIS disco_f746ng"
+
+set -e
+set -x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../../..
+cd ${ROOT_DIR}
+pwd
+
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
+  TARGET=${1} \
+  TAGS="${2}" \
+  generate_projects
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 741d50eb872f09c03612766c1ac17d6734cb65ec..1179b2850675decb0b18dadbf4e55ab667defb3b 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -30,13 +30,22 @@ TARGET_ARCH := $(HOST_ARCH)
 # STM32F746NG board, using the CMSIS library's implementations where possible.
 ALL_TAGS := $(TAGS) $(TARGET)
 
+# This is obviously horrible.  We need to generate these 3 versions of the
+# include directories from one source.
 INCLUDES := \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/flatbuffers/include
 
-# These are the include paths added to any generated project file.
+# Same list of paths, but now relative to the generated project files.
+GENERATED_PROJECT_INCLUDES := \
+-I. \
+-I./third_party/gemmlowp \
+-I./third_party/flatbuffers/include
+
+# Same list of paths, but now in the format the generate_keil_project.py
+# script expects them.
 PROJECT_INCLUDES := \
 . \
 third_party/gemmlowp \
@@ -117,12 +126,20 @@ third_party/flatbuffers/LICENSE.txt
 
 MAKE_PROJECT_FILES := \
   README_MAKE.md \
-  Makefile
+  Makefile \
+  .vscode/tasks.json
 
 MBED_PROJECT_FILES := \
   README_MBED.md \
   mbed-os.lib \
-  mbed_app.json
+  mbed_app.json \
+  .vscode/tasks.json
+
+KEIL_PROJECT_FILES := \
+  README_KEIL.md \
+  keil_project.uvprojx
+
+ALL_PROJECT_TARGETS :=
 
 KEIL_PROJECT_FILES := \
   README_KEIL.md \
@@ -200,12 +217,22 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
 
+# snease: Add %.bin rule here since BINDIR is now defined
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+$(BINDIR)%.bin: $(BINDIR)%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
 # Generate standalone makefile projects for all of the test targets.
 $(foreach TEST_TARGET,$(MICROLITE_TEST_SRCS),\
 $(eval $(call microlite_test,$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
 
 test: $(MICROLITE_TEST_TARGETS)
 
+generate_projects: $(ALL_PROJECT_TARGETS)
+
 # Gets rid of all generated files.
 clean:
 	rm -rf $(MAKEFILE_DIR)/gen
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
index 82c15e32f6572f36588945431918cf75299d3a64..639f002c3bff085052bd5611ed319e05cf50643d 100755
--- a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -35,9 +35,11 @@ CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
 STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
 SIFIVE_FE310_LIB_URL="https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
 RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+AM_SDK_URL="http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.0.0.zip"
 AP3_URL="https://github.com/AmbiqMicro/TFLiteMicro_Apollo3/archive/dfbcef9a57276c087d95aab7cb234f1d4c9eaaba.zip"
 CUST_CMSIS_URL="https://github.com/AmbiqMicro/TFLiteMicro_CustCMSIS/archive/8f63966c5692e6a3a83956efd2e4aed77c4c9949.zip"
 GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
+KISSFFT_URL="http://downloads.sourceforge.net/project/kissfft/kissfft/v1_3_0/kiss_fft130.zip"
 
 download_and_extract() {
   local usage="Usage: download_and_extract URL DIR"
@@ -72,37 +74,54 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
-patch_apollo3_sdk() {
-  local ap3_dir="${1}"
-  if [ ! -f ${ap3_dir}/VERSION.txt ]; then
-    echo "Could not find ${ap3_dir}, skipping Apollo3 SDK";
+patch_am_sdk() {
+  local am_dir="${1}"
+  if [ ! -f ${am_dir}/VERSION.txt ]; then
+    echo "Could not find ${am_dir}, skipping AmbiqMicro SDK patch";
     return;
   fi
-  local src_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc
-  local dest_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+
+  local src_dir=${am_dir}/boards/apollo3_evb/examples/hello_world/gcc
+  local dest_dir=${am_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+
   rm -rf ${dest_dir}
   mkdir ${dest_dir}
+
   cp "${src_dir}/startup_gcc.c" "${dest_dir}/startup_gcc.c"
   cp "${src_dir}/hello_world.ld" "${dest_dir}/apollo3evb.ld"
-  sed -i -e '131s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
+
+  sed -i -e '114s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
   sed -i -e 's/main/_main/g' "${dest_dir}/startup_gcc.c"
+
   sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "${dest_dir}/apollo3evb.ld"
   sed -i -e '3s/startup_gnu/startup_gcc/g' "${dest_dir}/apollo3evb.ld"
-  sed -i -e '6s/am_reset_isr/Reset_Handler/g' "${dest_dir}/apollo3evb.ld"
   sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "${dest_dir}/apollo3evb.ld"
   sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "${dest_dir}/apollo3evb.ld"
+
   echo "Finished preparing Apollo3 files"
 }
 
+patch_kissfft() {
+  sed -i -E "s@#ifdef FIXED_POINT@// Patched automatically by download_dependencies.sh so default is 16 bit.\n#ifndef FIXED_POINT\n#define FIXED_POINT (16)\n#endif\n// End patch.\n\n#ifdef FIXED_POINT@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@#define KISS_FFT_MALLOC malloc@#define KISS_FFT_MALLOC(X) (void*)(0) /* Patched. */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@#define KISS_FFT_FREE free@#define KISS_FFT_FREE(X) /* Patched. */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@(fprintf.*\);)@/* \1 */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/tools/kiss_fftr.c
+  sed -i -E "s@(exit.*\);)@return; /* \1 */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/tools/kiss_fftr.c
+  echo "Finished patching kissfft"
+}
+
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
 download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
 download_and_extract "${SIFIVE_FE310_LIB_URL}" "${DOWNLOADS_DIR}/sifive_fe310_lib"
 download_and_extract "${RISCV_TOOLCHAIN_URL}" "${DOWNLOADS_DIR}/riscv_toolchain"
+download_and_extract "${AM_SDK_URL}" "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0"
+patch_am_sdk "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0"
 download_and_extract "${AP3_URL}" "${DOWNLOADS_DIR}/apollo3_ext"
-patch_apollo3_sdk "${DOWNLOADS_DIR}/Apollo3-SDK-2018.08.13"
 download_and_extract "${CUST_CMSIS_URL}" "${DOWNLOADS_DIR}/CMSIS_ext"
 download_and_extract "${GCC_EMBEDDED_URL}" "${DOWNLOADS_DIR}/gcc_embedded"
+download_and_extract "${KISSFFT_URL}" "${DOWNLOADS_DIR}/kissfft"
+patch_kissfft "${DOWNLOADS_DIR}/kissfft"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
index 4d2465b5e98e6da3777fbe0f45a5f5e2c0d524eb..89f473b3109816d0326c28b5965f00f167f64476 100644
--- a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -52,6 +52,7 @@ specialize = $(call specialize_on_tags,$(1),$(strip $(call reverse,$(ALL_TAGS)))
 # 5 - List of C/C++ header files needed to build the target.
 # 6 - Linker flags required.
 # 7 - C++ compilation flags needed.
+# 8 - C compilation flags needed.
 # Calling eval on the output will create a <Name>_makefile target that you
 # can invoke to create the standalone project.
 define generate_project
@@ -68,7 +69,8 @@ $(PRJDIR)$(3)/$(1)/%: tensorflow/lite/experimental/micro/tools/make/templates/%.
 	sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
 	sed -E 's#\%\{EXECUTABLE\}\%#$(3)#g' | \
 	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
-	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' > $$@
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
 
 $(PRJDIR)$(3)/$(1)/keil_project.uvprojx: tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
 	@mkdir -p $$(dir $$@)
@@ -76,7 +78,13 @@ $(PRJDIR)$(3)/$(1)/keil_project.uvprojx: tensorflow/lite/experimental/micro/tool
         --input_template=$$< --output_file=$$@ --executable=$(3) \
         --srcs="$(4)" --hdrs="$(5)" --include_paths="$$(PROJECT_INCLUDES)"
 
+$(PRJDIR)$(3)/$(1)/.vscode/tasks.json : tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.$(1).tpl
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
 generate_$(3)_$(1)_project: $(addprefix $(PRJDIR)$(3)/$(1)/, $(4) $(5) $(2))
+
+ALL_PROJECT_TARGETS += generate_$(3)_$(1)_project
 endef
 
 # Specialized version of generate_project for TF Lite Micro test targets that
@@ -89,9 +97,9 @@ endef
 # Calling eval on the output will create targets that you can invoke to
 # generate the standalone project.
 define generate_microlite_projects
-$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
-$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
-$(call generate_project,keil,$(KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
+$(call generate_project,keil,$(KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
 endef
 
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index 6ed402a623188a7c39a007a1cfd7dbc67b775103..fa9dc9c99b82d04d7d28af85325789719da9397a 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -5,9 +5,9 @@ ifeq ($(TARGET), apollo3evb)
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   # Download the Ambiq Apollo3 SDK and set this variable to find the header
   # files:
-  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/Apollo3-SDK-2018.08.13
+  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/AmbiqSuite-Rel2.0.0
   # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
-  # with the softfp interfaces.
+  # with the hard interfaces.
   GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
   PLATFORM_FLAGS = \
@@ -31,7 +31,7 @@ ifeq ($(TARGET), apollo3evb)
     -mcpu=cortex-m4 \
     -mthumb \
     -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=softfp \
+    -mfloat-abi=hard \
     -std=gnu++11 \
     -Wvla \
     -Wall \
@@ -49,7 +49,7 @@ ifeq ($(TARGET), apollo3evb)
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
-    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=softfp \
+    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
     -nostartfiles -static \
     -Wl,--gc-sections -Wl,--entry,Reset_Handler \
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
@@ -61,7 +61,7 @@ ifeq ($(TARGET), apollo3evb)
   MICROLITE_LIBS := \
     $(APOLLO3_SDK)/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a \
     $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
-    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
     -lm
   INCLUDES += \
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e339fe635d4af2e9e884d0c3bdb56d9d210e9ad
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md
@@ -0,0 +1,9 @@
+Compiling instructions here
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro
+
+CONTACT INFORMATION:
+
+Contact info@etacompute.com for more information on obtaining the Eta Compute
+SDK and evalution board.
+
+www.etacompute.com
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
deleted file mode 100644
index 3cb74a72437be8017527c0ea05a1b82eb1a4ac9e..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Running The Micro Speech Example On Eta Compute's ECM3531EVB
-
-This code will enable you to compile and execute the Tensorflow Lite Micro Speech Example on Eta Computes's low power ECM3531 chip.
-
-
-GETTING STARTED:
-
-1. Download the Tensorflow code from Github and follow instructions there to download other dependencies.  
-
-2. Download the Eta Compute SDK, version 0.0.17.
-
-3. Install the Arm compiler arm-none-eabi-gcc, version = arm-none-eabi-gcc (GNU Tools for Arm Embedded Processors 7-2018-q2-update) 7.3.1 20180622 (release) [ARM/embedded-7-branch revision 261907]
-
-4. Edit the file   tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc  so that the variable ETA_SDK points to the location where the Eta Compute SDK is installed, and the variable GCC_ARM points to the Arm compiler.
-
-5. Compile the code with the command   "make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531 test".  This will create the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test.
-
-6. Connect the board to the host computer, start PuTTY (Connection type = Serial, Speed = 11520, Data bits = 8, Stop bits = 1,  Parity = None), and load the executable with ocd.  A sample script for loading the image is provided in tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program.  
-
-The following  will be printed on the Uart:
-
-Testing TestInvoke
-Ran successfully
-
-/ tests passed
-~~~ALL TESTS PASSED~~~
-
-
-
-CONTACT INFORMATION:
-
-Contact info@etacompute.com  for more information on obtaining the Eta Compute SDK and evalution board.
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
index af34f988f2d04a0c1c87f20d6058df560db7e2c5..383b7f924408b484c8ee2ada5c4d6ec66edb059a 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
@@ -55,7 +55,7 @@ SECTIONS
     {
         _datax = .;
         KEEP(*(.mainStack))
-        . += 12288;
+        . += 16384;
         _edatax = .;
         _stack_top = .;
         . += 4;
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds
new file mode 100644
index 0000000000000000000000000000000000000000..9cbbea3569ba05b8fc9269ff6c5500fb386c03a3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/*
+ * linker script for use with ECM3531 chip.
+ * .text  and .ro map to FLASH all else to SRAM.
+ *
+ */
+
+ /*
+  * Indicate to the linker the entry point.
+  */
+ENTRY(ResetISR)
+
+/*
+ *   FLASH is at 0x01000000 of length 0x00080000  512KB
+ *   SRAM  is at 0x10000000 of length 0x00020000  128KB
+ */
+MEMORY
+{
+    FLASH (RX) : ORIGIN = 0x01000000, LENGTH = 0x00080000
+    SRAM (RWX) : ORIGIN = 0x10000000, LENGTH = 0x00020000
+}
+
+SECTIONS
+{
+    .text :
+    {
+        _text = .;
+        KEEP(*(.vectors))
+        . = ALIGN(0x4);
+        *(.text*)
+        . = ALIGN(0x4);
+        *(.rodata*)
+        . = ALIGN(0x4);
+        _etext = .;
+    } > FLASH= 0
+    .dummy :
+    {
+        . = ALIGN(0x4);
+        _eftext = .;
+    } > FLASH
+/* put the stack at the bottom of SRAM*/
+    .datax (NOLOAD) :
+    {
+        _datax = .;
+        KEEP(*(.mainStack))
+        . = ALIGN(0x4);
+        . += 16384;
+        _edatax = .;
+        _stack_top = .;
+    } > SRAM
+    .data :
+    {
+        _data = .;
+        *(.data*)
+        KEEP(*(.mainHeap))
+        _edata = .;
+    } > SRAM AT > FLASH
+
+    .bss (NOLOAD) :
+    {
+        _bss = .;
+        *(.bss*)
+        *(COMMON)
+        _ebss = .;
+    } > SRAM
+
+
+
+}
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase
new file mode 100755
index 0000000000000000000000000000000000000000..5395b3d9965e98572fb12d61d7b862f4ce926a0f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./flash_erase to erase the flash.
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+#ocd comand
+ocd_commands = ["halt\n",
+                "flash erase_sector 0 0 127\n",
+                "mww 0x1001fff8 0\n",
+                "mdw 0x01000000 16\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands: 
+    print(x)
+    send_ocd_cmd(x)
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program
new file mode 100755
index 0000000000000000000000000000000000000000..bc3fe5cb21aa2a89d8dda41a68185f03e43c674e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./flash_program executable_name to load an executable from the directory tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/ into flash
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import sys, getopt
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+# git path to project elf file
+cur_dir = os.getcwd()
+#elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech'
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + sys.argv[1]
+print("elf_file = ",elf_file)
+
+
+# use these to download and run the elf fle
+ocd_commands = ["halt\n",
+                "flash erase_sector 0 0 127\n",
+                "flash write_image {}\n".format(elf_file),
+                "mww 0x1001fff8 0\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands:
+    print(x)
+    send_ocd_cmd(x)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
index ac1f49962a61756ccbde02300c612bd7b4f48e84..781231480aa2f1dec18cc468e1ea0129604c71e7 100755
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
@@ -1,5 +1,5 @@
 #!/usr/bin/python3
-#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram to load the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test into SRAM
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram executable_name to load an executable from the directory tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/
 #
 #
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
@@ -18,6 +18,7 @@
 #==============================================================================
 
 
+import sys, getopt
 import os
 import telnetlib
 
@@ -34,7 +35,8 @@ get_ocd_response() # clean it out
 
 # git path to project elf file
 cur_dir = os.getcwd()
-elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech_test'
+#elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'preprocessor_test'
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + sys.argv[1]
 print("elf_file = ",elf_file)
 
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
index baae58f87e1761c978a87256fda8b7e90edb79e5..897a2b66d21668c4a28573e9d068b865c8f008a8 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
@@ -5,6 +5,10 @@ ifeq ($(TARGET), ecm3531)
   ETA_SDK := /home/hari/TensaiSDK-v0.0.17/soc/
   GCC_ARM := /home/hari/Downloads/gcc-arm-none-eabi-7-2018-q2-update/
 
+#Pick the appropriate lds file depending whether you are running frof SRAM of flash
+  ETA_LDS_FILE := ecm3531.lds
+#  ETA_LDS_FILE := ecm3531_flash.lds
+
   ifeq ($(wildcard $(ETA_SDK)),)
     $(error Path to ETA SDK is not set (ETA_SDK))
   endif
@@ -14,6 +18,7 @@ ifeq ($(TARGET), ecm3531)
   endif
 
   PLATFORM_FLAGS = \
+    -DARM_MATH_CM3 \
     -DFIRMWARE_BUILD \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
@@ -58,14 +63,13 @@ ifeq ($(TARGET), ecm3531)
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
     -fno-exceptions \
     -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.lds \
-    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/$(ETA_LDS_FILE) \
+    -Wl,-Map=$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.map,--cref
   BUILD_TYPE := micro
   MICROLITE_LIBS := \
     $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
     -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  ECM3531_INCLUDES := \
     -I$(GCC_ARM)/arm-none-eabi/include/ \
     -I$(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/inc/ \
     -I$(ETA_SDK)/ecm3531/m3/reg/inc/ \
@@ -75,6 +79,9 @@ ifeq ($(TARGET), ecm3531)
     -I$(ETA_SDK)/../utils/inc/  \
     -I$(ETA_SDK)/ecm3531/boards/eta_evb/eta_bsp/inc
 
+  INCLUDES += $(ECM3531_INCLUDES)
+  GENERATED_PROJECT_INCLUDES += $(ECM3531_INCLUDES)
+
   # _main.c contains application and target specific initialization, like
   # setting clock speed, default uart setups, etc. and an implementation
   # of the DebugLog interfaces.
@@ -83,7 +90,13 @@ ifeq ($(TARGET), ecm3531)
     $(MAKEFILE_DIR)/targets/ecm3531/_main.c \
     $(wildcard $(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/src/*.c) \
     $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.c) \
-    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s)
+
+  # The linker script isn't a header, but it needs to get copied to the gen/
+  # directory for generated projects.  This is similar to the behavior needed
+  # for headers.
+  MICROLITE_CC_HDRS += \
+    $(MAKEFILE_DIR)/targets/ecm3531/$(ETA_LDS_FILE)
 
   TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
   # These are tests that don't currently work on the blue pill.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
index 74d54f1ebee12d7773edfd1b073ddf17dd3791d6..ca6519c1390b5b783e4b6f26cac40a6b7ef32f46 100644
--- a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
@@ -4,12 +4,8 @@ SRCS := \
 OBJS := \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
 
-INCLUDES := \
--I. \
--I./third_party/gemmlowp \
--I./third_party/flatbuffers/include
-
 CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
 
 LDFLAGS += %{LINKER_FLAGS}%
 
@@ -20,7 +16,6 @@ LDFLAGS += %{LINKER_FLAGS}%
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 %{EXECUTABLE}% : $(OBJS)
-	$(CXX) $(LDFLAGS) $(OBJS) \
-	-o $@
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS)
 
 all: %{EXECUTABLE}%
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..141994d854565dc2ad2152e440c1d29526acb3dc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl
@@ -0,0 +1,16 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Make Build",
+            "type": "shell",
+            "command": "make",
+            "group": {
+                "kind": "build",
+                "isDefault": true
+                }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..616f3b23188df4af934433772c86c5c1a9452539
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl
@@ -0,0 +1,39 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Mbed Config Root",
+            "type": "shell",
+            "command": "mbed config root .",
+        },
+        {
+            "label": "Mbed Deploy",
+            "type": "shell",
+            "command": "mbed deploy",
+        },
+        {
+            "label": "Mbed Patch C++11",
+            "type": "shell",
+            "command": "python",
+            "args": [
+                "-c",
+                "import fileinput, glob;\nfor filename in glob.glob(\"mbed-os/tools/profiles/*.json\"):\n  for line in fileinput.input(filename, inplace=True):\n    print line.replace(\"\\\"-std=gnu++98\\\"\",\"\\\"-std=c++11\\\", \\\"-fpermissive\\\"\")"
+            ]
+        },
+        {
+            "label": "Mbed Init",
+            "dependsOn": ["Mbed Config Root", "Mbed Deploy", "Mbed Patch C++11"]
+        },
+        {
+            "label": "Mbed build",
+            "type": "shell",
+            "command": "mbed compile -m auto -t GCC_ARM",
+            "group": {
+                "kind": "build",
+                "isDefault": true
+                }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/microfrontend/lib/BUILD b/tensorflow/lite/experimental/microfrontend/lib/BUILD
index a055e52f71001295cf95dfcbe790bc4118140fed..8dd42fc38290dbf5be8f9f1a850ad88cbf326ace 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/lib/BUILD
@@ -6,6 +6,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
 cc_library(
     name = "bits",
     hdrs = ["bits.h"],
@@ -117,72 +122,65 @@ cc_library(
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "fft_test",
-    size = "small",
     srcs = ["fft_test.cc"],
     deps = [
         ":fft",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "filterbank_test",
-    size = "small",
     srcs = ["filterbank_test.cc"],
     deps = [
         ":filterbank",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "frontend_test",
-    size = "small",
     srcs = ["frontend_test.cc"],
     deps = [
         ":frontend",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "log_scale_test",
-    size = "small",
     srcs = ["log_scale_test.cc"],
     deps = [
         ":log_scale",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "noise_reduction_test",
-    size = "small",
     srcs = ["noise_reduction_test.cc"],
     deps = [
         ":noise_reduction",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "pcan_gain_control_test",
-    size = "small",
     srcs = ["pcan_gain_control_test.cc"],
     deps = [
         ":pcan_gain_control",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "window_test",
-    size = "small",
     srcs = ["window_test.cc"],
     deps = [
         ":window",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
diff --git a/tensorflow/lite/experimental/microfrontend/lib/bits.h b/tensorflow/lite/experimental/microfrontend/lib/bits.h
index bf15466a3d6484c3059a1ded1bb51e4d4287b1bf..04b3ba6f055f956720b58720c78083b1529fb065 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/bits.h
+++ b/tensorflow/lite/experimental/microfrontend/lib/bits.h
@@ -63,14 +63,14 @@ static inline int CountLeadingZeros64Slow(uint64_t n) {
 
 static inline int CountLeadingZeros64(uint64_t n) {
 #if defined(_MSC_VER) && defined(_M_X64)
-  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  // MSVC does not have __builtin_clzll. Use _BitScanReverse64.
   unsigned long result = 0;  // NOLINT(runtime/int)
   if (_BitScanReverse64(&result, n)) {
     return 63 - result;
   }
   return 64;
 #elif defined(_MSC_VER)
-  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse
   unsigned long result = 0;  // NOLINT(runtime/int)
   if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
     return 31 - result;
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
index 1b754c1b4c244edf1b091a581e5ae9399c2ac2e3..ec1f247ba24ad27917330708d6f9c754515a686b 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -25,9 +24,13 @@ const int16_t kFakeWindow[] = {
     0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
 const int kScaleShift = 0;
 
-TEST(FftTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FftTest_CheckOutputValues) {
   struct FftState state;
-  ASSERT_TRUE(
+  TF_LITE_MICRO_EXPECT(
       FftPopulateState(&state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
 
   FftInit(&state);
@@ -37,14 +40,15 @@ TEST(FftTest, CheckOutputValues) {
       {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
       {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
       {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
-  ASSERT_EQ(state.fft_size / 2 + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.fft_size / 2 + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.fft_size / 2; ++i) {
-    EXPECT_EQ(state.output[i].real, expected[i].real);
-    EXPECT_EQ(state.output[i].imag, expected[i].imag);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i].real, expected[i].real);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i].imag, expected[i].imag);
   }
 
   FftFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
index 41f0064d4f1674471fa731e72464b1d40fce4216..16257aa11a5ca3e82aeff60f8cc0176de3c519ab 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -33,9 +32,9 @@ const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
 const int kScaleShift = 0;
 
 // Test filterbank generation using scaled-down defaults.
-class FilterbankTest : public ::testing::Test {
- protected:
-  FilterbankTest() {
+class FilterbankTestConfig {
+ public:
+  FilterbankTestConfig() {
     config_.num_channels = 2;
     config_.lower_band_limit = 8.0;
     config_.upper_band_limit = 450.0;
@@ -44,110 +43,124 @@ class FilterbankTest : public ::testing::Test {
   struct FilterbankConfig config_;
 };
 
-TEST_F(FilterbankTest, CheckStartIndex) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckStartIndex) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
-  EXPECT_EQ(state.start_index, kStartIndex);
+  TF_LITE_MICRO_EXPECT_EQ(state.start_index, kStartIndex);
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckEndIndex) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckEndIndex) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
-  EXPECT_EQ(state.end_index, kEndIndex);
+  TF_LITE_MICRO_EXPECT_EQ(state.end_index, kEndIndex);
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelFrequencyStarts) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelFrequencyStarts) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 4, 8};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelWeightStarts) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWeightStarts) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 8, 16};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelWidths) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWidths) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {8, 8, 8};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_widths[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_widths[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckWeights) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckWeights) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
                               0, 3376, 2468, 1591, 744,  0,   0,   0,
                               0, 4020, 3226, 2456, 1708, 983, 277, 0};
-  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
-                state.channel_widths[state.num_channels],
-            sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(state.weights[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.weights[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckUnweights) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckUnweights) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
                               0, 720, 1628, 2505, 3352, 0,    0,    0,
                               0, 76,  870,  1640, 2388, 3113, 3819, 0};
-  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
-                state.channel_widths[state.num_channels],
-            sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(state.unweights[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.unweights[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckConvertFftComplexToEnergy) {
   struct FilterbankState state;
   state.start_index = kStartIndex;
   state.end_index = kEndIndex;
@@ -161,42 +174,46 @@ TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
 
   int i;
   for (i = state.start_index; i < state.end_index; ++i) {
-    EXPECT_EQ(energy[i], kEnergy[i]);
+    TF_LITE_MICRO_EXPECT_EQ(energy[i], kEnergy[i]);
   }
 }
 
-TEST_F(FilterbankTest, CheckAccumulateChannels) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckAccumulateChannels) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   FilterbankAccumulateChannels(&state, kEnergy);
 
-  ASSERT_EQ(state.num_channels + 1, sizeof(kWork) / sizeof(kWork[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(kWork) / sizeof(kWork[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.work[i], kWork[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.work[i], kWork[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckSqrt) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckSqrt) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
   std::memcpy(state.work, kWork, sizeof(kWork));
 
   uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
 
   const uint32_t expected[] = {247311, 508620};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(scaled_filterbank[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(scaled_filterbank[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
index a6faa1fc1f51360e295253fb2b3cfdf01ada74ad..568484f14dde6b958d5c9e144ab8dfd7a68a0fb0 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -29,9 +28,9 @@ const int16_t kFakeAudioData[] = {
     0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
 
 // Test end-to-end frontend behaviors.
-class FrontendTest : public ::testing::Test {
- protected:
-  FrontendTest() {
+class FrontendTestConfig {
+ public:
+  FrontendTestConfig() {
     config_.window.size_ms = 25;
     config_.window.step_size_ms = 10;
     config_.noise_reduction.smoothing_bits = 10;
@@ -53,9 +52,15 @@ class FrontendTest : public ::testing::Test {
   struct FrontendConfig config_;
 };
 
-TEST_F(FrontendTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckOutputValues) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   struct FrontendOutput output = FrontendProcessSamples(
@@ -63,18 +68,20 @@ TEST_F(FrontendTest, CheckOutputValues) {
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
 
   const uint16_t expected[] = {479, 425};
-  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < output.size; ++i) {
-    EXPECT_EQ(output.values[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
   }
 
   FrontendFreeStateContents(&state);
 }
 
-TEST_F(FrontendTest, CheckConsecutiveWindow) {
+TF_LITE_MICRO_TEST(FrontendTest_CheckConsecutiveWindow) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   FrontendProcessSamples(&state, kFakeAudioData,
@@ -86,18 +93,20 @@ TEST_F(FrontendTest, CheckConsecutiveWindow) {
       &num_samples_read);
 
   const int16_t expected[] = {436, 378};
-  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < output.size; ++i) {
-    EXPECT_EQ(output.values[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
   }
 
   FrontendFreeStateContents(&state);
 }
 
-TEST_F(FrontendTest, CheckNotEnoughSamples) {
+TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   FrontendProcessSamples(&state, kFakeAudioData,
@@ -113,10 +122,10 @@ TEST_F(FrontendTest, CheckNotEnoughSamples) {
           kStepSamples,
       &num_samples_read);
 
-  EXPECT_EQ(output.size, 0);
-  EXPECT_EQ(output.values, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
+  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
 
   FrontendFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
index 1ea0842ec2ad1065782198b635bf8b4858d6bf3a..be52fd426a23a389aac84e4b2dac832924716f83 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
@@ -15,15 +15,18 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
 const int kScaleShift = 6;
 const int kCorrectionBits = -1;
 
-TEST(LogScaleTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValues) {
   struct LogScaleState state;
   state.enable_log = true;
   state.scale_shift = kScaleShift;
@@ -36,11 +39,11 @@ TEST(LogScaleTest, CheckOutputValues) {
   const uint16_t expected[] = {479, 425};
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
   }
 }
 
-TEST(LogScaleTest, CheckOutputValuesNoLog) {
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValuesNoLog) {
   struct LogScaleState state;
   state.enable_log = false;
   state.scale_shift = kScaleShift;
@@ -53,8 +56,8 @@ TEST(LogScaleTest, CheckOutputValuesNoLog) {
   const uint16_t expected[] = {65535, 45998};
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
   }
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
index 13d58b2476762d89ee79be554be12a9b7a897ad5..ba864c427ced36748167c9412fe2966d72d3cb0e 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
@@ -15,17 +15,16 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
 const int kNumChannels = 2;
 
 // Test noise reduction using default config values.
-class NoiseReductionTest : public ::testing::Test {
- protected:
-  NoiseReductionTest() {
+class NoiseReductionTestConfig {
+ public:
+  NoiseReductionTestConfig() {
     config_.smoothing_bits = 10;
     config_.even_smoothing = 0.025;
     config_.odd_smoothing = 0.06;
@@ -35,38 +34,48 @@ class NoiseReductionTest : public ::testing::Test {
   struct NoiseReductionConfig config_;
 };
 
-TEST_F(NoiseReductionTest, TestNoiseReductionEstimate) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReductionEstimate) {
+  NoiseReductionTestConfig config;
   struct NoiseReductionState state;
-  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+  TF_LITE_MICRO_EXPECT(
+      NoiseReductionPopulateState(&config.config_, &state, kNumChannels));
 
   uint32_t signal[] = {247311, 508620};
   NoiseReductionApply(&state, signal);
 
   const uint32_t expected[] = {6321887, 31248341};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(state.estimate[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.estimate[i], expected[i]);
   }
 
   NoiseReductionFreeStateContents(&state);
 }
 
-TEST_F(NoiseReductionTest, TestNoiseReduction) {
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReduction) {
+  NoiseReductionTestConfig config;
   struct NoiseReductionState state;
-  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+  TF_LITE_MICRO_EXPECT(
+      NoiseReductionPopulateState(&config.config_, &state, kNumChannels));
 
   uint32_t signal[] = {247311, 508620};
   NoiseReductionApply(&state, signal);
 
   const uint32_t expected[] = {241137, 478104};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(signal[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
   }
 
   NoiseReductionFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
index 7c92d2d29d0e41d5e378a596c5a06e8418edfa8d..93d7a8bcb94d5e0145b9ee701b413194f2946a7b 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -25,9 +24,9 @@ const int kSmoothingBits = 10;
 const int kCorrectionBits = -1;
 
 // Test pcan auto gain control using default config values.
-class PcanGainControlTest : public ::testing::Test {
- protected:
-  PcanGainControlTest() {
+class PcanGainControlTestConfig {
+ public:
+  PcanGainControlTestConfig() {
     config_.enable_pcan = 1;
     config_.strength = 0.95;
     config_.offset = 80.0;
@@ -37,24 +36,30 @@ class PcanGainControlTest : public ::testing::Test {
   struct PcanGainControlConfig config_;
 };
 
-TEST_F(PcanGainControlTest, TestPcanGainControl) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PcanGainControlTest_TestPcanGainControl) {
   uint32_t estimate[] = {6321887, 31248341};
+  PcanGainControlTestConfig config;
   struct PcanGainControlState state;
-  ASSERT_TRUE(PcanGainControlPopulateState(&config_, &state, estimate,
-                                           kNumChannels, kSmoothingBits,
-                                           kCorrectionBits));
+  TF_LITE_MICRO_EXPECT(PcanGainControlPopulateState(
+      &config.config_, &state, estimate, kNumChannels, kSmoothingBits,
+      kCorrectionBits));
 
   uint32_t signal[] = {241137, 478104};
   PcanGainControlApply(&state, signal);
 
   const uint32_t expected[] = {3578, 1533};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(signal[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
   }
 
   PcanGainControlFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
index 60f11440f56ea39a25a5aa2beb23eb25a83048b3..cf9df523b8f5e540d47c8e6d3d42e37540f62ce0 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/window.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -29,9 +28,9 @@ const int16_t kFakeAudioData[] = {
     0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
 
 // Test window function behaviors using default config values.
-class WindowTest : public ::testing::Test {
- protected:
-  WindowTest() {
+class WindowTestConfig {
+ public:
+  WindowTestConfig() {
     config_.size_ms = 25;
     config_.step_size_ms = 10;
   }
@@ -39,84 +38,98 @@ class WindowTest : public ::testing::Test {
   struct WindowConfig config_;
 };
 
-TEST_F(WindowTest, CheckCoefficients) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(WindowState_CheckCoefficients) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
 
   const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
                               2681, 3145, 3541, 3843, 4032, 4096, 4032,
                               3843, 3541, 3145, 2681, 2177, 1664, 1176,
                               743,  391,  144,  16};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.coefficients[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.coefficients[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckResidualInput) {
+TF_LITE_MICRO_TEST(WindowState_CheckResidualInput) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
   int i;
   for (i = kStepSamples; i < kWindowSamples; ++i) {
-    EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckOutputValues) {
+TF_LITE_MICRO_TEST(WindowState_CheckOutputValues) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
   const int16_t expected[] = {
       0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
       0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckMaxAbsValue) {
+TF_LITE_MICRO_TEST(WindowState_CheckMaxAbsValue) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
-  EXPECT_EQ(state.max_abs_output_value, 32256);
+  TF_LITE_MICRO_EXPECT_EQ(state.max_abs_output_value, 32256);
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckConsecutiveWindow) {
+TF_LITE_MICRO_TEST(WindowState_CheckConsecutiveWindow) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData + kWindowSamples,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
       &num_samples_read));
@@ -124,38 +137,41 @@ TEST_F(WindowTest, CheckConsecutiveWindow) {
   const int16_t expected[] = {
       0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
       0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckNotEnoughSamples) {
+TF_LITE_MICRO_TEST(WindowState_CheckNotEnoughSamples) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData + kWindowSamples,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
       &num_samples_read));
-  ASSERT_FALSE(WindowProcessSamples(
-      &state, kFakeAudioData + kWindowSamples + kStepSamples,
-      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
-          kStepSamples,
-      &num_samples_read));
-
-  EXPECT_EQ(
+  TF_LITE_MICRO_EXPECT_EQ(
+      false, WindowProcessSamples(
+                 &state, kFakeAudioData + kWindowSamples + kStepSamples,
+                 sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) -
+                     kWindowSamples - kStepSamples,
+                 &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(
       state.input_used,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
 
   WindowFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
new file mode 100644
index 0000000000000000000000000000000000000000..3317c33d9d118ba81dbabcf7486934e8d4bb03d3
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -0,0 +1,109 @@
+# TensorFlow Lite Objective-C API.
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+
+SOURCES = glob([
+    "sources/*.h",
+    "sources/*.m",
+    "sources/*.mm",
+])
+
+API_HEADERS = glob([
+    "apis/*.h",
+])
+
+MINIMUM_OS_VERSION = "9.0"
+
+# Compiler flags for building regular non-test libraries.
+RELEASE_COPTS = [
+    # Enables language-specific warnings for Objective-C, Objective-C++, C, and C++.
+    "-Wall",
+    # Warns if functions, variables, and types marked with the deprecated attribute are being used.
+    "-Wdeprecated-declarations",
+    # Warns for errors in documentation.
+    "-Wdocumentation",
+    # Turns all warnings into errors.
+    "-Werror",
+    # Enables extra warning flags that are not enabled by -Wall.
+    "-Wextra",
+    # Warns if a global function is defined without a previous prototype declaration.
+    "-Wmissing-prototypes",
+    # From -Wextra. Disables warning when signed value is converted to unsigned value during comparison.
+    "-Wno-sign-compare",
+    # From -Wextra. Disables warning for unused parameters, which are common in delegate methods and block callbacks.
+    "-Wno-unused-parameter",
+    # Warns if a global or local variable or type declaration shadows another variable, parameter, type, class member, or instance variable.
+    "-Wshadow",
+    # Warns if a function is declared or defined without specifying the argument types. For a block with no args, use (void) instead of ().
+    "-Wstrict-prototypes",
+    # Warns if an @selector() expression is encountered with a method name that hasn't been defined yet.
+    "-Wundeclared-selector",
+    # Turn off warnings for headers not part of TensorFlow Lite Objective-C API.
+    "--system-header-prefix=tensorflow/lite/experimental/c/",
+]
+
+# Compiler flags for building test libraries.
+TEST_COPTS = RELEASE_COPTS + [
+    # From -Wall. Disables warning when passing nil to a callee that requires a non-null argument.
+    "-Wno-nonnull",
+    # Disables warning when a global or local variable or type declaration shadows another.
+    "-Wno-shadow",
+]
+
+# Default tags for filtering targets. Targets in this file are restricted to Apple platforms.
+DEFAULT_TAGS = [
+    "apple",
+    "manual",
+]
+
+objc_library(
+    name = "TensorFlowLite",
+    srcs = SOURCES,
+    hdrs = API_HEADERS,
+    copts = RELEASE_COPTS,
+    tags = DEFAULT_TAGS,
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
+    alwayslink = 1,
+)
+
+ios_unit_test(
+    name = "TensorFlowLiteTests",
+    size = "small",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    tags = DEFAULT_TAGS + [
+        # These sanitizer tests are not supported by iOS build toolchain (b/74292221).
+        # Disabled these for iOS test targets.
+        "noasan",
+        "notsan",
+        "nomsan",
+    ],
+    deps = [":TensorFlowLiteTestsLib"],
+)
+
+objc_library(
+    name = "TensorFlowLiteTestsLib",
+    testonly = 1,
+    srcs = glob([
+        "tests/*.m",
+    ]),
+    hdrs = glob([
+        "apis/*.h",
+        "sources/*.h",
+        "tests/*.h",
+    ]),
+    copts = TEST_COPTS,
+    resources = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+    ],
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+    ],
+)
diff --git a/tensorflow/lite/experimental/objc/README.md b/tensorflow/lite/experimental/objc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2940e0524080934147c7ee9044df0098d29c496b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/README.md
@@ -0,0 +1,52 @@
+# TensorFlow Lite Objective-C Library
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/) is TensorFlow's lightweight
+solution for Objective-C developers. It enables low-latency inference of
+on-device machine learning models with a small binary size and fast performance
+supporting hardware acceleration.
+
+## Getting Started
+
+### Bazel
+
+In your `BUILD` file, add the `TensorFlowLite` dependency:
+
+```python
+objc_library(
+  deps = [
+      "//tensorflow/lite/experimental/objc:TensorFlowLite",
+  ],
+)
+```
+
+If you would like to build the Objective-C TensorFlow Lite library using Bazel on Apple
+platforms, clone or download the [TensorFlow GitHub repo](https://github.com/tensorflow/tensorflow),
+then navigate to the root `tensorflow` directory and execute the `configure.py` script:
+
+```shell
+python configure.py
+```
+
+Follow the prompts and when asked to configure the Bazel rules for Apple
+platforms, enter `y`.
+
+Build the `TensorFlowLite` Objective-C library target:
+
+```shell
+bazel build tensorflow/lite/experimental/objc:TensorFlowLite
+```
+
+Build the `TensorFlowLiteTests` target:
+
+```shell
+bazel test tensorflow/lite/experimental/objc:TensorFlowLiteTests
+```
+
+### Tulsi
+
+Open the `TensorFlowLiteObjc.tulsiproj` using the Tulsi application on Mac or by
+running the following command in Terminal from the root source directory:
+
+```shell
+generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj:TensorFlowLiteObjC --outputfolder ~/path/to/xcodeproj
+```
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/Configs/TensorFlowLiteObjc.tulsigen b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/Configs/TensorFlowLiteObjc.tulsigen
new file mode 100644
index 0000000000000000000000000000000000000000..091ef4e2ea8b6ea0476bac71a6cfea25104706d1
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/Configs/TensorFlowLiteObjc.tulsigen
@@ -0,0 +1,60 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite",
+    "tensorflow/lite/experimental/c",
+    "tensorflow/lite/experimental/objc",
+    "tensorflow/lite/experimental/objc/apis",
+    "tensorflow/lite/experimental/objc/sources",
+    "tensorflow/lite/experimental/objc/tests",
+    "tensorflow/lite/kernels",
+    "tensorflow/lite/kernels/internal",
+    "tensorflow/lite/nnapi",
+    "tensorflow/lite/schema",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/experimental/objc:TensorFlowLite",
+    "//tensorflow/lite/experimental/objc:TensorFlowLiteTests",
+  ],
+  "projectName" : "TensorFlowLiteObjC",
+  "optionSet" : {
+    "LaunchActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "EnvironmentVariables" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "CommandlineArguments" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "LaunchActionPostActionScript" : {
+      "p" : "$(inherited)"
+    }
+  },
+  "additionalFilePaths" : [
+    "tensorflow/lite/experimental/objc/BUILD",
+  ]
+}
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/project.tulsiconf b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000000000000000000000000000000000000..0b6fedff3f66d1faf3894604f756c45f8dce9547
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/project.tulsiconf
@@ -0,0 +1,17 @@
+{
+  "configDefaults" : {
+    "optionSet" : {
+      "BazelBuildOptionsDebug" : {
+
+      },
+      "BazelBuildOptionsRelease" : {
+
+      },
+    }
+  },
+  "projectName" : "TensorFlowLiteObjC",
+  "packages" : [
+    "tensorflow/lite/experimental/objc"
+  ],
+  "workspaceRoot" : "../../../../.."
+}
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c06a4bc82f752baabdb6db100ee96e9ce29d29f
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
@@ -0,0 +1,179 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLInterpreterOptions;
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLInterpreterErrorCode
+ * This enum specifies various error codes related to `TFLInterpreter`.
+ */
+typedef NS_ENUM(NSUInteger, TFLInterpreterErrorCode) {
+  /** Provided tensor index is invalid. */
+  TFLInterpreterErrorCodeInvalidTensorIndex,
+
+  /** Input data has invalid byte size. */
+  TFLInterpreterErrorCodeInvalidInputByteSize,
+
+  /** Provided shape is invalid. It must be a non-empty array of positive unsigned integers. */
+  TFLInterpreterErrorCodeInvalidShape,
+
+  /** Provided model cannot be loaded. */
+  TFLInterpreterErrorCodeFailedToLoadModel,
+
+  /** Failed to create `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToCreateInterpreter,
+
+  /** Failed to invoke `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToInvoke,
+
+  /** Failed to retrieve a tensor. */
+  TFLInterpreterErrorCodeFailedToGetTensor,
+
+  /** Invalid tensor. */
+  TFLInterpreterErrorCodeInvalidTensor,
+
+  /** Failed to resize an input tensor. */
+  TFLInterpreterErrorCodeFailedToResizeInputTensor,
+
+  /** Failed to copy data into an input tensor. */
+  TFLInterpreterErrorCodeFailedToCopyDataToInputTensor,
+
+  /** Copying data into an output tensor not allowed. */
+  TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed,
+
+  /** Failed to get data from a tensor. */
+  TFLInterpreterErrorCodeFailedToGetDataFromTensor,
+
+  /** Failed to allocate memory for tensors. */
+  TFLInterpreterErrorCodeFailedToAllocateTensors,
+
+  /** Operaton not allowed without allocating memory for tensors first. */
+  TFLInterpreterErrorCodeAllocateTensorsRequired,
+
+  /** Operaton not allowed without invoking the interpreter first. */
+  TFLInterpreterErrorCodeInvokeInterpreterRequired,
+};
+
+/**
+ * A TensorFlow Lite model interpreter.
+ */
+@interface TFLInterpreter : NSObject
+
+/** The total number of input tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger inputTensorCount;
+
+/** The total number of output tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger outputTensorCount;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and the
+ * default interpreter options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and the default interpreter
+ *     options. `nil` if there is an error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and
+ * options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param options Options to use for configuring the TensorFlow Lite interpreter.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and options. `nil` if there is an
+ *     error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error NS_DESIGNATED_INITIALIZER;
+
+/**
+ * Invokes the interpreter to run inference.
+ *
+ * @param error An optional error parameter populated when there is an error in invoking the
+ *     interpreter.
+ *
+ * @return Whether the invocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)invokeWithError:(NSError **)error;
+
+/**
+ * Returns the input tensor at the given index.
+ *
+ * @param index The index of an input tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the input
+ *     tensor.
+ *
+ * @return The input tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Returns the output tensor at the given index.
+ *
+ * @param index The index of an output tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the
+ *     output tensor.
+ *
+ * @return The output tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Resizes the input tensor at the given index to the specified shape (an array of positive unsigned
+ * integers).
+ *
+ * @param index The index of an input tensor.
+ * @param shape Shape that the given input tensor should be resized to. It should be an array of
+ *     positive unsigned integer(s) containing the size of each dimension.
+ * @param error An optional error parameter populated when there is an error in resizing the input
+ *     tensor.
+ *
+ * @return Whether the input tensor was resized successfully. Returns NO if an error occurred.
+ */
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error;
+
+/**
+ * Allocates memory for tensors.
+ *
+ * @param error An optional error parameter populated when there is an error in allocating memory.
+ *
+ * @return Whether memory allocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)allocateTensorsWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6461fbf0178b1e72afb81e91d58109a2d7b0226b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Custom configuration options for a TensorFlow Lite interpreter. */
+@interface TFLInterpreterOptions : NSObject
+
+/**
+ * Maximum number of threads that the interpreter should run on. Defaults to 0 (unspecified, letting
+ * TensorFlow Lite to optimize the threading decision).
+ */
+@property(nonatomic) NSUInteger numberOfThreads;
+
+/**
+ * Initializes a new instance of `TFLInterpreterOptions`.
+ *
+ * @return A new instance of `TFLInterpreterOptions`.
+ */
+- (instancetype)init NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h b/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d5cf793c5bed984debe3a36fdec4f0945cd7c64
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Parameters for asymmetric quantization. Quantized values can be converted to float values using:
+ * `realValue = scale * (quantizedValue - zeroPoint)`.
+ */
+@interface TFLQuantizationParameters : NSObject
+
+/** Scale of asymmetric quantization. */
+@property(nonatomic, readonly) float scale;
+
+/** Zero point of asymmetric quantization. */
+@property(nonatomic, readonly) int32_t zeroPoint;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensor.h b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc710abf4e2ea99126be2fb359412287f3c37a33
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLQuantizationParameters;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorDataType
+ * This enum specifies supported TensorFlow Lite tensor data types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
+  /** Tensor data type not available. This indicates an error with the model. */
+  TFLTensorDataTypeNoType,
+
+  /** 32-bit single precision floating point. */
+  TFLTensorDataTypeFloat32,
+
+  /** 32-bit signed integer. */
+  TFLTensorDataTypeInt32,
+
+  /** 8-bit unsigned integer. */
+  TFLTensorDataTypeUInt8,
+
+  /** 64-bit signed integer. */
+  TFLTensorDataTypeInt64,
+
+  /** Boolean. */
+  TFLTensorDataTypeBool,
+
+  /** 16-bit signed integer. */
+  TFLTensorDataTypeInt16,
+
+  /** 8-bit signed integer. */
+  TFLTensorDataTypeInt8,
+};
+
+/**
+ * An input or output tensor in a TensorFlow Lite model.
+ *
+ * @warning Each `TFLTensor` instance is associated with a `TFLInterpreter` instance. Multiple
+ *     `TFLTensor` instances of the same TensorFlow Lite model are associated with the same
+ *     `TFLInterpreter` instance. As long as a `TFLTensor` instance is still in use, its associated
+ *     `TFLInterpreter` instance will not be deallocated.
+ */
+@interface TFLTensor : NSObject
+
+/** Name of the tensor. */
+@property(nonatomic, readonly, copy) NSString *name;
+
+/** Data type of the tensor. */
+@property(nonatomic, readonly) TFLTensorDataType dataType;
+
+/** Parameters for asymmetric quantization. `nil` if the tensor does not use quantization. */
+@property(nonatomic, readonly, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Copies the given data into an input tensor. This is allowed only for an input tensor and only
+ * before the interpreter is invoked; otherwise an error will be returned.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor.
+ * @param error An optional error parameter populated when there is an error in copying the data.
+ *
+ * @return Whether the data was copied into the input tensor successfully. Returns NO if an error
+ *     occurred.
+ */
+- (BOOL)copyData:(NSData *)data error:(NSError **)error;
+
+/**
+ * Retrieves a copy of data in the tensor. For an output tensor, the data is only available after
+ * the interpreter invocation has successfully completed; otherwise an error will be returned.
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the data.
+ *
+ * @return A copy of data in the tensor. `nil` if there is an error in retrieving the data or the
+ *     data is not available.
+ */
+- (nullable NSData *)dataWithError:(NSError **)error;
+
+/**
+ * Retrieves the shape of the tensor, an array of positive unsigned integers containing the size
+ * of each dimension. For example: the shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3] (i.e. an array of 2 arrays of 2 arrays of 3 numbers).
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8d50c896e6d5716308c7dc8818258fa38dbd72
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Helper utility for error reporting. */
+@interface TFLErrorUtil : NSObject
+
+/**
+ * Creates and saves an interpreter error with the given error code and description.
+ *
+ * @param code Error code.
+ * @param description Error description.
+ * @param error Pointer to where to save the created error. If `nil`, no error will be saved.
+ */
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m
new file mode 100644
index 0000000000000000000000000000000000000000..aa973c780060f4fa67573ff1e224ab0aed2bc92b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "TFLErrorUtil.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Error domain of TensorFlow Lite interpreter related errors. */
+static NSString *const TFLInterpreterErrorDomain = @"org.tensorflow.lite.interpreter";
+
+@implementation TFLErrorUtil
+
+#pragma mark - Public
+
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error {
+  if (error) {
+    *error = [NSError errorWithDomain:TFLInterpreterErrorDomain
+                                 code:code
+                             userInfo:@{NSLocalizedDescriptionKey : description}];
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b900c4f050451061a5d1a02b8be4dc51cade175
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h
@@ -0,0 +1,63 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLInterpreter (Internal)
+
+/**
+ * Copies the given data into the input tensor at the given index. This is allowed only before the
+ * interpreter is invoked.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor at the given index.
+ * @param index An input tensor index.
+ * @param error An optional error parameter populated when there is an error in setting the data.
+ *
+ * @return Whether the data was copied into the input tensor at the given index successfully.
+ *     Returns NO if an error occurred.
+ */
+- (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Retrieves a copy of the data from the given tensor. For an output tensor, the interpreter
+ * invocation has to complete before the data can be retrieved.
+ *
+ * @param tensor A tensor.
+ * @param error An optional error parameter populated when there is an error in getting the data.
+ *
+ * @return The data of the given tensor. `nil` if there is an error or data is not available.
+ */
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+/**
+ * Retrieves the shape of the given tensor, an array of positive unsigned integer(s) containing the
+ * size of each dimension. For example: shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3].
+ *
+ * @param tensor An input or output tensor.
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
new file mode 100644
index 0000000000000000000000000000000000000000..a8ca982f6dd619f9a01bd67cc028ee6fb583a75d
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -0,0 +1,407 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+#import "TFLErrorUtil.h"
+#import "TFLQuantizationParameters+Internal.h"
+#import "TFLTensor+Internal.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+#include "tensorflow/lite/experimental/c/c_api.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Error reporter for TFLInterpreter.
+ *
+ * @param user_data User data. Not used.
+ * @param format Error message which may contain argument formatting specifiers.
+ * @param args Values of the arguments in the error message.
+ */
+static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_list args) {
+  NSLog(@"%@", [[NSString alloc] initWithFormat:@(format) arguments:args]);
+}
+
+@interface TFLInterpreter ()
+
+/** TFL_Interpreter backed by C API. */
+@property(nonatomic, nullable) TFL_Interpreter *interpreter;
+
+@end
+
+@implementation TFLInterpreter
+
+#pragma mark - NSObject
+
+- (void)dealloc {
+  TFL_DeleteInterpreter(_interpreter);
+}
+
+#pragma mark - Public
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error {
+  return [self initWithModelPath:modelPath
+                         options:[[TFLInterpreterOptions alloc] init]
+                           error:error];
+}
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error {
+  self = [super init];
+
+  if (self != nil) {
+    TFL_Model *model = nullptr;
+    TFL_InterpreterOptions *cOptions = nullptr;
+
+    @try {
+      const char *modelPathCString = modelPath.UTF8String;
+      NSString *pathErrorString =
+          [NSString stringWithFormat:@"Cannot load model from path (%@).", modelPath];
+      if (modelPathCString == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
+                                       description:pathErrorString
+                                             error:error];
+        return nil;
+      }
+
+      model = TFL_NewModelFromFile(modelPathCString);
+      if (model == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
+                                       description:pathErrorString
+                                             error:error];
+        return nil;
+      }
+
+      cOptions = TFL_NewInterpreterOptions();
+      if (cOptions == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+
+      if (options.numberOfThreads > 0) {
+        TFL_InterpreterOptionsSetNumThreads(cOptions, (int32_t)options.numberOfThreads);
+      }
+      TFL_InterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
+
+      _interpreter = TFL_NewInterpreter(model, cOptions);
+      if (_interpreter == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+
+      _inputTensorCount = (NSUInteger)TFL_InterpreterGetInputTensorCount(_interpreter);
+      _outputTensorCount = (NSUInteger)TFL_InterpreterGetOutputTensorCount(_interpreter);
+      if (_inputTensorCount <= 0 || _outputTensorCount <= 0) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+    } @finally {
+      TFL_DeleteInterpreterOptions(cOptions);
+      TFL_DeleteModel(model);
+    }
+  }
+
+  return self;
+}
+
+- (BOOL)invokeWithError:(NSError **)error {
+  if (TFL_InterpreterInvoke(self.interpreter) != kTfLiteOk) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToInvoke
+                                   description:@"Failed to invoke the interpreter."
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.inputTensorCount error:error]) {
+    return nil;
+  }
+
+  return [self tensorOfType:TFLTensorTypeInput atIndex:index error:error];
+}
+
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.outputTensorCount error:error]) {
+    return nil;
+  }
+
+  return [self tensorOfType:TFLTensorTypeOutput atIndex:index error:error];
+}
+
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.inputTensorCount error:error]) {
+    return NO;
+  }
+
+  if (shape.count == 0) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidShape
+                                   description:@"Invalid shape. Must not be empty."
+                                         error:error];
+    return NO;
+  }
+
+  int cDimensions[self.inputTensorCount];
+  for (int dimIndex = 0; dimIndex < shape.count; ++dimIndex) {
+    int dimension = shape[dimIndex].intValue;
+    if (dimension <= 0) {
+      NSString *errorDescription = @"Invalid shape. Dimensions must be positive integers.";
+      [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidShape
+                                     description:errorDescription
+                                           error:error];
+      return NO;
+    }
+    cDimensions[dimIndex] = dimension;
+  }
+
+  if (TFL_InterpreterResizeInputTensor(self.interpreter, (int32_t)index, cDimensions,
+                                       (int32_t)shape.count) != kTfLiteOk) {
+    NSString *errorDescription = [NSString
+        stringWithFormat:@"Failed to resize input tensor at index (%lu).", (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToResizeInputTensor
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (BOOL)allocateTensorsWithError:(NSError **)error {
+  if (TFL_InterpreterAllocateTensors(self.interpreter) != kTfLiteOk) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToAllocateTensors
+                                   description:@"Failed to allocate memory for tensors."
+                                         error:error];
+    return NO;
+  }
+  return YES;
+}
+
+#pragma mark - TFLInterpreter (Internal)
+
+- (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:TFLTensorTypeInput atIndex:index error:error];
+  if (cTensor == nullptr) {
+    return NO;
+  }
+
+  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  if (data.length != byteSize) {
+    NSString *errorDescription = [NSString
+        stringWithFormat:@"Input tensor at index (%lu) expects data size (%lu), but got (%lu).",
+                         (unsigned long)index, byteSize, (unsigned long)data.length];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidInputByteSize
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  if (TFL_TensorCopyFromBuffer((TFL_Tensor *)cTensor, data.bytes, data.length) != kTfLiteOk) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to copy data into input tensor at index (%lu).",
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCopyDataToInputTensor
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  if (cTensor == nullptr) {
+    return nil;
+  }
+
+  void *bytes = TFL_TensorData(cTensor);
+  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  if (bytes == nullptr || byteSize == 0) {
+    NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get data from %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToGetDataFromTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+
+  return [NSData dataWithBytes:bytes length:byteSize];
+}
+
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  if (cTensor == nullptr) {
+    return nil;
+  }
+
+  NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
+  int32_t rank = TFL_TensorNumDims(cTensor);
+  if (rank <= 0) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid rank (%d).", tensorType,
+                                   (unsigned long)index, rank];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:rank];
+  for (int32_t dimIndex = 0; dimIndex < rank; dimIndex++) {
+    int32_t dimension = TFL_TensorDim(cTensor, dimIndex);
+    if (dimension <= 0) {
+      NSString *errorDescription =
+          [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid %d-th dimension (%d).",
+                                     tensorType, (unsigned long)index, dimIndex, dimension];
+      [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                     description:errorDescription
+                                           error:error];
+      return nil;
+    }
+    shape[dimIndex] = @((NSUInteger)dimension);
+  }
+
+  return shape;
+}
+
+#pragma mark - Private
+
+- (const TFL_Tensor *)cTensorOfType:(TFLTensorType)type
+                            atIndex:(NSUInteger)index
+                              error:(NSError **)error {
+  const TFL_Tensor *tensor = nullptr;
+
+  switch (type) {
+    case TFLTensorTypeInput:
+      tensor = TFL_InterpreterGetInputTensor(self.interpreter, (int32_t)index);
+      break;
+    case TFLTensorTypeOutput:
+      tensor = TFL_InterpreterGetOutputTensor(self.interpreter, (int32_t)index);
+      break;
+  }
+
+  if (tensor == nullptr) {
+    NSString *tensorType = [TFLTensor stringForTensorType:type];
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToGetTensor
+                                   description:errorDescription
+                                         error:error];
+  }
+
+  return tensor;
+}
+
+- (nullable TFLTensor *)tensorOfType:(TFLTensorType)type
+                             atIndex:(NSUInteger)index
+                               error:(NSError **)error {
+  const TFL_Tensor *tensor = [self cTensorOfType:type atIndex:index error:error];
+
+  if (tensor == nullptr) {
+    return nil;
+  }
+
+  NSString *tensorType = [TFLTensor stringForTensorType:type];
+  const char *cName = TFL_TensorName(tensor);
+  if (cName == nullptr) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get name of %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+  NSString *name = [NSString stringWithUTF8String:cName];
+
+  TFLTensorDataType dataType = [self tensorDataTypeFromCTensorType:TFL_TensorType(tensor)];
+
+  TFL_QuantizationParams cParams = TFL_TensorQuantizationParams(tensor);
+  TFLQuantizationParameters *quantizationParams;
+
+  // TODO(b/119735362): Update this check once the TFL_QuantizationParams struct has a mode.
+  if (cParams.scale != 0.0) {
+    quantizationParams = [[TFLQuantizationParameters alloc] initWithScale:cParams.scale
+                                                                zeroPoint:cParams.zero_point];
+  }
+
+  // TODO: Set quantization parameters when C API supports it.
+  return [[TFLTensor alloc] initWithInterpreter:self
+                                           type:type
+                                          index:index
+                                           name:name
+                                       dataType:dataType
+                         quantizationParameters:quantizationParams];
+}
+
+- (TFLTensorDataType)tensorDataTypeFromCTensorType:(TFL_Type)cTensorType {
+  switch (cTensorType) {
+    case kTfLiteFloat32:
+      return TFLTensorDataTypeFloat32;
+    case kTfLiteInt32:
+      return TFLTensorDataTypeInt32;
+    case kTfLiteUInt8:
+      return TFLTensorDataTypeUInt8;
+    case kTfLiteInt8:
+      return TFLTensorDataTypeInt8;
+    case kTfLiteInt64:
+      return TFLTensorDataTypeInt64;
+    case kTfLiteBool:
+      return TFLTensorDataTypeBool;
+    case kTfLiteInt16:
+      return TFLTensorDataTypeInt16;
+    case kTfLiteNoType:
+    case kTfLiteString:
+    case kTfLiteComplex64:
+      // kTfLiteString and kTfLiteComplex64 are not supported in TensorFlow Lite Objc API.
+      return TFLTensorDataTypeNoType;
+  }
+}
+
+- (BOOL)isValidTensorIndex:(NSUInteger)index
+                belowLimit:(NSUInteger)totalTensorCount
+                     error:(NSError **)error {
+  if (index >= totalTensorCount) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Invalid tensor index (%lu) exceeds max (%lu).",
+                                   (unsigned long)index, (unsigned long)(totalTensorCount - 1)];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensorIndex
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m b/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m
new file mode 100644
index 0000000000000000000000000000000000000000..d129befecabc5af752ccff70e84a4a66c7ee4bca
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLInterpreterOptions
+
+#pragma mark - Public
+
+- (instancetype)init {
+  self = [super init];
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..37d9ef0bb4761c9ff93111ba3158d4c4d68a9ec2
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLQuantizationParameters (Internal)
+
+/**
+ * Initializes a `TFLQuantizationParameters` instance with the given scale and zero point.
+ *
+ * @param scale Scale of asymmetric quantization.
+ * @param zeroPoint Zero point of asymmetric quantization.
+ *
+ * @return A new instance of `TFLQuantizationParameters` with the given scale and zero point.
+ */
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m
new file mode 100644
index 0000000000000000000000000000000000000000..44cb90d3323a73c1f79a27f319ac263c84e94408
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+#import "TFLQuantizationParameters+Internal.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLQuantizationParameters
+
+#pragma mark - TFLTensor (Internal)
+
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint {
+  self = [super init];
+  if (self != nil) {
+    _scale = scale;
+    _zeroPoint = zeroPoint;
+  }
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d5c51caabd8e44ab2b30a7b44259f6878865586
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+@class TFLInterpreter;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorType
+ * This enum specifies input or output tensor types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorType) {
+  /** Input tensor type. */
+  TFLTensorTypeInput,
+
+  /** Output tensor type. */
+  TFLTensorTypeOutput,
+};
+
+@interface TFLTensor (Internal)
+
+/** Input or output tensor type. */
+@property(nonatomic, readonly) TFLTensorType type;
+
+/** Index of the tensor. */
+@property(nonatomic, readonly) NSUInteger index;
+
+/**
+ * Initializes a `TFLTensor` with the given interpreter, name, data type, and quantization
+ * parameters.
+ *
+ * @param interpreter Interpreter backing the tensor.
+ * @param type Input or output tensor type.
+ * @param index Index of the tensor.
+ * @param name Name of the tensor.
+ * @param dataType Data type of the tensor.
+ * @param quantizationParameters Quantization parameters of the tensor. `nil` if the tensor does not
+ *     use quantization.
+ *
+ * @return A new instance of `TFLTensor` with the given name, data type, shape, and quantization
+ *     parameters.
+ */
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters;
+
+/**
+ * Returns the string name of the given input or output tensor type.
+ *
+ * @param type Input or output tensor type.
+ *
+ * @return The string name of the given input or output tensor type.
+ */
++ (NSString *)stringForTensorType:(TFLTensorType)type;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLTensor.m b/tensorflow/lite/experimental/objc/sources/TFLTensor.m
new file mode 100644
index 0000000000000000000000000000000000000000..2eaebfd6bec0483817bd4c1c3e540113cca75f5e
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLTensor.m
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+#import "TFLErrorUtil.h"
+#import "TFLInterpreter+Internal.h"
+#import "TFLTensor+Internal.h"
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+// String names of input or output tensor types.
+static NSString *const kTFLInputTensorTypeString = @"input";
+static NSString *const kTFLOutputTensorTypeString = @"output";
+
+@interface TFLTensor ()
+
+// Redefines readonly properties.
+@property(nonatomic) TFLTensorType type;
+@property(nonatomic) NSUInteger index;
+@property(nonatomic, copy) NSString *name;
+@property(nonatomic) TFLTensorDataType dataType;
+@property(nonatomic, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/**
+ * The backing interpreter. It's a strong reference to ensure that the interpreter is never released
+ * before this tensor is released.
+ *
+ * @warning Never let the interpreter hold a strong reference to the tensor to avoid retain cycles.
+ */
+@property(nonatomic) TFLInterpreter *interpreter;
+
+@end
+
+@implementation TFLTensor
+
+#pragma mark - Public
+
+- (BOOL)copyData:(NSData *)data error:(NSError **)error {
+  if (self.type == TFLTensorTypeOutput) {
+    [TFLErrorUtil
+        saveInterpreterErrorWithCode:TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed
+                         description:@"Cannot copy data into an output tensor."
+                               error:error];
+    return NO;
+  }
+
+  return [self.interpreter copyData:data toInputTensorAtIndex:self.index error:error];
+}
+
+- (nullable NSData *)dataWithError:(NSError **)error {
+  return [self.interpreter dataFromTensor:self error:error];
+}
+
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error {
+  return [self.interpreter shapeOfTensor:self error:error];
+}
+
+#pragma mark - TFLTensor (Internal)
+
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters {
+  self = [super init];
+  if (self != nil) {
+    _interpreter = interpreter;
+    _type = type;
+    _index = index;
+    _name = [name copy];
+    _dataType = dataType;
+    _quantizationParameters = quantizationParameters;
+  }
+  return self;
+}
+
++ (NSString *)stringForTensorType:(TFLTensorType)type {
+  switch (type) {
+    case TFLTensorTypeInput:
+      return kTFLInputTensorTypeString;
+    case TFLTensorTypeOutput:
+      return kTFLOutputTensorTypeString;
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..00b800d6af96636054f2a79f3d4c8d007dd89ea3
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -0,0 +1,49 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+
+#import <XCTest/XCTest.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Unit tests for TFLInterpreterOptions.
+ */
+@interface TFLInterpreterOptionsTests : XCTestCase
+@end
+
+@implementation TFLInterpreterOptionsTests
+
+#pragma mark - Tests
+
+- (void)testInit {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  XCTAssertEqual(options.numberOfThreads, 0);
+}
+
+- (void)testSetNumberOfThread {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.numberOfThreads = 2;
+  XCTAssertEqual(options.numberOfThreads, 2);
+  options.numberOfThreads = 0;
+  XCTAssertEqual(options.numberOfThreads, 0);
+  options.numberOfThreads = 3;
+  XCTAssertEqual(options.numberOfThreads, 3);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..eefa9b9f05826a0782c0b236a2d7e145428b1ca1
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
@@ -0,0 +1,358 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+#import <XCTest/XCTest.h>
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Float model resource name. */
+static NSString *const kAddFloatModelResourceName = @"add";
+
+/** Quantized model resource name. */
+static NSString *const kAddQuantizedModelResourceName = @"add_quantized";
+
+/** Model resource type. */
+static NSString *const kAddModelResourceType = @"bin";
+
+/** Rank of the input and output tensor in the Add model. */
+static const NSUInteger kAddModelTensorRank = 1U;
+
+/** Size of the first (and only) dimension of the input and output tensor in the Add model. */
+static const NSUInteger kAddModelTensorFirstDimensionSize = 2U;
+
+/** Quantization scale of the quantized model. */
+static const float kAddQuantizedModelScale = 0.003922F;
+
+/** Quantization zero point of the quantized model. */
+static const int32_t kAddQuantizedModelZeroPoint = 0;
+
+/** Invalid input tensor index. */
+static const NSUInteger kInvalidInputTensorIndex = 1U;
+
+/** Invalid output tensor index. */
+static const NSUInteger kInvalidOutputTensorIndex = 1U;
+
+/** Accurary used in comparing floating numbers. */
+static const float kTestAccuracy = 1E-5F;
+
+/**
+ * Unit tests for TFLInterpreter.
+ */
+@interface TFLInterpreterTests : XCTestCase
+
+/** Absolute path of the Add float model resource. */
+@property(nonatomic, nullable) NSString *floatModelPath;
+
+/** Default interpreter using the Add model. */
+@property(nonatomic, nullable) TFLInterpreter *interpreter;
+
+@end
+
+@implementation TFLInterpreterTests
+
+#pragma mark - XCTestCase
+
+- (void)setUp {
+  [super setUp];
+
+  NSBundle *bundle = [NSBundle bundleForClass:[self class]];
+  self.floatModelPath = [bundle pathForResource:kAddFloatModelResourceName
+                                         ofType:kAddModelResourceType];
+  NSError *error;
+  self.interpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(self.interpreter);
+  XCTAssertTrue([self.interpreter allocateTensorsWithError:nil]);
+}
+
+- (void)tearDown {
+  self.floatModelPath = nil;
+  self.interpreter = nil;
+
+  [super tearDown];
+}
+
+#pragma mark - Tests
+
+- (void)testSuccessfulFullRunAddFloatModel {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  options.numberOfThreads = 2;
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *customInterpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath
+                                                                        options:options
+                                                                          error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies input and output tensor counts.
+  XCTAssertEqual(customInterpreter.inputTensorCount, 1);
+  XCTAssertEqual(customInterpreter.outputTensorCount, 1);
+
+  // Resizes the intput tensor.
+  XCTAssertTrue([customInterpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertNil(error);
+
+  // Re-allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the input tensor.
+  TFLTensor *inputTensor = [customInterpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
+  XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeFloat32);
+  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
+
+  // Copies the input data.
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [inputData appendBytes:&one length:sizeof(float)];
+  [inputData appendBytes:&three length:sizeof(float)];
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([customInterpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the output tensor.
+  TFLTensor *outputTensor = [customInterpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
+  XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeFloat32);
+  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
+
+  // Tries to query an invalid output tensor index.
+  TFLTensor *invalidOutputTensor = [customInterpreter outputTensorAtIndex:kInvalidOutputTensorIndex
+                                                                    error:&error];
+  XCTAssertNil(invalidOutputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+
+  // Gets the output tensor data.
+  error = nil;
+  NSData *outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+  float output[kAddModelTensorFirstDimensionSize];
+  [outputData getBytes:output length:(sizeof(float) * kAddModelTensorFirstDimensionSize)];
+  XCTAssertEqualWithAccuracy(output[0], 3.f, kTestAccuracy);
+  XCTAssertEqualWithAccuracy(output[1], 9.f, kTestAccuracy);
+}
+
+- (void)testSuccessfulFullRunQuantizedModel {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  options.numberOfThreads = 2;
+
+  NSBundle *bundle = [NSBundle bundleForClass:[self class]];
+  NSString *quantizedModelPath = [bundle pathForResource:kAddQuantizedModelResourceName
+                                                  ofType:kAddModelResourceType];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *customInterpreter =
+      [[TFLInterpreter alloc] initWithModelPath:quantizedModelPath options:options error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies input and output tensor counts.
+  XCTAssertEqual(customInterpreter.inputTensorCount, 1);
+  XCTAssertEqual(customInterpreter.outputTensorCount, 1);
+
+  // Resizes the intput tensor.
+  XCTAssertTrue([customInterpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertNil(error);
+
+  // Re-allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the input tensor.
+  TFLTensor *inputTensor = [customInterpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
+  XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeUInt8);
+  XCTAssertEqualWithAccuracy(inputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
+                             kTestAccuracy);
+  XCTAssertEqual(inputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
+  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
+
+  // Copies the input data.
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  uint8_t one = 1;
+  uint8_t three = 3;
+  [inputData appendBytes:&one length:sizeof(uint8_t)];
+  [inputData appendBytes:&three length:sizeof(uint8_t)];
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([customInterpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the output tensor.
+  TFLTensor *outputTensor = [customInterpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
+  XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeUInt8);
+  XCTAssertEqualWithAccuracy(outputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
+                             kTestAccuracy);
+  XCTAssertEqual(outputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
+  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
+
+  // Tries to query an invalid output tensor index.
+  TFLTensor *invalidOutputTensor = [customInterpreter outputTensorAtIndex:kInvalidOutputTensorIndex
+                                                                    error:&error];
+  XCTAssertNil(invalidOutputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+
+  // Gets the output tensor data.
+  error = nil;
+  NSData *outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+  uint8_t output[kAddModelTensorFirstDimensionSize];
+  [outputData getBytes:output length:(sizeof(uint8_t) * kAddModelTensorFirstDimensionSize)];
+  XCTAssertEqual(output[0], 3);
+  XCTAssertEqual(output[1], 9);
+}
+
+- (void)testInitWithModelPath_invalidPath {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *brokenInterpreter = [[TFLInterpreter alloc] initWithModelPath:@"InvalidPath"
+                                                                          error:&error];
+  XCTAssertNil(brokenInterpreter);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeFailedToLoadModel);
+}
+
+- (void)testInvoke_beforeAllocation {
+  NSError *error;
+  TFLInterpreter *interpreterWithoutAllocation =
+      [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath error:&error];
+  XCTAssertNotNil(interpreterWithoutAllocation);
+  XCTAssertNil(error);
+
+  XCTAssertFalse([interpreterWithoutAllocation invokeWithError:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeFailedToInvoke);
+}
+
+- (void)testInputTensorAtIndex_invalidIndex {
+  NSError *error;
+  TFLTensor *inputTensor = [self.interpreter inputTensorAtIndex:kInvalidInputTensorIndex
+                                                          error:&error];
+  XCTAssertNil(inputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+}
+
+- (void)testResizeInputTensorAtIndex_invalidIndex {
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:kInvalidInputTensorIndex
+                                                    toShape:shape
+                                                      error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+}
+
+- (void)testResizeInputTensorAtIndex_emptyShape {
+  NSMutableArray *emptyShape = [NSMutableArray arrayWithCapacity:0];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:emptyShape error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
+}
+
+- (void)testResizeInputTensorAtIndex_zeroDimensionSize {
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:0];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
+}
+
+- (void)testCopyDataToInputTensorAtIndex_invalidInputDataByteSize {
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [inputData appendBytes:&one length:sizeof(float)];
+  [inputData appendBytes:&three length:(sizeof(float) - 1)];
+  NSError *error;
+  TFLTensor *inputTensor = [self.interpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertFalse([inputTensor copyData:inputData error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidInputByteSize);
+}
+
+- (void)testCopyDataToOutputTensorAtIndex_notAllowed {
+  NSMutableData *data = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [data appendBytes:&one length:sizeof(float)];
+  [data appendBytes:&three length:(sizeof(float) - 1)];
+  NSError *error;
+  TFLTensor *outputTensor = [self.interpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertFalse([outputTensor copyData:data error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m b/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..239e0bcb0dee8b6d2258be6f7e1ae2591611f501
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m
@@ -0,0 +1,48 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+#import <XCTest/XCTest.h>
+
+#import "tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Test scale of quantization parameters. */
+static const float kTestScale = 2.0;
+
+/** Test zero point of quantization parameters. */
+static const int32_t kTestZeroPoint = 128;
+
+/**
+ * Unit tests for TFLQuantizationParameters.
+ */
+@interface TFLQuantizationParametersTests : XCTestCase
+@end
+
+@implementation TFLQuantizationParametersTests
+
+#pragma mark - Tests
+
+- (void)testInitWithScaleAndZeroPoint {
+  TFLQuantizationParameters *params =
+      [[TFLQuantizationParameters alloc] initWithScale:kTestScale zeroPoint:kTestZeroPoint];
+  XCTAssertEqual(params.scale, kTestScale);
+  XCTAssertEqual(params.zeroPoint, kTestZeroPoint);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
new file mode 100644
index 0000000000000000000000000000000000000000..35875bbbe9e4e4c5439bad97248d23534b87a818
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -0,0 +1,106 @@
+# TensorFlow Lite for Swift.
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
+
+MINIMUM_OS_VERSION = "9.0"
+
+SWIFT_COPTS = [
+    "-wmo",
+]
+
+# Default tags for filtering targets. Targets in this file are restricted to Apple platforms.
+DEFAULT_TAGS = [
+    "apple",
+    "manual",
+]
+
+swift_library(
+    name = "TensorFlowLite",
+    srcs = glob(["Sources/*.swift"]),
+    copts = SWIFT_COPTS,
+    module_name = "TensorFlowLite",
+    tags = DEFAULT_TAGS,
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
+)
+
+ios_unit_test(
+    name = "TensorFlowLiteTests",
+    size = "small",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    tags = DEFAULT_TAGS + [
+        # DISABLED: Following sanitizer tests are not supported by iOS test targets.
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [":TensorFlowLiteTestsLib"],
+)
+
+swift_library(
+    name = "TensorFlowLiteTestsLib",
+    testonly = 1,
+    srcs = glob(["Tests/*.swift"]),
+    copts = SWIFT_COPTS,
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+        ":TestResources",
+    ],
+)
+
+objc_library(
+    name = "TestResources",
+    resources = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = DEFAULT_TAGS,
+)
+
+ios_application(
+    name = "TensorFlowLiteApp",
+    app_icons = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/**"]),
+    bundle_id = "com.tensorflow.lite.swift.TensorFlowLite",
+    families = [
+        "ipad",
+        "iphone",
+    ],
+    infoplists = ["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist"],
+    launch_storyboard = "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    sdk_frameworks = [
+        "CoreGraphics",
+    ],
+    tags = DEFAULT_TAGS,
+    deps = [":TensorFlowLiteAppLib"],
+)
+
+swift_library(
+    name = "TensorFlowLiteAppLib",
+    srcs = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/*.swift"]),
+    module_name = "TensorFlowLiteAppLib",
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+        ":TensorFlowLiteAppResources",
+    ],
+)
+
+objc_library(
+    name = "TensorFlowLiteAppResources",
+    storyboards = glob([
+        "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/*.storyboard",
+    ]),
+    tags = DEFAULT_TAGS,
+    deps = [":TestResources"],
+)
diff --git a/tensorflow/lite/experimental/swift/LICENSE b/tensorflow/lite/experimental/swift/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/tensorflow/lite/experimental/swift/README.md b/tensorflow/lite/experimental/swift/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf7eeac6b402698785ff0829bf29a8fb5e471065
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/README.md
@@ -0,0 +1,78 @@
+# TensorFlow Lite for Swift
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/) is TensorFlow's lightweight
+solution for Swift developers. It enables low-latency inference of on-device
+machine learning models with a small binary size and fast performance supporting
+hardware acceleration.
+
+## Getting Started
+
+### Bazel
+
+In your `BUILD` file, add the `TensorFlowLite` dependency:
+
+```python
+swift_library(
+  deps = [
+      "//tensorflow/lite/experimental/swift:TensorFlowLite",
+  ],
+)
+```
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
+
+If you would like to build the Swift TensorFlow Lite library using Bazel on Apple
+platforms, clone or download the [TensorFlow GitHub repo](https://github.com/tensorflow/tensorflow),
+then navigate to the root `tensorflow` directory and execute the `configure.py` script:
+
+```shell
+python configure.py
+```
+
+Follow the prompts and when asked to configure the Bazel rules for Apple
+platforms, enter `y`.
+
+Build the `TensorFlowLite` Swift library target:
+
+```shell
+bazel build tensorflow/lite/experimental/swift:TensorFlowLite
+```
+
+Build the `TensorFlowLiteTests` target:
+
+```shell
+bazel test tensorflow/lite/experimental/swift:TensorFlowLiteTests --swiftcopt=-enable-testing
+```
+
+Note that `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
+
+### Tulsi
+
+Open the `TensorFlowLite.tulsiproj` using the [TulsiApp](https://github.com/bazelbuild/tulsi) or by
+running the [`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
+script:
+
+```shell
+generate_xcodeproj.sh --genconfig tensorflow/lite/swift/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
+```
+
+### CocoaPods
+
+Add the following to your `Podfile`:
+
+```ruby
+use_frameworks!
+pod 'TensorFlowLiteSwift'
+```
+
+Then, run `pod install`.
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a14b5966b1a24946137fddae0ddea16ed43ba46c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -0,0 +1,265 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// A TensorFlow Lite interpreter that performs inference from a given model.
+public final class Interpreter {
+
+  /// The `TFL_Interpreter` C pointer type represented as an `UnsafePointer<TFL_Interpreter>`.
+  private typealias CInterpreter = OpaquePointer
+
+  /// Total number of input tensors associated with the model.
+  public var inputTensorCount: Int {
+    return Int(TFL_InterpreterGetInputTensorCount(cInterpreter))
+  }
+
+  /// Total number of output tensors associated with the model.
+  public var outputTensorCount: Int {
+    return Int(TFL_InterpreterGetOutputTensorCount(cInterpreter))
+  }
+
+  /// The underlying `TFL_Interpreter` C pointer.
+  private var cInterpreter: CInterpreter?
+
+  /// Creates a new model interpreter instance.
+  ///
+  /// - Parameters:
+  ///   - modelPath: Local file path to a TensorFlow Lite model.
+  ///   - options: Custom configurations for the interpreter. The default is `nil` indicating that
+  ///       interpreter will determine the configuration options.
+  /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
+  public init(modelPath: String, options: InterpreterOptions? = nil) throws {
+    guard let model = Model(filePath: modelPath) else { throw InterpreterError.failedToLoadModel }
+
+    let cInterpreterOptions: OpaquePointer? = try options.map { options in
+      guard let cOptions = TFL_NewInterpreterOptions() else {
+        throw InterpreterError.failedToCreateInterpreter
+      }
+      if let threadCount = options.threadCount, threadCount > 0 {
+        TFL_InterpreterOptionsSetNumThreads(cOptions, Int32(threadCount))
+      }
+      if options.isErrorLoggingEnabled {
+        TFL_InterpreterOptionsSetErrorReporter(
+          cOptions,
+          { (_, format, arguments) in
+            guard let cFormat = format,
+                  let message = String(cFormat: cFormat, arguments: arguments)
+            else {
+              return
+            }
+            print(String(describing: InterpreterError.tensorFlowLiteError(message)))
+          },
+          nil
+        )
+      }
+      return cOptions
+    }
+    defer { TFL_DeleteInterpreterOptions(cInterpreterOptions) }
+
+    guard let cInterpreter = TFL_NewInterpreter(model.cModel, cInterpreterOptions) else {
+      throw InterpreterError.failedToCreateInterpreter
+    }
+    self.cInterpreter = cInterpreter
+  }
+
+  deinit {
+    TFL_DeleteInterpreter(cInterpreter)
+  }
+
+  /// Invokes the interpreter to perform inference from the loaded graph.
+  ///
+  /// - Throws: An error if the model was not ready because tensors were not allocated.
+  public func invoke() throws {
+    guard TFL_InterpreterInvoke(cInterpreter) == kTfLiteOk else {
+      // TODO(b/117510052): Determine which error to throw.
+      throw InterpreterError.allocateTensorsRequired
+    }
+  }
+
+  /// Returns the input tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - index: The index for the input tensor.
+  /// - Throws: An error if the index is invalid or the tensors have not been allocated.
+  /// - Returns: The input tensor at the given index.
+  public func input(at index: Int) throws -> Tensor {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)),
+          let bytes = TFL_TensorData(cTensor),
+          let nameCString = TFL_TensorName(cTensor)
+    else {
+      throw InterpreterError.allocateTensorsRequired
+    }
+    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+      throw InterpreterError.invalidTensorDataType
+    }
+
+    let name = String(cString: nameCString)
+    let rank = TFL_TensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
+    let shape = TensorShape(dimensions)
+    let byteCount = TFL_TensorByteSize(cTensor)
+    let data = Data(bytes: bytes, count: byteCount)
+    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let scale = cQuantizationParams.scale
+    let zeroPoint = Int(cQuantizationParams.zero_point)
+    var quantizationParameters: QuantizationParameters? = nil
+    if scale != 0.0 {
+      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
+      quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
+    }
+    let tensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    return tensor
+  }
+
+  /// Returns the output tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - index: The index for the output tensor.
+  /// - Throws: An error if the index is invalid, tensors haven't been allocated, or interpreter
+  ///     hasn't been invoked for models that dynamically compute output tensors based on the values
+  ///     of its input tensors.
+  /// - Returns: The output tensor at the given index.
+  public func output(at index: Int) throws -> Tensor {
+    let maxIndex = outputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetOutputTensor(cInterpreter, Int32(index)),
+          let bytes = TFL_TensorData(cTensor),
+          let nameCString = TFL_TensorName(cTensor)
+    else {
+      // TODO(b/117510052): Determine which error to throw.
+      throw InterpreterError.invokeInterpreterRequired
+    }
+    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+      throw InterpreterError.invalidTensorDataType
+    }
+
+    let name = String(cString: nameCString)
+    let rank = TFL_TensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
+    let shape = TensorShape(dimensions)
+    let byteCount = TFL_TensorByteSize(cTensor)
+    let data = Data(bytes: bytes, count: byteCount)
+    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let scale = cQuantizationParams.scale
+    let zeroPoint = Int(cQuantizationParams.zero_point)
+    var quantizationParameters: QuantizationParameters? = nil
+    if scale != 0.0 {
+      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
+      quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
+    }
+    let tensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    return tensor
+  }
+
+  /// Resizes the input tensor at the given index to the specified tensor shape.
+  ///
+  /// - Note: After resizing an input tensor, the client **must** explicitly call
+  ///     `allocateTensors()` before attempting to access the resized tensor data or invoking the
+  ///     interpreter to perform inference.
+  /// - Parameters:
+  ///   - index: The index for the input tensor.
+  ///   - shape: The shape that the input tensor should be resized to.
+  /// - Throws: An error if the input tensor at the given index could not be resized.
+  public func resizeInput(at index: Int, to shape: TensorShape) throws {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard TFL_InterpreterResizeInputTensor(
+            cInterpreter,
+            Int32(index),
+            shape.int32Dimensions,
+            Int32(shape.rank)
+          ) == kTfLiteOk
+    else {
+      throw InterpreterError.failedToResizeInputTensor(index: index)
+    }
+  }
+
+  /// Copies the given data to the input tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - data: The data to be copied to the input tensor's data buffer.
+  ///   - index: The index for the input tensor.
+  /// - Throws: An error if the `data.count` does not match the input tensor's `data.count` or if
+  ///     the given index is invalid.
+  /// - Returns: The input tensor with the copied data.
+  @discardableResult
+  public func copy(_ data: Data, toInputAt index: Int) throws -> Tensor {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)) else {
+      throw InterpreterError.allocateTensorsRequired
+    }
+
+    let byteCount = TFL_TensorByteSize(cTensor)
+    guard data.count == byteCount else {
+      throw InterpreterError.invalidTensorDataCount(provided: data.count, required: byteCount)
+    }
+
+    let status = data.withUnsafeBytes { TFL_TensorCopyFromBuffer(cTensor, $0, data.count) }
+    guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
+    return try input(at: index)
+  }
+
+  /// Allocates memory for all input tensors based on their `TensorShape`s.
+  ///
+  /// - Note: This is a relatively expensive operation and should only be called after creating the
+  ///     interpreter and/or resizing any input tensors.
+  /// - Throws: An error if memory could not be allocated for the input tensors.
+  public func allocateTensors() throws {
+    guard TFL_InterpreterAllocateTensors(cInterpreter) == kTfLiteOk else {
+      throw InterpreterError.failedToAllocateTensors
+    }
+  }
+}
+
+// MARK: - Extensions
+
+extension String {
+  /// Returns a new `String` initialized by using the given format C array as a template into which
+  /// the remaining argument values are substituted according to the user’s default locale.
+  ///
+  /// - Note: Returns `nil` if a new `String` could not be constructed from the given values.
+  /// - Parameters:
+  ///   - cFormat: The format C array as a template for substituting values.
+  ///   - arguments: A C pointer to a `va_list` of arguments to substitute into `cFormat`.
+  init?(cFormat: UnsafePointer<CChar>, arguments: CVaListPointer) {
+    var buffer: UnsafeMutablePointer<CChar>?
+    guard vasprintf(&buffer, cFormat, arguments) != 0, let cString = buffer else { return nil }
+    self.init(validatingUTF8: cString)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
new file mode 100644
index 0000000000000000000000000000000000000000..5de58b997a76b6bf9493525694bc9f9e4e6b6c1c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
@@ -0,0 +1,99 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// TensorFlow Lite interpreter errors.
+public enum InterpreterError: Error {
+  case invalidTensorIndex(index: Int, maxIndex: Int)
+  case invalidTensorDataCount(provided: Int, required: Int)
+  case invalidTensorDataType
+  case failedToLoadModel
+  case failedToCreateInterpreter
+  case failedToResizeInputTensor(index: Int)
+  case failedToCopyDataToInputTensor
+  case failedToAllocateTensors
+  case allocateTensorsRequired
+  case invokeInterpreterRequired
+  case tensorFlowLiteError(String)
+}
+
+// MARK: - Extensions
+
+extension InterpreterError: LocalizedError {
+  /// Localized description of the interpreter error.
+  public var errorDescription: String? {
+    switch self {
+    case .invalidTensorIndex(let index, let maxIndex):
+      return "Invalid tensor index \(index), max index is \(maxIndex)."
+    case .invalidTensorDataCount(let providedCount, let requiredCount):
+      return "Provided data count \(providedCount) must match the required count \(requiredCount)."
+    case .invalidTensorDataType:
+      return "Tensor data type is unsupported or could not be determined because of a model error."
+    case .failedToLoadModel:
+      return "Failed to load the given model."
+    case .failedToCreateInterpreter:
+      return "Failed to create the interpreter."
+    case .failedToResizeInputTensor(let index):
+      return "Failed to resize input tesnor at index \(index)."
+    case .failedToCopyDataToInputTensor:
+      return "Failed to copy data to input tensor."
+    case .failedToAllocateTensors:
+      return "Failed to allocate memory for input tensors."
+    case .allocateTensorsRequired:
+      return "Must call allocateTensors()."
+    case .invokeInterpreterRequired:
+      return "Must call invoke()."
+    case .tensorFlowLiteError(let message):
+      return "TensorFlow Lite Error: \(message)"
+    }
+  }
+}
+
+extension InterpreterError: CustomStringConvertible {
+  /// Textual representation of the TensorFlow Lite interpreter error.
+  public var description: String {
+    return errorDescription ?? "Unknown error."
+  }
+}
+
+#if swift(>=4.2)
+extension InterpreterError: Equatable {}
+#else
+extension InterpreterError: Equatable {
+  public static func == (lhs: InterpreterError, rhs: InterpreterError) -> Bool {
+    switch (lhs, rhs) {
+    case (.invalidTensorDataType, .invalidTensorDataType),
+         (.failedToLoadModel, .failedToLoadModel),
+         (.failedToCreateInterpreter, .failedToCreateInterpreter),
+         (.failedToAllocateTensors, .failedToAllocateTensors),
+         (.allocateTensorsRequired, .allocateTensorsRequired),
+         (.invokeInterpreterRequired, .invokeInterpreterRequired):
+      return true
+    case (.invalidTensorIndex(let lhsIndex, let lhsMaxIndex),
+          .invalidTensorIndex(let rhsIndex, let rhsMaxIndex)):
+      return lhsIndex == rhsIndex && lhsMaxIndex == rhsMaxIndex
+    case (.invalidTensorDataCount(let lhsProvidedCount, let lhsRequiredCount),
+          .invalidTensorDataCount(let rhsProvidedCount, let rhsRequiredCount)):
+      return lhsProvidedCount == rhsProvidedCount && lhsRequiredCount == rhsRequiredCount
+    case (.failedToResizeInputTensor(let lhsIndex), .failedToResizeInputTensor(let rhsIndex)):
+      return lhsIndex == rhsIndex
+    case (.tensorFlowLiteError(let lhsMessage), .tensorFlowLiteError(let rhsMessage)):
+      return lhsMessage == rhsMessage
+    default:
+      return false
+    }
+  }
+}
+#endif  // swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2365fd7ade0f9562250b239308f6a13b16c35784
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Custom configuration options for a TensorFlow Lite interpreter.
+public struct InterpreterOptions: Equatable {
+
+  /// Maximum number of CPU threads that the interpreter should run on. Default is `nil` which
+  /// indicates that the `Interpreter` will decide the number of threads to use.
+  public var threadCount: Int? = nil
+
+  /// Whether error logging to the console is enabled. The default is `false`.
+  public var isErrorLoggingEnabled = false
+
+  /// Creates a new instance of interpreter options.
+  public init() {}
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Model.swift b/tensorflow/lite/experimental/swift/Sources/Model.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e8c49ff1ae10cc20d1c50b8e8340950cb1491722
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Model.swift
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// A TensorFlow Lite model used by the 'Interpreter` to perform inference.
+final class Model {
+
+  /// The `TFL_Model` C pointer type represented as an `UnsafePointer<TFL_Model>`.
+  typealias CModel = OpaquePointer
+
+  /// The underlying `TFL_Model` C pointer.
+  let cModel: CModel?
+
+  /// Creates a new model instance.
+  ///
+  /// - Precondition: Initialization can fail if the given `filePath` is invalid.
+  /// - Parameters:
+  ///   - filePath: Local file path to a TensorFlow Lite model.
+  init?(filePath: String) {
+    guard !filePath.isEmpty, let cModel = TFL_NewModelFromFile(filePath) else { return nil }
+    self.cModel = cModel
+  }
+
+  deinit {
+    TFL_DeleteModel(cModel)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f36787564478115e19584b933a10fb0458e06c71
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Parameters that determine the mapping of quantized values to real values. Quantized values can
+/// be mapped to float values using the following conversion:
+/// `realValue = scale * (quantizedValue - zeroPoint)`.
+public struct QuantizationParameters {
+
+  /// Difference between real values corresponding to consecutive quantized values differing by 1.
+  /// For example, the range of quantized values for `UInt8` data type is [0, 255].
+  public let scale: Float
+
+  /// Quantized value that corresponds to the real 0 value.
+  public let zeroPoint: Int
+
+  /// Creates a new quantization parameters instance.
+  ///
+  /// - Parameters:
+  ///   - scale: Scale value for asymmetric quantization.
+  ///   - zeroPoint: Zero point for asymmetric quantization.
+  init(scale: Float, zeroPoint: Int) {
+    self.scale = scale
+    self.zeroPoint = zeroPoint
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Tensor.swift b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
new file mode 100644
index 0000000000000000000000000000000000000000..b738d8754914e20ac4c1cb991c92b029828f66d2
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// An input or output tensor in a TensorFlow Lite graph.
+public struct Tensor {
+
+  /// Name of the tensor.
+  public let name: String
+
+  /// Data type of the tensor.
+  public let dataType: TensorDataType
+
+  /// Shape of the tensor.
+  public let shape: TensorShape
+
+  /// Data in the input or output tensor.
+  public let data: Data
+
+  /// Quantization parameters for the tensor if using a quantized model.
+  public let quantizationParameters: QuantizationParameters?
+
+  /// Creates a new input or output tensor instance.
+  ///
+  /// - Parameters:
+  ///   - name: Name of the tensor.
+  ///   - dataType: Data type of the tensor.
+  ///   - data: Data in the input tensor.
+  ///   - quantizationParameters Quantization parameters for the tensor if using a quantized model.
+  ///       The default is `nil`.
+  init(
+    name: String,
+    dataType: TensorDataType,
+    shape: TensorShape,
+    data: Data,
+    quantizationParameters: QuantizationParameters? = nil
+  ) {
+    self.name = name
+    self.dataType = dataType
+    self.shape = shape
+    self.data = data
+    self.quantizationParameters = quantizationParameters
+  }
+}
+
+/// Supported TensorFlow Lite tensor data types.
+public enum TensorDataType: Equatable {
+  /// 32-bit single precision floating point tensor data type.
+  case float32
+  /// 8-bit unsigned integer tensor data type.
+  case uInt8
+  /// 16-bit signed integer tensor data type.
+  case int16
+  /// 32-bit signed integer tensor data type.
+  case int32
+  /// 64-bit signed integer tensor data type.
+  case int64
+  /// Boolean tensor data type.
+  case bool
+
+  /// Creates a new tensor data type from the given `TFL_Type` or `nil` if the data type is
+  /// unsupported or could not be determined because there was an error.
+  ///
+  /// - Parameter type: A data type supported by a tensor.
+  init?(type: TFL_Type) {
+    switch type {
+    case kTfLiteFloat32:
+      self = .float32
+    case kTfLiteUInt8:
+      self = .uInt8
+    case kTfLiteInt16:
+      self = .int16
+    case kTfLiteInt32:
+      self = .int32
+    case kTfLiteInt64:
+      self = .int64
+    case kTfLiteBool:
+      self = .bool
+    case kTfLiteNoType:
+      fallthrough
+    default:
+      return nil
+    }
+  }
+}
+
+/// The shape of a TensorFlow Lite tensor.
+public struct TensorShape {
+
+  /// The number of dimensions of the tensor.
+  public let rank: Int
+
+  /// Array of dimensions for the tensor.
+  public let dimensions: [Int]
+
+  /// Array of `Int32` dimensions for the tensor.
+  var int32Dimensions: [Int32] { return dimensions.map(Int32.init) }
+
+  /// Creates a new tensor shape instance with the given array of dimensions.
+  ///
+  /// - Parameters:
+  ///   - dimensions: Dimensions for the tensor.
+  public init(_ dimensions: [Int]) {
+    self.rank = dimensions.count
+    self.dimensions = dimensions
+  }
+
+  /// Creates a new tensor shape instance with the given elements representing the dimensions.
+  ///
+  /// - Parameters:
+  ///   - elements: Dimensions for the tensor.
+  public init(_ elements: Int...) {
+    self.init(elements)
+  }
+}
+
+extension TensorShape: ExpressibleByArrayLiteral {
+  /// Creates a new tensor shape instance with the given array literal representing the dimensions.
+  ///
+  /// - Parameters:
+  ///   - arrayLiteral: Dimensions for the tensor.
+  public init(arrayLiteral: Int...) {
+    self.init(arrayLiteral)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
new file mode 100644
index 0000000000000000000000000000000000000000..16bc6cbfe8f554caad2cba3cae11b364b34ed64d
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
@@ -0,0 +1,57 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite/experimental/c",
+    "tensorflow/lite/experimental/swift",
+    "tensorflow/lite/experimental/swift/Sources",
+    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp",
+    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj",
+    "tensorflow/lite/experimental/swift/Tests",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/experimental/swift:TensorFlowLite",
+    "//tensorflow/lite/experimental/swift:TensorFlowLiteApp",
+    "//tensorflow/lite/experimental/swift:TensorFlowLiteTests",
+  ],
+  "projectName" : "TensorFlowLite",
+  "optionSet" : {
+    "LaunchActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "EnvironmentVariables" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "CommandlineArguments" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "LaunchActionPostActionScript" : {
+      "p" : "$(inherited)"
+    }
+  },
+  "additionalFilePaths" : [
+    "tensorflow/lite/experimental/swift/BUILD"
+  ]
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000000000000000000000000000000000000..82ac8aa38126021c176773e4093352bcbecd8603
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf
@@ -0,0 +1,14 @@
+{
+  "configDefaults" : {
+    "optionSet" : {
+      "ProjectPrioritizesSwift" : {
+        "p" : "YES"
+      }
+    }
+  },
+  "projectName" : "TensorFlowLite",
+  "packages" : [
+    "tensorflow/lite/experimental/swift"
+  ],
+  "workspaceRoot" : "../../../../.."
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..fbbf9a1de2c8e82ab486b99b9e9b8c6dfe80868e
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
@@ -0,0 +1,345 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */; };
+		4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B722146ED64006C3AEF /* AppDelegate.swift */; };
+		4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B742146ED64006C3AEF /* ViewController.swift */; };
+		4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B762146ED64006C3AEF /* Main.storyboard */; };
+		4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B792146ED66006C3AEF /* Assets.xcassets */; };
+		4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */; };
+		4ADDE0CE2176600E00FF07A2 /* Array+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+TensorFlowLite.swift"; sourceTree = "<group>"; };
+		4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TensorFlowLiteApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		4AA72B722146ED64006C3AEF /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
+		4AA72B742146ED64006C3AEF /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
+		4AA72B772146ED64006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		4AA72B792146ED66006C3AEF /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		4AA72B7C2146ED66006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		4AA72B7E2146ED66006C3AEF /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Array+TensorFlowLite.swift"; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		4AA72B6C2146ED64006C3AEF /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		4AA72B662146ED64006C3AEF = {
+			isa = PBXGroup;
+			children = (
+				4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */,
+				4AA72B702146ED64006C3AEF /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		4AA72B702146ED64006C3AEF /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */ = {
+			isa = PBXGroup;
+			children = (
+				4AA72B722146ED64006C3AEF /* AppDelegate.swift */,
+				4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */,
+				4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */,
+				4AA72B742146ED64006C3AEF /* ViewController.swift */,
+				4AA72B762146ED64006C3AEF /* Main.storyboard */,
+				4AA72B792146ED66006C3AEF /* Assets.xcassets */,
+				4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */,
+				4AA72B7E2146ED66006C3AEF /* Info.plist */,
+			);
+			path = TensorFlowLiteApp;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */;
+			buildPhases = (
+				4AA72B6B2146ED64006C3AEF /* Sources */,
+				4AA72B6C2146ED64006C3AEF /* Frameworks */,
+				4AA72B6D2146ED64006C3AEF /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = TensorFlowLiteApp;
+			productName = TensorFlowLiteApp;
+			productReference = 4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		4AA72B672146ED64006C3AEF /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0940;
+				LastUpgradeCheck = 0940;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					4AA72B6E2146ED64006C3AEF = {
+						CreatedOnToolsVersion = 9.4.1;
+					};
+				};
+			};
+			buildConfigurationList = 4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 4AA72B662146ED64006C3AEF;
+			productRefGroup = 4AA72B702146ED64006C3AEF /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		4AA72B6D2146ED64006C3AEF /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */,
+				4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */,
+				4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		4AA72B6B2146ED64006C3AEF /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */,
+				4ADDE0CE2176600E00FF07A2 /* Array+TensorFlowLite.swift in Sources */,
+				4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */,
+				4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		4AA72B762146ED64006C3AEF /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				4AA72B772146ED64006C3AEF /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				4AA72B7C2146ED66006C3AEF /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		4AA72B7F2146ED66006C3AEF /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		4AA72B802146ED66006C3AEF /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		4AA72B822146ED66006C3AEF /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		4AA72B832146ED66006C3AEF /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4AA72B7F2146ED66006C3AEF /* Debug */,
+				4AA72B802146ED66006C3AEF /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4AA72B822146ED66006C3AEF /* Debug */,
+				4AA72B832146ED66006C3AEF /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 4AA72B672146ED64006C3AEF /* Project object */;
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ffa90a06adb0b9f93575c8390cd30bd589e43ac7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
@@ -0,0 +1,24 @@
+import UIKit
+
+@UIApplicationMain
+
+final class AppDelegate: UIResponder, UIApplicationDelegate {
+
+  /// The main window of the app.
+  var window: UIWindow?
+
+  func application(
+    _ application: UIApplication,
+    didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
+  ) -> Bool {
+    return true
+  }
+}
+
+// MARK: - Extensions
+
+#if !swift(>=4.2)
+extension UIApplication {
+  typealias LaunchOptionsKey = UIApplicationLaunchOptionsKey
+}
+#endif  // !swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
new file mode 100644
index 0000000000000000000000000000000000000000..56df1ce6597aacf307f7a89a084527ea93c303c2
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
@@ -0,0 +1,22 @@
+import Foundation
+
+extension Array {
+  /// Creates a new array from the bytes of the given unsafe data.
+  ///
+  /// - Warning: The array's `Element` type must be trivial in that it can be copied bit for bit
+  ///     with no indirection or reference-counting operations; otherwise, copying the raw bytes in
+  ///     the `unsafeData`'s buffer to a new array returns an unsafe copy.
+  /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
+  ///     `MemoryLayout<Element>.stride`.
+  /// - Parameter unsafeData: The data containing the bytes to turn into an array.
+  init?(unsafeData: Data) {
+    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
+    let elements = unsafeData.withUnsafeBytes {
+      UnsafeBufferPointer<Element>(
+        start: $0,
+        count: unsafeData.count / MemoryLayout<Element>.stride
+      )
+    }
+    self.init(elements)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8db8d65fd79fd541b2b7eba75c7378af3448f9c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..da4a164c918651cdd1e11dca5cc62c333f097601
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..a07a1321be2e65323fadeca51487671c88f462c8
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14109" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14088"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="Llm-lL-Icb"/>
+                        <viewControllerLayoutGuide type="bottom" id="xb3-aO-Qok"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlowLite" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="3Gq-PV-hia">
+                                <rect key="frame" x="16" y="315" width="343" height="38.5"/>
+                                <fontDescription key="fontDescription" type="boldSystem" pointSize="32"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="3Gq-PV-hia" firstAttribute="leading" secondItem="Ze5-6b-2t3" secondAttribute="leading" constant="16" id="aXL-9T-5Pf"/>
+                            <constraint firstItem="3Gq-PV-hia" firstAttribute="centerY" secondItem="Ze5-6b-2t3" secondAttribute="centerY" id="cDf-Go-1FR"/>
+                            <constraint firstAttribute="trailing" secondItem="3Gq-PV-hia" secondAttribute="trailing" constant="16" id="fB9-BX-A3B"/>
+                        </constraints>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="52" y="374.66266866566718"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..10cae6e855311791044d6a25a16d855873223c43
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="TensorFlowLiteAppLib" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="y3c-jy-aDJ"/>
+                        <viewControllerLayoutGuide type="bottom" id="wfy-db-euE"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="7Mj-sL-hrd">
+                                <rect key="frame" x="0.0" y="367" width="375" height="300"/>
+                                <color key="backgroundColor" red="0.0" green="0.47843137250000001" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="300" id="YUb-MC-D5w"/>
+                                </constraints>
+                                <color key="textColor" cocoaTouchSystemColor="tableCellGroupedBackgroundColor"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Qwg-EP-bd6" userLabel="Bottom Toolbar">
+                                <rect key="frame" x="0.0" y="323" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="jhT-Q0-E9N"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" systemItem="flexibleSpace" id="P3q-uA-YUa"/>
+                                    <barButtonItem title="Invoke Interpreter" id="A4J-Mg-nmd" userLabel="Invoke Button">
+                                        <connections>
+                                            <action selector="invokeInterpreter:" destination="BYZ-38-t0r" id="lZU-x7-PsJ"/>
+                                        </connections>
+                                    </barButtonItem>
+                                    <barButtonItem style="plain" systemItem="flexibleSpace" id="Qad-Pa-ySg"/>
+                                </items>
+                            </toolbar>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Gkb-TR-PCB" userLabel="Top Toolbar">
+                                <rect key="frame" x="0.0" y="28" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="hSD-2q-fUE"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" id="LKw-TX-bbH">
+                                        <segmentedControl key="customView" opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="top" segmentControlStyle="bar" selectedSegmentIndex="0" id="rhA-nW-xzT">
+                                            <rect key="frame" x="16" y="7" width="343" height="30"/>
+                                            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                            <segments>
+                                                <segment title="Add"/>
+                                                <segment title="AddQuantized"/>
+                                                <segment title="MultiAdd"/>
+                                            </segments>
+                                            <connections>
+                                                <action selector="modelChanged:" destination="BYZ-38-t0r" eventType="valueChanged" id="YnG-Ov-B5D"/>
+                                            </connections>
+                                        </segmentedControl>
+                                    </barButtonItem>
+                                </items>
+                            </toolbar>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstAttribute="trailing" secondItem="Gkb-TR-PCB" secondAttribute="trailing" id="4Cr-Sf-I7n"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="bottom" secondItem="wfy-db-euE" secondAttribute="top" id="6ot-zD-sze"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="top" secondItem="Qwg-EP-bd6" secondAttribute="bottom" id="ELA-C6-NiG"/>
+                            <constraint firstAttribute="trailing" secondItem="7Mj-sL-hrd" secondAttribute="trailing" id="HDO-xr-mBl"/>
+                            <constraint firstItem="Gkb-TR-PCB" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="Kmo-6K-gS4"/>
+                            <constraint firstItem="Qwg-EP-bd6" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="hGu-lm-fMG"/>
+                            <constraint firstAttribute="trailing" secondItem="Qwg-EP-bd6" secondAttribute="trailing" id="iXR-LK-nTO"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="nr7-jW-ZYf"/>
+                            <constraint firstItem="Gkb-TR-PCB" firstAttribute="top" secondItem="y3c-jy-aDJ" secondAttribute="bottom" constant="8" id="uCF-VW-rR0"/>
+                        </constraints>
+                    </view>
+                    <connections>
+                        <outlet property="invokeButton" destination="A4J-Mg-nmd" id="UxZ-Ft-E45"/>
+                        <outlet property="modelControl" destination="rhA-nW-xzT" id="KKf-TT-BQ2"/>
+                        <outlet property="resultsTextView" destination="7Mj-sL-hrd" id="T4I-z4-tYA"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="125.59999999999999" y="133.5832083958021"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a70c848390ad7ba584629563d7d75a9e32341
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
@@ -0,0 +1,13 @@
+import Foundation
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..3ca3875f04e5789da9cfb34a44151cd06226a8f3
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>0.0.1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>NSCameraUsageDescription</string>
+	<key>NSPhotoLibraryUsageDescription</key>
+	<string>Select a photo to detect objects in.</string>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
new file mode 100644
index 0000000000000000000000000000000000000000..73c74fd19c996653d988977d551fcef683f18697
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
@@ -0,0 +1,299 @@
+import TensorFlowLite
+import UIKit
+
+class ViewController: UIViewController {
+
+  // MARK: - Properties
+
+  /// TensorFlowLite interpreter object for performing inference from a given model.
+  private var interpreter: Interpreter?
+
+  /// Serial dispatch queue for managing `Interpreter` calls.
+  private let interpreterQueue = DispatchQueue(
+    label: Constant.dispatchQueueLabel,
+    qos: .userInitiated
+  )
+
+  /// The currently selected model.
+  private var currentModel: Model {
+    guard let currentModel = Model(rawValue: modelControl.selectedSegmentIndex) else {
+      preconditionFailure("Invalid model for selected segment index.")
+    }
+    return currentModel
+  }
+
+  /// A description of the current model.
+  private var modelDescription: String {
+    guard let interpreter = interpreter else { return "" }
+    let inputCount = interpreter.inputTensorCount
+    let outputCount = interpreter.outputTensorCount
+    let inputTensors = (0..<inputCount).map { index in
+      var tensorInfo = "  Input \(index + 1): "
+      do {
+        let tensor = try interpreter.input(at: index)
+        tensorInfo += "\(tensor)"
+      } catch let error {
+        tensorInfo += "\(error.localizedDescription)"
+      }
+      return tensorInfo
+    }.joined(separator: "\n")
+    let outputTensors = (0..<outputCount).map { index in
+      var tensorInfo = "  Output \(index + 1): "
+      do {
+        let tensor = try interpreter.output(at: index)
+        tensorInfo += "\(tensor)"
+      } catch let error {
+        tensorInfo += "\(error.localizedDescription)"
+      }
+      return tensorInfo
+    }.joined(separator: "\n")
+    return "Model Description:\n" +
+             "  Input Tensor Count = \(inputCount)\n\(inputTensors)\n\n" +
+             "  Output Tensor Count = \(outputCount)\n\(outputTensors)"
+  }
+
+  // MARK: - IBOutlets
+
+  /// A segmented control for changing models. See the `Model` enum for available models.
+  @IBOutlet private var modelControl: UISegmentedControl!
+
+  @IBOutlet private var resultsTextView: UITextView!
+  @IBOutlet private var invokeButton: UIBarButtonItem!
+
+  // MARK: - UIViewController
+
+  override func viewDidLoad() {
+    super.viewDidLoad()
+
+    invokeButton.isEnabled = false
+    loadModel()
+  }
+
+  // MARK: - IBActions
+
+  @IBAction func modelChanged(_ sender: Any) {
+    invokeButton.isEnabled = false
+    updateResultsText("Switched to the \(currentModel.description) model.")
+    loadModel()
+  }
+
+  @IBAction func invokeInterpreter(_ sender: Any) {
+    switch currentModel {
+    case .add:
+      invokeAdd()
+    case .addQuantized:
+      invokeAddQuantized()
+    case .multiAdd:
+      invokeMultiAdd()
+    }
+  }
+
+  // MARK: - Private
+
+  private func loadModel() {
+    let fileInfo = currentModel.fileInfo
+    guard let modelPath = Bundle.main.path(forResource: fileInfo.name, ofType: fileInfo.extension)
+    else {
+      updateResultsText("Failed to load the \(currentModel.description) model.")
+      return
+    }
+    setUpInterpreter(withModelPath: modelPath)
+  }
+
+  private func setUpInterpreter(withModelPath modelPath: String) {
+    interpreterQueue.async {
+      do {
+        var options = InterpreterOptions()
+        options.isErrorLoggingEnabled = true
+        self.interpreter = try Interpreter(modelPath: modelPath, options: options)
+      } catch let error {
+        self.updateResultsText(
+          "Failed to create the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+      safeDispatchOnMain { self.invokeButton.isEnabled = true }
+    }
+  }
+
+  private func invokeAdd() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        try interpreter.resizeInput(at: 0, to: [2])
+        try interpreter.allocateTensors()
+        let input: [Float32] = [1, 3]
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 2 add operations on input \(input.description) equals: "
+        self.updateResultsText(resultsText)
+        let data = Data(copyingBufferOf: input)
+        try interpreter.copy(data, toInputAt: 0)
+        try interpreter.invoke()
+        let outputTensor = try interpreter.output(at: 0)
+        let results: () -> String = {
+          guard let results = [Float32](unsafeData: outputTensor.data) else { return "No results." }
+          return resultsText + results.description
+        }
+        self.updateResultsText(results())
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func invokeAddQuantized() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        try interpreter.resizeInput(at: 0, to: [2])
+        try interpreter.allocateTensors()
+        let input: [UInt8] = [1, 3]
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 2 add operations on quantized input \(input.description) equals: "
+        self.updateResultsText(resultsText)
+        let data = Data(input)
+        try interpreter.copy(data, toInputAt: 0)
+        try interpreter.invoke()
+        let outputTensor = try interpreter.output(at: 0)
+        let results: () -> String = {
+          guard let quantizationParameters = outputTensor.quantizationParameters else {
+            return "No results."
+          }
+          let quantizedResults = [UInt8](outputTensor.data)
+          let dequantizedResults = quantizedResults.map {
+            quantizationParameters.scale * Float(Int($0) - quantizationParameters.zeroPoint)
+          }
+          return resultsText + quantizedResults.description +
+                   ", dequantized results: " + dequantizedResults.description
+        }
+        self.updateResultsText(results())
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func invokeMultiAdd() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        let shape = TensorShape(2)
+        try (0..<interpreter.inputTensorCount).forEach { index in
+          try interpreter.resizeInput(at: index, to: shape)
+        }
+        try interpreter.allocateTensors()
+        let inputs = try (0..<interpreter.inputTensorCount).map { index -> [Float32] in
+          let input = [Float32(index + 1), Float32(index + 2)]
+          let data = Data(copyingBufferOf: input)
+          try interpreter.copy(data, toInputAt: index)
+          return input
+        }
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 3 add operations on inputs \(inputs.description) equals: "
+        self.updateResultsText(resultsText)
+        try interpreter.invoke()
+        let results = try (0..<interpreter.outputTensorCount).map { index -> [Float32] in
+          let tensor = try interpreter.output(at: index)
+          return [Float32](unsafeData: tensor.data) ?? []
+        }
+        self.updateResultsText(resultsText + results.description)
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func updateResultsText(_ text: String? = nil) {
+    safeDispatchOnMain { self.resultsTextView.text = text }
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  static let dispatchQueueLabel = "TensorFlowLiteInterpreterQueue"
+  static let nilInterpreterErrorMessage =
+    "Failed to invoke the interpreter because the interpreter was nil."
+}
+
+/// Models that can be loaded by the TensorFlow Lite `Interpreter`.
+private enum Model: Int, CustomStringConvertible {
+  /// A float model that performs two add operations on one input tensor and returns the result in
+  /// one output tensor.
+  case add = 0
+  /// A quantized model that performs two add operations on one input tensor and returns the result
+  /// in one output tensor.
+  case addQuantized = 1
+  /// A float model that performs three add operations on four input tensors and returns the results
+  /// in 2 output tensors.
+  case multiAdd = 2
+
+  var fileInfo: (name: String, extension: String) {
+    switch self {
+    case .add:
+      return Add.fileInfo
+    case .addQuantized:
+      return AddQuantized.fileInfo
+    case .multiAdd:
+      return MultiAdd.fileInfo
+    }
+  }
+
+  // MARK: - CustomStringConvertible
+
+  var description: String {
+    switch self {
+    case .add:
+      return Add.name
+    case .addQuantized:
+      return AddQuantized.name
+    case .multiAdd:
+      return MultiAdd.name
+    }
+  }
+}
+
+/// Values for the `Add` model.
+private enum Add {
+  static let name = "Add"
+  static let fileInfo = (name: "add", extension: "bin")
+}
+
+/// Values for the `AddQuantized` model.
+private enum AddQuantized {
+  static let name = "AddQuantized"
+  static let fileInfo = (name: "add_quantized", extension: "bin")
+}
+
+/// Values for the `MultiAdd` model.
+private enum MultiAdd {
+  static let name = "MultiAdd"
+  static let fileInfo = (name: "multi_add", extension: "bin")
+}
+
+// MARK: - Fileprivate
+
+/// Safely dispatches the given block on the main queue. If the current thread is `main`, the block
+/// is executed synchronously; otherwise, the block is executed asynchronously on the main thread.
+fileprivate func safeDispatchOnMain(_ block: @escaping () -> Void) {
+  if Thread.isMainThread { block(); return }
+  DispatchQueue.main.async { block() }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..54b4f59b28942fe2398aba1a19443857e9617458
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class InterpreterOptionsTests: XCTestCase {
+
+  func testInterpreterOptions_InitWithDefaultValues() {
+    let options = InterpreterOptions()
+    XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isErrorLoggingEnabled)
+  }
+
+  func testInterpreterOptions_InitWithCustomValues() {
+    var options = InterpreterOptions()
+    options.threadCount = 2
+    XCTAssertEqual(options.threadCount, 2)
+    options.isErrorLoggingEnabled = true
+    XCTAssertTrue(options.isErrorLoggingEnabled)
+  }
+
+  func testInterpreterOptions_Equatable() {
+    var options1 = InterpreterOptions()
+    var options2 = InterpreterOptions()
+    XCTAssertEqual(options1, options2)
+
+    options1.threadCount = 2
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.threadCount = 3
+    XCTAssertNotEqual(options1, options2)
+    options2.threadCount = 2
+
+    options1.isErrorLoggingEnabled = true
+    options2.isErrorLoggingEnabled = true
+    XCTAssertEqual(options1, options2)
+
+    options2.isErrorLoggingEnabled = false
+    XCTAssertNotEqual(options1, options2)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e98da5f951e9bc6bfebaf6a1bd76b3c8c8bb9e83
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -0,0 +1,315 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class InterpreterTests: XCTestCase {
+
+  var interpreter: Interpreter!
+
+  override func setUp() {
+    super.setUp()
+
+    interpreter = try! Interpreter(modelPath: AddModel.path)
+  }
+
+  override func tearDown() {
+    interpreter = nil
+
+    super.tearDown()
+  }
+
+  func testInterpreter_InitWithModelPath() {
+    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path))
+  }
+
+  func testInterpreter_Init_ThrowsFailedToLoadModel() {
+    XCTAssertThrowsError(try Interpreter(modelPath: "/invalid/path")) { error in
+      self.assertEqualErrors(actual: error, expected: .failedToLoadModel)
+    }
+  }
+
+  func testInterpreter_InitWithModelPathAndOptions() {
+    var options = InterpreterOptions()
+    options.threadCount = 2
+    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path, options: options))
+  }
+
+  func testInterpreter_InputTensorCount() {
+    XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
+  }
+
+  func testInterpreter_OutputTensorCount() {
+    XCTAssertEqual(interpreter.outputTensorCount, AddModel.outputTensorCount)
+  }
+
+  func testInterpreter_Invoke() throws {
+    try interpreter.allocateTensors()
+    XCTAssertNoThrow(try interpreter.invoke())
+  }
+
+  func testInterpreter_Invoke_ThrowsAllocateTensorsRequired_ModelNotReady() {
+    XCTAssertThrowsError(try interpreter.invoke()) { error in
+      self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
+    }
+  }
+
+  func testInterpreter_InputTensorAtIndex() throws {
+    try setUpAddModelInputTensor()
+    let inputTensor = try interpreter.input(at: AddModel.validIndex)
+    XCTAssertEqual(inputTensor, AddModel.inputTensor)
+  }
+
+  func testInterpreter_InputTensorAtIndex_QuantizedModel() throws {
+    interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
+    try setUpAddQuantizedModelInputTensor()
+    let inputTensor = try interpreter.input(at: AddQuantizedModel.inputOutputIndex)
+    XCTAssertEqual(inputTensor, AddQuantizedModel.inputTensor)
+  }
+
+  func testInterpreter_InputTensorAtIndex_ThrowsInvalidIndex() throws {
+    try interpreter.allocateTensors()
+    XCTAssertThrowsError(try interpreter.input(at: AddModel.invalidIndex)) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_InputTensorAtIndex_ThrowsAllocateTensorsRequired() {
+    XCTAssertThrowsError(try interpreter.input(at: AddModel.validIndex)) { error in
+      self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
+    }
+  }
+
+  func testInterpreter_OutputTensorAtIndex() throws {
+    try setUpAddModelInputTensor()
+    try interpreter.invoke()
+    let outputTensor = try interpreter.output(at: AddModel.validIndex)
+    XCTAssertEqual(outputTensor, AddModel.outputTensor)
+    let expectedResults = [Float32](unsafeData: outputTensor.data)
+    XCTAssertEqual(expectedResults, AddModel.results)
+  }
+
+  func testInterpreter_OutputTensorAtIndex_QuantizedModel() throws {
+    interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
+    try setUpAddQuantizedModelInputTensor()
+    try interpreter.invoke()
+    let outputTensor = try interpreter.output(at: AddQuantizedModel.inputOutputIndex)
+    XCTAssertEqual(outputTensor, AddQuantizedModel.outputTensor)
+    let expectedResults = [UInt8](outputTensor.data)
+    XCTAssertEqual(expectedResults, AddQuantizedModel.results)
+  }
+
+  func testInterpreter_OutputTensorAtIndex_ThrowsInvalidIndex() throws {
+    try interpreter.allocateTensors()
+    try interpreter.invoke()
+    XCTAssertThrowsError(try interpreter.output(at: AddModel.invalidIndex)) { error in
+      let maxIndex = AddModel.outputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_OutputTensorAtIndex_ThrowsInvokeInterpreterRequired() {
+    XCTAssertThrowsError(try interpreter.output(at: AddModel.validIndex)) { error in
+      self.assertEqualErrors(actual: error, expected: .invokeInterpreterRequired)
+    }
+  }
+
+  func testInterpreter_ResizeInputTensorAtIndexToShape() {
+    XCTAssertNoThrow(try interpreter.resizeInput(at: AddModel.validIndex, to: [2, 2, 3]))
+    XCTAssertNoThrow(try interpreter.allocateTensors())
+  }
+
+  func testInterpreter_ResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
+    XCTAssertThrowsError(try interpreter.resizeInput(
+      at: AddModel.invalidIndex,
+      to: [2, 2, 3]
+    )) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex() throws {
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    let inputTensor = try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
+    XCTAssertEqual(inputTensor.data, AddModel.inputData)
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
+    XCTAssertThrowsError(try interpreter.copy(
+      AddModel.inputData,
+      toInputAt: AddModel.invalidIndex
+    )) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidDataCount() throws {
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    let invalidData = Data(count: AddModel.dataCount - 1)
+    XCTAssertThrowsError(try interpreter.copy(
+      invalidData,
+      toInputAt: AddModel.validIndex
+    )) { error in
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
+      )
+    }
+  }
+
+  func testInterpreter_AllocateTensors() {
+    XCTAssertNoThrow(try interpreter.allocateTensors())
+  }
+
+  // MARK: - Private
+
+  private func setUpAddModelInputTensor() throws {
+    precondition(interpreter != nil)
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
+  }
+
+  private func setUpAddQuantizedModelInputTensor() throws {
+    precondition(interpreter != nil)
+    try interpreter.resizeInput(at: AddQuantizedModel.inputOutputIndex, to: AddQuantizedModel.shape)
+    try interpreter.allocateTensors()
+    try interpreter.copy(AddQuantizedModel.inputData, toInputAt: AddQuantizedModel.inputOutputIndex)
+  }
+
+  private func assertEqualErrors(actual: Error, expected: InterpreterError) {
+    guard let actual = actual as? InterpreterError else {
+      XCTFail("Actual error should be of type InterpreterError.")
+      return
+    }
+    XCTAssertEqual(actual, expected)
+  }
+}
+
+// MARK: - Constants
+
+/// Values for the `add.bin` model.
+private enum AddModel {
+  static let info = (name: "add", extension: "bin")
+  static let inputTensorCount = 1
+  static let outputTensorCount = 1
+  static let invalidIndex = 1
+  static let validIndex = 0
+  static let shape: TensorShape = [2]
+  static let dataCount = inputData.count
+  static let inputData = Data(copyingBufferOf: [Float32(1.0), Float32(3.0)])
+  static let outputData = Data(copyingBufferOf: [Float32(3.0), Float32(9.0)])
+  static let results = [Float32(3.0), Float32(9.0)]
+
+  static let inputTensor = Tensor(
+    name: "input",
+    dataType: .float32,
+    shape: shape,
+    data: inputData
+  )
+  static let outputTensor = Tensor(
+    name: "output",
+    dataType: .float32,
+    shape: shape,
+    data: outputData
+  )
+
+  static var path: String = {
+    let bundle = Bundle(for: InterpreterTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
+/// Values for the `add_quantized.bin` model.
+private enum AddQuantizedModel {
+  static let info = (name: "add_quantized", extension: "bin")
+  static let inputOutputIndex = 0
+  static let shape: TensorShape = [2]
+  static let inputData = Data([1, 3])
+  static let outputData = Data([3, 9])
+  static let quantizationParameters = QuantizationParameters(scale: 0.003922, zeroPoint: 0)
+  static let results: [UInt8] = [3, 9]
+
+  static let inputTensor = Tensor(
+    name: "input",
+    dataType: .uInt8,
+    shape: shape,
+    data: inputData,
+    quantizationParameters: quantizationParameters
+  )
+  static let outputTensor = Tensor(
+    name: "output",
+    dataType: .uInt8,
+    shape: shape,
+    data: outputData,
+    quantizationParameters: quantizationParameters
+  )
+
+  static var path: String = {
+    let bundle = Bundle(for: InterpreterTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
+// MARK: - Extensions
+
+extension Array {
+  /// Creates a new array from the bytes of the given unsafe data.
+  ///
+  /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
+  ///     `MemoryLayout<Element>.stride`.
+  /// - Parameter unsafeData: The data containing the bytes to turn into an array.
+  init?(unsafeData: Data) {
+    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
+    let elements = unsafeData.withUnsafeBytes {
+      UnsafeBufferPointer<Element>(
+        start: $0,
+        count: unsafeData.count / MemoryLayout<Element>.stride
+      )
+    }
+    self.init(elements)
+  }
+}
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..025db1890607641d49304ae22da1fc33fed084ef
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
@@ -0,0 +1,59 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class ModelTests: XCTestCase {
+
+  var modelPath: String!
+
+  override func setUp() {
+    super.setUp()
+
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(
+            forResource: Constant.modelInfo.name,
+            ofType: Constant.modelInfo.extension)
+    else {
+      XCTFail("Failed to get the model file path.")
+      return
+    }
+    self.modelPath = modelPath
+  }
+
+  override func tearDown() {
+    modelPath = nil
+
+    super.tearDown()
+  }
+
+  func testModel_InitWithFilePath() {
+    XCTAssertNotNil(Model(filePath: modelPath))
+  }
+
+  func testModel_InitWithEmptyFilePath_FailsInitialization() {
+    XCTAssertNil(Model(filePath: ""))
+  }
+
+  func testModel_InitWithInvalidFilePath_FailsInitialization() {
+    XCTAssertNil(Model(filePath: "invalid/path"))
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  static let modelInfo = (name: "add", extension: "bin")
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..65648c26982daa0cab2a40d111d72e10563373cf
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class QuantizationParametersTests: XCTestCase {
+
+  func testQuantizationParameters_InitWithCustomValues() {
+    let parameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    XCTAssertEqual(parameters.scale, 0.5)
+    XCTAssertEqual(parameters.zeroPoint, 1)
+  }
+
+  func testQuantizationParameters_Equatable() {
+    let parameters1 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let parameters2 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    XCTAssertEqual(parameters1, parameters2)
+
+    let parameters3 = QuantizationParameters(scale: 0.4, zeroPoint: 1)
+    XCTAssertNotEqual(parameters1, parameters3)
+    XCTAssertNotEqual(parameters2, parameters3)
+  }
+}
+
+// MARK: - Extensions
+
+extension QuantizationParameters: Equatable {
+  public static func == (lhs: QuantizationParameters, rhs: QuantizationParameters) -> Bool {
+    return lhs.scale == rhs.scale && lhs.zeroPoint == rhs.zeroPoint
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/TensorTests.swift b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4540043a1636f43834ec496ffef1e78444ba312b
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
@@ -0,0 +1,83 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class TensorTests: XCTestCase {
+
+  // MARK: - Tensor
+
+  func testTensor_Init() {
+    let name = "InputTensor"
+    let dataType: TensorDataType = .uInt8
+    let shape = TensorShape(Constant.dimensions)
+    guard let data = name.data(using: .utf8) else { XCTFail("Data should not be nil."); return }
+    let quantizationParameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let inputTensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    XCTAssertEqual(inputTensor.name, name)
+    XCTAssertEqual(inputTensor.dataType, dataType)
+    XCTAssertEqual(inputTensor.shape, shape)
+    XCTAssertEqual(inputTensor.data, data)
+    XCTAssertEqual(inputTensor.quantizationParameters, quantizationParameters)
+  }
+
+  // MARK: - TensorShape
+
+  func testTensorShape_InitWithArray() {
+    let shape = TensorShape(Constant.dimensions)
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+
+  func testTensorShape_InitWithElements() {
+    let shape = TensorShape(2, 2, 3)
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+
+  func testTensorShape_InitWithArrayLiteral() {
+    let shape: TensorShape = [2, 2, 3]
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  /// Array of 2 arrays of 2 arrays of 3 numbers: [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]].
+  static let dimensions = [2, 2, 3]
+}
+
+// MARK: - Extensions
+
+extension TensorShape: Equatable {
+  public static func == (lhs: TensorShape, rhs: TensorShape) -> Bool {
+    return lhs.rank == rhs.rank && lhs.dimensions == rhs.dimensions
+  }
+}
+
+extension Tensor: Equatable {
+  public static func == (lhs: Tensor, rhs: Tensor) -> Bool {
+    return lhs.name == rhs.name && lhs.dataType == rhs.dataType && lhs.shape == rhs.shape &&
+           lhs.data == rhs.data && lhs.quantizationParameters == rhs.quantizationParameters
+  }
+}
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index cb6ec3e0d7e0f1b53cc8b84e10cb1be4b1f023c0..949a255abaf63cbc1cc0b3f718f6aaca4f38fd8d 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -112,5 +112,29 @@ inline LSHProjectionType LSHProjectionTypeToSchema(
   }
 }
 
+inline MirrorPadMode MirrorPaddingModeToSchema(TfLiteMirrorPaddingMode mode) {
+  switch (mode) {
+    case kTfLiteMirrorPaddingUnknown:
+      return MirrorPadMode_REFLECT;  // TODO(aselle): consider an error
+    case kTfLiteMirrorPaddingReflect:
+      return MirrorPadMode_REFLECT;
+    case kTfLiteMirrorPaddingSymmetric:
+      return MirrorPadMode_SYMMETRIC;
+  }
+}
+
+inline CombinerType CombinerTypeToSchema(TfLiteCombinerType type) {
+  switch (type) {
+    case kTfLiteCombinerTypeSum:
+      return CombinerType_SUM;
+    case kTfLiteCombinerTypeMean:
+      return CombinerType_MEAN;
+    case kTfLiteCombinerTypeSqrtn:
+      return CombinerType_SQRTN;
+  }
+}
+
+// int
+
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 73813446138a9d3b7686012c84310d456f502894..b425a75325300c56e609b4fb925319b6ed14a9d9 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -22,54 +22,59 @@ limitations under the License.
 namespace tflite {
 namespace {
 // This is generated by grepping
-//  cat  third_party/tensorflow/lite/builtin_op_data.h
-//| grep "^} TfLite" | sed 's/^} TfLite\(.*\)Params;/\1Params/g' | grep -v "^}"
-static const char* param_structs[] = {"TfLiteConvParams",
-                                      "TfLitePoolParams",
-                                      "TfLiteDepthwiseConvParams",
-                                      "TfLiteSVDFParams",
-                                      "TfLiteRNNParams",
-                                      "TfLiteSequenceRNNParams",
-                                      "TfLiteFullyConnectedParams",
-                                      "TfLiteLSHProjectionParams",
-                                      "TfLiteSoftmaxParams",
-                                      "TfLiteConcatenationParams",
-                                      "TfLiteAddParams",
-                                      "TfLiteSpaceToBatchNDParams",
+//  cat  third_party/tensorflow/lite/c/builtin_op_data.h | grep "^} TfLite" |
+//  sed 's/^} \(TfLite.*\)Params;/\1Params/g' | grep -v "^}" | sed
+//  's/\(.*\)/"\1",/g' | sort
+static const char* param_structs[] = {"TfLiteAddParams",
+                                      "TfLiteArgMaxParams",
+                                      "TfLiteArgMinParams",
                                       "TfLiteBatchToSpaceNDParams",
-                                      "TfLiteMulParams",
-                                      "TfLiteSubParams",
+                                      "TfLiteBidirectionalSequenceLSTMParams",
+                                      "TfLiteBidirectionalSequenceRNNParams",
+                                      "TfLiteCastParams",
+                                      "TfLiteConcatenationParams",
+                                      "TfLiteConvParams",
+                                      "TfLiteDepthwiseConvParams",
                                       "TfLiteDivParams",
+                                      "TfLiteEmbeddingLookupSparseParams",
+                                      "TfLiteFakeQuantParams",
+                                      "TfLiteFullyConnectedParams",
+                                      "TfLiteGatherParams",
                                       "TfLiteL2NormParams",
+                                      "TfLiteLeakyReluParams",
                                       "TfLiteLocalResponseNormParams",
+                                      "TfLiteLSHProjectionParams",
                                       "TfLiteLSTMParams",
-                                      "TfLiteResizeBilinearParams",
-                                      "TfLiteResizeNearestNeighborParams",
+                                      "TfLiteMirrorPaddingParams",
+                                      "TfLiteMulParams",
+                                      "TfLiteOneHotParams",
+                                      "TfLitePackParams",
                                       "TfLitePadParams",
                                       "TfLitePadV2Params",
+                                      "TfLitePoolParams",
+                                      "TfLiteReducerParams",
                                       "TfLiteReshapeParams",
+                                      "TfLiteResizeBilinearParams",
+                                      "TfLiteResizeNearestNeighborParams",
+                                      "TfLiteRNNParams",
+                                      "TfLiteSequenceRNNParams",
+                                      "TfLiteShapeParams",
                                       "TfLiteSkipGramParams",
+                                      "TfLiteSoftmaxParams",
+                                      "TfLiteSpaceToBatchNDParams",
                                       "TfLiteSpaceToDepthParams",
-                                      "TfLiteCastParams",
-                                      "TfLiteEmbeddingLookupSparseParams",
-                                      "TfLiteGatherParams",
-                                      "TfLiteTransposeParams",
-                                      "TfLiteReducerParams",
+                                      "TfLiteSparseToDenseParams",
                                       "TfLiteSplitParams",
                                       "TfLiteSplitVParams",
                                       "TfLiteSqueezeParams",
                                       "TfLiteStridedSliceParams",
-                                      "TfLiteArgMaxParams",
-                                      "TfLiteArgMinParams",
+                                      "TfLiteSubParams",
+                                      "TfLiteSVDFParams",
                                       "TfLiteTransposeConvParams",
-                                      "TfLiteSparseToDenseParams",
-                                      "TfLiteShapeParams",
-                                      "TfLiteFakeQuantParams",
-                                      "TfLitePackParams",
-                                      "TfLiteOneHotParams",
-                                      "TfLiteLeakyReluParams",
-                                      "TfLiteMirrorPaddingParams",
+                                      "TfLiteTransposeParams",
+                                      "TfLiteUnidirectionalSequenceLSTMParams",
                                       "TfLiteUniqueParams",
+                                      "TfLiteUnpackParams",
                                       nullptr};
 }  // namespace
 
@@ -142,7 +147,6 @@ class OpOptionData {
     op_to_option_["REDUCE_MAX"] = "ReducerOptions";
     op_to_option_["REDUCE_MIN"] = "ReducerOptions";
     op_to_option_["REDUCE_ANY"] = "ReducerOptions";
-    op_to_option_["UNPACK"] = "";
     op_to_option_["SUM"] = "ReducerOptions";
     op_to_option_["REDUCE_MAX"] = "ReducerOptions";
     op_to_option_["REDUCE_PROD"] = "ReducerOptions";
@@ -151,34 +155,31 @@ class OpOptionData {
     op_to_option_["AVERAGE_POOL_2D"] = "Pool2DOptions";
     op_to_option_["MAX_POOL_2D"] = "Pool2DOptions";
     op_to_option_["L2_NORMALIZATION"] = "L2NormOptions";
-    op_to_option_["BIDIRECTIONAL_SEQUENCE_LSTM"] = "LSTMOptions";
-    op_to_option_["UNIDIRECTIONAL_SEQUENCE_LSTM"] = "LSTMOptions";
-    op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
-    op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
-    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
-    op_to_option_["UNIQUE"] = "";      // TODO(karimnosseir): UniqueOptions.
-    // Manually specified mappings between ops and options (none)
-    op_to_option_["EMBEDDING_LOOKUP"] =
-        "";  // TODO(aselle): maybe something else.
+    op_to_option_["MAXIMUM"] = "MaximumMinimumOptions";
+    op_to_option_["MINIMUM"] = "MaximumMinimumOptions";
+    op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
+    op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
+
+    // Manually specified mappings between ops to "none" options -- these are
+    // ops without a corresponding Options message in schema as yet. If these
+    // options do get assigned an Options message in future, they need to be
+    // updated here as well.
+    op_to_option_["EMBEDDING_LOOKUP"] = "";
     op_to_option_["FLOOR"] = "";
     op_to_option_["CEIL"] = "";
-    op_to_option_["HASHTABLE_LOOKUP"] =
-        "";  // TODO(aselle): maybe something else.
+    op_to_option_["HASHTABLE_LOOKUP"] = "";
     op_to_option_["LOGISTIC"] = "";
     op_to_option_["RELU"] = "";
     op_to_option_["RELU_N1_TO_1"] = "";
     op_to_option_["RELU6"] = "";
     op_to_option_["TANH"] = "";
-    op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
-    op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
     op_to_option_["PRELU"] = "";
-    op_to_option_["MAXIMUM"] = "";  // TODO(aselle): MaximumMinimumOptions
-    op_to_option_["MINIMUM"] = "";  // TODO(aselle): MaximumMinimumOptions
     op_to_option_["SIN"] = "";
     op_to_option_["LOG"] = "";
     op_to_option_["SQRT"] = "";
     op_to_option_["RSQRT"] = "";
+    op_to_option_["ELU"] = "";
 
     // TODO(aselle): These are undesirable hacks. Consider changing C structs
     option_to_struct_["Pool2DOptions"] = "TfLitePoolParams";
@@ -186,6 +187,7 @@ class OpOptionData {
     option_to_struct_["DepthwiseConv2DOptions"] = "TfLiteDepthwiseConvParams";
     option_to_struct_["LocalResponseNormalizationOptions"] =
         "TfLiteLocalResponseNormParams";
+    option_to_struct_["MirrorPadOptions"] = "TfLiteMirrorPaddingParams";
     // Now for every op, try to find an option.
     bool fatal = false;
     for (auto op_name : ops_) {
@@ -225,13 +227,15 @@ class OpOptionData {
           if (!param_struct_found) {
             std::cerr << "Failed to get param struct for option " << option_name
                       << std::endl;
-            fatal = true;
           } else {
             option_to_struct_.insert(std::make_pair(option_name, params_guess));
           }
         }
       }
     }
+    if (fatal) {
+      exit(1);
+    }
   }
 
  private:
@@ -242,16 +246,28 @@ class OpOptionData {
       option_to_type_function_;
 };
 
+void GenerateImportForResizeBilinearOp(FILE* fp) {
+  fprintf(fp,
+          "  case BuiltinOperator_RESIZE_BILINEAR:  {\n"
+          "    const auto* params = reinterpret_cast<const "
+          "TfLiteResizeBilinearParams*>(builtin_op_data);\n"
+          "    auto union_type = CreateResizeBilinearOptions(*fbb, "
+          "params->align_corners).Union();\n"
+          "    return std::make_pair(BuiltinOptions_ResizeBilinearOptions, "
+          "union_type);\n"
+          "  }\n  break;\n");
+}
+
 void GenerateImportForOp(FILE* fp, const std::string& op_name,
                          const std::string& option_name,
                          const std::string& option_type,
                          const flatbuffers::TypeTable* options,
                          const std::string& struct_name) {
-  // Skip tricky ones for now
-  if (struct_name == "TfLiteResizeBilinearParams") return;
-  if (struct_name == "TfLiteSqueezeParams") return;
-  if (struct_name == "TfLiteEmbeddingLookupSparseParams") return;
-  if (struct_name == "TfLiteReshapeParams") return;
+  // Special-case ResizeBilinear which has some deprecated fields.
+  if (struct_name == "TfLiteResizeBilinearParams") {
+    GenerateImportForResizeBilinearOp(fp);
+    return;
+  }
 
   fprintf(fp, "  case BuiltinOperator_%s:  {\n", op_name.c_str());
   fprintf(fp,
@@ -261,6 +277,9 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
 
   for (size_t i = 0; i < options->num_elems; i++) {
     std::string elem_name = options->names[i];
+    bool is_int_vector = false;
+    std::string vector_name = elem_name;
+    std::string vector_size;
     // TODO(aselle): Irregular naming in builtins
     if (elem_name == "fused_activation_function")
       elem_name = "activation";
@@ -272,8 +291,26 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
       elem_name = "dilation_height_factor";
     else if (elem_name == "dilation_w_factor")
       elem_name = "dilation_width_factor";
-    else if (elem_name == "new_shape")
-      elem_name = "shape";
+    else if (elem_name == "idx_out_type")
+      elem_name = "index_out_type";
+
+    // Vector fields treated specially.
+    if (elem_name == "new_shape") {
+      is_int_vector = true;
+      vector_name = "shape";
+      vector_size = "num_dimensions";
+    } else if (elem_name == "squeeze_dims") {
+      is_int_vector = true;
+      vector_size = "num_squeeze_dims";
+    }
+
+    if (is_int_vector) {
+      fprintf(fp,
+              "    auto val%zu = fbb->CreateVector("
+              "std::vector<int>(params->%s, params->%s + params->%s));\n",
+              i, vector_name.c_str(), vector_name.c_str(), vector_size.c_str());
+      continue;
+    }
 
     flatbuffers::TypeCode code = options->type_codes[i];
     auto contained_type = code.sequence_ref != -1
@@ -292,6 +329,10 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
       mapper = "LSTMKernelTypeToSchema";
     } else if (contained_type == LSHProjectionTypeTypeTable) {
       mapper = "LSHProjectionTypeToSchema";
+    } else if (contained_type == MirrorPadModeTypeTable) {
+      mapper = "MirrorPaddingModeToSchema";
+    } else if (contained_type == CombinerTypeTypeTable) {
+      mapper = "CombinerTypeToSchema";
     }
 
     fprintf(fp,
diff --git a/tensorflow/lite/experimental/writer/writer_lib.cc b/tensorflow/lite/experimental/writer/writer_lib.cc
index a0ce4b716d62c5a24342f5a3863e58eb203f7441..2bdc41bae84341949631f77a1be8631b007f2985 100644
--- a/tensorflow/lite/experimental/writer/writer_lib.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib.cc
@@ -219,6 +219,11 @@ std::vector<int> InterpreterWriter::RemapTensorIndicesToWritten(
   std::vector<int> output;
   output.reserve(input.size());
   for (int x : input) {
+    // Special value representing an optional tensor which is not present.
+    if (x == -1) {
+      output.push_back(x);
+      continue;
+    }
     if (tensor_to_written_tensor_[x] != -1) {
       output.push_back(tensor_to_written_tensor_[x]);
     }
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 9c48e1e54d153b9ff043e43f75f25cc36398bc60..7eaf64c9a4a5d2e102d17ef4dcaf1ccb63ba7057 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -30,8 +30,6 @@ upper_tabs:
         path: /lite/ops_versioning
       - title: TensorFlow Lite compatibility guide
         path: /lite/tf_ops_compatibility
-      - title: List of hosted models
-        path: /lite/models
       - title: TensorFlow Lite for iOS
         path: /lite/ios
       - title: TensorFlow Lite for Raspberry Pi
@@ -85,27 +83,36 @@ upper_tabs:
     #   contents:
     #   - title: Overview
     #     path: /lite/models/
-    #   - heading: Beginner
-    #     style: divider
-    #   - title: Image labeling
+    #   - title: Hosted models
+    #     path: /lite/models/hosted
+    #   - title: Image classification
     #     section:
     #     - title: Overview
-    #       path: /lite/models/image/label/overview
+    #       path: /lite/models/image_classification/overview
     #     - title: Android
-    #       path: /lite/models/image/label/android
+    #       path: /lite/models/image_classification/android
     #     - title: iOS
-    #       path: /lite/models/image/label/ios
-    #   - heading: Advanced
-    #     style: divider
-    #   - heading: Image
-    #   - title: Image classification
-    #     path: /lite/models/image/classification/
-    #   - heading: Audio
-    #   - title: Hot word detection
-    #     path: /lite/models/audio/hot_word/
-    #   - heading: Text
-    #   - title: Text classification
-    #     path: /lite/models/text/classification/
+    #       path: /lite/models/image_classification/ios
+    #   - title: Object detection
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/object_detection/overview
+    #   - title: Speech recognition
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/speech_recognition/overview
+    #   - title: Pose estimation
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/pose_estimation/overview
+    #   - title: Segmentation
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/segmentation/overview
+    #   - title: Smart reply
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/smart_reply/overview
 
     - name: API
       skip_translation: true
diff --git a/tensorflow/lite/g3doc/demo_ios.md b/tensorflow/lite/g3doc/demo_ios.md
index f4b481dc6192db703dea4161ed28e2fd63812ebf..33e74f1a402f2e3778ded1e815db72be5ff28c74 100644
--- a/tensorflow/lite/g3doc/demo_ios.md
+++ b/tensorflow/lite/g3doc/demo_ios.md
@@ -1,9 +1,10 @@
-
 # iOS Demo App
 
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model. These
-instructions walk you through building and running the demo on an iOS device.
+This tutorial provides a simple iOS mobile application to classify images using
+the iOS device camera. In this tutorial, you will download the demo application
+from the Tensorflow repository, build it on your computer, and install it on
+your iOS Device. You will also learn how to customize the application to suit
+your requirements.
 
 ## Prerequisites
 
@@ -30,47 +31,199 @@ instructions walk you through building and running the demo on an iOS device.
     If this is a new install, you will need to run the Xcode application once to
     agree to the license before continuing.
 
-## Building the iOS Demo App
-
-1.  Install CocoaPods if you don't have it:
+*   Install CocoaPods if you don't have it:
 
         sudo gem install cocoapods
 
-2.  Download the model files used by the demo app (this is done from inside the
-    cloned directory):
+### Step 1. Clone the TensorFlow source code
+
+First, we clone the GitHub repository on the computer in a folder to get the
+demo application.
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Step 2. Download required dependencies
+
+Execute the shell script to download the model files used by the demo app (this
+is done from inside the cloned directory):
+
+```
+    tensorflow/lite/examples/ios/download_models.sh
+```
 
-        sh tensorflow/lite/examples/ios/download_models.sh
+Run the following command to install TensorFlow Lite pod:
 
-3.  Install the pod to generate the workspace file:
+```
+    cd tensorflow/lite/examples/ios/camera
+    pod install
+```
 
-        cd tensorflow/lite/examples/ios/camera
-        pod install
+If you have installed this pod before and that command doesn't work, try
 
-    If you have installed this pod before and that command doesn't work, try
+```
+    pod repo update
+```
 
-        pod repo update
+### Step 3. Build the XCode project
 
-    At the end of this step you should have a file called
-    `tflite_camera_example.xcworkspace`.
+Open the `tflite_camera_example.xcworkspace` project file generated in the last
+step:
 
-4.  Open the project in Xcode by typing this on the command line:
+```
+    open tflite_camera_example.xcworkspace
+```
 
-        open tflite_camera_example.xcworkspace
+Under `Project navigator -> tflite_camera_example -> Targets ->
+tflite_camera_example -> General` change the bundle identifier by pre-pending
+your name:
 
-    This launches Xcode if it isn't open already and opens the
-    `tflite_camera_example` project.
+![pre-pend your name to the bundle identifier](images/ios/bundle_identifier.png)
 
-5.  Under `Project navigator -> tflite_camera_example -> Targets ->
-    tflite_camera_example -> General` change the bundle identifier by
-    pre-pending your name:
+Plug in your iOS device. Note the app must be executed with a real device with
+camera. Select the iOS device from the drop-down menu.
 
-    ![pre-pend your name to the bundle identifier](images/ios/bundle_identifier.png)
+![Device selection](images/ios/device_selection.png)
 
-6.  Build and run the app in Xcode.
+Click the "Run" button to build and run the app
 
-    Note that as mentioned earlier, you must already have a device set up and
-    linked to your Apple Developer account in order to deploy the app on a
-    device.
+![Build and execute](images/ios/build_and_execute.png)
+
+Note that as mentioned earlier, you must already have a device set up and linked
+to your Apple Developer account in order to deploy the app on a device.
 
 You'll have to grant permissions for the app to use the device's camera. Point
 the camera at various objects and enjoy seeing how the model classifies things!
+
+## Understanding iOS App Code
+
+### Get camera input
+
+The main logic of this app is in the Objective C++ source file
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+
+The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
+delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
+called for every captured frame. It calls `runModelOnFrame` to run the model for
+every frame.
+
+### Create an interpreter
+
+To create the interpreter, we need to load the model file. The following code
+will load a model and create an interpreter.
+
+```
+model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```
+
+Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
+load times and reduce the dirty pages in memory.
+
+Construct a `BuiltinOpResolver` to use the TensorFliw Lite buildin ops. Then,
+create the interpreter object using `InterpreterBuilder` that takes the model
+file as argument as shown below.
+
+```
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```
+
+### Obtain the input buffer
+
+By default, the app uses quantized model since it's smaller and faster. The
+buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
+following code obtains the input buffer from the interpreter:
+
+```
+// Get the index of first input tensor.
+int input_tensor_index = interpreter->inputs()[0];
+// Get the pointer to the input buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
+```
+
+Throughout this document, it's assumed a quantized model is used.
+
+### Pre-process of bitmap image
+
+The MobileNet model we're using takes 224x224x3 inputs, where the dimensions are
+width, height, and colors (RGB). The images returned from `AVCaptureSession` is
+bigger, and has 4 color channels (RGBA).
+
+Many image classification models (like MobileNet) take fixe-sized inputs. It's
+required to scale or crop the image before feeding it into the model, and change
+the channels from RGBA to RGB.
+
+The code to pre-process the images is in `ProcessInputWithQuantizedModel`
+function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
+simple implementation for nearest neighbor color sampling, and it only copies
+the first 3 bytes for each pixel.
+
+```
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+```
+
+Note the code is preprocessing and preparing the model input from the camera
+data. Therefore the first parameter `input` should be the camera buffer. The
+second parameter `output` should be the buffer of model input.
+
+### Run inference and obtain output buffer
+
+After preprocessing and filling the data into the input buffer of the
+interpreter, it's really easy to run the interpreter:
+
+```
+if (interpreter->Invoke() != kTfLiteOk) {
+  NSLog("Failed to invoke!");
+}
+```
+
+The result is stored in the output tensor buffer of the interpreter. The
+following code obtains the pointer to the buffer:
+
+```
+// Get the index of first output tensor.
+const int output_tensor_index = interpreter->outputs()[0];
+// Get the pointer to the output buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
+```
+
+### Post-process values
+
+The output buffer contains an array of `uint8_t`, and the value range is 0-255.
+We need to convert the value to float to get the probabilities with value range
+0.0-1.0. The formula of the quantization value mapping is:
+
+    float_value = (quantized_value - zero_point) * scale
+
+The following code converts quantized values back to float values, using the
+quantizaiton parameters in tensors:
+
+```
+uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+int32_t zero_point = input_tensor->params.zero_point;
+float scale = input_tensor->params.scale;
+float output[output_size];
+for (int i = 0; i < output_size; ++i) {
+  output[i] = (quantized_output[i] - zero_point) * scale;
+}
+```
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores. See the `GetTopN` function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/devguide.md
index cbad036407fabea9d49910e22b4c968470566211..7e23f1d1b6ef3b5e85736099db96417e2dd72010 100644
--- a/tensorflow/lite/g3doc/devguide.md
+++ b/tensorflow/lite/g3doc/devguide.md
@@ -92,7 +92,7 @@ formats:
 
 TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
 to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
-frozen. This process invovles several file formats including the `frozen
+frozen. This process involves several file formats including the `frozen
 GraphDef`:
 
 *   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
@@ -166,7 +166,7 @@ Refer to the [ops compatibility guide](tf_ops_compatibility.md) for
 troubleshooting help, and if that doesn't help, please
 [file an issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Graph vizualization tool
+### Graph Visualization tool
 
 The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
 to visualize TensorFlow Lite models after conversion. To build the
diff --git a/tensorflow/lite/g3doc/images/ios/build_and_execute.png b/tensorflow/lite/g3doc/images/ios/build_and_execute.png
new file mode 100644
index 0000000000000000000000000000000000000000..a305350cb7b413b70d2095cfd8fea6d396c63695
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/build_and_execute.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/device_selection.png b/tensorflow/lite/g3doc/images/ios/device_selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..1565fa0f2b65d48f4308ba50caacf745125c7431
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/device_selection.png differ
diff --git a/tensorflow/lite/g3doc/models/_index.yaml b/tensorflow/lite/g3doc/models/_index.yaml
index f4d8bc40a9325b12734022e005996e13dba0a0d6..5cd679cd614ac53fb11f172cdea25416455ab3ed 100644
--- a/tensorflow/lite/g3doc/models/_index.yaml
+++ b/tensorflow/lite/g3doc/models/_index.yaml
@@ -1,125 +1,93 @@
-project_path: /lite/_project.yaml
 book_path: /lite/_book.yaml
+project_path: /lite/_project.yaml
 description: <!--no description-->
 landing_page:
-  body_class: tfo-hide-page-nav
   custom_css_path: /site-assets/css/style.css
-  show_side_navs: true
+  nav: left
   rows:
-
-  # Hero
-  - classname: >
-      devsite-landing-row-50
-      devsite-landing-row-large-headings
-      devsite-landing-row-no-image-background
-    foreground: theme
+  - classname: devsite-landing-row-100
+    heading: Optimized models for common mobile and edge use cases
     items:
-    - heading: Models marketplace
-      description: >
-        The TensorFlow Lite models marketplace, your neighborhood model shoppe.
-      image_path: /resources/images/tflite-card-16x9.png
+    - description: >
+        Take state-of-the-art optimized research models and easily deploy them to mobile and edge devices.
+      buttons:
+        - label: Skip to full models repo
+          path: /lite/models/hosted
 
-  # Features
   - background: grey
     items:
-    - heading: Optimized for mobile
+    - heading: Optimized for edge
       description: >
-        Machine learning can make your apps more engaging, personalized, and
-        helpful, and provides solutions that are optimized to run on-device.
-    - heading: Built with Google expertise
+        Edge has many constraints, such as limited memory, speed, and power. These models are specifically designed to run on-device.
+    - heading: Built by TensorFlow
       description: >
-        Models offer the technologies that have long powered Google's own
-        experiences on mobile.
-    - heading: Approachable and comprehensive
+        These models were optimized by the talented TensorFlow Lite team. Breathe easy.
+    - heading: Customizable
       description: >
-        Use out-of-the-box solutions (base APIs) or custom models, running
-        on-device or in the Cloud, depending on your specific needs.
+        The models were trained with certain labels in mind. With transfer learning, you can customize them to fit your needs.
 
-  # Beginner models
-  - classname: devsite-landing-row-100
-    heading: "Build machine learning into your apps"
+  - classname: devsite-landing-row-cards
+    heading: Solutions to common problems
     items:
-    - heading: >
-        Image labeling
+    - heading: "Image classification"
       description: >
-        Identify objects, locations, activities, animal species, products, and
-        more
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Text recognition (OCR)
+        Identify hundreds of objects, including people, activities, animals, plants, and places.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/image_classification/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/image_classification/overview
+    - heading: "Object detection"
       description: >
-        Recognize and extract text from images
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Face detection
+        Detect multiple objects with bounding boxes. Yes, dogs and cats, too.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/object_detection/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/object_detection/overview
+    - heading: "Speech recognition"
       description: >
-        Detect faces and facial landmarks
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
+        “Okay, Google.” How about, “Okay, custom keyword spotting?”
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/speech_recognition/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/speech_recognition/overview
 
-  - items:
-    - heading: >
-        Barcode scanning
-      description: >
-        Scan and process barcodes
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Landmark detection
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: "Pose estimation"
       description: >
-        Identify popular landmarks in an image
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Smart reply
+        Estimate poses for single or multiple people. Imagine the possibilities, including stick figure dance parties.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/pose_estimation/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/pose_estimation/overview
+    - heading: "Segmentation"
       description: >
-        Provide suggested text snippet that fits context
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-
-  # Custom models
-  - classname: >
-      devsite-landing-row-no-image-background
-      devsite-landing-row-50
-      devsite-landing-row-large-headings
-    foreground: theme
-    background: grey
-    items:
-    - heading: Custom models
+        Pinpoint the shape of objects with strict localization accuracy and semantic labels. Trained with people, places, animals, and more.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/segmentation/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/segmentation/overview
+    - heading: "Smart reply"
       description: >
-        <p>If models don’t cover your use cases, you can always
-        bring your own existing TensorFlow Lite models. Just upload your model,
-        and we’ll take care of hosting and serving it to your app.</p>
-
-        <p>Models acts as an API layer to your custom model, making it easy to
-        run and use. In addition to deploying your models, we are releasing an
-        experimental model compression flow that aims to reduce model size (up
-        to orders of magnitudes) while maintaining similar accuracy. Sign up at
-        <a href="https://g.co/firebase/signup">g.co/firebase/signup</a></p>
+        Generate reply suggestions to input conversational chat messages.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/smart_reply/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/smart_reply/overview
 
-        <p>And if you’re new to machine learning and want more information on
-        custom models for mobile, you can <a
-        href="//www.tensorflow.org/lite/">learn more about TensorFlow
-        Lite.</a></p>
-      image_path: /resources/images/tflite-card-16x9.png
-      image_left: true
   - classname: devsite-landing-row-large-headings
     foreground: theme
     items:
-    - heading: Just the beginning
+    - heading: Benefits of pre-trained models
       description: >
-        Our ultimate goal is to reduce idea–to–implementation cycles and make AI
-        an essential and intuitive part of a developer's toolkit. We will do so
-        by continuing to add new Base APIs that leverage Google’s machine
-        learning expertise. Base APIs will ultimately cover significantly more
-        use cases in the vision, speech, and text fields. We will also continue
-        to simplify use of custom models, adding tools to deploy, compress, and
-        create them.
+        The best way to get started is to used pre-trained models because they
+        allow you to save time and money, leverage learnings from research scientists,
+        and avoid expensive datasets. Instead of having to start from scratch, you
+        could use an existing model and just retrain the last few layers of the
+        neural network to suit your needs. This process is called transfer learning.
diff --git a/tensorflow/lite/g3doc/models.md b/tensorflow/lite/g3doc/models/hosted.md
similarity index 99%
rename from tensorflow/lite/g3doc/models.md
rename to tensorflow/lite/g3doc/models/hosted.md
index 62b3f17c79aa3688011a1452da18e098008f414e..84421e1fc4b5892e0f5c27888f81abcca1c05ba0 100644
--- a/tensorflow/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/models/hosted.md
@@ -1,5 +1,4 @@
-
-# List of Hosted Models
+# Hosted models
 
 # AutoML mobile image classification models (Float Models)
 
diff --git a/tensorflow/lite/g3doc/models/image/label/ios.md b/tensorflow/lite/g3doc/models/image/label/ios.md
deleted file mode 100644
index 904c6450ac7272e67c1982b56099b608b91e2237..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/models/image/label/ios.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# iOS
-
-lorem
diff --git a/tensorflow/lite/g3doc/models/image/label/overview.md b/tensorflow/lite/g3doc/models/image/label/overview.md
deleted file mode 100644
index b3d9133bb2123012f2ddd2db768347305d224744..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/models/image/label/overview.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Overview
-
-Image labeling gives you insight into the content of images. When you use the
-API, you get a list of the entities that were recognized: people, things,
-places, activities, and so on. Each label found comes with a score that
-indicates the confidence the ML model has in its relevance. With this
-information, you can perform tasks such as automatic metadata generation
-and content moderation.
diff --git a/tensorflow/lite/g3doc/models/image/label/android.md b/tensorflow/lite/g3doc/models/image_classification/android.md
similarity index 85%
rename from tensorflow/lite/g3doc/models/image/label/android.md
rename to tensorflow/lite/g3doc/models/image_classification/android.md
index c755328ac059013d2d45bbeb3c67516dafbb0ff1..61606096f77ce810c2b1a686cd05599b35200f57 100644
--- a/tensorflow/lite/g3doc/models/image/label/android.md
+++ b/tensorflow/lite/g3doc/models/image_classification/android.md
@@ -3,21 +3,20 @@
 This tutorial provides a simple Android mobile application to classify images
 using the Android device camera. In this tutorial, you will download the demo
 application from the Tensorflow repository, build it on your computer, and
-install it on your Android Device. You will also learn how to customize the
+install it on your Android device. You will also learn how to customize the
 application to suit your requirements.
 
 ### Prerequisites
 
 *   Android Studio 3.2 (installed on a Linux, Mac or Windows machine)
 
-*   Android Device
+*   Android device
 
 *   USB cable (to connect Android device to your computer)
 
 ### Step 1. Clone the TensorFlow source code
 
-First, we clone the GitHub repository on the computer in a folder to get the
-demo application.
+Clone the GitHub repository to your computer to get the demo application.
 
 ```
 
@@ -29,21 +28,21 @@ Open the TensorFlow source code in Android Studio. To do this, open Android
 Studio and select `Open an existing project` setting the folder to
 `tensorflow/lite/examples/android`
 
-![Step 1](images/classifydemo_img1.png)
+<img src="images/classifydemo_img1.png" />
 
 This folder contains the demo application for image classification, object
 detection, and speech hotword detection.
 
 ### Step 2. Build the Android Studio project
 
-In this step, Select `Build -> Make Project` and check that the project builds
+Select `Build -> Make Project` and check that the project builds
 successfully. You will need Android SDK configured in the settings. You'll need
 at least SDK version 23. The gradle file will prompt you to download any missing
 libraries.
 
-![Step 2](images/classifydemo_img4.png)
+<img src="images/classifydemo_img4.png" style="width: 40%" />
 
-![Step 2a](images/classifydemo_img2.png)
+<img src="images/classifydemo_img2.png" style="width: 60%" />
 
 #### TensorFlow Lite AAR from JCenter:
 
@@ -53,25 +52,25 @@ build.
 If you see a build error related to compatibility with Tensorflow Lite's Java
 API (example: method X is undefined for type Interpreter), there has likely been
 a backwards compatible change to the API. You will need to pull new app code
-that's compatible with the nightly build by running git pull.
+that's compatible with the nightly build by running `git pull`.
 
-### Step 3. Install and Run the app
+### Step 3. Install and run the app
 
-Connect the Android device to the computer, and be sure to approve any ADB
+Connect the Android device to the computer and be sure to approve any ADB
 permission prompts that appear on your phone. Select `Run -> Run app.` Select
-the deployment target in the connected devices to the device on which app will
+the deployment target in the connected devices to the device on which the app will
 be installed. This will install the app on the device.
 
-![Step 3](images/classifydemo_img5.png)
+<img src="images/classifydemo_img5.png" style="width: 60%" />
 
-![Step 3a](images/classifydemo_img6.png)
+<img src="images/classifydemo_img6.png" style="width: 70%" />
 
-![Step 3b](images/classifydemo_img7.png)
+<img src="images/classifydemo_img7.png" style="width: 40%" />
 
-![Step 3c](images/classifydemo_img8.png)
+<img src="images/classifydemo_img8.png" style="width: 80%" />
 
-To test the app, open the app named `TFL Classify` on the device. When you run
-the app first time, the app will request permission to access the camera.
+To test the app, open the app called `TFL Classify` on your device. When you run
+the app the first time, the app will request permission to access the camera.
 Re-installing the app may require you to uninstall the previous installations.
 
 ## Understanding Android App Code
@@ -85,7 +84,7 @@ This file depends on `AndroidManifest.xml` in the folder
 `tensorflow/tensorflow/lite/examples/android/app/src/main` to set the camera
 orientation.
 
-### Pre-process of bitmap image
+### Pre-process bitmap image
 
 The mobile application code that pre-processes the images and runs inference is
 in
@@ -102,7 +101,7 @@ DIM_PIXEL_SIZE);
 c.imgData.order(ByteOrder.nativeOrder());
 ```
 
-While running the application, we preprocess the incoming bitmap images from the
+While running the application, we pre-process the incoming bitmap images from the
 camera to a Bytebuffer. Since this model is quantized 8-bit, we will put a
 single byte for each channel. `imgData` will contain an encoded `Color` for each
 pixel in ARGB format, so we need to mask the least significant 8 bits to get
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png b/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png
new file mode 100644
index 0000000000000000000000000000000000000000..a305350cb7b413b70d2095cfd8fea6d396c63695
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png b/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png
new file mode 100644
index 0000000000000000000000000000000000000000..398763916b353e61f236392e2b8898aad2aafe8e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png b/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..1565fa0f2b65d48f4308ba50caacf745125c7431
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/dog.png b/tensorflow/lite/g3doc/models/image_classification/images/dog.png
new file mode 100644
index 0000000000000000000000000000000000000000..65c6eb551468be3b53dc805009688c7b5808c660
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/dog.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/ios.md b/tensorflow/lite/g3doc/models/image_classification/ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..63e3abd779355b842964ae8836f24a1cd7a8832f
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/ios.md
@@ -0,0 +1,229 @@
+# TensorFlow Lite iOS Image Classifier App Example
+
+This tutorial provides a simple iOS mobile application to classify images using
+the iOS device camera. In this tutorial, you will download the demo application
+from the Tensorflow repository, build it on your computer, and install it on
+your iOS Device. You will also learn how to customize the application to suit
+your needs.
+
+## Prerequisites
+
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
+
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
+
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
+
+        git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
+
+*   You'll also need the Xcode command-line tools:
+
+        xcode-select --install
+
+    If this is a new install, you will need to run the Xcode application once to
+    agree to the license before continuing.
+
+*   Install CocoaPods if you don't have it:
+
+        sudo gem install cocoapods
+
+### Step 1. Clone the TensorFlow source code
+
+lone the GitHub repository onto your computer to get the
+demo application.
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Step 2. Download required dependencies
+
+Execute the shell script to download the model files used by the demo app (this
+is done from inside the cloned directory):
+
+```
+    tensorflow/lite/examples/ios/download_models.sh
+```
+
+Run the following command to install TensorFlow Lite pod:
+
+```
+    cd tensorflow/lite/examples/ios/camera
+    pod install
+```
+
+If you have installed this pod before and that command doesn't work, try
+
+```
+    pod repo update
+```
+
+### Step 3. Build the XCode project
+
+Open the `tflite_camera_example.xcworkspace` project file generated in the last
+step:
+
+```
+    open tflite_camera_example.xcworkspace
+```
+
+Under `Project navigator -> tflite_camera_example -> Targets ->
+tflite_camera_example -> General` change the bundle identifier by pre-pending
+your name:
+
+![pre-pend your name to the bundle identifier](images/bundle_identifier.png)
+
+Plug in your iOS device. Note that the app must be executed with a real device with
+a camera. Select the iOS device from the drop-down menu.
+
+![Device selection](images/device_selection.png)
+
+Click the "Run" button to build and run the app
+
+![Build and execute](images/build_and_execute.png)
+
+Note that, as mentioned earlier, you must already have a device set up and linked
+to your Apple Developer account in order to deploy the app onto a device.
+
+You'll have to grant permissions for the app to use the device's camera. Point
+the camera at various objects and enjoy seeing how the model classifies things!
+
+## Understanding iOS App Code
+
+### Get camera input
+
+The main logic of this app is in the Objective C++ source file
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+
+The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
+delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
+called for every captured frame. It calls `runModelOnFrame` to run the model for
+every frame.
+
+### Create an interpreter
+
+To create the interpreter, we need to load the model file. The following code
+will load a model and create an interpreter.
+
+```
+model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```
+
+Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
+load times and reduce the dirty pages in memory.
+
+Construct a `BuiltinOpResolver` to use the TensorFliw Lite buildin ops. Then,
+create the interpreter object using `InterpreterBuilder` that takes the model
+file as argument as shown below.
+
+```
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```
+
+### Obtain the input buffer
+
+By default, the app uses a quantized model since it's smaller and faster. The
+buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
+following code obtains the input buffer from the interpreter:
+
+```
+// Get the index of first input tensor.
+int input_tensor_index = interpreter->inputs()[0];
+// Get the pointer to the input buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
+```
+
+Throughout this document, it's assumed that a quantized model is used.
+
+### Pre-process bitmap image
+
+The MobileNet model that we're using takes 224x224x3 inputs, where the dimensions are
+width, height, and colors (RGB). The images returned from `AVCaptureSession` is
+bigger and has 4 color channels (RGBA).
+
+Many image classification models (like MobileNet) take fixe-sized inputs. It's
+required to scale or crop the image before feeding it into the model and change
+the channels from RGBA to RGB.
+
+The code to pre-process the images is in `ProcessInputWithQuantizedModel`
+function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
+simple implementation for nearest neighbor color sampling and it only copies
+the first 3 bytes for each pixel.
+
+```
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+```
+
+Note that the code pre-processes and prepares the model input from the camera
+data. Therefore, the first parameter `input` should be the camera buffer. The
+second parameter `output` should be the buffer of model input.
+
+### Run inference and obtain output buffer
+
+After pre-processing and filling the data into the input buffer of the
+interpreter, it's really easy to run the interpreter:
+
+```
+if (interpreter->Invoke() != kTfLiteOk) {
+  NSLog("Failed to invoke!");
+}
+```
+
+The result is stored in the output tensor buffer of the interpreter. The
+following code obtains the pointer to the buffer:
+
+```
+// Get the index of first output tensor.
+const int output_tensor_index = interpreter->outputs()[0];
+// Get the pointer to the output buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
+```
+
+### Post-process values
+
+The output buffer contains an array of `uint8_t`, and the value range is from 0-255.
+We need to convert the value to float to get the probabilities with a value range from
+0.0-1.0. The formula of the quantization value mapping is:
+
+    float_value = (quantized_value - zero_point) * scale
+
+The following code converts quantized values back to float values, using the
+quantizaiton parameters in tensors:
+
+```
+uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+int32_t zero_point = input_tensor->params.zero_point;
+float scale = input_tensor->params.scale;
+float output[output_size];
+for (int i = 0; i < output_size; ++i) {
+  output[i] = (quantized_output[i] - zero_point) * scale;
+}
+```
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores. See the `GetTopN` function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ffaf33d35957ddac8809ab5b82c771f5cc032b2
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -0,0 +1,211 @@
+# Image classification
+<img src="../images/image.png" class="attempt-right">
+
+Use a pre-trained and optimized model to identify hundreds of classes of objects, including people, activities, animals, plants, and places.
+
+## Get started
+
+If you are unfamiliar with the concept of image classification, you should start by reading <a href="#what_is_image_classification">What is image classification?</a>
+
+If you understand image classification, you’re new to TensorFlow Lite, and you’re working with Android or iOS, we recommend following the corresponding tutorial that will walk you through our sample code.
+
+<a class="button button-primary" href="android">Android</a>
+<a class="button button-primary" href="ios">iOS</a>
+
+If you are using a platform other than Android or iOS, or you are already familiar with the <a href="https://www.tensorflow.org/lite/apis">TensorFlow Lite APIs</a>, you can download our starter image classification model and the accompanying labels.
+
+Once you have the starter model running on your target device, you can experiment with different models to find the optimal balance between performance, accuracy, and model size. For guidance, see Choose a different model.
+
+
+If you are using a platform other than Android or iOS, or you are already familiar with the <a href="../apis">TensorFlow Lite APIs</a>, you can download our starter image classification model and the accompanying labels.
+
+<a class="button button-primary" href="">Download starter model and labels</a>
+
+## What is image classification?
+A common use of machine learning is to identify what an image represents. For example, we might want to know what type of animal appears in the following photograph.
+
+<img src="images/dog.png" alt="dog" width="50%">
+
+The task of predicting what an image represents is called image classification. An image classification model is trained to recognize various classes of images. For example, a model might be trained to recognize photos representing three different types of animals: rabbits, hamsters, and dogs.
+
+When we subsequently provide a new image as input to the model, it will output the probabilities of the image representing each of the types of animal it was trained on. An example output might be as follows:
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Animal type</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Rabbit</td>
+      <td>0.07</td>
+    </tr>
+    <tr>
+      <td>Hamster</td>
+      <td>0.02</td>
+    </tr>
+    <tr>
+      <td style="background-color: #fcb66d;">Dog</td>
+      <td style="background-color: #fcb66d;">0.91</td>
+    </tr>
+  </tbody>
+</table>
+
+Based on the output, we can see that the classification model has predicted that the image has a high probability of representing a dog.
+
+Note: Image classification can only tell you the probability that an image represents one or more of the classes that the model was trained on. It cannot tell you the position or identity of objects within the image. If you need to identify objects and their positions within images, you should use an <a href="object_detection">object detection</a> model.
+
+### Training, labels, and inference
+
+During training, an image classification model is fed images and their associated labels. Each label is the name of a distinct concept, or class, that the model will learn to recognize. Here are some examples of labels and training data for our hypothetical model that classifies animal photos:
+
+<table>
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Training data</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>[three different images of rabbits]</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>[three different images of hamsters]</td>
+    </tr>
+    <tr>
+      <td>dog</td>
+      <td>[three different images of dogs]</td>
+    </tr>
+  </tbody>
+</table>
+
+Given sufficient training data (often hundreds or thousands of images per label), an image classification model can learn to predict whether new images belong to any of the classes it has been trained on. This process of prediction is called inference.
+
+To perform inference, an image is passed as input to a model. The model will then output an array of probabilities between 0 and 1. With our example model, this process might look like the following:
+
+<table style="width: 60%">
+  <tr style="border-top: 0px;">
+    <td style="width: 40%"><img src="images/dog.png" alt="dog"></td>
+    <td style="width: 20%; font-size: 2em; vertical-align: middle;">→</td>
+    <td style="width: 40%; vertical-align: middle;">[0.07, 0.02, 0.91]</td>
+</table>
+
+Each number in the output corresponds to a label in our training data. Associating our output with the three labels the model was trained on, we can see the model has predicted a high probability that the image represents a dog.
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>0.07</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>0.02</td>
+    </tr>
+    <tr>
+      <td style="background-color: #fcb66d;">dog</td>
+      <td style="background-color: #fcb66d;">0.91</td>
+    </tr>
+  </tbody>
+</table>
+
+You might notice that the sum of all the probabilities (for rabbit, hamster, and dog) is equal to 1. This is a common type of output for models with multiple classes (see <a href="https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax">Softmax</a> for more information).
+
+### Ambiguous results
+
+Since the probabilities will always sum to 1, if the image is not confidently recognized as belonging to any of the classes the model was trained on you may see the probability distributed throughout the labels without any one value being significantly larger.
+
+For example, the following might indicate an ambiguous result:
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>0.31</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>0.35</td>
+    </tr>
+    <tr>
+      <td>dog</td>
+      <td>0.34</td>
+    </tr>
+  </tbody>
+</table>
+
+### Uses and limitations
+
+The image classification models that we provide are useful for single-label classification, which means predicting which single label the image is most likely to represent. They are trained to recognize 1000 classes of image. For a full list of classes, see the labels file.
+
+If you want to train a model to recognize new classes, see <a href="#customize_model">Customize model</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting the type and position of one or more objects within an image (see <a href="object_detection">object detection</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="segmentation">segmentation</a>)</li>
+</ul>
+
+Once you have the starter model running on your target device, you can experiment with different models to find the optimal balance between performance, accuracy, and model size. For guidance, see <a href="#choose_a_different_model">Choose a different model</a>.
+
+## Choose a different model
+
+There are a large number of image classification models available on our List of hosted models. You should aim to choose the optimal model for your application based on performance, accuracy and model size. There are trade-offs between each of them.
+
+### Performance
+
+We measure performance in terms of the amount of time it takes for a model to run inference on a given piece of hardware. The less time, the faster the model.
+
+The performance you require depends on your application. Performance can be important for applications like real-time video, where it may be important to analyze each frame in the time before the next frame is drawn (e.g. inference must be faster than 33ms to perform real-time inference on a 30fps video stream).
+
+Our quantized Mobilenet models’ performance ranges from 3.7ms to 80.3 ms.
+
+### Accuracy
+We measure accuracy in terms of how often the model correctly classifies an image. For example, a model with a stated accuracy of 60% can be expected to classify an image correctly an average of 60% of the time.
+
+Our List of hosted models provides Top-1 and Top-5 accuracy statistics. Top-1 refers to how often the correct label appears as the label with the highest probability in the model’s output. Top-5 refers to how often the correct label appears in the top 5 highest probabilities in the model’s output.
+
+Our quantized Mobilenet models’ Top-5 accuracy ranges from 64.4 to 89.9%.
+
+### Size
+The size of a model on-disk varies with its performance and accuracy. Size may be important for mobile development (where it might impact app download sizes) or when working with hardware (where available storage might be limited).
+
+Our quantized Mobilenet models’ size ranges from 0.5 to 3.4 Mb.
+
+### Architecture
+There are several different architectures of models available on List of hosted models, indicated by the model’s name. For example, you can choose between Mobilenet, Inception, and others.
+
+The architecture of a model impacts its performance, accuracy, and size. All of our hosted models are trained on the same data, meaning you can use the provided statistics to compare them and choose which is optimal for your application.
+
+Note: The image classification models we provide accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. <br /><br />All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel.<br /><br />Our Android and iOS code samples demonstrate how to process full-sized camera images into the required format for each model.
+
+## Customize model
+The pre-trained models we provide are trained to recognize 1000 classes of image. For a full list of classes, see the labels file.
+
+You can use a technique known as transfer learning to re-train a model to recognize classes not in the original set. For example, you could re-train the model to distinguish between different species of tree, despite there being no trees in the original training data. To do this, you will need a set of training images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in the TensorFlow for Poets codelab.
+
+## Read more about this
+<ul>
+  <li>Blog post:</li>
+  <li>Image classification GitHub:</li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/images/audio.png b/tensorflow/lite/g3doc/models/images/audio.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce6b25c442016a21600eb8249eafa55bacbba4e9
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/audio.png differ
diff --git a/tensorflow/lite/g3doc/models/images/blank.png b/tensorflow/lite/g3doc/models/images/blank.png
new file mode 100644
index 0000000000000000000000000000000000000000..d099da5da07271410883554e07e37765ca048590
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/blank.png differ
diff --git a/tensorflow/lite/g3doc/models/images/camera.png b/tensorflow/lite/g3doc/models/images/camera.png
new file mode 100644
index 0000000000000000000000000000000000000000..95a9218d47864aba12255bd32b67bb74b0d6704e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/camera.png differ
diff --git a/tensorflow/lite/g3doc/models/images/detection.png b/tensorflow/lite/g3doc/models/images/detection.png
new file mode 100644
index 0000000000000000000000000000000000000000..30e10f59cd53af21fe9b6a86aa5b45ca07131b1b
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/detection.png differ
diff --git a/tensorflow/lite/g3doc/models/images/image.png b/tensorflow/lite/g3doc/models/images/image.png
new file mode 100644
index 0000000000000000000000000000000000000000..e72aac9b25eec69e8c0252f441d125340b88cab5
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/image.png differ
diff --git a/tensorflow/lite/g3doc/models/images/object.png b/tensorflow/lite/g3doc/models/images/object.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa8ed428ed15e7b166bdde560669563a224e6f6c
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/object.png differ
diff --git a/tensorflow/lite/g3doc/models/images/output_stride.png b/tensorflow/lite/g3doc/models/images/output_stride.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d4663f8675eef733e18b2a5cb05670cd40d8293
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/output_stride.png differ
diff --git a/tensorflow/lite/g3doc/models/images/pose.png b/tensorflow/lite/g3doc/models/images/pose.png
new file mode 100644
index 0000000000000000000000000000000000000000..f071d789963d0f48efb5ba20633391403f75ddf8
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/pose.png differ
diff --git a/tensorflow/lite/g3doc/models/images/segmentation.png b/tensorflow/lite/g3doc/models/images/segmentation.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c61330687cc9a388a443bc6b771027d15b66d98
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/segmentation.png differ
diff --git a/tensorflow/lite/g3doc/models/images/sentiment.png b/tensorflow/lite/g3doc/models/images/sentiment.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ba494fcb6e62a90015d2aead4779fcacab70529
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/sentiment.png differ
diff --git a/tensorflow/lite/g3doc/models/images/smart_reply.png b/tensorflow/lite/g3doc/models/images/smart_reply.png
new file mode 100644
index 0000000000000000000000000000000000000000..802cc80feebe2a46b059b23d52ccf794701e4d99
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/smart_reply.png differ
diff --git a/tensorflow/lite/g3doc/models/images/tabular.png b/tensorflow/lite/g3doc/models/images/tabular.png
new file mode 100644
index 0000000000000000000000000000000000000000..2eac8f4c4ac74029c755a207b1f8a25592f468ac
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/tabular.png differ
diff --git a/tensorflow/lite/g3doc/models/images/text.png b/tensorflow/lite/g3doc/models/images/text.png
new file mode 100644
index 0000000000000000000000000000000000000000..227594f07e3d38fd4110249eb2c4c6541fb89baa
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/text.png differ
diff --git a/tensorflow/lite/g3doc/models/images/tflite_models.png b/tensorflow/lite/g3doc/models/images/tflite_models.png
new file mode 100644
index 0000000000000000000000000000000000000000..f60cd26a3177f95e40875ed92aa4a30c59a7623f
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/tflite_models.png differ
diff --git a/tensorflow/lite/g3doc/models/images/video.png b/tensorflow/lite/g3doc/models/images/video.png
new file mode 100644
index 0000000000000000000000000000000000000000..88b3b7d3c76840625abec821220413a03d384a45
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/video.png differ
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f62d017bfd7dbfdfdeee9757b12775ede227a8c
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -0,0 +1,220 @@
+# Object detection
+<img src="../images/detection.png" class="attempt-right">
+
+Detect multiple objects with bounding boxes. Yes, dogs and cats too.
+
+<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download starter model and labels</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## What is object detection?
+Given an image or a video stream, an object detection model can identify which of a known set of objects might be present and provide information about their positions within the image.
+
+<!-- TODO -->
+For example, this screenshot of our <a href="">object detection sample app</a> shows how several objects have been recognized and their positions annotated:
+
+
+<!-- TODO -->
+TODO: Insert image
+
+An object detection model is trained to detect the presence and location of multiple classes of objects. For example, a model might be trained with images that contain various pieces of computer hardware, along with a label that specifies the class of hardware they represent (e.g. a laptop, a keyboard, or a monitor), and data specifying where each object appears in the image.
+
+When we subsequently provide an image to the model, it will output a list of the objects it detects, the location of a bounding box that contains each object, and a score that indicates the confidence that detection was correct.
+
+### Model output
+
+<table style="width: 60%;">
+  <thead>
+    <tr>
+      <th>Class</th>
+      <th>Score</th>
+      <th>Location</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Laptop</td>
+      <td>0.92</td>
+      <td>[18, 21, 57, 63]</td>
+    </tr>
+    <tr>
+      <td>Keyboard</td>
+      <td>0.88</td>
+      <td>[100, 30, 180, 150]</td>
+    </tr>
+    <tr>
+      <td>Monitor</td>
+      <td>0.87</td>
+      <td>[7, 82, 89, 163] </td>
+    </tr>
+    <tr>
+      <td>Keyboard</td>
+      <td>0.23</td>
+      <td>[42, 66, 57, 83]</td>
+    </tr>
+    <tr>
+      <td>Monitor</td>
+      <td>0.11</td>
+      <td>[6, 42, 31, 58]</td>
+    </tr>
+  </tbody>
+</table>
+
+### Confidence score
+
+To interpret these results, we can look at the score and the location for each detected object. The score is a number between 0 and 1 that indicates confidence that the object was genuinely detected. The closer the number is to 1, the more confident the model is.
+
+Depending on your application, you can decide a cut-off threshold below which you will discard detection results. For our example, we might decide a sensible cut-off is a score of 0.5 (meaning a 50% probability that the detection is valid). In that case, we would ignore the last two objects in the array, because those confidence scores are below 0.5:
+
+<table style="width: 60%;">
+  <thead>
+    <tr>
+      <th>Class</th>
+      <th>Score</th>
+      <th>Location</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Laptop</td>
+      <td>0.92</td>
+      <td>[18, 21, 57, 63]</td>
+    </tr>
+    <tr>
+      <td>Keyboard</td>
+      <td>0.88</td>
+      <td>[100, 30, 180, 150]</td>
+    </tr>
+    <tr>
+      <td>Monitor</td>
+      <td>0.87</td>
+      <td>[7, 82, 89, 163] </td>
+    </tr>
+    <tr>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">Keyboard</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">0.23</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">[42, 66, 57, 83]</td>
+    </tr>
+    <tr>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">Monitor</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">0.11</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">[6, 42, 31, 58]</td>
+    </tr>
+  </tbody>
+</table>
+
+The cut-off you use should be based on whether you are more comfortable with false positives (objects that are wrongly identified, or areas of the image that are erroneously identified as objects when they are not), or false negatives (genuine objects that are missed because their confidence was low).
+
+<!-- TODO -->
+TODO: Insert screenshot showing both
+
+### Location
+
+For each detected object, the model will return an array of four numbers representing a bounding rectangle that surrounds its position. The numbers are ordered as follows:
+
+<table style="width: 50%; margin: 0 auto;">
+  <tbody>
+    <tr style="border-top: none;">
+      <td>[</td>
+      <td>top,</td>
+      <td>left,</td>
+      <td>bottom,</td>
+      <td>right</td>
+      <td>]</td>
+    </tr>
+  </tbody>
+</table>
+
+The top value represents the distance of the rectangle’s top edge from the top of the image, in pixels. The left value represents the left edge’s distance from the left of the input image. The other values represent the bottom and right edges in a similar manner.
+
+<!-- TODO -->
+Note: Object detection models accept input images of a specific size. This is likely to be different from the size of the raw image captured by your device’s camera, and you will have to write code to crop and scale your raw image to fit the model’s input size (there are examples of this in our <a href="">sample code</a>).<br /><br />The pixel values output by the model refer to the position in the cropped and scaled image, so you must scale them to fit the raw image in order to interpret them correctly.
+
+
+### Uses and limitations
+
+<!-- TODO -->
+The object detection model we provide can identify and locate up to 10 objects in an image. It is trained to recognize 80 classes of object. For a full list of classes, see the labels file in the <a href="">model zip</a>.
+
+If you want to train a model to recognize new classes, see <a href="#customize_model">Customize model</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting which single label the image most likely represents (see <a href="image_classification">image classification</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="segmentation">segmentation</a>)</li>
+</ul>
+
+Get started
+If you are new to TensorFlow Lite and are working with Android or iOS, we recommend following the corresponding tutorial that will walk you through our sample code.
+
+<!-- TODO -->
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+If you are using a platform other than Android or iOS, or you are already familiar with the <a href="../apis">TensorFlow Lite APIs</a>, you can download our starter object detection model and the accompanying labels.
+
+<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download starter model and labels</a>
+
+The model will return 10 detection results...
+
+## Starter model
+We recommend starting to implement object detection using the quantized COCO SSD MobileNet v1 model, available with labels from this download link:
+
+<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download starter model and labels</a>
+
+### Input
+The model takes an image as input. The expected image is 300x300 pixels, with three channels (red, blue, and green) per pixel. This should be fed to the model as a flattened buffer of 270,000 byte values (300x300x3). Since the model is <a href="">quantized</a>, each value should be a single byte representing a value between 0 and 255.
+
+### Output
+The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2 describe 10 detected objects, with one element in each array corresponding to each object. There will always be 10 objects detected.
+
+<table>
+  <thead>
+    <tr>
+      <th>Index</th>
+      <th>Name</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>Locations</td>
+      <td>Multidimensional array of [10][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>Classes</td>
+      <td>Array of 10 integers (output as floating point values) each indicating the index of a class label from the labels file</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>Scores</td>
+      <td>Array of 10 floating point values between 0 and 1 representing probability that a class was detected</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>Number and detections</td>
+      <td>Array of length 1 containing a floating point value expressing the total number of detection results</td>
+    </tr>
+  </tbody>
+</table>
+
+## Customize model
+
+<!-- TODO -->
+The pre-trained models we provide are trained to detect 80 classes of object. For a full list of classes, see the labels file in the <a href="">model zip</a>.
+
+You can use a technique known as transfer learning to re-train a model to recognize classes not in the original set. For example, you could re-train the model to detect multiple types of vegetable, despite there only being one vegetable in the original training data. To do this, you will need a set of training images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in the <a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193">Training and serving a real-time mobile object detector in 30 minutes</a> blog post.
+
+<!-- TODO -->
+Read more about this
+<ul>
+  <li>Blog post:</li>
+  <li>Object detection GitHub:</li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/pose_estimation/overview.md b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..f19a5a10edb7d2ea89ebe5f1689cf95ba9a90bec
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
@@ -0,0 +1,128 @@
+# Pose estimation
+<img src="../images/pose.png" class="attempt-right" />
+
+<i>PoseNet</i> is a vision model that can be used to estimate the pose of a person in an image/video by estimating where key body joints are.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite">Download starter model</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## How it works
+Pose estimation refers to computer vision techniques that detect human figures in images and videos, so that one could determine, for example, where someone’s elbow shows up in an image.
+
+To be clear, this technology is not recognizing who is in an image — there is no personal identifiable information associated to pose detection. The algorithm is simply estimating where key body joints are.
+
+The key points detected are indexed by part id with a confidence score between 0.0 and 1.0; 1.0 being the highest.
+
+<table style="width: 30%;">
+  <thead>
+    <tr>
+      <th>Id</th>
+      <th>Part</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>nose</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>leftEye</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>rightEye</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>leftEar</td>
+    </tr>
+    <tr>
+      <td>4</td>
+      <td>rightEar</td>
+    </tr>
+    <tr>
+      <td>5</td>
+      <td>leftShoulder</td>
+    </tr>
+    <tr>
+      <td>6</td>
+      <td>rightShoulder</td>
+    </tr>
+    <tr>
+      <td>7</td>
+      <td>leftElbow</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>rightElbow</td>
+    </tr>
+    <tr>
+      <td>9</td>
+      <td>leftWrist</td>
+    </tr>
+    <tr>
+      <td>10</td>
+      <td>rightWrist</td>
+    </tr>
+    <tr>
+      <td>11</td>
+      <td>leftHip</td>
+    </tr>
+    <tr>
+      <td>12</td>
+      <td>rightHip</td>
+    </tr>
+    <tr>
+      <td>13</td>
+      <td>leftKnee</td>
+    </tr>
+    <tr>
+      <td>14</td>
+      <td>rightKnee</td>
+    </tr>
+    <tr>
+      <td>15</td>
+      <td>leftAnkle</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>rightAnkle</td>
+    </tr>
+  </tbody>
+</table>
+
+## Example output
+<img src="https://www.tensorflow.org/images/models/pose_estimation.gif" />
+
+## Get started
+Android and iOS end-to-end tutorials are coming soon. In the meantime, if you want to experiment this on a web browser, check out the TensorFlow.js <a href="https://github.com/tensorflow/tfjs-models/tree/master/posenet">GitHub repository</a>.
+
+
+## How it performs
+Performance varies based on your device and output stride (heatmaps and offset vectors). The PoseNet model is image size invariant, which means it can predict pose positions in the same scale as the original image regardless of whether the image is downscaled. This means PoseNet can be configured to have a higher accuracy at the expense of performance.
+
+The output stride determines how much we’re scaling down the output relative to the input image size. It affects the size of the layers and the model outputs. The higher the output stride, the smaller the resolution of layers in the network and the outputs, and correspondingly their accuracy. In this implementation, the output stride can have values of 8, 16, or 32. In other words, an output stride of 32 will result in the fastest performance but lowest accuracy, while 8 will result in the highest accuracy but slowest performance. We recommend starting with 16.
+
+<img src="../images/models/output_stride.png" >
+<span style="font-size: 0.8em">The output stride determines how much we’re scaling down the output relative to the input image size. A higher output stride is faster but results in lower accuracy.</span>
+
+## Read more about this
+<ul>
+  <li><a href="">Blog post: Real-time Human Pose Estimation in the Browser with TensorFlow.js</a></li>
+  <li><a href="">TF.js GitHub: Pose Detection in the Browser: PoseNet Model</a></li>
+</ul>
+
+## Users
+<ul>
+  <li><a href="">‘PomPom Mirror’</a></li>
+  <li><a href="">Amazing Art Installation Turns You Into A Bird | Chris Milk "The Treachery of Sanctuary"</a></li>
+  <li><a href="">Puppet Parade - Interactive Kinect Puppets</a></li>
+  <li><a href="">Messa di Voce (Performance), Excerpts</a></li>
+  <li><a href="">Augmented reality</a></li>
+  <li><a href="">Interactive animation</a></li>
+  <li><a href="">Gait analysis</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif b/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e664adf700b396b9cd06d48378b782efbbca4282
Binary files /dev/null and b/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif differ
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f1cf1aa69279dac9a17a810e931786c7d5273d
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -0,0 +1,32 @@
+# Segmentation (GPU)
+<img src="../images/segmentation.png" class="attempt-right" />
+
+<i>DeepLab</i> is a state-of-art deep learning model for semantic image segmentation, where the goal is to assign semantic labels (e.g., person, dog, cat and so on) to every pixel in the input image.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download starter model</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## How it works
+It all started with classification where the model predicts an entire input. With advances in data, hardware, and software, object detection can infer objects with spatial location. Semantic segmentation offers the highest level of granularity with labels at a pixel level.
+
+Current implementation includes the following features:
+<ol>
+  <li>DeepLabv1: We use atrous convolution to explicitly control the resolution at which feature responses are computed within Deep Convolutional Neural Networks.</li>
+  <li>DeepLabv2: We use atrous spatial pyramid pooling (ASPP) to robustly segment objects at multiple scales with filters at multiple sampling rates and effective fields-of-views.</li>
+  <li>DeepLabv3: We augment the ASPP module with image-level feature [5, 6] to capture longer range information. We also include batch normalization [7] parameters to facilitate the training. In particular, we applying atrous convolution to extract output features at different output strides during training and evaluation, which efficiently enables training BN at output stride = 16 and attains a high performance at output stride = 8 during evaluation.</li>
+  <li>DeepLabv3+: We extend DeepLabv3 to include a simple yet effective decoder module to refine the segmentation results especially along object boundaries. Furthermore, in this encoder-decoder structure one can arbitrarily control the resolution of extracted encoder features by atrous convolution to trade-off precision and runtime.</li>
+</ol>
+
+## Example output
+The model will create a mask over the target objects with high accuracy.
+<img src="images/segmentation.gif" />
+
+## Read more about this
+<ul>
+  <li>Blog post: <a href="https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html">Semantic Image Segmentation with DeepLab in TensorFlow</a></li>
+  <li><a href="https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7">Blog post: TensorFlow Lite Now Faster with Mobile GPUs (Developer Preview)</a></li>
+  <li><a href="https://github.com/tensorflow/models/tree/master/research/deeplab">DeepLab GitHub: DeepLab: Deep Labelling for Semantic Image Segmentation</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif b/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a61691fd8714102409d290e7f6d6e361d9cbf13
Binary files /dev/null and b/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif differ
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..c35a5f2642519decc3a02e29c9da08241c6f915e
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -0,0 +1,40 @@
+# Smart reply
+<img src="../images/smart_reply.png" class="attempt-right" />
+
+Smart replies are contextually relevant, one-touch responses that help the user to reply to an incoming text message (or email) efficiently and effortlessly.
+
+<a class="button button-primary" href="http://download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download starter model and labels</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## How it works
+The model generates reply suggestions to input conversational chat messages with an efficient inference that can be easily be plugged in to your chat application to power on-device conversational intelligence.
+
+The on-device model comes with several benefits. It is:
+<ul>
+  <li>Faster: The model resides on the device and does not require internet connectivity. Thus, the inference is very fast and has an average latency of only a few milliseconds.</li>
+  <li>Resource efficient: The model has a small memory footprint on the device.</li>
+  <li>Privacy-friendly: The user data never leaves the device and this eliminates any privacy restrictions.</li>
+</ul>
+
+## Example output
+<img src="images/smart_reply.gif" />
+
+## How to use this model?
+We have provided a pre-built demo APK that you can download, install, and test on your phone. Go to the <a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc">GitHub page</a> for instructions and list of support ops and functionalities.
+
+## Read more about this
+<ul>
+  <li><a href="https://arxiv.org/pdf/1708.00630.pdf">Research paper</a></li>
+  <li><a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/">Source code</a></li>
+</ul>
+
+## Users
+<ul>
+  <li><a href="https://www.blog.google/products/gmail/save-time-with-smart-reply-in-gmail/">Gmail</a></li>
+  <li><a href="https://www.blog.google/products/gmail/computer-respond-to-this-email/">Inbox</a></li>
+  <li><a href="https://blog.google/products/allo/google-allo-smarter-messaging-app/">Allo</a></li>
+  <li><a href="https://research.googleblog.com/2017/02/on-device-machine-intelligence.html">Smart Replies on Android Wear</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/speech_recognition/overview.md b/tensorflow/lite/g3doc/models/speech_recognition/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d6d855db36e8a5b433d038136399da0e0f64213
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/speech_recognition/overview.md
@@ -0,0 +1,14 @@
+# Speech recognition
+
+<img src="../images/audio.png" class="attempt-right">
+
+Recognize audio keywords!
+
+<a class="button button-primary" href="">Download starter model</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## What is speech recognition?
+Coming soon.
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index cff4afc250852e58e29c7a6f4dc57a2ef1158f16..ff8ddabdf43f8d6f01da9a3e4db1d083278687fe 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -165,6 +165,17 @@ Options {
 }
 ```
 
+**ADD_N**
+
+```
+Inputs {
+  0-N: any number of tensors (must have same size and shape)
+}
+Outputs {
+  0: elementwise sum of the input tensors
+}
+```
+
 **ARG_MAX**
 
 ```
@@ -291,6 +302,17 @@ Options {
 }
 ```
 
+**ELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to exp(features) - 1 if < 0, features otherwise.
+}
+```
+
 **EQUAL**
 
 ```
@@ -403,6 +425,18 @@ Outputs {
 }
 ```
 
+**GATHER_ND**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
@@ -702,6 +736,17 @@ Options {
 }
 ```
 
+**RANK**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a 0-D int32 Tensor representing the rank of input
+}
+```
+
 **RELU**
 
 ```
@@ -989,6 +1034,22 @@ Outputs {
 }
 ```
 
+**WHERE**
+
+```
+Inputs {
+  0: A tensor of type bool.
+  1: A tensor which may have the same shape as condition. If condition is rank
+     1, x may have higher rank, but its first dimension must match the size of
+     condition.
+  2: A tensor with the same shape and type as x.
+}
+Outputs {
+  0: A tensor with the same type and shape as x, y if they are non-None, or
+     a tensor with shape (num_true, dim_size(condition)).
+}
+```
+
 **ZEROS_LIKE**
 
 ```
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
index 1c5f23b7bb6bf1acdc4903fc51e8e2c82427e634..7c1ad20e1b978f5a05a7e366ce836d0ac60b860c 100644
--- a/tensorflow/lite/g3doc/using_select_tf_ops.md
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -49,7 +49,7 @@ partially supported by TensorFlow Lite, and one would like to avoid those
 limitations.
 
 The following example shows how to use `target_ops` in the
-[`TFLiteConverter`](./convert/python_api) Python API.
+[`TFLiteConverter`](./convert/python_api.md) Python API.
 
 ```
 import tensorflow as tf
@@ -150,8 +150,8 @@ TensorFlow Lite XCode project with support for select TensorFlow ops has been
 added to
 `tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
 
-To use this feature in a your own project, either clone the example project or
-set the project settings for a new or existing project to the following:
+To use this feature in your own project, either clone the example project or set
+the project settings for a new or existing project to the following:
 
 *   In Build Phases -> Link Binary With Libraries, add the static libraries
     under `tensorflow/contrib/makefile/gen/lib/` directory:
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 1cec0d0c290679c7755cbf84858317489c0ba159..a9091924c064341316d788704daa643bc8e247b5 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/graph_info.h"
 #include <algorithm>
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -94,6 +95,10 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // been identified.
     for (int output_index : info_->outputs()) {
       int output_epoch = tensor_epochs_[output_index];
+      if (output_epoch == kEpochAlwaysReady) {
+        // This happens when an input of subgraph is also an output of subgraph.
+        continue;
+      }
       NodeSubset& output_subset = (*node_subsets_)[output_epoch];
       output_subset.output_tensors.push_back(output_index);
     }
@@ -138,7 +143,8 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // See if all dependencies of this node are already assigned to a
     // node sub set.
     for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
-      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
+      if (input_tensor_index != kOptionalTensor &&
+          tensor_epochs_[input_tensor_index] == kEpochNotReady) {
         return false;
       }
     }
@@ -162,6 +168,9 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
       // Look at our inputs one more time to update that tensor's
       // epochs' outputs
       for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+        if (input_tensor_index == kOptionalTensor) {
+          continue;
+        }
         int input_epoch = tensor_epochs_[input_tensor_index];
         int node_epoch = current_epoch;
         if (input_epoch != node_epoch) {
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 4d8bbdc0eef49b3f79b3c74c1d07fd86467e1d65..b72728a9a9c94c4ee7312e5ff6f17e7b4d3b8a95 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -101,7 +101,7 @@ void CheckPartitionSubgraphs(
 }
 
 // Test an empty trivial graph with no partitions.
-TEST(PartitionTest, Nodes0_PartitionNodes0) {
+TEST(PartitionTest, Nodes0PartitionNodes0) {
   SimpleTestGraph graph;
   std::vector<int> nodes_to_partition = {};
   std::vector<NodeSubset> generated_subgraphs;
@@ -109,6 +109,20 @@ TEST(PartitionTest, Nodes0_PartitionNodes0) {
   CheckPartitionSubgraphs(generated_subgraphs, {});
 }
 
+// Test a trivial graph with no node and only 1 tensor.
+// The tensor is input & output of the graph at the same time.
+// Note: This is a regression test to ensure the partitioning logic
+// handles this case without crashing.
+TEST(PartitionTest, Nodes0PartitionNodes0Tensors1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(1);
+  graph.SetInputsAndOutputs({0}, {0});
+  std::vector<int> nodes_to_partition = {};
+  std::vector<NodeSubset> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+  CheckPartitionSubgraphs(generated_subgraphs, {});
+}
+
 // Test a 1 node graph with no partitions.
 // Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[]
 // Output: [kTfNoPartition, tensor(0) -> node(0) -> tensor(1)]
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c840c9a759c63fb165a6be91e3c768a4618eba98..75a23deb1441e47ed5780209e994a921aefc4585 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -223,8 +223,6 @@ void Interpreter::SetCancellationFunction(void* data,
 }
 
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
-  // TODO(ycling): It seems Flex delegate doesn't work on non-primary subgraphs.
-  // Need to investigate.
   for (auto& subgraph : subgraphs_) {
     TF_LITE_ENSURE_OK(context_, subgraph->ModifyGraphWithDelegate(delegate));
   }
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 725bb326ba1d6a9d9c206cd4fb01bdf687b0a79c..16cca45f388953e0616dd0b1b4c24114e5a6108b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -107,7 +107,7 @@ public final class Tensor {
       throw new IllegalArgumentException(
           "Null inputs are allowed only if the Tensor is bound to a buffer handle.");
     }
-    throwExceptionIfTypeIsIncompatible(src);
+    throwIfDataIsIncompatible(src);
     if (isByteBuffer(src)) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
       // For direct ByteBuffer instances we support zero-copy. Note that this assumes the caller
@@ -138,7 +138,7 @@ public final class Tensor {
       throw new IllegalArgumentException(
           "Null outputs are allowed only if the Tensor is bound to a buffer handle.");
     }
-    throwExceptionIfTypeIsIncompatible(dst);
+    throwIfDataIsIncompatible(dst);
     if (dst instanceof ByteBuffer) {
       ByteBuffer dstByteBuffer = (ByteBuffer) dst;
       dstByteBuffer.put(buffer());
@@ -159,6 +159,7 @@ public final class Tensor {
     if (isByteBuffer(input)) {
       return null;
     }
+    throwIfTypeIsIncompatible(input);
     int[] inputShape = computeShapeOf(input);
     if (Arrays.equals(shapeCopy, inputShape)) {
       return null;
@@ -243,16 +244,14 @@ public final class Tensor {
     }
   }
 
-  private void throwExceptionIfTypeIsIncompatible(Object o) {
+  private void throwIfDataIsIncompatible(Object o) {
+    throwIfTypeIsIncompatible(o);
+    throwIfShapeIsIncompatible(o);
+  }
+
+  private void throwIfTypeIsIncompatible(Object o) {
+    // ByteBuffer payloads can map to any type, so exempt it from the check.
     if (isByteBuffer(o)) {
-      ByteBuffer oBuffer = (ByteBuffer) o;
-      if (oBuffer.capacity() != numBytes()) {
-        throw new IllegalArgumentException(
-            String.format(
-                "Cannot convert between a TensorFlowLite buffer with %d bytes and a "
-                    + "ByteBuffer with %d bytes.",
-                numBytes(), oBuffer.capacity()));
-      }
       return;
     }
     DataType oType = dataTypeOf(o);
@@ -263,7 +262,20 @@ public final class Tensor {
                   + "object of type %s (which is compatible with the TensorFlowLite type %s).",
               dtype, o.getClass().getName(), oType));
     }
+  }
 
+  private void throwIfShapeIsIncompatible(Object o) {
+    if (isByteBuffer(o)) {
+      ByteBuffer oBuffer = (ByteBuffer) o;
+      if (oBuffer.capacity() != numBytes()) {
+        throw new IllegalArgumentException(
+            String.format(
+                "Cannot convert between a TensorFlowLite buffer with %d bytes and a "
+                    + "ByteBuffer with %d bytes.",
+                numBytes(), oBuffer.capacity()));
+      }
+      return;
+    }
     int[] oShape = computeShapeOf(o);
     if (!Arrays.equals(oShape, shapeCopy)) {
       throw new IllegalArgumentException(
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index c5496e3a21e7f5d27c36d92e49dd6c8e622b0070..ff3325633c1d71a950682764b6d1576e3b75ed6a 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.fail;
 import java.io.File;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -247,6 +248,18 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testRunWithUnsupportedInputType() {
+    FloatBuffer floatBuffer = FloatBuffer.allocate(10);
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    try (Interpreter interpreter = new Interpreter(MODEL_FILE)) {
+      interpreter.run(floatBuffer, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("DataType error: cannot resolve DataType of");
+    }
+  }
+
   @Test
   public void testRunWithWrongOutputType() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 826bb9a2585e59abd3fc0f7516d0195e5ee7a1bb..c24b6ede630a11bedbd471281997de6624e3d9de 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -152,6 +152,7 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "add_n.cc",
         "arg_min_max.cc",
         "audio_spectrogram.cc",
         "basic_rnn.cc",
@@ -179,6 +180,7 @@ cc_library(
         "floor_mod.cc",
         "fully_connected.cc",
         "gather.cc",
+        "gather_nd.cc",
         "hashtable_lookup.cc",
         "if.cc",
         "l2norm.cc",
@@ -197,6 +199,7 @@ cc_library(
         "pooling.cc",
         "pow.cc",
         "range.cc",
+        "rank.cc",
         "reduce.cc",
         "reshape.cc",
         "resize_bilinear.cc",
@@ -224,6 +227,8 @@ cc_library(
         "unidirectional_sequence_rnn.cc",
         "unique.cc",
         "unpack.cc",
+        "where.cc",
+        "while.cc",
         "zeros_like.cc",
     ],
     hdrs = [
@@ -355,6 +360,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "add_n_test",
+    size = "small",
+    srcs = ["add_n_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "arg_min_max_test",
     size = "small",
@@ -702,6 +719,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gather_nd_test",
+    size = "small",
+    srcs = ["gather_nd_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "topk_v2_test",
     size = "small",
@@ -1068,6 +1098,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "rank_test",
+    size = "small",
+    srcs = ["rank_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "pow_test",
     size = "small",
@@ -1145,6 +1188,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "where_test",
+    size = "small",
+    srcs = ["where_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "zeros_like_test",
     size = "small",
@@ -1205,6 +1261,24 @@ tf_cc_test(
     deps = [
         ":builtin_ops",
         ":kernel_util",
+        ":subgraph_test_util",
+        ":test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "while_test",
+    size = "small",
+    srcs = ["while_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":subgraph_test_util",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1272,3 +1346,31 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "subgraph_test_util",
+    testonly = 1,
+    srcs = ["subgraph_test_util.cc"],
+    hdrs = ["subgraph_test_util.h"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "subgraph_test_util_test",
+    size = "small",
+    srcs = ["subgraph_test_util_test.cc"],
+    deps = [
+        ":subgraph_test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 4463a6c5a65bf848ad68635717750d3a214dd0a0..930eabaeccfde5c9fce824a58d28d14783dde419 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -23,7 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -60,9 +63,9 @@ namespace {
 TfLiteStatus CheckOutputQuantParams(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     const TfLiteTensor* output) {
+  TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
   if (input->type == kTfLiteUInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
   } else {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
   }
@@ -118,7 +121,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     static constexpr int kInputIntegerBits = 4;
 
     const double input_real_multiplier =
@@ -177,8 +180,15 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        std::numeric_limits<uint8_t>::min());
+    }
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        std::numeric_limits<int8_t>::min());
+    }
     TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
 
     static constexpr int kInputIntegerBits = 4;
@@ -261,8 +271,13 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+    }
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 127);
+    }
     TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256);
 
     static const double kBeta = 1.0;
@@ -353,6 +368,24 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+namespace {
+template <typename T>
+void QuantizedRelu6(const TfLiteTensor* input, TfLiteTensor* output) {
+  ActivationParams params;
+  params.activation_type = FusedActivationFunctionType::kRelu6;
+  params.quantized_activation_min =
+      std::max(static_cast<int32_t>(std::numeric_limits<T>::min()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(0.f / output->params.scale)));
+  params.quantized_activation_max =
+      std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+  optimized_ops::ReluX(params, GetTensorShape(input), GetTensorData<T>(input),
+                       GetTensorShape(output), GetTensorData<T>(output));
+}
+}  // namespace
+
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -365,23 +398,16 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
-    case kTfLiteUInt8: {
-      ActivationParams params;
-      params.activation_type = FusedActivationFunctionType::kRelu6;
-      params.quantized_activation_min = std::max(
-          0, output->params.zero_point +
-                 static_cast<int32>(roundf(0.f / output->params.scale)));
-      params.quantized_activation_max = std::min(
-          255, output->params.zero_point +
-                   static_cast<int32>(roundf(6.f / output->params.scale)));
-      optimized_ops::ReluX(params, GetTensorShape(input),
-                           GetTensorData<uint8>(input), GetTensorShape(output),
-                           GetTensorData<uint8>(output));
+    case kTfLiteUInt8:
+      QuantizedRelu6<uint8_t>(input, output);
+      return kTfLiteOk;
+    case kTfLiteInt8: {
+      QuantizedRelu6<int8_t>(input, output);
       return kTfLiteOk;
     } break;
     default:
       context->ReportError(
-          context, "Only float32 and uint8 supported currently, got %s.",
+          context, "Only float32, uint8 and int8 supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
@@ -436,6 +462,16 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       }
       return kTfLiteOk;
     } break;
+    case kTfLiteInt8: {
+      const auto input_shape = GetTensorShape(input);
+      const auto output_shape = GetTensorShape(output);
+      const int size = MatchingFlatSize(input_shape, output_shape);
+      reference_integer_ops::Tanh(
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift, size,
+          GetTensorData<int8_t>(input), GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently, got %s.",
                            TfLiteTypeGetName(input->type));
@@ -493,6 +529,15 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     }
+    case kTfLiteInt8: {
+      const int input_size =
+          MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+      reference_integer_ops::Logistic(
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift, input_size,
+          GetTensorData<int8_t>(input), GetTensorData<int8_t>(output));
+      break;
+    }
     default:
       context->ReportError(context, "Only float32 supported currently, got %s.",
                            TfLiteTypeGetName(input->type));
@@ -815,6 +860,21 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       }
       return kTfLiteOk;
     }
+    case kTfLiteInt8: {
+      const auto input_shape = GetTensorShape(input);
+      const auto output_shape = GetTensorShape(output);
+      const int trailing_dim = input_shape.DimensionsCount() - 1;
+      const int outer_size =
+          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+      const int depth =
+          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+      reference_integer_ops::LogSoftmax(
+          data->input_multiplier, data->input_left_shift,
+          data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
+          data->diff_min, outer_size, depth, GetTensorData<int8_t>(input),
+          GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    }
     default:
       context->ReportError(context, "Only float32 supported currently., got %s",
                            TfLiteTypeGetName(input->type));
@@ -884,8 +944,31 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::Elu(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
 }  // namespace activations
 
+TfLiteRegistration* Register_ELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::EluEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_RELU() {
   static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
                                  activations::GenericPrepare,
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 5e3c56ed5bf7092581fbbced6d3735958c19580c..25b17a9678728f0ee82ccf22e2a5b63eee2c3537 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -32,6 +32,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     input_ = AddInput(input);
     if (input.type == TensorType_UINT8) {
       output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256, -128});
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -116,6 +118,20 @@ class QuantizedActivationsOpModel : public BaseActivationsOpModel {
   }
 };
 
+TEST(FloatActivationsOpTest, Elu) {
+  FloatActivationsOpModel m(BuiltinOperator_ELU,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, -4,     //
+      3, -2, 10, -0.1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.0, -0.997521, 2.0, -0.981684,    //
+                                 3.0, -0.864665, 10.0, -0.0951626,  //
+                             })));
+}
+
 TEST(FloatActivationsOpTest, Relu) {
   FloatActivationsOpModel m(BuiltinOperator_RELU,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
@@ -172,7 +188,7 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, Relu6) {
+TEST(QuantizedActivationsOpTest, Relu6Uint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
@@ -195,7 +211,29 @@ TEST(QuantizedActivationsOpTest, Relu6) {
               ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
 }
 
-TEST(QuantizedActivationsOpTest, Tanh) {
+TEST(QuantizedActivationsOpTest, Relu6Int8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                                    {
+                                                        0, 0, 2, 4,  //
+                                                        3, 0, 6, 1,  //
+                                                    },
+                                                    kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 0, 32, 64, 48, 0, 96, 16}));
+}
+
+TEST(QuantizedActivationsOpTest, TanhUint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
@@ -218,6 +256,29 @@ TEST(QuantizedActivationsOpTest, Tanh) {
               ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 225}));
 }
 
+TEST(QuantizedActivationsOpTest, TanhInt8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      -4, -2, 8, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0, -0.999987, 0.964027, 0.999329,     //
+                      -0.999329, -0.96402, 0.99999, 0.76159,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, -128, 123, 127, -128, -123, 127, 97}));
+}
+
 TEST(QuantizedActivationsOpTest, TanhInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
@@ -253,7 +314,7 @@ TEST(FloatActivationsOpTest, Sigmoid) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, Sigmoid) {
+TEST(QuantizedActivationsOpTest, SigmoidUint8) {
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOGISTIC,
       /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
@@ -273,6 +334,26 @@ TEST(QuantizedActivationsOpTest, Sigmoid) {
               ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
 }
 
+TEST(QuantizedActivationsOpTest, SigmoidInt8) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, -127, 99, 123, 116, -99, 127, 60}));
+}
+
 TEST(QuantizedActivationsOpTest, SigmoidInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
@@ -705,7 +786,7 @@ TEST(FloatActivationsOpTest, LogSoftmax) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, LogSoftmax) {
+TEST(QuantizedActivationsOpTest, LogSoftmaxUint8) {
   const float kLogSoftmaxQuantizedTolerance = 16 / 256.0;
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOG_SOFTMAX,
@@ -727,6 +808,30 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
+TEST(QuantizedActivationsOpTest, LogSoftmaxInt8) {
+  const float kLogSoftmaxQuantizedTolerance = 0.06355;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOG_SOFTMAX,
+      /*input=*/{TensorType_INT8, {2, 4}, -10, 10},
+      /*output=*/{TensorType_INT8, {}, 0, 0, 16. / 256, 127});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -4.14297, -10.14297, -2.14297, -.142971,    //
+                      -7.00104, -12.00104, -.00104087, -9.00104,  //
+                  },
+                  kLogSoftmaxQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         61, -36, 93, 125,   //
+                                         15, -65, 127, -16,  //
+                                     }));
+}
+
 // A base class of PRelu op model. It provides the constructor for
 // FloatPReluOpModel and QuantizedPReluOpModel.
 class BasePReluOpModel : public SingleOpModel {
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 9867cc53b342d9fddda81db270c223de2ecda14f..4cfe435e9e2c4dab7253c4be4fffcb991cea4abd 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -92,7 +93,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     // 8bit -> 8bit general quantized path, with general rescalings
     data->input1_offset = -input1->params.zero_point;
     data->input2_offset = -input2->params.zero_point;
@@ -117,10 +118,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplierSmallerThanOneExp(
         real_output_multiplier, &data->output_multiplier, &data->output_shift);
 
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
-
+    if (output->type == kTfLiteUInt8) {
+      CalculateActivationRangeUint8(params->activation, output,
+                                    &data->output_activation_min,
+                                    &data->output_activation_max);
+    } else {
+      CalculateActivationRangeInt8(params->activation, output,
+                                   &data->output_activation_min,
+                                   &data->output_activation_max);
+    }
   } else if (output->type == kTfLiteInt16) {
     // 16bit -> 16bit special quantized path, supporting only a rather
     // narrow case of quantization parameters: zero_points must all be 0
@@ -219,7 +225,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input1,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
@@ -235,25 +241,33 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                         data->output_activation_max, &op_params);
     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
         GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname)                                      \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output));
-    if (kernel_type == kReference) {
+#define TF_LITE_ADD(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
       } else {
-        TF_LITE_ADD(reference_ops, Add);
+        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
       }
     } else {
-      if (op_params.broadcast_category ==
-          BroadcastableOpCategory::kGenericBroadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
-      } else if (need_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
+      if (kernel_type == kReference) {
+        if (need_broadcast) {
+          TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+        } else {
+          TF_LITE_ADD(reference_ops, Add, uint8_t);
+        }
       } else {
-        TF_LITE_ADD(optimized_ops, Add);
+        if (op_params.broadcast_category ==
+            BroadcastableOpCategory::kGenericBroadcast) {
+          TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, uint8_t);
+        } else if (need_broadcast) {
+          TF_LITE_ADD(optimized_ops, BroadcastAddFivefold, uint8_t);
+        } else {
+          TF_LITE_ADD(optimized_ops, Add, uint8_t);
+        }
       }
     }
 #undef TF_LITE_ADD
@@ -292,7 +306,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(context,
                       EvalAddQuantized<kernel_type>(context, node, params, data,
                                                     input1, input2, output));
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e9b2ea24afcd3eff107b110e7d5bb6226d95d3a
--- /dev/null
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace add_n {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int num_inputs = NumInputs(node);
+  TF_LITE_ENSURE(context, num_inputs >= 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = input1->type;
+
+  // Check that all input tensors have the same shape and type.
+  for (int i = kInputTensor1 + 1; i < num_inputs; ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE(context, HaveSameShapes(input1, input));
+    TF_LITE_ENSURE_EQ(context, input1->type, input->type);
+  }
+
+  // Use the first input node's dimension to be the dimension of the output
+  // node.
+  TfLiteIntArray* input1_dims = input1->dims;
+  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input1_dims);
+  return context->ResizeTensor(context, output, output_dims);
+}
+
+template <typename T>
+void EvalAddN(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(haoliang): Initialize all_inputs only once during init.
+  VectorOfTensors<T> all_inputs(*context, *node->inputs);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  int num_inputs = NumInputs(node);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  reference_ops::AddN<T>(GetTensorShape(input1), num_inputs, all_inputs.data(),
+                         GetTensorData<T>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (output->type == kTfLiteFloat32) {
+    EvalAddN<float>(context, node);
+  } else if (output->type == kTfLiteInt32) {
+    EvalAddN<int32_t>(context, node);
+  } else {
+    context->ReportError(context,
+                         "AddN only supports FLOAT32|INT32 now, got %s.",
+                         TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace add_n
+
+TfLiteRegistration* Register_ADD_N() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 add_n::Prepare, add_n::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/add_n_test.cc b/tensorflow/lite/kernels/add_n_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee9477d2ff13c4e4f4e2da815d8f5660ab5b6c4e
--- /dev/null
+++ b/tensorflow/lite/kernels/add_n_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseAddNOpModel : public SingleOpModel {
+ public:
+  BaseAddNOpModel(const std::vector<TensorData>& inputs,
+                  const TensorData& output) {
+    int num_inputs = inputs.size();
+    std::vector<std::vector<int>> input_shapes;
+
+    for (int i = 0; i < num_inputs; ++i) {
+      inputs_.push_back(AddInput(inputs[i]));
+      input_shapes.push_back(GetShape(inputs_[i]));
+    }
+
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD_N, BuiltinOptions_AddNOptions,
+                 CreateAddNOptions(builder_).Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  int input(int i) { return inputs_[i]; }
+
+ protected:
+  std::vector<int> inputs_;
+  int output_;
+};
+
+class FloatAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatAddNOpModel, AddMultipleTensors) {
+  FloatAddNOpModel m({{TensorType_FLOAT32, {1, 2, 2, 1}},
+                      {TensorType_FLOAT32, {1, 2, 2, 1}},
+                      {TensorType_FLOAT32, {1, 2, 2, 1}}},
+                     {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input(0), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input(1), {0.1, 0.2, 0.3, 0.5});
+  m.PopulateTensor<float>(m.input(2), {0.5, 0.1, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.4, 0.5, 1.1, 1.5}));
+}
+
+TEST(IntegerAddNOpModel, AddMultipleTensors) {
+  IntegerAddNOpModel m({{TensorType_INT32, {1, 2, 2, 1}},
+                        {TensorType_INT32, {1, 2, 2, 1}},
+                        {TensorType_INT32, {1, 2, 2, 1}}},
+                       {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input(0), {-20, 2, 7, 8});
+  m.PopulateTensor<int32_t>(m.input(1), {1, 2, 3, 5});
+  m.PopulateTensor<int32_t>(m.input(2), {10, -5, 1, -2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-9, -1, 11, 11}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 16045d457238e482bd7aad1077d0344632a7550b..2904f4a11a947264cb12fc2e8c0a7822df24c678 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -63,9 +63,10 @@ class QuantizedAddOpModel : public BaseAddOpModel {
  public:
   using BaseAddOpModel::BaseAddOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -74,17 +75,15 @@ class QuantizedAddOpModel : public BaseAddOpModel {
   }
 };
 
-// for quantized Add, the error shouldn't exceed 2*step
+// for quantized Add, the error shouldn't exceed step
 float GetTolerance(float min, float max) {
   float kQuantizedStep = (max - min) / 255.0;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 float GetToleranceInt16(float min, float max) {
   float kQuantizedStep = (max - min) / 32767.f;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 TEST(FloatAddOpModel, NoActivation) {
@@ -191,7 +190,8 @@ TEST(IntegerAddOpModel, WithBroadcast) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
@@ -200,19 +200,28 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
@@ -238,7 +247,8 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                              {-0.8, 0.2, 0.7, 0.3}};
@@ -247,53 +257,74 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   std::vector<std::vector<float>> results = {{-0.2, 0.6, 1.0, -0.1},
                                              {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1UInt8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1Int8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(),
+                                         {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear({-1.9, 0.5, 1.0, 1.3, 2.2, 2.1},
                                                 kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcast) {
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapesUInt8) {
+  QuantizedVariousInputShapes<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapesInt8) {
+  QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedWithScalarBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, {TensorType_UINT8, {}, -3.f, 3.f},
-        ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+        {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f},
+        {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input1(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(model_fixture.input2(), {0.1f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(model_fixture.input2(),
+                                                     {0.1f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
                                         kQuantizedTolerance)))
         << "With shape number " << i;
@@ -301,22 +332,31 @@ TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcast) {
   // Re-run with exchanged inputs.
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, {}, -3.f, 3.f},
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(model_fixture.input1(), {0.1f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+        {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f},
+        {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(model_fixture.input1(),
+                                                     {0.1f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input2(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
                                         kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcast) {
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastUInt8) {
+  QuantizedWithScalarBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
+  QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedWithMixedBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   const std::vector<int> base_shape = {2, 3, 1, 2};
   std::vector<std::vector<int>> test_shapes = {
@@ -335,40 +375,48 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcast) {
       {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f,
        -1.3f}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, base_shape, -3.f, 3.f},
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f},
+                                      {tensor_type, test_shapes[i], -3.f, 3.f},
+                                      {tensor_type, {}, -3.f, 3.f},
+                                      ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input1(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
                                  2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input2(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
         << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, base_shape, -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f},
+                                      {tensor_type, base_shape, -3.f, 3.f},
+                                      {tensor_type, {}, -3.f, 3.f},
+                                      ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input1(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input2(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
                                  2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastUInt8) {
+  QuantizedWithMixedBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
+  QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index f9adf6bc4e8eea8a653f3b4a2f8843675d858417..e5223badc407059511f06cd538b6057c1e276966 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -80,13 +80,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
+    case kTfLiteInt8:
     case kTfLiteInt32:
       break;
 
     default:
       context->ReportError(
           context,
-          "Unkonwn input type: %d, only float32 and int types are supported",
+          "Unknown input type: %d, only float32 and int types are supported",
           input->type);
       return kTfLiteError;
   }
@@ -135,6 +136,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           case kTfLiteUInt8:
             TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t);
             break;
+          case kTfLiteInt8:
+            TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int32_t);
+            break;
           case kTfLiteInt32:
             TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int32_t);
             break;
@@ -150,6 +154,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           case kTfLiteUInt8:
             TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int64_t);
             break;
+          case kTfLiteInt8:
+            TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int64_t);
+            break;
           case kTfLiteInt32:
             TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int64_t);
             break;
diff --git a/tensorflow/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
index 1b1000f29e88b6837544ac146dc8d7876cfe036b..01ea923f26d3ca32ec109a61d0484b0ecbd30c93 100644
--- a/tensorflow/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -86,6 +86,28 @@ TEST(ArgMaxOpTest, GetMaxArgFloat) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
+TEST(ArgMaxOpTest, GetMaxArgUInt8) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_UINT8, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<uint8_t>(model.input(), {1, 9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt8) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT8, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int8_t>(model.input(), {-1, -9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
+}
+
 TEST(ArgMaxOpTest, GetMaxArgInt) {
   ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
                                TensorType_INT32);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index 34fdf34f70c9660266e23260bd5a6b645a3c5ccb..ce85aeddedcebdecf4d2944bade2ed5f823b0592 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -148,6 +148,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int8_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index f33089559992c1a6a6fa34161122c43b7954fbdb..bd806b55ca48424e143a77d1f95640365af5fe77 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -26,8 +26,9 @@ using ::testing::ElementsAreArray;
 
 class BatchToSpaceNDOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetBlockShape(std::initializer_list<int> data) {
@@ -38,7 +39,10 @@ class BatchToSpaceNDOpModel : public SingleOpModel {
     PopulateTensor<int>(crops_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
@@ -58,11 +62,12 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
  public:
   BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
                              std::initializer_list<int> block_shape,
-                             std::initializer_list<int> crops) {
-    input_ = AddInput(TensorType_FLOAT32);
+                             std::initializer_list<int> crops,
+                             const TensorType& type = TensorType_FLOAT32) {
+    input_ = AddInput(type);
     block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
     crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
@@ -81,11 +86,12 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
 //    m.Invoke();
 class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
  public:
-  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape,
+                               const TensorType& type = TensorType_FLOAT32) {
+    input_ = AddInput(type);
     block_shape_ = AddInput(TensorType_INT32);
     crops_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
@@ -96,22 +102,47 @@ class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
 
 TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
   BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0},
+                               TensorType_INT8);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}, TensorType_INT8);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
 }
 
 #ifdef GTEST_HAS_DEATH_TEST
@@ -127,7 +158,7 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, -1, 0});
   EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index a914449ae552e37249f2cecb5c88f3b49e83f133..d4924192551ec0399863433bc021c21f81207bd1 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -59,11 +59,12 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 // TODO(ruic): optimize macros below to using template functions.
 #define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
+  template <typename input_dtype>                                              \
   void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
                              const TfLiteTensor* input1,                       \
                              const TfLiteTensor* input2, TfLiteTensor* output, \
                              bool requires_broadcast) {                        \
-    if (input1->type == kTfLiteUInt8) {                                        \
+    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
       auto input1_offset = -input1->params.zero_point;                         \
       auto input2_offset = -input2->params.zero_point;                         \
       const int left_shift = 8;                                                \
@@ -87,14 +88,16 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
       op_params.input2_shift = input2_shift;                                   \
       if (requires_broadcast) {                                                \
         reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
-            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
-            GetTensorShape(input2), GetTensorData<uint8_t>(input2),            \
-            GetTensorShape(output), GetTensorData<bool>(output));              \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
       } else {                                                                 \
         reference_ops::opname##WithScaling(                                    \
-            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
-            GetTensorShape(input2), GetTensorData<uint8_t>(input2),            \
-            GetTensorShape(output), GetTensorData<bool>(output));              \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
       }                                                                        \
     }                                                                          \
   }
@@ -136,8 +139,12 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedEqual(context, node, input1, input2, output,
-                         requires_broadcast);
+      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
+                                  requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -165,8 +172,12 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedNotEqual(context, node, input1, input2, output,
-                            requires_broadcast);
+      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -193,8 +204,12 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreater(context, node, input1, input2, output,
-                           requires_broadcast);
+      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
+                                   requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -221,8 +236,12 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreaterEqual(context, node, input1, input2, output,
-                                requires_broadcast);
+      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
+                                         requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
+                                        requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -249,8 +268,12 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLess(context, node, input1, input2, output,
-                        requires_broadcast);
+      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
+                                requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -277,8 +300,12 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLessEqual(context, node, input1, input2, output,
-                             requires_broadcast);
+      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
+                                      requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
       break;
     default:
       context->ReportError(context,
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index ab10c959a4d6b234cb6ae0810174e8f1c48898d1..6ec1f09a6c000e7e8fb7f27f4bbdfc174fc99ed5 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -363,7 +363,7 @@ TEST(ComparisonsTest, LessEqualBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
-TEST(QuantizedComparisonsTest, EqualQuantized) {
+TEST(QuantizedComparisonsTest, EqualUInt8Quantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax},
@@ -376,7 +376,20 @@ TEST(QuantizedComparisonsTest, EqualQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, false));
 }
 
-TEST(QuantizedComparisonsTest, NotEqualQuantized) {
+TEST(QuantizedComparisonsTest, EqualInt8Quantized) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  ComparisonOpModel model({TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          TensorType_INT8, BuiltinOperator_EQUAL);
+  model.QuantizeAndPopulate<int8_t>(model.input1(), {1, -9, 7, 3});
+  model.QuantizeAndPopulate<int8_t>(model.input2(), {-1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false));
+}
+
+TEST(QuantizedComparisonsTest, NotEqualUInt8Quantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax},
@@ -389,6 +402,19 @@ TEST(QuantizedComparisonsTest, NotEqualQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
 }
 
+TEST(QuantizedComparisonsTest, NotEqualInt8Quantized) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  ComparisonOpModel model({TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          TensorType_INT8, BuiltinOperator_NOT_EQUAL);
+  model.QuantizeAndPopulate<int8_t>(model.input1(), {1, -9, 7, 3});
+  model.QuantizeAndPopulate<int8_t>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
+}
+
 TEST(ComparisonsTest, GreaterQuantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
@@ -470,7 +496,7 @@ TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedUInt8NotEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -488,7 +514,25 @@ TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8NotEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_NOT_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {-20, 2, 7, -8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {2});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, true, true, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8GreaterWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -506,7 +550,25 @@ TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8GreaterWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_GREATER);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, false, false, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8GreaterEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -524,7 +586,25 @@ TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8GreaterEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_GREATER_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, false, true, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8LessWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -542,7 +622,25 @@ TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8LessWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_LESS);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(false, true, true, false, false, false))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8LessEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -560,6 +658,24 @@ TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
   }
 }
 
+TEST(ComparisonsTest, QuantizedInt8LessEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_LESS_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(false, true, true, true, false, false))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index a79388b900eb89b56a4d18f887dbe52e84fb123f..1cc188ae5f7bfe91bee48c60b692d9dca2b7cf0e 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -83,6 +83,10 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::sin);
 }
 
+TfLiteStatus CosEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::cos);
+}
+
 TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::log);
 }
@@ -122,6 +126,14 @@ TfLiteRegistration* Register_SIN() {
   return &r;
 }
 
+TfLiteRegistration* Register_COS() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::CosEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_LOG() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 7d24320081257925508b2aa53503c1cf71d0e913..89f2a506f0cc00df021d8b5113174833df7e33cb 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -65,6 +65,15 @@ TEST(ElementWise, Sin) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(ElementWise, Cos) {
+  ElementWiseOpFloatModel m(BuiltinOperator_COS, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({1, -1, -1, 0.54030})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
 TEST(ElementWise, Log) {
   ElementWiseOpFloatModel m(BuiltinOperator_LOG, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {1, 3.1415926, 1, 1});
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index dfc9550ed600ac08407c4f07b6ad7d0be26bfe10..7ed77c428dcb79dbbc3a36f3c2e55d2ae66ae4d5 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/gemm_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -212,7 +213,9 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context,
                  filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  if (bias) {
+    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  }
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
 
   int total_input_size = 1;
@@ -286,6 +289,27 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
     macro_name(target_namespace, kRelu6);                            \
   }
 
+namespace {
+void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
+                        const TfLiteTensor* filter, const TfLiteTensor* bias,
+                        TfLiteTensor* output,
+                        gemmlowp::GemmContext* gemm_context) {
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output), gemm_context);
+}
+}  // namespace
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteFullyConnectedParams* params, OpData* data,
@@ -319,6 +343,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t);
         break;
+      case kTfLiteInt8:
+        FullyConnectedInt8(data, input, filter, bias, output, gemm_context);
+        break;
       case kTfLiteInt16:
         TF_LITE_FULLY_CONNECTED(reference_ops, int16_t);
         break;
@@ -339,6 +366,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t);
         break;
+      case kTfLiteInt8:
+        FullyConnectedInt8(data, input, filter, bias, output, gemm_context);
+        break;
       case kTfLiteInt16:
         TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t);
         break;
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 03f4ea71430f5d578288d913e8ba1d0222467882..ae8e2ac35db4d9aedfda664902b1423ba3c2de7a 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -137,6 +137,7 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
   BaseFullyConnectedOpModel(
       TfLiteRegistration* registration, int units, int batches,
       const TensorData& input, const TensorData& output = {TensorType_FLOAT32},
+      bool bias_tensor_optional = false,
       ActivationFunctionType activation_func = ActivationFunctionType_RELU,
       FullyConnectedOptionsWeightsFormat weights_format =
           FullyConnectedOptionsWeightsFormat_DEFAULT)
@@ -151,7 +152,9 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    if (input.type == TensorType_FLOAT32) {
+    if (bias_tensor_optional) {
+      bias_ = AddNullInput();
+    } else if (input.type == TensorType_FLOAT32) {
       bias_ = AddInput({TensorType_FLOAT32, {units_}});
     } else {
       // This is a quantized version. The scale of 'bias' depends on the scales
@@ -173,7 +176,9 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
             .Union());
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_FULLY_CONNECTED, registration);
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter(
+        {GetShape(input_), GetShape(weights_),
+         (bias_ == kOptionalTensor) ? std::vector<int>() : GetShape(bias_)});
   }
 
   int input_size() { return input_size_; }
@@ -216,9 +221,12 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   void SetBias(const std::vector<float>& data) {
     QuantizeAndPopulate<int32_t>(bias_, data);
   }
+  template <typename T>
   void SetWeights(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(weights_, data);
+    QuantizeAndPopulate<T>(weights_, data);
   }
+
+  template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
     std::vector<float> shuffled_data(data.size());
@@ -237,15 +245,17 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     }
     TfLiteTensor* t = interpreter_->tensor(weights_);
     auto quantized_data =
-        Quantize<uint8_t>(shuffled_data, t->params.scale, t->params.zero_point);
-    for (uint8_t& q : quantized_data) {
+        Quantize<T>(shuffled_data, t->params.scale, t->params.zero_point);
+    for (T& q : quantized_data) {
       q ^= 0x80;
     }
     PopulateTensor(weights_, 0, quantized_data.data(),
                    quantized_data.data() + quantized_data.size());
   }
+
+  template <typename T>
   void SetInput(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<T>(input_, data);
   }
 
   template <typename T>
@@ -397,21 +407,42 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest2) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
 }
 
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
+TEST(FloatFullyConnectedOpTest, SimpleTestNoBias) {
+  // The optimized kernel assumes that the bias is specified.
+  FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
+                               /*units=*/1, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 2}},
+                               /*output=*/{TensorType_FLOAT32},
+                               /*bias_tensor_optional=*/true);
+  m.SetWeights({
+      2, 4,  // u = 0
+  });
+
+  m.SetInput({
+      1, 2,  // b = 0
+      2, 1,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(10, 8));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
   // input_product_scale < output_scale was not true.
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -427,22 +458,48 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
+  QuantizedFullyConnectedOpModel m(
+      ops::builtin::Register_FULLY_CONNECTED_REF(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(23, 24, 25, 57, 58, 59));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+       SimpleTestQuantizedOutputMultiplierGreaterThan1Uint8) {
   // real_multiplier = 2.
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
       /*output=*/{TensorType_UINT8, {}, -63.5, 64});
 
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -458,6 +515,36 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(175, 177, 179, 243, 245, 247));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedOutputMultiplierGreaterThan1Int8) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      ops::builtin::Register_FULLY_CONNECTED_REF(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -127, 128},
+      /*output=*/{TensorType_INT8, {}, -63.5, 64});
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(47, 49, 51, 115, 117, 119));
+}
+
 void SimpleTestQuantizedInt16OutputCase(
     TfLiteRegistration* registration, int input_depth, int output_depth,
     int batches, FullyConnectedOptionsWeightsFormat weights_format) {
@@ -477,6 +564,7 @@ void SimpleTestQuantizedInt16OutputCase(
       /*input=*/
       {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax},
       /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax},
+      /*bias_tensor_optional=*/false,
       /*activation_func=*/ActivationFunctionType_NONE, weights_format);
 
   std::mt19937 random_engine;
@@ -492,7 +580,7 @@ void SimpleTestQuantizedInt16OutputCase(
   // and set the (possibly shuffled) weights.
   switch (weights_format) {
     case FullyConnectedOptionsWeightsFormat_DEFAULT:
-      m.SetWeights(weights_data);
+      m.SetWeights<uint8_t>(weights_data);
       break;
     case FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8:
       // The shuffled path currently supports only a restrictive subset of
@@ -500,7 +588,7 @@ void SimpleTestQuantizedInt16OutputCase(
       CHECK_EQ(input_depth % 16, 0);
       CHECK_EQ(output_depth % 4, 0);
       CHECK(batches == 1 || batches == 4);
-      m.ShuffleAndSetWeights(weights_data, input_depth, output_depth);
+      m.ShuffleAndSetWeights<uint8_t>(weights_data, input_depth, output_depth);
       break;
     default:
       LOG(FATAL) << "Unhandled weights format";
@@ -522,7 +610,7 @@ void SimpleTestQuantizedInt16OutputCase(
   }
 
   m.SetBias(bias_data);
-  m.SetInput(input_data);
+  m.SetInput<uint8_t>(input_data);
 
   m.Invoke();
 
@@ -664,21 +752,21 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
   // input_product_scale < output_scale was not true.
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -695,21 +783,21 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
 }
 
 TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
+       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1Uint8) {
   // real_multiplier = 2.
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
       /*output=*/{TensorType_UINT8, {}, -63.5, 64});
 
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index f205daae1343cb0abecc95e7d1b280c10f55d897..54d05adbcf161a2af88bea4a0de1eec06e70c09a 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -57,6 +57,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
+    case kTfLiteInt8:
     case kTfLiteInt64:
     case kTfLiteInt32:
       break;
@@ -135,6 +136,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<float, int32_t>(*params, input, positions, output);
       case kTfLiteUInt8:
         return Gather<uint8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt8:
+        return Gather<int8_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt64:
@@ -153,6 +156,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<float, int64_t>(*params, input, positions, output);
       case kTfLiteUInt8:
         return Gather<uint8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt8:
+        return Gather<int8_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt64:
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20e98652ee57ec7b6b86a20cbc474b4b9c29b2aa
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -0,0 +1,154 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace gather_nd {
+constexpr int kParams = 0;
+constexpr int kIndices = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* params = GetInput(context, node, kParams);
+  const TfLiteTensor* indices = GetInput(context, node, kIndices);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (params->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Params of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(params->type));
+      return kTfLiteError;
+  }
+  switch (indices->type) {
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Indices of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(indices->type));
+      return kTfLiteError;
+  }
+
+  const int params_rank = NumDimensions(params);
+  const int indices_rank = NumDimensions(indices);
+  const int indices_nd = SizeOfDimension(indices, indices_rank - 1);
+  if (params_rank < 1) {
+    context->ReportError(context, "Params must be at least a vector.");
+    return kTfLiteError;
+  }
+  if (indices_rank < 1) {
+    context->ReportError(context, "Indices must be at least a vector.");
+    return kTfLiteError;
+  }
+  if (indices_nd > params_rank) {
+    context->ReportError(
+        context, "Index innermost dimension length must be <= params rank.");
+    return kTfLiteError;
+  }
+
+  // Assign to output the input type.
+  output->type = params->type;
+
+  // The result shape is
+  // indices.shape[:-1] + params.shape[indices.shape[-1]:]
+  const int output_rank = indices_rank + params_rank - indices_nd - 1;
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_rank);
+  int output_index = 0;
+  for (int i = 0; i < indices_rank - 1; ++i) {
+    output_shape->data[output_index++] = indices->dims->data[i];
+  }
+  for (int i = indices_nd; i < params_rank; ++i) {
+    output_shape->data[output_index++] = params->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename ParamsT, typename IndicesT>
+TfLiteStatus GatherNd(const TfLiteTensor* params, const TfLiteTensor* indices,
+                      TfLiteTensor* output) {
+  reference_ops::GatherNd(
+      GetTensorShape(params), GetTensorData<ParamsT>(params),
+      GetTensorShape(indices), GetTensorData<IndicesT>(indices),
+      GetTensorShape(output), GetTensorData<ParamsT>(output));
+  return kTfLiteOk;
+}
+
+template <typename IndicesT>
+TfLiteStatus EvalGatherNd(TfLiteContext* context, const TfLiteTensor* params,
+                          const TfLiteTensor* indices, TfLiteTensor* output) {
+  switch (params->type) {
+    case kTfLiteFloat32:
+      return GatherNd<float, IndicesT>(params, indices, output);
+    case kTfLiteUInt8:
+      return GatherNd<uint8_t, IndicesT>(params, indices, output);
+    case kTfLiteInt8:
+      return GatherNd<int8_t, IndicesT>(params, indices, output);
+    case kTfLiteInt32:
+      return GatherNd<int32_t, IndicesT>(params, indices, output);
+    case kTfLiteInt64:
+      return GatherNd<int64_t, IndicesT>(params, indices, output);
+    default:
+      context->ReportError(context,
+                           "Params type '%s' are not supported by gather_nd.",
+                           TfLiteTypeGetName(params->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* params = GetInput(context, node, kParams);
+  const TfLiteTensor* indices = GetInput(context, node, kIndices);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (indices->type) {
+    case kTfLiteInt32:
+      return EvalGatherNd<int32_t>(context, params, indices, output);
+    case kTfLiteInt64:
+      return EvalGatherNd<int64_t>(context, params, indices, output);
+    default:
+      context->ReportError(
+          context, "Indices of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(indices->type));
+      return kTfLiteError;
+  }
+}
+}  // namespace gather_nd
+
+TfLiteRegistration* Register_GATHER_ND() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 gather_nd::Prepare, gather_nd::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e93efb8ff468f9e1cd6d2cd8c4343c0fe62e79
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -0,0 +1,323 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class GatherNdOpModel : public SingleOpModel {
+ public:
+  GatherNdOpModel(const TensorData& params, const TensorData& indices) {
+    params_ = AddInput(params);
+    indices_ = AddInput(indices);
+    output_ = AddOutput(params.type);
+    SetBuiltinOp(BuiltinOperator_GATHER_ND, BuiltinOptions_GatherNdOptions,
+                 CreateGatherNdOptions(builder_).Union());
+    BuildInterpreter({GetShape(params_), GetShape(indices_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(params_, data);
+  }
+
+  template <typename T>
+  void SetPositions(std::initializer_list<T> data) {
+    PopulateTensor<T>(indices_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int params_;
+  int indices_;
+  int output_;
+};
+
+TEST(GatherNdOpTest, ElementIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 1, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 2.2}));
+}
+
+TEST(GatherNdOpTest, SliceIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 1}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({2.1, 2.2, 1.1, 1.2}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoMatrix1) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}},
+                    {TensorType_INT32, {2, 1, 1}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({2.1, 2.2, 1.1, 1.2}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoMatrix2) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}},
+                    {TensorType_INT32, {2, 1, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 1, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 2.2}));
+}
+
+TEST(GatherNdOpTest, DuplicateIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 0, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 1.1}));
+}
+
+TEST(GatherNdOpTest, ElementIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {1, 2, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, -4.1}));
+}
+
+TEST(GatherNdOpTest, SliceIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 2});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({1.1, -1.2, 1.3, -2.1, 2.2, 2.3, 5.1, -5.2, 5.3,
+                                6.1, -6.2, 6.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor1) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, -4.1}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor2) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1, 1}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({3.1, 3.2, -3.3, -4.1, -4.2, 4.3, 1.1, -1.2, 1.3,
+                                -2.1, 2.2, 2.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor3) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 1, 0, 0, 0, 2, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3, 1.1, -1.2, 1.3,
+                                6.1, -6.2, 6.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor4) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 2});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, 3.2, 4.3, 6.3}));
+}
+
+TEST(GatherNdOpTest, DuplicateIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 0, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, -2.1, 2.2, 2.3}));
+}
+
+TEST(GatherNdOpTest, Float32Int32) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
+}
+
+TEST(GatherNdOpTest, Float32Int64) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT64, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
+}
+
+TEST(GatherNdOpTest, Int32Int32) {
+  GatherNdOpModel m({TensorType_INT32, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int32_t>({1, -1, 1, -2, 2, 2,   //
+                       3, 3, -3, -4, -4, 4,  //
+                       5, -5, 5, 6, -6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int32Int64) {
+  GatherNdOpModel m({TensorType_INT32, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int32_t>({1, -1, 1, -2, 2, 2,   //
+                       3, 3, -3, -4, -4, 4,  //
+                       5, -5, 5, 6, -6, 6});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Uint8Int32) {
+  GatherNdOpModel m({TensorType_UINT8, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2,  //
+                       3, 3, 3, 4, 4, 4,  //
+                       5, 5, 5, 6, 6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({2, 2, 2, 3, 3, 3}));
+}
+
+TEST(GatherNdOpTest, Uint8Int64) {
+  GatherNdOpModel m({TensorType_UINT8, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2,  //
+                       3, 3, 3, 4, 4, 4,  //
+                       5, 5, 5, 6, 6, 6});
+  m.SetPositions<int64_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({2, 2, 2, 3, 3, 3}));
+}
+
+TEST(GatherNdOpTest, Int8Int32) {
+  GatherNdOpModel m({TensorType_INT8, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int8_t>({1, -1, 1, -2, 2, 2,   //
+                      3, 3, -3, -4, -4, 4,  //
+                      5, -5, 5, 6, -6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int8Int64) {
+  GatherNdOpModel m({TensorType_INT8, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int8_t>({1, -1, 1, -2, 2, 2,   //
+                      3, 3, -3, -4, -4, 4,  //
+                      5, -5, 5, 6, -6, 6});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int64Int32) {
+  GatherNdOpModel m({TensorType_INT64, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int64_t>({1LL, -1LL, 1LL, -2LL, 2LL, 2LL,   //
+                       3LL, 3LL, -3LL, -4LL, -4LL, 4LL,  //
+                       5LL, -5LL, 5LL, 6LL, -6LL, 6LL});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
+}
+
+TEST(GatherNdOpTest, Int64Int64) {
+  GatherNdOpModel m({TensorType_INT64, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int64_t>({1LL, -1LL, 1LL, -2LL, 2LL, 2LL,   //
+                       3LL, 3LL, -3LL, -4LL, -4LL, 4LL,  //
+                       5LL, -5LL, 5LL, 6LL, -6LL, 6LL});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 7b5f84348903a3cc436f1bd6cf32b3175b2f5815..b5461c204f3f11cdcc6f54cdc1b0aff40265ce54 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -205,6 +205,24 @@ TEST(TypesGatherOpTest, Uint8Int64) {
   EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
 }
 
+TEST(TypesGatherOpTest, Int8Int32) {
+  GatherOpModel m({TensorType_INT8, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int8_t>({-13, -120, 14, 15});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
+}
+
+TEST(TypesGatherOpTest, Int8Int64) {
+  GatherOpModel m({TensorType_INT8, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int8_t>({-13, -120, 14, 15});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
+}
+
 TEST(TypesGatherOpTest, Int64Int32) {
   GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
   m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index a814ac997d1f1891b77dcdb3055f0e50df4aea1c..1bd394e980073b73674ca972d28fafe04f7b8adf 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -153,6 +153,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // in `Prepare`, so we don't need to do it here again.
   TF_LITE_ENSURE_OK(context, active_branch_subgraph.Invoke());
 
+  for (int tensor_index : active_branch_subgraph.outputs()) {
+    active_branch_subgraph.EnsureTensorDataIsReadable(tensor_index);
+  }
+
   bool has_dynamic_output_tensors = false;
   for (int i = 0; i < node->outputs->size; ++i) {
     TfLiteTensor* output = GetOutput(context, node, i);
diff --git a/tensorflow/lite/kernels/if_test.cc b/tensorflow/lite/kernels/if_test.cc
index 80e4ef133a090c7fe7e909d432d6a8ccb975d87a..0f90db131b0bc335b54f4f8c24fa5d8dd02862f4 100644
--- a/tensorflow/lite/kernels/if_test.cc
+++ b/tensorflow/lite/kernels/if_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,207 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
 
 namespace tflite {
-namespace ops {
-namespace builtin {
-// ADD and MUL are used to test simple branch.
-TfLiteRegistration* Register_ADD();
-TfLiteRegistration* Register_MUL();
-// ADD and MUL are used to test dynamic sized subgraphs.
-TfLiteRegistration* Register_PAD();
-}  // namespace builtin
-namespace custom {
-TfLiteRegistration* Register_IF();
-}  // namespace custom
-}  // namespace ops
-
-namespace {
-
-void SetupTensor(Subgraph* subgraph, int tensor_index,
-                 const std::vector<int>& shape, TfLiteType type) {
-  ASSERT_EQ(subgraph->SetTensorParametersReadWrite(
-                tensor_index, type, "", shape.size(), shape.data(), {}, false),
-            kTfLiteOk);
-}
-
-// TODO(ycling): Consider to move all the test helper functions to another
-// build target (e.g. subgraph_test_util).
-// Build a subgraph with an add op. Helper function for testing.
-void BuildAddSubgraph(Subgraph* subgraph) {
-  int first_new_tensor_index;
-  ASSERT_EQ(subgraph->AddTensors(3, &first_new_tensor_index), kTfLiteOk);
-  ASSERT_EQ(first_new_tensor_index, 0);
-  ASSERT_EQ(subgraph->SetInputs({0, 1}), kTfLiteOk);
-  ASSERT_EQ(subgraph->SetOutputs({2}), kTfLiteOk);
-
-  SetupTensor(subgraph, 0, {2}, kTfLiteInt32);
-  SetupTensor(subgraph, 1, {1, 2}, kTfLiteInt32);
-  // Intentionally set the wrong output size for testing. This should be
-  // overridden by Prepare function.
-  SetupTensor(subgraph, 2, {100}, kTfLiteInt32);
-
-  TfLiteAddParams* params =
-      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
-  params->activation = kTfLiteActNone;
-  int node_index;
-  subgraph->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, params,
-                                  ::tflite::ops::builtin::Register_ADD(),
-                                  &node_index);
-}
-
-// Build a subgraph with an mul op. Helper function for testing.
-void BuildMulSubgraph(Subgraph* subgraph) {
-  int first_new_tensor_index;
-  ASSERT_EQ(subgraph->AddTensors(3, &first_new_tensor_index), kTfLiteOk);
-  ASSERT_EQ(first_new_tensor_index, 0);
-  ASSERT_EQ(subgraph->SetInputs({0, 1}), kTfLiteOk);
-  ASSERT_EQ(subgraph->SetOutputs({2}), kTfLiteOk);
-
-  SetupTensor(subgraph, 0, {2}, kTfLiteInt32);
-  SetupTensor(subgraph, 1, {1, 2}, kTfLiteInt32);
-  // Intentionally set the wrong output size for testing. This should be
-  // overridden by Prepare function.
-  SetupTensor(subgraph, 2, {100}, kTfLiteInt32);
 
-  TfLiteMulParams* params =
-      reinterpret_cast<TfLiteMulParams*>(malloc(sizeof(TfLiteMulParams)));
-  params->activation = kTfLiteActNone;
-  int node_index;
-  subgraph->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, params,
-                                  ::tflite::ops::builtin::Register_MUL(),
-                                  &node_index);
-}
-
-// Build a subgraph with a pad op. Helper function for testing.
-void BuildPadSubgraph(Subgraph* subgraph) {
-  int first_new_tensor_index;
-  ASSERT_EQ(subgraph->AddTensors(3, &first_new_tensor_index), kTfLiteOk);
-  ASSERT_EQ(first_new_tensor_index, 0);
-  ASSERT_EQ(subgraph->SetInputs({0, 1}), kTfLiteOk);
-  ASSERT_EQ(subgraph->SetOutputs({2}), kTfLiteOk);
-
-  SetupTensor(subgraph, 0, {2}, kTfLiteInt32);
-  SetupTensor(subgraph, 1, {1, 2}, kTfLiteInt32);
-  // Intentionally set the wrong output size for testing. This should be
-  // overridden by Prepare function.
-  SetupTensor(subgraph, 2, {100}, kTfLiteInt32);
-
-  TfLitePadParams* params =
-      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLitePadParams)));
-  int node_index;
-  subgraph->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, params,
-                                  ::tflite::ops::builtin::Register_PAD(),
-                                  &node_index);
-}
-
-void BuildIfSubgraph(Subgraph* subgraph) {
-  int first_new_tensor_index;
-  ASSERT_EQ(subgraph->AddTensors(4, &first_new_tensor_index), kTfLiteOk);
-  ASSERT_EQ(first_new_tensor_index, 0);
-  ASSERT_EQ(subgraph->SetInputs({0, 1, 2}), kTfLiteOk);
-  ASSERT_EQ(subgraph->SetOutputs({3}), kTfLiteOk);
-
-  SetupTensor(subgraph, 0, {1}, kTfLiteBool);
-  SetupTensor(subgraph, 1, {2}, kTfLiteInt32);
-  SetupTensor(subgraph, 2, {1, 2}, kTfLiteInt32);
-  // Intentionally set the wrong output size for testing. This should be
-  // overridden by Prepare function.
-  SetupTensor(subgraph, 3, {100}, kTfLiteInt32);
-
-  flexbuffers::Builder fbb;
-  fbb.Map([&]() {
-    fbb.Int("then_subgraph_index", 1);
-    fbb.Int("else_subgraph_index", 2);
-  });
-  fbb.Finish();
-  const auto& buffer = fbb.GetBuffer();
-
-  int node_index;
-  subgraph->AddNodeWithParameters(
-      {0, 1, 2}, {3}, reinterpret_cast<const char*>(buffer.data()),
-      buffer.size(), nullptr, ::tflite::ops::custom::Register_IF(),
-      &node_index);
-}
+using subgraph_test_util::CheckIntTensor;
+using subgraph_test_util::ControlFlowOpTest;
+using subgraph_test_util::FillIntTensor;
 
-void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data) {
-  int count = NumElements(tensor);
-  ASSERT_EQ(count, data.size());
-  for (int i = 0; i < count; ++i) {
-    tensor->data.i32[i] = data[i];
-  }
-}
-
-void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
-                    const std::vector<int32_t>& data) {
-  ASSERT_EQ(tensor->dims->size, shape.size());
-  for (int i = 0; i < tensor->dims->size; ++i) {
-    ASSERT_EQ(tensor->dims->data[i], shape[i]);
-  }
-  ASSERT_EQ(tensor->type, kTfLiteInt32);
-  int count = NumElements(tensor);
-  ASSERT_EQ(count, data.size());
-  for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(tensor->data.i32[i], data[i]);
-  }
-}
-
-// TestHelperfunctionTest tests the helper functions defined in this file.
-TEST(TestHelperfunctionTest, TestBuildAddSubgraph) {
-  std::unique_ptr<Interpreter> interpreter(new Interpreter);
-  BuildAddSubgraph(&interpreter->primary_subgraph());
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-  FillIntTensor(interpreter->tensor(interpreter->inputs()[0]), {5, 7});
-  FillIntTensor(interpreter->tensor(interpreter->inputs()[1]), {1, 2});
-  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
-  TfLiteTensor* output = interpreter->tensor(interpreter->outputs()[0]);
-  CheckIntTensor(output, {1, 2}, {6, 9});
-}
-
-TEST(TestHelperfunctionTest, TestBuildMulSubgraph) {
-  std::unique_ptr<Interpreter> interpreter(new Interpreter);
-  BuildMulSubgraph(&interpreter->primary_subgraph());
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-  FillIntTensor(interpreter->tensor(interpreter->inputs()[0]), {5, 7});
-  FillIntTensor(interpreter->tensor(interpreter->inputs()[1]), {1, 2});
-  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
-  TfLiteTensor* output = interpreter->tensor(interpreter->outputs()[0]);
-  CheckIntTensor(output, {1, 2}, {5, 14});
-}
-
-TEST(TestHelperfunctionTest, TestBuildPadSubgraph) {
-  std::unique_ptr<Interpreter> interpreter(new Interpreter);
-  BuildPadSubgraph(&interpreter->primary_subgraph());
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-  FillIntTensor(interpreter->tensor(interpreter->inputs()[0]), {5, 7});
-  FillIntTensor(interpreter->tensor(interpreter->inputs()[1]), {1, 2});
-  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
-  TfLiteTensor* output = interpreter->tensor(interpreter->outputs()[0]);
-  CheckIntTensor(output, {5}, {0, 5, 7, 0, 0});
-}
+namespace {
 
 // A simple test that performs `ADD` if condition is true, and `MUL` otherwise.
 // The computation is: `cond ? a + b : a * b`.
-class SimpleIfTest : public ::testing::Test {
+class SimpleIfTest : public ControlFlowOpTest {
  protected:
   void SetUp() override {
-    interpreter_.reset(new Interpreter);
     interpreter_->AddSubgraphs(2);
-    BuildAddSubgraph(interpreter_->subgraph(1));
-    BuildMulSubgraph(interpreter_->subgraph(2));
-    BuildIfSubgraph(&interpreter_->primary_subgraph());
+    builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+    builder_->BuildMulSubgraph(interpreter_->subgraph(2));
+    builder_->BuildIfSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[2], {1, 2});
     ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
     FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
     FillIntTensor(interpreter_->tensor(interpreter_->inputs()[2]), {1, 2});
   }
-  std::unique_ptr<Interpreter> interpreter_;
 };
 
 TEST_F(SimpleIfTest, TestIfTrue) {
@@ -231,19 +66,22 @@ TEST_F(SimpleIfTest, TestIfFalse) {
 
 // Test IF op using subgraphs with dynamically sized outputs.
 // The computation is: `cond ? a + b : pad(a, b)`.
-class DynamicSubgraphIfTest : public ::testing::Test {
+class DynamicSubgraphIfTest : public ControlFlowOpTest {
  protected:
   void SetUp() override {
-    interpreter_.reset(new Interpreter);
     interpreter_->AddSubgraphs(2);
-    BuildAddSubgraph(interpreter_->subgraph(1));
-    BuildPadSubgraph(interpreter_->subgraph(2));
-    BuildIfSubgraph(&interpreter_->primary_subgraph());
+    builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+    builder_->BuildPadSubgraph(interpreter_->subgraph(2));
+    builder_->BuildIfSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[2], {1, 2});
     ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
     FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
     FillIntTensor(interpreter_->tensor(interpreter_->inputs()[2]), {1, 2});
   }
-  std::unique_ptr<Interpreter> interpreter_;
 };
 
 TEST_F(DynamicSubgraphIfTest, TestIfTrue) {
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index cbe4c9d414027ae1a1a9311930e689bbde8bbf46..816b88d675ca3861e2042fdaedfe8a029d7a37aa 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -306,11 +306,17 @@ cc_library(
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
         "reference/fully_connected.h",
+        "reference/integer_ops/add.h",
         "reference/integer_ops/conv.h",
         "reference/integer_ops/depthwise_conv.h",
         "reference/integer_ops/dequantize.h",
+        "reference/integer_ops/fully_connected.h",
+        "reference/integer_ops/log_softmax.h",
+        "reference/integer_ops/logistic.h",
+        "reference/integer_ops/mul.h",
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/softmax.h",
+        "reference/integer_ops/tanh.h",
         "reference/reference_ops.h",
         "reference/softmax.h",
     ],
@@ -584,14 +590,19 @@ cc_test(
 
 cc_test(
     name = "depthwiseconv_quantized_test",
-    srcs = ["depthwiseconv_quantized_test.cc"],
+    srcs = [
+        "depthwiseconv_quantized_test.cc",
+        "optimized/depthwiseconv_uint8_transitional.h",
+    ],
     shard_count = 2,
     deps = [
         ":optimized_base",
         ":reference_base",
         ":test_util",
         ":types",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@gemmlowp",
     ],
 )
 
@@ -642,7 +653,7 @@ cc_test(
     srcs = [
         "logsoftmax_quantized_test.cc",
     ],
-    shard_count = 3,
+    shard_count = 4,
     tags = [
         # TODO(b/122242739): Reenable after fixing the flakiness?
         "nomac",
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index bc30ac91220906588f204d6ff21c275faa2b6c25..e00a3f405e071df7034da9c54770c17397a3670f 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -131,6 +131,221 @@ int CountLeadingZeros(T integer_input) {
 #endif
 }
 
+// TODO(b/77858996): Add these to gemmlowp.
+template <typename IntegerType>
+IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          sum)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
+}
+
+template <typename IntegerType>
+IntegerType SaturatingSub(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t diff = a32 - b32;
+  return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff)));
+}
+
+template <>
+inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t diff = a64 - b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          diff)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingSub(a.raw(), b.raw()));
+}
+// End section to be moved to gemmlowp.
+
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90
+             ? 7
+             : input_bits > 44
+                   ? 6
+                   : input_bits > 21
+                         ? 5
+                         : input_bits > 10
+                               ? 4
+                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+}
+
+// Although currently the name of this function says that it cannot handle
+// values less than 1, in practice it can handle as low as 1/x_max, where
+// x_max is the largest representable input.  In other words, the output range
+// is symmetric.
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32 r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
+  const int32 r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficent to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
 inline int32 GetReciprocal(int32 x, int x_integer_digits,
                            int* num_bits_over_unit) {
   int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 3682499d494cc4e63712b6c57d80482899b2185d..5a0539064755d9cd93205d680723de3550a177b9 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdlib>
 #include <iterator>
 #include <limits>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -26,33 +27,82 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 
 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 
 namespace tflite {
 namespace {
 
-enum class ForceKernelInvocation {
-  // Run all tests against kUseStandardEntry even if also testing another
-  // kernel, since we need to be sure that the main DepthwiseConv() function in
-  // optimized_ops.h dispatches to a correctly-executing kernel.
-  kNone = 0,  // The "default" option: use the normal DepthwiseConv
-              // kernel (entry) function.
-  kUseGenericKernel,
-  kUseNeon3x3,            // 3x3 kernel that uses NEON when available.
-  kUseNeon3x3DotProduct,  // 3x3 kernel that uses dot-product enabled NEON when
-                          // available.
+using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+using ::testing::Bool;
+using ::testing::Values;
+
+// Currently, this is used in place of a Boolean "is symmetric?".
+enum class ParamsSpecialization {
+  kNone = 0,
+  kSymmetric,  // Symmetric quantization: zero represented by 128.
+};
+
+static constexpr int kSymmetricZeroPoint = 128;
+
+// Extend coverage distribution in a specific aspect, either explicitly chosen
+// or randomly chosen as in a mixture distribution.
+enum class CoverageExtension {
+  kNone = 0,
+  kLargeHeights = 1,
+  kLargeWidths = 2,
+  kNumOptions
+};
+
+// The TestParam structure below is the preferred parameterization of tests. A
+// tuple version is defined in order to support value-parameterized tests.
+typedef std::tuple<DepthwiseConvImplementation, int, bool, bool, bool,
+                   DepthwiseConvOutputRounding, bool>
+    TestParamTuple;
+
+struct TestParam {
+  TestParam() = default;
+
+  explicit TestParam(TestParamTuple param_tuple)
+      : forced_invocation(::testing::get<0>(param_tuple)),
+        tests_to_run(::testing::get<1>(param_tuple)),
+        test_stride(::testing::get<2>(param_tuple)),
+        test_pad(::testing::get<3>(param_tuple)),
+        test_depth_multiplier(::testing::get<4>(param_tuple)),
+        output_rounding(::testing::get<5>(param_tuple)),
+        loose_tolerance(::testing::get<6>(param_tuple)) {}
+
+  static std::string TestNameSuffix(
+      const ::testing::TestParamInfo<TestParamTuple>& info) {
+    const TestParam param(info.param);
+    return absl::Substitute("invocation_$0_stride_$1_pad_$2_depth_mult_$3",
+                            static_cast<int>(param.forced_invocation),
+                            param.test_stride, param.test_pad,
+                            param.test_depth_multiplier);
+  }
+
+  DepthwiseConvImplementation forced_invocation =
+      DepthwiseConvImplementation::kNone;
+  int tests_to_run = 0;
+  bool test_stride = false;
+  bool test_pad = false;
+  bool test_depth_multiplier = false;
+  DepthwiseConvOutputRounding output_rounding =
+      DepthwiseConvOutputRounding::kNone;
+  bool loose_tolerance = false;
 };
 
 inline void DispatchDepthwiseConv(
-    ForceKernelInvocation forced_invocation, const DepthwiseParams& params,
+    const TestParam& test_param, const DepthwiseParams& params,
     const RuntimeShape& input_shape, const uint8* input_data,
     const RuntimeShape& filter_shape, const uint8* filter_data,
     const RuntimeShape& bias_shape, const int32* bias_data,
     const RuntimeShape& output_shape, uint8* output_data) {
-  switch (forced_invocation) {
-    case ForceKernelInvocation::kUseNeon3x3: {
+  switch (test_param.forced_invocation) {
+    case DepthwiseConvImplementation::kUseNeon3x3: {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
@@ -67,20 +117,20 @@ inline void DispatchDepthwiseConv(
 
       // Check that parameter combination is supported.
       const bool basic_3x3_kernel_supported =
-          optimized_ops::Fast3x3FilterKernelSupported(
+          optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
               input_shape, filter_shape, stride_width, stride_height,
               dilation_width_factor, dilation_height_factor, pad_width,
               pad_height, depth_multiplier, output_shape, output_shift);
       ASSERT_TRUE(basic_3x3_kernel_supported)
           << "pad_width = " << params.padding_values.width
           << " pad_height = " << params.padding_values.height
-          << " input_width = " << input_shape.Dims(1)
-          << " input_height = " << input_shape.Dims(2)
-          << " output_width = " << output_shape.Dims(1)
-          << " output_height = " << output_shape.Dims(2);
+          << " input_width = " << input_shape.Dims(2)
+          << " input_height = " << input_shape.Dims(1)
+          << " output_width = " << output_shape.Dims(2)
+          << " output_height = " << output_shape.Dims(1);
 
       // Call kernel optimized for depthwise convolutions using 3x3 filters.
-      optimized_ops::DepthwiseConv3x3Filter(
+      optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
           params, input_shape, input_data, filter_shape, filter_data,
           bias_shape, bias_data, output_shape, output_data);
       return;
@@ -88,56 +138,42 @@ inline void DispatchDepthwiseConv(
       break;
 #endif
     }
-    case ForceKernelInvocation::kUseNeon3x3DotProduct: {
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__) && \
-    !defined(GOOGLE_L4T)
-      using optimized_ops::DotProduct3x3KernelType;
-      DotProduct3x3KernelType kernel_type =
-          optimized_ops::CategorizeDotProductKernel(params);
-      switch (kernel_type) {
-        case DotProduct3x3KernelType::kPlain:
-          // TODO(b/118430534): Implement optimized kernel.
-          optimized_ops::DepthwiseConv3x3Filter(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithDepthMultiplication:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConvGeneral(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithPad0Stride2:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConv3x3Filter(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithPad1Stride1:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConvGeneral(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kNone:
-        default:
-          break;
-      }
-#endif
+    case DepthwiseConvImplementation::kUseNeon3x3DotProduct:
+    case DepthwiseConvImplementation::kUseUnwound3x3DotProduct:
+    case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct:
+      // TODO(b/118426582) Placeholder for future dispatches.
       break;
+    case DepthwiseConvImplementation::kUseCModel3x3DotProduct: {
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(params);
+
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
+          << "Kernel type = " << static_cast<int>(kernel_type);
+
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseCModel3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
     }
-    case ForceKernelInvocation::kUseGenericKernel: {
-      optimized_ops::DepthwiseConvGeneral(params, input_shape, input_data,
-                                          filter_shape, filter_data, bias_shape,
-                                          bias_data, output_shape, output_data);
+    case DepthwiseConvImplementation::kUseGenericKernel: {
+      optimized_ops::depthwise_conv::DepthwiseConvGeneral(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
       return;
     }
-    case ForceKernelInvocation::kNone:
+    case DepthwiseConvImplementation::kNone:
     default:
       break;
   }
+  EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
+      << "TODO(b/118426582) requested kernel was not invoked / available yet";
   optimized_ops::DepthwiseConv(params, input_shape, input_data, filter_shape,
                                filter_data, bias_shape, bias_data, output_shape,
                                output_data);
@@ -145,7 +181,7 @@ inline void DispatchDepthwiseConv(
 
 // Runs the DepthwiseConv and compares against the reference implementation.
 int TestOneDepthwiseConvWithGivenOutputShift(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -174,10 +210,31 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   op_params.output_offset = output_offset;
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = -output_shift;
-  reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               reference_output_data.data());
-  DispatchDepthwiseConv(forced_invocation, op_params, input_shape, input_data,
+  switch (test_param.output_rounding) {
+    case DepthwiseConvOutputRounding::kUpward:
+      reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
+          DepthwiseConvOutputRounding::kAwayFromZero>::Run(op_params,
+                                                           input_shape,
+                                                           input_data,
+                                                           filter_shape,
+                                                           filter_data,
+                                                           bias_shape,
+                                                           bias_data,
+                                                           output_shape,
+                                                           reference_output_data
+                                                               .data());
+      break;
+    case DepthwiseConvOutputRounding::kAwayFromZero:
+      reference_ops::DepthwiseConv(
+          op_params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, reference_output_data.data());
+      break;
+    case DepthwiseConvOutputRounding::kNone:
+    default:
+      EXPECT_NE(test_param.output_rounding, DepthwiseConvOutputRounding::kNone);
+      break;
+  }
+  DispatchDepthwiseConv(test_param, op_params, input_shape, input_data,
                         filter_shape, filter_data, bias_shape, bias_data,
                         output_shape, output_data.data());
   int saturated_min = 0;
@@ -201,15 +258,26 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
   const float mean_abs_diff =
       static_cast<float>(sum_abs_diff) / output_buffer_size;
+
+  constexpr int diff_mean_tolerance = 1;
+  constexpr int diff_median_tolerance = 0;
+  // The tolerance that we apply to means is tight, but we allow for a rounding
+  // difference in one pixel, and loosen by another 1% for float comparison.
+  const float mean_tolerance =
+      std::max(1e-5f, 1.01f * 2.f / output_buffer_size *
+                          std::sqrt(1.f * depth_multiplier));
+
   // Normally we should require bit-for-bit exact results. Unfortunately a bug
   // in the Intel arm_neon_sse.h translation header that we use for x86 tests
-  // causes 1-bit inaccuracy in
-  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
-  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
-  // yet still ensure that no more than a small minority of values are wrong.
-  EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f &&
-              std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
-              std::abs(max_diff) <= 1);
+  // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
+  // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
+  // few off-by-one errors for now, yet still ensure that no more than a small
+  // minority of values are wrong.
+  EXPECT_LT(std::abs(mean_diff), mean_tolerance);
+  EXPECT_LT(mean_abs_diff, mean_tolerance);
+  EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
+  EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
+  EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
   if (saturated_min > 2 * saturated_max) {
     return -1;
   }
@@ -221,13 +289,12 @@ int TestOneDepthwiseConvWithGivenOutputShift(
 
 // The point of this function is that we can't practically know which
 // output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
-// could do some
-// statistics for large size, but they would be fragile at smaller sizes), and
-// guessing wrong would mean that all the values get saturated so the test
-// becomes
-// vacuous. So we just bisect our way to reasonable output_shift values.
+// could do some statistics for large size, but they would be fragile at smaller
+// sizes), and guessing wrong would mean that all the values get saturated so
+// the test becomes vacuous. So we just bisect our way to reasonable
+// output_shift values.
 void TestOneDepthwiseConvBisectOutputShift(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -242,7 +309,7 @@ void TestOneDepthwiseConvBisectOutputShift(
   int output_shift_bisect_midpoint =
       (output_activation_bisect_start + output_activation_bisect_end) / 2;
   int bisect_result = TestOneDepthwiseConvWithGivenOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       output_shift_bisect_midpoint, output_activation_min,
@@ -269,7 +336,7 @@ void TestOneDepthwiseConvBisectOutputShift(
                                              ? output_activation_bisect_end
                                              : output_shift_bisect_midpoint;
   TestOneDepthwiseConvBisectOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       new_output_activation_bisect_start, new_output_activation_bisect_end,
@@ -277,7 +344,7 @@ void TestOneDepthwiseConvBisectOutputShift(
 }
 
 void TestOneDepthwiseConv(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -287,13 +354,14 @@ void TestOneDepthwiseConv(
     std::int32_t output_activation_min, std::int32_t output_activation_max,
     const RuntimeShape& output_shape) {
   TestOneDepthwiseConvBisectOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       0, 32, output_activation_min, output_activation_max, output_shape);
 }
 
-bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
+bool TryTestDepthwiseConv(const TestParam& test_param,
+                          ParamsSpecialization params_specialization, int batch,
                           int input_depth, int input_width, int input_height,
                           int filter_width, int filter_height,
                           int depth_multiplier, int stride,
@@ -318,9 +386,12 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
   }
   const std::int32_t output_multiplier =
       UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
-  const std::int32_t input_offset = UniformRandomInt(-256, 0);
-  const std::int32_t filter_offset = UniformRandomInt(-256, 0);
-  const std::int32_t output_offset = UniformRandomInt(-256, 0);
+  std::int32_t filter_offset = -kSymmetricZeroPoint;
+  if (params_specialization != ParamsSpecialization::kSymmetric) {
+    filter_offset = UniformRandomInt(-255, 0);
+  }
+  const std::int32_t input_offset = UniformRandomInt(-255, 0);
+  const std::int32_t output_offset = UniformRandomInt(0, 255);
   RuntimeShape input_shape_inference(
       {batch, input_height, input_width, input_depth});
   RuntimeShape output_shape_inference;
@@ -343,7 +414,7 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
   FillRandom(&filter_data);
   FillRandom(&bias_data, -10000, 10000);
   TestOneDepthwiseConv(
-      forced_invocation, input_data.data(), input_shape_inference, input_offset,
+      test_param, input_data.data(), input_shape_inference, input_offset,
       filter_data.data(), filter_shape_inference, filter_offset,
       bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
       pad_height, depth_multiplier, output_offset, output_multiplier,
@@ -355,7 +426,8 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
 // be legal. If they're not legal, it returns false. If they're legal,
 // it runs the DepthwiseConv test and returns true. This allows the caller
 // to loop until a test has been run.
-bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
+bool TryTestOneDepthwiseConv(const TestParam& test_param,
+                             ParamsSpecialization params_specialization) {
   // We have to pick a lot of positive values, where we are particularly
   // interested in small values because they are most likely to be special
   // cases in optimized implementations, and secondarily because they allow
@@ -375,13 +447,14 @@ bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
 // Tests parameters for the 3x3 filter kernel.
-bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
+bool TryTestOneDepthwiseConv3x3Filter(
+    const TestParam& test_param, ParamsSpecialization params_specialization) {
   const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
   const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
   int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
@@ -397,7 +470,7 @@ bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
 
   // Adjust for, or reject, special cases.
-  if (forced_invocation != ForceKernelInvocation::kNone) {
+  if (test_param.forced_invocation != DepthwiseConvImplementation::kNone) {
     // With stride == 2 and SAME, padding width and height are the left and top
     // padding amounts. When there is an even input dimension, padding + 1 is
     // required on the right / bottom. This is not handled by these kernels, so
@@ -416,59 +489,77 @@ bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
   }
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
 // Tests with parameters suited to dot-product-NEON 3x3 filter kernels.
-bool TryTestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
-                          bool test_stride, bool test_pad,
-                          bool test_depth_multiplier) {
+bool TryTestOneNeonDot3x3(const TestParam& test_param,
+                          ParamsSpecialization params_specialization) {
+  const CoverageExtension coverage_extension = static_cast<CoverageExtension>(
+      UniformRandomInt(0, static_cast<int>(CoverageExtension::kNumOptions)));
+
   const int batch = 1;
-  const int input_depth = test_depth_multiplier
+  const int input_depth = test_param.test_depth_multiplier
                               ? 1
-                              : 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
-  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+                              : 8 * ExponentialRandomPositiveInt(0.9f, 3, 50);
+  const int input_width = coverage_extension == CoverageExtension::kLargeWidths
+                              ? ExponentialRandomPositiveInt(0.9f, 50, 200)
+                              : ExponentialRandomPositiveInt(0.9f, 20, 60);
+  const int input_height =
+      coverage_extension == CoverageExtension::kLargeHeights
+          ? ExponentialRandomPositiveInt(0.9f, 50, 200)
+          : ExponentialRandomPositiveInt(0.9f, 20, 60);
   const int filter_width = 3;
   const int filter_height = 3;
   const int depth_multiplier =
-      test_depth_multiplier ? 8 * ExponentialRandomPositiveInt(0.8f, 1, 6) : 1;
-  const int stride = test_stride ? 2 : 1;
+      test_param.test_depth_multiplier
+          ? 8 * ExponentialRandomPositiveInt(0.2f, 1, 9)
+          : 1;
+  const int stride = test_param.test_stride ? 2 : 1;
   // We don't support dilations in the 3x3 filter.
   const int dilation_width_factor = 1;
   const int dilation_height_factor = 1;
-  const auto padding_type = test_pad ? PaddingType::kSame : PaddingType::kValid;
+  const auto padding_type =
+      test_param.test_pad ? PaddingType::kSame : PaddingType::kValid;
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
-void TestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
-  while (!TryTestOneDepthwiseConv(forced_invocation)) {
+void TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,
+                          DepthwiseConvOutputRounding output_rounding) {
+  TestParam test_param;
+  test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
+  while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
-  while (!TryTestOneDepthwiseConv3x3Filter(forced_invocation)) {
+void TestOneDepthwiseConv3x3Filter(
+    DepthwiseConvImplementation forced_invocation,
+    DepthwiseConvOutputRounding output_rounding) {
+  TestParam test_param;
+  test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
+  while (!TryTestOneDepthwiseConv3x3Filter(test_param,
+                                           ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
-                       bool test_stride, bool test_pad,
-                       bool test_depth_multiplier) {
-  while (!TryTestOneNeonDot3x3(forced_invocation, test_stride, test_pad,
-                               test_depth_multiplier)) {
+void TestOneNeonDot3x3(const TestParam& test_param) {
+  while (!TryTestOneNeonDot3x3(test_param, ParamsSpecialization::kSymmetric)) {
   }
 }
 
 TEST(TestDepthwiseConv, TestDepthwiseConv) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(ForceKernelInvocation::kNone);
+    TestOneDepthwiseConv(DepthwiseConvImplementation::kNone,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
@@ -476,69 +567,93 @@ TEST(TestDepthwiseConv, TestDepthwiseConv) {
 TEST(TestDepthwiseConv, TestGenericKernel) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(ForceKernelInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv(DepthwiseConvImplementation::kUseGenericKernel,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
 TEST(TestDepthwiseConv, TestKernel3x3Filter) {
   const int kTestsToRun = 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kNone);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kNone,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
-// While the 3x3 coverage test is primarily targeted at specialized kernels, we
-// also run it against the generic kernel, optionally with fewer invocations.
+// While 3x3 coverage tests are primarily targeted at specialized kernels, we
+// also run it against the generic kernel.
 TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv3x3Filter(
+        DepthwiseConvImplementation::kUseGenericKernel,
+        DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 TEST(TestDepthwiseConv, TestNeon3x3Filter) {
   const int kTestsToRun = 3 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseNeon3x3);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
+#endif
 
-// No stride, no depth multiplier, no pad.
-TEST(TestDepthwiseConv, TestNeonDot3x3Plain) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+class DepthwiseConvTest : public ::testing::TestWithParam<TestParamTuple> {};
 
-TEST(TestDepthwiseConv, TestNeonDot3x3DepthMultiplier) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/true);
+TEST_P(DepthwiseConvTest, NeonDot3x3) {
+  const TestParam param(GetParam());
+  for (int i = 0; i < param.tests_to_run; i++) {
+    TestOneNeonDot3x3(param);
   }
 }
 
-TEST(TestDepthwiseConv, TestNeonDot3x3Stride2) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/true, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+INSTANTIATE_TEST_SUITE_P(
+    Neon3x3Kernel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::kUseNeon3x3),  // forced_invocation
+        Values(1000),                                      // tests_to_run
+        Bool(),                                            // test_stride
+        Values(false),                                     // test_pad
+        Values(false),  // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
+        Values(false)                                        // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+#endif
 
-TEST(TestDepthwiseConv, TestNeonDot3x3Pad1) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/true,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+// While 3x3 coverage tests are primarily targeted at specialized kernels, we
+// also run it against the generic kernel.
+INSTANTIATE_TEST_SUITE_P(
+    GenericKernel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseGenericKernel),                 // forced_invocation
+        Values(100),                                   // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
+INSTANTIATE_TEST_SUITE_P(
+    CModel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseCModel3x3DotProduct),           // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/log_quantized_test.cc b/tensorflow/lite/kernels/internal/log_quantized_test.cc
index 8c39350ab1dd8996799e6539755f040399974106..c31c8e307751bcf1030e121eec23ac6cb217f461 100644
--- a/tensorflow/lite/kernels/internal/log_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/log_quantized_test.cc
@@ -121,8 +121,7 @@ void RunSingleTest(const std::vector<int32>& test_input,
                    const string& check_label, int tolerance) {
   const int n = test_input.size();
   std::vector<int32> float_gen_output(n, 0);
-  std::vector<int32> reference_output(n, 0);
-  std::vector<int32> optimized_output(n, 0);
+  std::vector<int32> quantized_output(n, 0);
 
   // Workaround the stupid things that intelligent humans do.
   // Consequence of __builtin_clz(0u) may equal 31 instead of 32.
@@ -132,45 +131,21 @@ void RunSingleTest(const std::vector<int32>& test_input,
   }
 
   for (int i = 0; i < n; ++i) {
-    reference_output[i] =
-        tflite::reference_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
-            OutputIntegerBits, InputIntegerBits>(
-            gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
-                fudged_input[i]))
-            .raw();
-    optimized_output[i] =
-        tflite::optimized_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
-            OutputIntegerBits, InputIntegerBits>(
+    quantized_output[i] =
+        tflite::log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                            InputIntegerBits>(
             gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
                 fudged_input[i]))
             .raw();
     float_gen_output[i] = LogPositiveValuesViaFloat(
         fudged_input[i], InputIntegerBits, OutputIntegerBits);
   }
-  // Note that first check is intolerant.
-  {
-    std::ostringstream label;
-    label << check_label << " / optimized vs reference / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        optimized_output, reference_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, 0);
-  }
   {
     std::ostringstream label;
     label << check_label << " / reference vs float-gen / InputIntegerBits="
           << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        reference_output, float_gen_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, tolerance);
-  }
-  {
-    std::ostringstream label;
-    label << check_label << " optimized vs float-gen / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        optimized_output, float_gen_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, tolerance);
+    CheckOutputData(quantized_output, float_gen_output, test_input, label.str(),
+                    InputIntegerBits, OutputIntegerBits, tolerance);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index 945300dad1653257db69c3440f6db0589e0c1a7b..d0d2654d4123e5025d000a796907f675ca29b05c 100644
--- a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
 #include "tensorflow/lite/string.h"
@@ -61,7 +63,42 @@ void RunLogSoftmaxFloatReference(const uint8* input_data,
   }
 }
 
-void CheckOutputData(const uint8* test_output, const uint8* reference_output,
+// Same as above except for the following change:
+// - input and output data type
+// - Dequnatize function
+// - clamping values
+void RunLogSoftmaxFloatReference(const int8* input_data,
+                                 const RuntimeShape& shape_common,
+                                 int32 input_offset, const double input_scale,
+                                 int stride, float beta,
+                                 int8* reference_output_data) {
+  const int ref_buffer_size = shape_common.FlatSize();
+  std::vector<float> reference_dequant_data(ref_buffer_size);
+  std::vector<float> reference_output_float_data(ref_buffer_size);
+
+  // Reference data generated via Dequant of input into float, and then applying
+  // float LogSoftmax.
+  DequantizationParams dq_params;
+  dq_params.zero_point = input_offset;
+  dq_params.scale = input_scale;
+  reference_integer_ops::Dequantize(dq_params, shape_common, input_data,
+                                    shape_common,
+                                    reference_dequant_data.data());
+  SoftmaxParams sm_params;
+  optimized_ops::LogSoftmax(sm_params, shape_common,
+                            reference_dequant_data.data(), shape_common,
+                            reference_output_float_data.data());
+  // Work with quantized scaling for LogSoftmax, under which 255 represents 0,
+  // and -16 gets nudged up to 0.
+  for (int i = 0; i < ref_buffer_size; i++) {
+    reference_output_data[i] = std::max(
+        -128, static_cast<int>(
+                  127 + std::round(16.0f * reference_output_float_data[i])));
+  }
+}
+
+template <typename T>
+void CheckOutputData(const T* test_output, const T* reference_output,
                      const RuntimeShape& shape_common,
                      const string& check_label, bool be_exacting) {
   const int buffer_size = shape_common.FlatSize();
@@ -144,15 +181,58 @@ void RunOneLogSoftmaxTest(const uint8* input_data,
   reference_ops::LogSoftmax(params, shape_common, input_data, shape_common,
                             reference_quant_logsoftmax_output.data());
 
-  CheckOutputData(optimized_logsoftmax_output.data(),
-                  reference_float_logsoftmax_output.data(), shape_common,
-                  "Optimized vs float reference", false);
-  CheckOutputData(optimized_logsoftmax_output.data(),
-                  reference_quant_logsoftmax_output.data(), shape_common,
-                  "Optimized vs quant reference", true);
-  CheckOutputData(reference_quant_logsoftmax_output.data(),
-                  reference_float_logsoftmax_output.data(), shape_common,
-                  "Quant reference vs float reference", false);
+  CheckOutputData<uint8_t>(optimized_logsoftmax_output.data(),
+                           reference_float_logsoftmax_output.data(),
+                           shape_common, "Optimized vs float reference", false);
+  CheckOutputData<uint8_t>(optimized_logsoftmax_output.data(),
+                           reference_quant_logsoftmax_output.data(),
+                           shape_common, "Optimized vs quant reference", true);
+  CheckOutputData<uint8_t>(reference_quant_logsoftmax_output.data(),
+                           reference_float_logsoftmax_output.data(),
+                           shape_common, "Quant reference vs float reference",
+                           false);
+}
+
+// Runs the LogSoftmax and compares against the float reference implementation
+// and the int8 quantized reference implementation.
+void RunOneLogSoftmaxTest(const int8* input_data,
+                          const RuntimeShape& shape_common, int32 input_offset,
+                          const double input_scale, int stride, float beta) {
+  const int buffer_size = shape_common.FlatSize();
+  std::vector<int8> quantized_logsoftmax_reference_implementation(buffer_size);
+  std::vector<int8> float_logsoftmax_optimized_implementation(buffer_size);
+
+  RunLogSoftmaxFloatReference(input_data, shape_common, input_offset,
+                              input_scale, stride, beta,
+                              float_logsoftmax_optimized_implementation.data());
+
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  int32 reverse_scaling_divisor;
+  int reverse_scaling_right_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessLogSoftmaxScalingExp(
+      beta, input_scale, kScaledDiffIntegerBits, &input_beta_multiplier,
+      &input_beta_left_shift, &reverse_scaling_divisor,
+      &reverse_scaling_right_shift);
+  reverse_scaling_right_shift *= -1;
+  // diff_min has a negative value, and is used to limit the maximum magnitude
+  // of the diffs, which are <= 0.
+  const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                                     input_beta_left_shift);
+
+  const int outer_size =
+      shape_common.Dims(0) * shape_common.Dims(1) * shape_common.Dims(2);
+  const int inner_size = shape_common.Dims(3);
+  reference_integer_ops::LogSoftmax(
+      input_beta_multiplier, input_beta_left_shift, reverse_scaling_divisor,
+      reverse_scaling_right_shift, diff_min, outer_size, inner_size, input_data,
+      quantized_logsoftmax_reference_implementation.data());
+
+  CheckOutputData<int8_t>(quantized_logsoftmax_reference_implementation.data(),
+                          float_logsoftmax_optimized_implementation.data(),
+                          shape_common, "Quant reference vs float reference",
+                          false);
 }
 
 // This function picks some random LogSoftmax params, which are checked for
@@ -161,6 +241,7 @@ void RunOneLogSoftmaxTest(const uint8* input_data,
 // to loop until a test has been run.
 //
 // Currently we do not reject for any reason.
+template <typename T>
 bool TryOneUniformLogSoftmax() {
   // We pick mostly positive values, on the whole emphasizing smaller values and
   // therefore faster tests.  We test a wider range of depths.  In the case of
@@ -178,7 +259,7 @@ bool TryOneUniformLogSoftmax() {
       RuntimeShape({batch, input_height, input_width, input_depth});
   const int buffer_size = shape_common.FlatSize();
 
-  std::vector<uint8> input_data(buffer_size);
+  std::vector<T> input_data(buffer_size);
   FillRandom(&input_data);
   RunOneLogSoftmaxTest(input_data.data(), shape_common, input_offset,
                        input_scale, stride, beta);
@@ -224,15 +305,23 @@ bool TryOneSkyscraperLogSoftmax(bool small_depth) {
   return true;
 }
 
-TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxUint8Tests) {
+  const int kTestsToRun = 100;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneUniformLogSoftmax<uint8_t>()) {
+    }
+  }
+}
+
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxUint8Int8Tests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    while (!TryOneUniformLogSoftmax()) {
+    while (!TryOneUniformLogSoftmax<int8_t>()) {
     }
   }
 }
 
-TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
+TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxUint8Tests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(false)) {
@@ -240,7 +329,7 @@ TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
   }
 }
 
-TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) {
+TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxUint8Tests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(true)) {
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 13629494bd2404516c550e33b756a4b283d6cf2f..84d701676b8f4122cb4d66b11969675549cee60f 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 namespace tflite {
 namespace optimized_ops {
+namespace depthwise_conv {
 
 // Implementation of quantized DepthwiseConv
 
@@ -1946,6 +1947,8 @@ inline void DepthwiseConvGeneral(
   }
 }
 
+}  // namespace depthwise_conv
+
 inline void DepthwiseConv(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -1980,23 +1983,23 @@ inline void DepthwiseConv(
 
   // Call kernel optimized for depthwise convolutions using 3x3 filters if
   // parameters are supported.
-  if (Fast3x3FilterKernelSupported(
+  if (depthwise_conv::Fast3x3FilterKernelSupported(
           input_shape, filter_shape, stride_width, stride_height,
           dilation_width_factor, dilation_height_factor, pad_width, pad_height,
           depth_multiplier, output_shape, output_shift)) {
     gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
-    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-                           filter_data, bias_shape, bias_data, output_shape,
-                           output_data);
+    depthwise_conv::DepthwiseConv3x3Filter(
+        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+        bias_data, output_shape, output_data);
     return;
   }
 #endif
 
   gemmlowp::ScopedProfilingLabel specialized_label(
       "DepthwiseConv/8bit/General");
-  DepthwiseConvGeneral(params, input_shape, input_data, filter_shape,
-                       filter_data, bias_shape, bias_data, output_shape,
-                       output_data);
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
+                                       filter_shape, filter_data, bias_shape,
+                                       bias_data, output_shape, output_data);
 }
 
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 6bfe27d96df4e1512a039d99dcbe8bb4bec117fb..8a87d424e8329fe060425ee85ef175351c6ec61d 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -23,51 +23,60 @@ limitations under the License.
 
 namespace tflite {
 namespace optimized_ops {
+namespace depthwise_conv {
+
+constexpr int kDepthwiseConvScratchWorkspaceSize = 10 * 10 * 64;
+constexpr int kDepthwiseConvAdjustedBiasLimit = 256;
+// In cases such as depth multiplication, we want to be able to load data from
+// the workspace that is beyond the valid range. Macro-block sizes are adjusted
+// to allow for this.
+constexpr int kWorkspaceExtension = 16;
 
 // See CategorizeDotProductKernel for definitive taxonomy.
 enum class DotProduct3x3KernelType {
   kNone = 0,  // Parameter combination is not supported for dot product kernels.
   kPlain,
-  kWithDepthMultiplication,
-  kWithPad0Stride2,
-  kWithPad1Stride1,
+  kWithDepthMultiplicationStride1,
+  kWithDepthMultiplicationStride2,
+  kStride2,
 };
 
 inline DotProduct3x3KernelType CategorizeDotProductKernel(
     const DepthwiseParams& params) {
-  const int padding = params.padding_values.width;
+  const int padding =
+      std::max(params.padding_values.width, params.padding_values.height);
   const int stride = params.stride_width;
-  if (padding != params.padding_values.height ||
-      stride != params.stride_height) {
+  if (stride != params.stride_height || padding > 1) {
     return DotProduct3x3KernelType::kNone;
   }
 
   if (params.depth_multiplier == 1) {
-    if (padding == 0 && stride == 1) {
+    if (stride == 1) {
       return DotProduct3x3KernelType::kPlain;
-    } else if (padding == 0 && stride == 2) {
-      return DotProduct3x3KernelType::kWithPad0Stride2;
-    } else if (padding == 1 && stride == 1) {
-      return DotProduct3x3KernelType::kWithPad1Stride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kStride2;
     } else {
       return DotProduct3x3KernelType::kNone;
     }
   } else {
-    if (padding == 0 && stride == 1) {
-      return DotProduct3x3KernelType::kWithDepthMultiplication;
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride2;
     } else {
       return DotProduct3x3KernelType::kNone;
     }
   }
 }
 
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 #include <stddef.h>
 
-#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
-
 // Encapsulates constant parameters used in DepthwiseConv.
 // 64-bit is used for types that will be added to 64-bit addresses in asm.
 struct DepthwiseConvParams {
@@ -91,9 +100,6 @@ struct DepthwiseConvParams {
   int32 output_height;
 };
 
-#define STR(s) STR_UNEXPANDED(s)
-#define STR_UNEXPANDED(s) #s
-
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvParams struct. This is used in the asm to load parameters.
 // Keep these values in sync with the static_asserts below.
@@ -168,7 +174,46 @@ static_assert(offsetof(DepthwiseConvParams, output_width) ==
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
                   OFFSET_OUTPUT_HEIGHT,
               "");
+#endif
 
+// Encapsulates constant parameters used in DepthwiseConv using dot-product ops.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+//
+// This structure is specifically designed for use in asm.
+struct DepthwiseConvDotProdParams {
+  int64_t input_depth;
+  int64_t output_depth;
+  int32 workspace_height_stride;
+  int32 input_width_overall_micro_repeats;
+  int32 input_width_micro_repeats;
+  int32 depth_micro_repeats;
+  int32 inbound_block_height;
+  int32 residual_width;
+  int32 input_height_stride;
+  int32 stride;
+  int32 output_width_overall_micro_repeats;
+  int32 output_width_micro_repeats;
+  int32 output_residual_width;
+  int32 output_height_stride;
+  int32 bias_increment;
+  int32 padding_left;
+  int32 padding_right;
+  int32 padding_top;
+  int32 padding_bottom;
+  int32 height_macro_count;
+  int32 width_macro_count;
+  int32 outbound_block_height;
+  int32 workspace_width_micro_repeats;
+  int32 input_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int32 output_shift;
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  int32 four_over_stride;
+};
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
 
@@ -2964,8 +3009,6 @@ struct DepthwiseConvPartial<EdgeType::kVertical, 1, 1> {
 #undef OFFSET_INPUT_HEIGHT
 #undef OFFSET_OUTPUT_WIDTH
 #undef OFFSET_OUTPUT_HEIGHT
-#undef STR
-#undef STR_UNEXPANDED
 
 // Copies a subset of the input designated by |input_ptr| into |output_ptr|
 // with the specified output dimensions. Supports output depths of 64 only as
@@ -3048,7 +3091,7 @@ struct DepthwiseConvMultiRow {
         get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
     TFLITE_DCHECK(64 * shuffle_params.input_width *
                       shuffle_params.input_height <=
-                  DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+                  kDepthwiseConvScratchWorkspaceSize);
 
     int32 out_x = start_x;
 
@@ -3376,7 +3419,7 @@ inline void DepthwiseConv3x3Filter(
   // allocated on the stack. Eventually we will want to move it to the heap
   // and have it allocated outside of this function, like the im2col_array
   // used in gemmlowp.
-  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
+  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
 
   for (int32 b = 0; b < batches; ++b) {
     const uint8* input_ptr = input_data + b * input_batch_size;
@@ -3455,9 +3498,12 @@ inline void DepthwiseConv3x3Filter(
     }
   }
 }
-
 #endif  // __aarch64__
 
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
new file mode 100644
index 0000000000000000000000000000000000000000..1940353f74c05f6ff059d9c62d5c157228406dd5
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -0,0 +1,1395 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+
+// This file provides kernel implementations that are not used in shipped
+// inference code, but rather (a) show how model C++ code is designed and then
+// transformed into asm code, and (b) aid with maintenance and later development
+// of variations. Many projects (even including, say, the classic NAG libraries)
+// develop highly optimized code, but do not maintain intermediate versions.
+// Often the result is incomprehensible final-version code.
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+// Permute filter data, and adjust bias data to account for symmetric input
+// offset. Details are provided in the implementation of the
+// kUseCModel3x3DotProduct version.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation>
+struct ProcessPerDepth {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct> {
+  // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
+  // width 3,  sub-block 0 or 1, depth 4. Filter data is written as
+  // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
+  //
+  // Note that this rearrangement is much like that performed on input data when
+  // filling the workspace, and optimized versions will be similar.
+  static inline void FillFilterBank(int depth, const uint8* filter_block,
+                                    int8 filter_bank[3][2][4][4]) {
+    constexpr int kSymmetricZeroPoint = 128;
+    // Load filter data in, 8-bytes down depth / sub-block at a time.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8 loaded_filter[3][4][2][4];
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
+               8);
+      }
+      // Pad the filter with symmetric representation of 0, so that the values
+      // become 0 when the zero-poing is added below. Thus these filter taps are
+      // effectively disregarded in later filtering.
+      memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
+    }
+    for (int y = 0; y < 3; ++y) {
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank[y][0][z][x] =
+              loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
+          filter_bank[y][1][z][x] =
+              loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+    }
+  }
+
+  // Adjust the bias (weights) data according to the input offset.
+  //
+  // The output calculation is
+  // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
+  //                                 (filter[i][j][d] + filter_offset)
+  // (where offsets are expressed as differences from 128).
+  //
+  // Since we cannot efficiently handle varying offsets / bias across the image,
+  // we insist on filter_offset = 0.
+  //
+  // This function calculates
+  // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
+  // which accounts for input offset. If the bias is constant over the depth,
+  // the adjusted bias will vary.
+  static inline void AdjustBias(int32 input_offset,
+                                const int8 filter_bank[3][2][4][4],
+                                const int32* bias_data,
+                                int32 adjusted_bias_block[2][4]) {
+    constexpr int kSymmetricZeroPoint = 128;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    // For instance, if input_offset == 128, no adjustment is needed.
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int s = 0; s < 2; ++s) {
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_block[s][z] = bias_data[4 * s + z];
+        for (int i = 0; i < 9; ++i) {
+          adjusted_bias_block[s][z] +=
+              input_offset_difference * filter_bank[i % 3][s][z][i / 3];
+        }
+      }
+    }
+  }
+
+  static void Run(const uint8* filter_data, const int32* bias_data,
+                  int8* shuffled_filter_data, int32* adjusted_bias_data,
+                  const DepthwiseConvDotProdParams* function_params) {
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+    const int depth = function_params->output_depth;
+    const int bias_increment = function_params->bias_increment;
+    const int32 input_offset = function_params->input_offset;
+
+    int8 filter_bank[3][2][4][4];
+    int32 adjusted_bias_block[2][4];
+
+    for (int j_depth = 0; j_depth < (depth >> 3); ++j_depth) {
+      FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
+      AdjustBias(input_offset, filter_bank,
+                 bias_data + 2 * j_depth * bias_increment, adjusted_bias_block);
+
+      memcpy(shuffled_filter_data, filter_bank[0][0][0],
+             shuffled_filter_increment);
+      shuffled_filter_data += shuffled_filter_increment;
+      memcpy(adjusted_bias_data, adjusted_bias_block[0],
+             8 * sizeof(adjusted_bias_block[0][0]));
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+// Copy a macro block of data from the input buffer into the workspace,
+// permuting data within each micro block.
+//
+// (a) Copy a macro block of data, padding as required along the width and
+//     height.
+// (b) Transpose the data within each micro block.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication,
+          int32 max_padding>
+struct PackMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      max_padding> {
+  // A straight copy of a macro block of input data into a scratch buffer.
+  //
+  // Requirement: depth_micro_repeats > 0.
+  static inline void CopyMacroBlock(
+      int32 height_block_number, int32 width_block_number,
+      const DepthwiseConvDotProdParams& function_params,
+      const uint8* input_block_data, int8* scratch_block_data) {
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The input depth and count of micro blocks provide the width strides.
+    const int input_height_stride = function_params.input_height_stride;
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int input_depth = function_params.input_depth;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    int input_width_micro_repeats = function_params.input_width_micro_repeats;
+    const int residual_width = function_params.residual_width;
+    const int block_height = function_params.inbound_block_height;
+
+    const int padding_left = function_params.padding_left;
+    const int padding_right = function_params.padding_right;
+    const int padding_top = function_params.padding_top;
+    const int padding_bottom = function_params.padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params.width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params.height_macro_count - 1);
+
+    // Modify the trailing case to reflect the input width.
+    int input_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+    if (trailing_width_padding) {
+      input_residual_width -= 1;
+      input_width_micro_repeats = width_overall_micro_repeats - 1;
+    }
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset_difference =
+        function_params.input_offset + kSymmetricZeroPoint;
+
+    // We load data into a temporary buffer and then save, to match subsequent
+    // processing. This will make it easier to combine stages into one ASM
+    // routine.
+    int8 tmp_load[4][2][4];
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs trailing padding).
+        int adjusted_residual_width =
+            j_width == input_width_micro_repeats ? input_residual_width : 4;
+
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+          memset(tmp_load[0][0], -input_offset_difference, 8);
+        }
+        if (adjusted_residual_width < 4) {
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            memset(tmp_load[x][0], -input_offset_difference, 8);
+          }
+        }
+
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // The inner 3 loops go through the sub-block, depth and width within
+          // each micro block.
+
+          // Load, and apply symmetric offset.
+          int8* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          const uint8* input_data = input_block_data +
+                                    k_height * input_height_stride +
+                                    j_width * 4 * input_depth + i_depth * 8;
+          // Full-size macro blocks are 2*4*4 = 32 bytes.
+          for (int x = start_width; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+
+          // Save results.
+          memcpy(&scratch_data[0], tmp_load[0][0], 8);
+          memcpy(&scratch_data[8], tmp_load[1][0], 8);
+          memcpy(&scratch_data[16], tmp_load[2][0], 8);
+          memcpy(&scratch_data[24], tmp_load[3][0], 8);
+        }
+      }
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference, workspace_height_stride);
+    }
+  }
+
+  // Transpose 4x4 blocks within each sub-micro-block.
+  //
+  // Implemented somewhat like NEON register manipulation, so that we can see
+  // equivalence of the two approaches.
+  static inline void MicroTransposeBlocks(
+      const DepthwiseConvDotProdParams& function_params,
+      int8* scratch_block_data) {
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    const int block_height = function_params.inbound_block_height;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating.
+    int8 tmp_load[4][2][4];         // [width][sub-block][depth]
+    int8 tmp_transposed[4][2][4];   // [depth][sub-block][width]
+    int8 tmp_interleaved[2][4][4];  // [sub-block][depth][width]
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          int8* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          // A. Load data
+          memcpy(tmp_load[0][0], &scratch_data[0], 8);
+          memcpy(tmp_load[1][0], &scratch_data[8], 8);
+          memcpy(tmp_load[2][0], &scratch_data[16], 8);
+          memcpy(tmp_load[3][0], &scratch_data[24], 8);
+
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
+        }
+      }
+    }
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    CopyMacroBlock(height_block_number, width_block_number, *function_params,
+                   input_block_data, scratch_block_data);
+    MicroTransposeBlocks(*function_params, scratch_block_data);
+  }
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      max_padding> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    // Currently support for padding is limited to 1 on any side.
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The count of micro blocks (below) provides the width strides.
+    const int input_height_stride = function_params->input_height_stride;
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int residual_width = function_params->residual_width;
+    const int block_height = function_params->inbound_block_height;
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset_difference =
+        function_params->input_offset + kSymmetricZeroPoint;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // When there is unit input depth, the micro-block iteration need only be
+    // through the height. The micro blocks are contiguous across the width.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const uint8* input_data =
+          input_block_data + k_height * input_height_stride;
+      int8* scratch_data =
+          scratch_block_data + k_height * workspace_height_stride;
+
+      // Handle leading padding. This is overwritten if there is no padding.
+      scratch_data[0] = -input_offset_difference;
+
+      memcpy(&scratch_data[start_width], input_data, copy_size);
+      for (int i = 0; i < copy_size; ++i) {
+        scratch_data[start_width + i] += -kSymmetricZeroPoint;
+      }
+
+      // Handle trailing padding, and fill in remainder of micro block.
+      memset(&scratch_data[start_width + copy_size], -input_offset_difference,
+             4 - adjusted_residual_width + kWorkspaceExtension);
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+    }
+  }
+};
+
+// Apply filter to macro block of input data and store results. Details are
+// provided in the implementation of the kUseCModel3x3DotProduct version.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication, int32 stride>
+struct KernelMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Apply filter to macro block of input data and store results.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, depth 4, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
+                                               int workspace_height_stride,
+                                               int width_micro_stride,
+                                               bool no_right_block,
+                                               const int8* input_block,
+                                               int8 selected_data[3][4][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+
+    // The input banks have same format as selected_data.
+    int8 left_bank[3][4][4];
+    int8 right_bank[3][4][4];
+
+    // Work through one slice, by row, at a time.
+    for (int k_height = 0; k_height < 3; ++k_height) {
+      // Simulate demangling of mangled storage arrangement.
+      const int8* left_input_block =
+          &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
+      memcpy(left_bank[k_height][0], left_input_block, 16);
+      if (no_right_block) {
+        memset(right_bank[k_height][0], 0, 16);
+      } else {
+        const int8* right_input_block =
+            &input_block[k_height * workspace_height_stride +
+                         sub_block * 2 * 8 + width_micro_stride];
+        memcpy(right_bank[k_height][0], right_input_block, 16);
+      }
+      for (int depth_index = 0; depth_index < 4; ++depth_index) {
+        memcpy(selected_data[k_height][depth_index],
+               &left_bank[k_height][depth_index][offset], 4 - offset);
+        memcpy(&selected_data[k_height][depth_index][4 - offset],
+               right_bank[k_height][depth_index], offset);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& params, int sub_block,
+      const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
+      const int32* bias_data, uint8 output_values[4]) {
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 output_multiplier = params.output_multiplier;
+    const int32 output_shift = params.output_shift;
+    const int32 output_offset = params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32 acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32 input_val = selected_data[y][d][x];
+          int32 filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8>(acc);
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8 sub_selected_input_data[3][4][4];  // Height 3, depth 4, width 4.
+    uint8 output_values[4];                 // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                scratch_data + width_micro_stride * i_width;
+            // Iterate over input width shifts within sub-micro blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, s,
+                                        workspace_height_stride,
+                                        width_micro_stride, no_right_block,
+                                        input_data, sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Apply filter to macro block of input data and store results.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset,
+                                               int workspace_height_stride,
+                                               bool no_right_block,
+                                               const int8* input_block,
+                                               int8 selected_data[3][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+    if (no_right_block) {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset],
+               4 - offset);
+      }
+    } else {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset], 4);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& function_params, int sub_block,
+      const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
+      const int32* bias_data, uint8 output_values[4]) {
+    const int32 output_activation_min =
+        function_params.quantized_activation_min;
+    const int32 output_activation_max =
+        function_params.quantized_activation_max;
+    const int32 output_multiplier = function_params.output_multiplier;
+    const int32 output_shift = function_params.output_shift;
+    const int32 output_offset = function_params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32 acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32 input_val = selected_data[y][x];
+          int32 filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8>(acc);
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8 sub_selected_input_data[3][4];  // Height 3, depth 4, width 4.
+    uint8 output_values[4];              // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data = scratch_data + 4 * i_width;
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
+                                        no_right_block, input_data,
+                                        sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Top-level implementation function for 3x3 depthwise convolution using
+// NEON dot-product instructions.
+//
+// MACRO & MICRO BLOCKS
+//
+// The task is divided into macro blocks. Data is copied first into a macro
+// block in a workspace. This has two purposes: (a) bringing data into
+// cache, and (b) permuting data so that it can be used much more easily in
+// a dot-product filter.
+//
+// When there is no depth multiplication:
+//
+// The permutations required for dot-products are local, within 4 data points
+// down the depth and 4 across the width. We want to pull in input data at least
+// 8-bytes at a time, down the depth, and so we divide the macro blocks into
+// 1x4x8 (height, width, depth) and further divide the micro blocks into
+// sub-blocks with shape (1x4x4).
+//
+// Each macro-block is constructed from micro-blocks that are internally
+// rearranged during loading into the macro-block workspace.
+//
+// In other words, the micro-block shape is
+//     {1, 1, 4, 8}
+// Each macro block is typically shape
+//     {1, height_block_size, 4 * workspace_width_micro_repeats, 64}
+// and workspace_width_micro_repeats is chosen so it fits into the
+// workspace.
+//
+// However, if depth < 64, we decrease the macro block depth, enabling us to
+// increase the macro-block width.
+//
+// When there is depth multiplication:
+//
+// We require input-depth = 1 and exploit that instead.  Note that output data
+// is still full-depth, *as is the filter and bias data after certain
+// adjustments*, and so the filter stage in this case still proceeds in
+// terms of sub-blocks.
+//
+// The Magic of these numbers:
+//     4 is the number of input elements used in each dot-product.
+//     8 is the number of inputs we load at a time into a register.
+//     64 is min amount of data to be loaded in a stretch (when possible).
+//
+// FILTER DATA PREPARATION
+//
+// Filter data needs to be permuted in a fashion like that of input data, and
+// this is done in a preprocessing stage. In addition, this stage extends the
+// filter in the direction of width from 3 to 4. The extra filter taps are set
+// to zero so that input data does not have to be zeroed before applying
+// dot-products.
+//
+// OVERALL COUNTS: HANDLING TRAILING ITERATION
+//
+// Often it is necessary to handle the last iteration in a loop differently,
+// generally because the final item is shorter. The logic to detect the
+// special case can be a bit expensive. We use a scheme in which there are
+// two counts, in a pattern like xxx_yyy_repeats and
+// xxx_overall_yyy_repeats. The first gives the count of "normal"
+// iterations. The loop iterates over the second count, and the induction
+// variable is checked to see if it reaches xxx_yyy_repeats. If there is no
+// special trailing iteration, xxx_yyy_repeats = xxx_overall_yyy_repeats,
+// and the special code is not executed.
+//
+// Example:
+// Suppose that we characterize a size s as
+// f(s) -> (block-4-repetitions, remainder, overall_repetitions):
+// f(11) -> (2, 3, 3)
+// f(12) -> (3, 0, 3)
+// f(13) -> (3, 1, 4)
+//
+// POINTING OUTSIDE OF INPUT ARRAY.
+//
+// When there is padding, the input data pointer passed to the fill routines
+// points outside of the input array and into a kind-of virtual padded
+// margin. It turns out that this simplifies the code and removes
+// conditional statements. It is hard to explain why without comparing two
+// versions of the code. In summary, this way the adjustment into the margin
+// can be made unconditionally, and the correction back into the input array
+// is done where there is a conditional already.
+//
+// OVERLAP
+//
+// Since this is *depthwise* conv, neither the batch nor the depth have overlap.
+// The height and depth overlap by (filter_size - 1). Thus some data is used
+// twice on the borders of macro blocks.
+//
+template <DepthwiseConvImplementation implementation>
+inline void DepthwiseConvDotProduct3x3(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  // Check kernel restrictions.
+  constexpr int filter_size = 3;
+  constexpr int kSymmetricZeroPoint = 128;
+  constexpr int kMaxStride = 2;
+  constexpr int kMaxPadding = 1;
+  TFLITE_DCHECK_EQ(params.weights_offset, -kSymmetricZeroPoint);
+  TFLITE_DCHECK_LE(params.stride_width, kMaxStride);
+  TFLITE_DCHECK_EQ(params.stride_height, params.stride_width);
+  TFLITE_DCHECK_EQ(params.dilation_width_factor, 1);
+  TFLITE_DCHECK_EQ(params.dilation_height_factor, 1);
+  TFLITE_DCHECK_LE(params.padding_values.width, kMaxPadding);
+  TFLITE_DCHECK_LE(params.padding_values.height, kMaxPadding);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  // Key kernel parameters (along with padding handled later).
+  const int stride = params.stride_width;
+  const int depth_multiplier = params.depth_multiplier;
+  const bool has_depth_multiplication = depth_multiplier > 1;
+
+  // Extract task dimensions.
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  TFLITE_DCHECK(!has_depth_multiplication || input_depth == 1);
+  TFLITE_DCHECK(has_depth_multiplication || input_depth == output_depth);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  TFLITE_DCHECK_EQ(input_depth * depth_multiplier, output_depth);
+  TFLITE_DCHECK_EQ(MatchingDim(filter_shape, 1, filter_shape, 2), filter_size);
+
+  // Return now if nothing to do.
+  if (output_width == 0 || output_height == 0) {
+    return;
+  }
+
+  // Kernel parameter structure: set basic fields.
+  //
+  // In asm it is easier to pass a structure than more than, say, 8 parameters.
+  DepthwiseConvDotProdParams function_params;
+  function_params.input_depth = input_depth;
+  function_params.output_depth = output_depth;
+  function_params.input_offset = params.input_offset;
+  function_params.output_offset = params.output_offset;
+  function_params.output_multiplier = params.output_multiplier;
+  function_params.output_shift = params.output_shift;
+  function_params.quantized_activation_min = params.quantized_activation_min;
+  function_params.quantized_activation_max = params.quantized_activation_max;
+  function_params.stride = stride;
+
+  // Handle inbound bias data.
+  //
+  // Note that this data is adjusted in a per-depth process before the main
+  // filters. The adjustment accounts for a non-symmetric input offset.
+  //
+  // Kernel subroutines need to be able to operate consistently on an bias
+  // array. Where there is no bias, we provide one filled with zeros.
+  constexpr int kMinBiasLoad = 8;
+  int32 zero_bias_data[kMinBiasLoad];
+  if (bias_data) {
+    function_params.bias_increment = 4;
+  } else {
+    memset(zero_bias_data, 0, sizeof(zero_bias_data));
+    bias_data = &zero_bias_data[0];
+    function_params.bias_increment = 0;
+  }
+  TFLITE_DCHECK_LE(2 * function_params.bias_increment, kMinBiasLoad);
+
+  // Process padding.
+  //
+  // Whether "correct" or not, this matches ComputeConvSizes. When there is
+  // stride > 1 there can be padding on the bottom or top, and therefore
+  // we need to consider padding. This is true even if one or other of the
+  // padding_values is 0.
+  const int padded_width = (output_width - 1) * stride + filter_size;
+  {
+    const int padding_left = params.padding_values.width;
+    // Right padding would be -1 if discarding input because of stride.
+    const int padding_right =
+        std::max(padded_width - input_width - padding_left, 0);
+    const int padding_top = params.padding_values.height;
+    const int padded_height = (output_height - 1) * stride + filter_size;
+    const int padding_bottom =
+        std::max(padded_height - input_height - padding_top, 0);
+
+    function_params.padding_left = padding_left;
+    function_params.padding_right = padding_right;
+    function_params.padding_top = padding_top;
+    function_params.padding_bottom = padding_bottom;
+
+    TFLITE_DCHECK_LE(padding_left, padding_right);
+    TFLITE_DCHECK_LE(padding_top, padding_bottom);
+  }
+  // When stride == 1 left or top padding may only be non-zero.
+  // This is when padding is specified but not needed on a trailing dimension.
+  // When stride == 2 right or bottom padding may only be non-zero.
+  // This is a result of the details of the padding calculations.
+  const bool padding_required =
+      params.padding_type == tflite::PaddingType::kSame ||
+      function_params.padding_right > 0 || function_params.padding_bottom > 0;
+
+  // Choose parameter-specific kernel subroutines.
+  //
+  // The main part of the kernel has two stages. First, a temporary workspace is
+  // filled with padded and permuted data. Second, the filter is applied to the
+  // workspace data to generate output.
+  //
+  // The workspace fill stage handles padding so that the filter stage does not
+  // need to account for it. The workspace fill stage does not need to
+  // understand striding, and implicitly handles striding through the parameters
+  // that it is given.
+  using pack_macro_block_func_t = decltype(
+      &PackMacroBlock<implementation,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      0>::Run);
+  using kernel_macro_block_func_t = decltype(
+      &KernelMacroBlock<implementation,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        1>::Run);
+  pack_macro_block_func_t pack_macro_block_func;
+  kernel_macro_block_func_t kernel_macro_block_func;
+  {
+    if (has_depth_multiplication) {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/2>::Run;
+      }
+    } else {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/2>::Run;
+      }
+    }
+  }
+
+  // Stride-only variables.
+  //
+  // stride == 1 ? 4 : 2:
+  const int output_height_per_macro = 6 - 2 * stride;
+  // output_height_per_macro * stride:
+  constexpr int input_height_per_macro = 4;
+  // Number of rows per micro block (= rows per macro block) is
+  //   (output_height_per_macro - 1) * stride + 1 + (filter_size - 1)
+  //   = stride == 1 ? 3 + filter_size : 2 + filter_size:
+  const int height_block_size = 4 + filter_size - stride;
+  const int input_height_overlap = filter_size - stride;
+  // stride == 1 ? 4 : 2:
+  function_params.four_over_stride = output_height_per_macro;
+
+  TFLITE_DCHECK_EQ(stride * function_params.four_over_stride, 4);
+  TFLITE_DCHECK_EQ(height_block_size,
+                   input_height_per_macro + input_height_overlap);
+
+  // Create workspaces.
+  //
+  // Filter workspace is for shuffle: only first depth/8 is used.
+  // indexed as [depth/8][sub-block][height][depth][width].
+  TFLITE_DCHECK_LE(output_depth, kDepthwiseConvAdjustedBiasLimit);
+  TFLITE_DCHECK_EQ(kDepthwiseConvAdjustedBiasLimit % 8, 0);
+  int8 macroblock_workspace[kDepthwiseConvScratchWorkspaceSize];
+  int32 adjusted_bias_data[kDepthwiseConvAdjustedBiasLimit];
+  int8 filter_workspace[kDepthwiseConvAdjustedBiasLimit >> 3][3][2][4][4];
+
+  // Output depth characterization.
+  //
+  const int depth_macro_count = output_depth / 64;
+  const int depth_overall_macro_count = (output_depth + 63) / 64;
+  // Number of micro blocks down the depth in a final incomplete macro block.
+  const int depth_trailing_micro_repeats = output_depth / 8 % 8;
+  // The output_depth may not have a remainder: it must be a multiple of 8.
+  TFLITE_DCHECK_EQ(output_depth,
+                   64 * depth_macro_count + 8 * depth_trailing_micro_repeats);
+
+  // Characterize the first macro block depth, the largest.
+  //
+  // We base treatment of the width on the trailing macro block if there are
+  // no full blocks, in order to do more work together (that is, increase
+  // workspace_width_micro_repeats when largest_macro_depth < 64).
+  const int largest_macro_depth =
+      has_depth_multiplication
+          ? 1
+          : (depth_macro_count > 0 ? 64 : 8 * depth_trailing_micro_repeats);
+
+  // Characterize width, consumption of input and generation of output.
+  //
+  // In the case of depth multiplication, we ensure that some of the workspace
+  // at the end remains unused. This enables the filter routines to load the
+  // "next" data, of at least 16 bytes, even when at the end of the workspace.
+  // It is relatively expensive to detect the end micro block. It is also very
+  // difficult to test for (to trigger) erroneous reads (past end of array) in
+  // the depth multplication case.
+  int workspace_width_micro_repeats =
+      (has_depth_multiplication
+           ? kDepthwiseConvScratchWorkspaceSize - kWorkspaceExtension
+           : kDepthwiseConvScratchWorkspaceSize) /
+      (4 * largest_macro_depth * height_block_size);
+  // When there is no depth multiplication, the workspace depth is a multiple of
+  // 8, which ensures that workspace rows are 16-byte aligned. (Actually 32,
+  // because of the micro width of 4.) This is not necessarily the case under
+  // depth multiplication, so we adjust now to impose this restriction.
+  if (has_depth_multiplication) {
+    workspace_width_micro_repeats = (workspace_width_micro_repeats / 4) * 4;
+  }
+  TFLITE_DCHECK_EQ((workspace_width_micro_repeats * largest_macro_depth) % 4,
+                   0);
+  // Discount 1 of the micro-block repeats in each macro block to account for
+  // overlap.
+  const int consumed_width_per_macro_block =
+      4 * (workspace_width_micro_repeats - 1);
+  const int output_width_per_macro_block =
+      function_params.four_over_stride * (workspace_width_micro_repeats - 1);
+  TFLITE_DCHECK_GT(workspace_width_micro_repeats, 1);
+  TFLITE_DCHECK_EQ(output_width_per_macro_block * stride,
+                   consumed_width_per_macro_block);
+
+  // Width repetitions and residuals.
+  //
+  // Use of the workspace is characterized primarily in terms of *padded input*.
+  // Striding only matters in a few places.
+  //
+  // Simplifications: We require that there always be at least one full
+  // micro-block across the width. Since the maximum padding is 1, the trailing
+  // padding cannot span two micro blocks.
+  const int residual_micro_width = padded_width % 4;
+  // We base the count of macro blocks on the amount of padded input data each
+  // one consumes.
+  int width_overall_macro_count = (padded_width - residual_micro_width +
+                                   consumed_width_per_macro_block - 1) /
+                                  consumed_width_per_macro_block;
+  // Recall that we left a micro block at the end of each macro block for use as
+  // overlap. There is a special case in which we can use one fewer macro
+  // blocks, with the last one consuming extra input. (But not if the
+  // calculation thinks that we can use zero blocks.)
+  if (padded_width <=
+      ((width_overall_macro_count - 1) * consumed_width_per_macro_block + 4)) {
+    width_overall_macro_count -= 1;
+  }
+  width_overall_macro_count = std::max(width_overall_macro_count, 1);
+  // We always have to treat the final macro block along width as trailing,
+  // because even if it is full in terms of padded input, it will be incomplete
+  // in terms of output.
+  const int width_macro_count = width_overall_macro_count - 1;
+  // Micro blocks are traversed in terms of input in fill routines.
+  const int width_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count) / 4;
+  const int width_overall_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count + 3) /
+      4;
+  // Micro blocks are traversed in terms of output in filtering routines.
+  const int residual_output_micro_width =
+      (output_width - 1) % function_params.four_over_stride + 1;
+  const int output_width_trailing_micro_repeats =
+      residual_micro_width > (filter_size - 1)
+          ? width_trailing_micro_repeats
+          : width_trailing_micro_repeats - 1;
+  // Check results.
+  TFLITE_DCHECK_GT(width_overall_trailing_micro_repeats, 0);
+  TFLITE_DCHECK_EQ(padded_width,
+                   residual_micro_width +
+                       consumed_width_per_macro_block * width_macro_count +
+                       4 * width_trailing_micro_repeats);
+  TFLITE_DCHECK_LE(width_overall_macro_count, width_macro_count + 1);
+  TFLITE_DCHECK_GE(width_overall_macro_count, width_macro_count);
+
+  // Height repetitions and residuals.
+  //
+  const int height_macro_count = output_height / output_height_per_macro;
+  const int residual_output_height = output_height % output_height_per_macro;
+  const int height_overall_macro_count =
+      (output_height + output_height_per_macro - 1) / output_height_per_macro;
+  TFLITE_DCHECK_EQ(
+      output_height,
+      residual_output_height + output_height_per_macro * height_macro_count);
+  TFLITE_DCHECK_LE(height_overall_macro_count, height_macro_count + 1);
+  TFLITE_DCHECK_GE(height_overall_macro_count, height_macro_count);
+
+  // Data strides.
+  //
+  const int input_height_stride = input_width * input_depth;
+  const int output_height_stride = output_width * output_depth;
+  const int input_batch_stride = input_height_stride * input_height;
+  const int output_batch_stride = output_height_stride * output_height;
+  const int input_depth_macro_stride = has_depth_multiplication ? 0 : 64;
+  const int input_width_macro_stride =
+      input_depth * consumed_width_per_macro_block;
+  const int output_width_macro_stride =
+      output_depth * output_width_per_macro_block;
+
+  // Store parameters that do not vary across macro blocks.
+  //
+  function_params.workspace_width_micro_repeats = workspace_width_micro_repeats;
+  function_params.height_macro_count = height_overall_macro_count;
+  function_params.width_macro_count = width_overall_macro_count;
+  function_params.input_height_stride = input_height_stride;
+  function_params.output_height_stride = output_height_stride;
+  function_params.residual_width = residual_micro_width;
+
+  // Preprocess filter and bias data.
+  //
+  ProcessPerDepth<implementation>::Run(filter_data, bias_data,
+                                       filter_workspace[0][0][0][0],
+                                       adjusted_bias_data, &function_params);
+  function_params.bias_increment = 4;  // Adjusted bias data always spans depth.
+
+  // Main process.
+  //
+  // Most kernels are nested batch-height-width-depth. Here we proceed over
+  // macro blocks batch-width-depth-height.
+  //
+  // Example of handling of trailing iteration: when there is trailing depth,
+  // depth_overall_macro_count = depth_macro_count + 1, so we can adjust the
+  // dimensions for trailing macro blocks by looking for
+  // j_depth == depth_macro_count.
+  for (int b = 0; b < batches; ++b) {
+    for (int k_width = 0; k_width < width_overall_macro_count; ++k_width) {
+      // Figure out the work to be done for this macro block. If it trails in
+      // any dimension, the work in that dimension is adjusted.
+      // The work to be done across widths has 3 cases:
+      // (a) A full macro block,
+      // (b) Partial terminal macro block, with input and output ending in
+      //     same micro block, and
+      // (c) Partial terminal macro block, with output corresponding to one
+      //     fewer micro blocks, because filter extends across micro-block
+      //     boundary.
+      if (k_width != width_macro_count) {
+        function_params.output_residual_width = 0;
+        function_params.input_width_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.output_width_micro_repeats =
+            workspace_width_micro_repeats - 1;
+      } else {
+        function_params.output_residual_width = residual_output_micro_width;
+        function_params.input_width_micro_repeats =
+            width_trailing_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            width_overall_trailing_micro_repeats;
+        function_params.output_width_micro_repeats =
+            output_width_trailing_micro_repeats;
+      }
+      function_params.output_width_overall_micro_repeats =
+          function_params.output_residual_width == 0
+              ? function_params.output_width_micro_repeats
+              : function_params.output_width_micro_repeats + 1;
+
+      for (int j_depth = 0; j_depth < depth_overall_macro_count; ++j_depth) {
+        const uint8* input_data_block =
+            input_data + b * input_batch_stride +
+            j_depth * input_depth_macro_stride +
+            k_width * input_width_macro_stride -
+            function_params.padding_left * input_depth -
+            function_params.padding_top * input_height_stride;
+        uint8* output_data_block = output_data + b * output_batch_stride +
+                                   j_depth * 64 +
+                                   k_width * output_width_macro_stride;
+
+        function_params.depth_micro_repeats =
+            j_depth == depth_macro_count ? depth_trailing_micro_repeats : 8;
+        // Under depth multiplication the workspace_height_stride does not have
+        // to depend on input_width_overall_micro_repeats, but this improves the
+        // compactness of workspace use.
+        const int workspace_height_stride =
+            has_depth_multiplication
+                ? 16 * ((function_params.input_width_overall_micro_repeats +
+                         3) >>
+                        2)
+                : 4 * function_params.input_width_overall_micro_repeats * 8 *
+                      function_params.depth_micro_repeats;
+        TFLITE_DCHECK_EQ(workspace_height_stride % 16, 0);
+        function_params.workspace_height_stride = workspace_height_stride;
+
+        // For the first macro block for output rows we fill in the first few
+        // rows.  After this we will copy them (see below in loop.)
+        function_params.inbound_block_height = input_height_overlap;
+        pack_macro_block_func(-1, k_width, input_data_block,
+                              macroblock_workspace, &function_params);
+        input_data_block += input_height_stride * input_height_overlap;
+
+        for (int i_height = 0; i_height < height_overall_macro_count;
+             ++i_height) {
+          if (i_height != height_macro_count) {
+            function_params.inbound_block_height = input_height_per_macro;
+            function_params.outbound_block_height = output_height_per_macro;
+          } else {
+            function_params.inbound_block_height =
+                residual_output_height * stride;
+            function_params.outbound_block_height = residual_output_height;
+          }
+          TFLITE_DCHECK_LT(i_height * output_height_per_macro, output_height);
+          TFLITE_DCHECK_LT(i_height * input_height_per_macro, input_height);
+          TFLITE_DCHECK_LT(k_width * output_width_per_macro_block,
+                           output_width);
+          TFLITE_DCHECK_LT(k_width * consumed_width_per_macro_block,
+                           input_width);
+
+          // Macro blocks overlap by input_height_overlap rows, so we copy
+          // those instead of filling in afresh.  The first macro block across
+          // output rows was filled in outside of the loop (above).
+          if (i_height > 0) {
+            memcpy(macroblock_workspace,
+                   macroblock_workspace +
+                       input_height_per_macro * workspace_height_stride,
+                   input_height_overlap * workspace_height_stride);
+          }
+
+          pack_macro_block_func(
+              i_height, k_width, input_data_block,
+              macroblock_workspace +
+                  input_height_overlap * workspace_height_stride,
+              &function_params);
+
+          kernel_macro_block_func(macroblock_workspace,
+                                  filter_workspace[8 * j_depth][0][0][0],
+                                  adjusted_bias_data + 64 * j_depth,
+                                  output_data_block, &function_params);
+
+          input_data_block += input_height_stride * input_height_per_macro;
+          output_data_block += output_height_stride * output_height_per_macro;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5dc03e7d5344c2e577c6f7da9fa180d0672bfcb3..6e8390ce5509c836a5dfd4859fc6495301e77a8e 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -63,6 +63,7 @@ using reference_ops::ConcatenationWithScaling;
 using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
+using reference_ops::Elu;
 using reference_ops::FakeQuant;
 using reference_ops::Fill;
 using reference_ops::Gather;
@@ -182,45 +183,6 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
   return MatrixMap<Scalar>(data, rows, cols);
 }
 
-// This is like the template-parameter version, except that the power-of-two is
-// passed as a function parameter. The template version is to be preferred,
-// since some target hardware optimizations depend on the range of the exponent.
-template <typename IntegerType>
-IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
-  if (exponent == 0) {
-    return x;
-  }
-  using ScalarIntegerType =
-      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
-  const IntegerType min =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
-  const IntegerType max =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
-  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
-
-  const std::int32_t threshold =
-      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
-  const IntegerType positive_mask =
-      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
-  const IntegerType negative_mask =
-      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
-
-  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
-  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
-  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
-  return result;
-}
-
-// This is like the template-parameter version, except that the power-of-two is
-// passed as a function parameter. See raw-integer version for further comments.
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits>
-SaturatingRoundingMultiplyByPOTParam(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
-}
-
 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              float output_activation_max,
                                              const RuntimeShape& bias_shape,
@@ -841,24 +803,21 @@ inline void FullyConnected(
 }
 
 #ifdef USE_NEON
-inline void FullyConnectedAsGEMV(
+inline void FullyConnectedAsGEMVWorkerImpl(
     const RuntimeShape& input_shape, const uint8* input_data,
     int32 input_offset, const RuntimeShape& filter_shape,
     const uint8* filter_data, int32 filter_offset,
     const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
     int32 output_multiplier, int output_shift, int32 output_activation_min,
     int32 output_activation_max, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, int row_start, int row_end) {
   gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
   const int output_dim_count = output_shape.DimensionsCount();
-  const int filter_dim_count = filter_shape.DimensionsCount();
   TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
   const int input_size = FlatSizeSkipDim(input_shape, 0);
-  const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
   static constexpr int kPeel = 4;
   const bool shift_left = (output_shift > 0);
   for (int k = 0; k < input_size; k += 64) {
@@ -867,81 +826,139 @@ inline void FullyConnectedAsGEMV(
   for (int k = 0; k < kPeel * input_size; k += 64) {
     optimized_ops_preload_l1_stream(filter_data + k);
   }
-  TFLITE_DCHECK(!(output_size % kPeel));
-  const int32* bias_ptr = bias_data;
-  uint8* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += kPeel) {
-    int32x4_t acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      acc[k] = vdupq_n_s32(0);
-    }
+
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
     const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
     const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
     int in = 0;
     for (; in <= input_size - 16; in += 16) {
       const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      uint8x16_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1q_u8(filter_ptr);
-        optimized_ops_preload_l1_stream(filter_ptr + 64);
-      }
-      int16x8_t input_val[2];
-      const uint8x8_t low = vget_low_u8(input_val_u8);
-      const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
-      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
-      int16x8_t filter_val[kPeel][2];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
-        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
-        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
-        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
-        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
-        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
-      }
-      for (int p = 0; p < 2; p++) {
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
-                             vget_low_s16(input_val[p]));
-        }
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
-                             vget_high_s16(input_val[p]));
-        }
-      }
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_1 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_2 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_3 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      int16x8_t input_val_0, input_val_1;
+      uint8x8_t low = vget_low_u8(input_val_u8);
+      uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_u8(filter_val_u8_0);
+      high = vget_high_u8(filter_val_u8_0);
+      int16x8_t filter_val_0_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_0_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_1);
+      high = vget_high_u8(filter_val_u8_1);
+      int16x8_t filter_val_1_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_1_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_2);
+      high = vget_high_u8(filter_val_u8_2);
+      int16x8_t filter_val_2_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_2_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_3);
+      high = vget_high_u8(filter_val_u8_3);
+      int16x8_t filter_val_3_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_3_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
     }
     for (; in <= input_size - 8; in += 8) {
       const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      uint8x8_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1_u8(filter_ptr);
-      }
-      int16x8_t input_val;
-      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_2 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_3 = vld1_u8(filter_ptr);
+      int16x8_t input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
       input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t filter_val[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
-        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
-                           vget_low_s16(input_val));
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
-                           vget_high_s16(input_val));
-      }
+      int16x8_t filter_val_0 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_0));
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_1));
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_2));
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_3));
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
     }
     if (in < input_size) {
-      int32 buf[4 * kPeel];
-      for (int k = 0; k < 4; k++) {
-        vst1q_s32(buf + 4 * k, acc[k]);
-      }
+      int32 buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
       for (; in < input_size; in++) {
         int lane = (in + 8 - input_size) % 4;
         const int32 input_val = input_data[in] + input_offset;
@@ -951,26 +968,28 @@ inline void FullyConnectedAsGEMV(
           buf[lane + 4 * k] += filter_val * input_val;
         }
       }
-      for (int k = 0; k < 4; k++) {
-        acc[k] = vld1q_s32(buf + 4 * k);
-      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
     }
 
     // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      pairwise_reduced_acc[k] =
-          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
-    }
-    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
     const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
     const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
     int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
     // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
     reduced = vaddq_s32(reduced, bias_vec);
     if (shift_left) {
       const int32 multiplier_power_of_two = 1 << output_shift;
@@ -993,11 +1012,116 @@ inline void FullyConnectedAsGEMV(
     // Apply the clamping from the activation function
     res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
     res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
-    // Store results to destination. Assumes 32bit alignment.
-    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
-                  vreinterpret_u32_u8(res8), 0);
-    output_ptr += kPeel;
+    // Store results to destination.
+    vst1_lane_u8(output_data + out + 0, res8, 0);
+    vst1_lane_u8(output_data + out + 1, res8, 1);
+    vst1_lane_u8(output_data + out + 2, res8, 2);
+    vst1_lane_u8(output_data + out + 3, res8, 3);
+  }
+}
+
+struct FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  FullyConnectedAsGEMVWorkerTask(const RuntimeShape& input_shape,
+                                 const uint8* input_data, int32 input_offset,
+                                 const RuntimeShape& filter_shape,
+                                 const uint8* filter_data, int32 filter_offset,
+                                 const RuntimeShape& bias_shape,
+                                 const int32* bias_data, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data, int row_start, int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
+  }
+
+  const RuntimeShape& input_shape_;
+  const uint8* input_data_;
+  int32 input_offset_;
+  const RuntimeShape& filter_shape_;
+  const uint8* filter_data_;
+  int32 filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32* bias_data_;
+  int32 output_offset_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int32 output_activation_min_;
+  int32 output_activation_max_;
+  const RuntimeShape& output_shape_;
+  uint8* output_data_;
+  gemmlowp::GemmContext* gemm_context_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const uint8* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext* gemm_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemm_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
   }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker =
+      gemmlowp::RoundUp<kKernelRows>(output_rows / thread_count);
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks[i] = new FullyConnectedAsGEMVWorkerTask(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemm_context->workers_pool()->Execute(tasks);
 }
 #endif  // USE_NEON
 
@@ -1054,14 +1178,16 @@ inline void FullyConnected(
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
 #ifdef USE_NEON
-  const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
-  if (batches == 1 && !(output_size % 4)) {
-    return FullyConnectedAsGEMV(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data);
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemm_context);
+    }
   }
 #endif  // USE_NEON
   const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
@@ -2084,6 +2210,21 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+#ifdef USE_NEON
+  if (gemm_input_cols == 1 && output_rows >= 4) {
+    RuntimeShape fc_filter_shape{
+        filter_shape.Dims(0),
+        filter_shape.Dims(filter_shape.DimensionsCount() - 1)};
+
+    return FullyConnectedAsGEMV(
+        *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
+        filter_data, filter_offset, bias_shape, bias_data, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_shape, output_data, gemm_context);
+  }
+#endif
+
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
       filter_data, filter_rows, filter_cols);
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
@@ -2679,7 +2820,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
     // dimension.
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -2708,7 +2849,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
     // for y4 == 1 and the loop over y3 is contained within the
     // AddScalarBroadcast function.
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3065,7 +3206,7 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
   int y4 = params.broadcast_shape[4];
   if (y4 > 1) {
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3082,7 +3223,7 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
     }
   } else {
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3710,6 +3851,14 @@ inline void AveragePool(const PoolParams& params,
                         const uint8* input_data,
                         const RuntimeShape& output_shape, uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -3722,69 +3871,76 @@ inline void AveragePool(const PoolParams& params,
   const int output_width = output_shape.Dims(2);
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
+
+  uint16 acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin =
-            (out_x * stride_width) - params.padding_values.width;
-        const int in_y_origin =
-            (out_y * stride_height) - params.padding_values.height;
-        const int filter_x_start = std::max(0, -in_x_origin);
-        const int filter_x_end =
-            std::min(params.filter_width, input_width - in_x_origin);
-        const int filter_y_start = std::max(0, -in_y_origin);
-        const int filter_y_end =
-            std::min(params.filter_height, input_height - in_y_origin);
-        const int filter_count =
-            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
-        // 2560 is required by MobileNetV2 with depth multiplier 2.
-        static constexpr int kAccBufferMaxSize = 4096;
-        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
-        uint16 acc[kAccBufferMaxSize];
-        memset(acc, 0, depth * sizeof(acc[0]));
-        const uint8* input_ptr =
-            input_data +
-            depth * (in_x_origin +
-                     input_width * (in_y_origin + input_height * batch));
-        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-          const uint8* input_row_ptr =
-              input_ptr + depth * (fy * input_width + filter_x_start);
-          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-            int channel = 0;
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
 #ifdef USE_NEON
-            for (; channel <= depth - 16; channel += 16) {
-              uint16x8_t acc_reg[2];
-              for (int i = 0; i < 2; i++) {
-                acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint16x8_t acc_reg[2];
+                for (int i = 0; i < 2; i++) {
+                  acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+                }
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+                acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+                for (int i = 0; i < 2; i++) {
+                  vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+                }
               }
-              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
-              input_row_ptr += 16;
-              acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
-              acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
-              for (int i = 0; i < 2; i++) {
-                vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint16x8_t acc_reg = vld1q_u16(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vaddw_u8(acc_reg, input_reg);
+                vst1q_u16(acc + channel, acc_reg);
               }
-            }
-            for (; channel <= depth - 8; channel += 8) {
-              uint16x8_t acc_reg = vld1q_u16(acc + channel);
-              uint8x8_t input_reg = vld1_u8(input_row_ptr);
-              input_row_ptr += 8;
-              acc_reg = vaddw_u8(acc_reg, input_reg);
-              vst1q_u16(acc + channel, acc_reg);
-            }
 #endif
-            for (; channel < depth; ++channel) {
-              acc[channel] += *input_row_ptr++;
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
             }
           }
-        }
-        uint8* output_ptr =
-            output_data + Offset(output_shape, batch, out_y, out_x, 0);
-        int channel = 0;
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
 #ifdef USE_NEON
 #define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
   if (filter_count == FILTER_COUNT) {                                   \
-    for (; channel <= depth - 8; channel += 8) {                        \
+    for (; channel <= tranche_depth - 8; channel += 8) {                \
       uint16 buf[8];                                                    \
       for (int i = 0; i < 8; i++) {                                     \
         buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
@@ -3795,25 +3951,26 @@ inline void AveragePool(const PoolParams& params,
       vst1_u8(output_ptr + channel, buf8);                              \
     }                                                                   \
   }
-        AVGPOOL_DIVIDING_BY(9)
-        AVGPOOL_DIVIDING_BY(15)
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
 #undef AVGPOOL_DIVIDING_BY
-        for (; channel <= depth - 8; channel += 8) {
-          uint16 buf[8];
-          for (int i = 0; i < 8; i++) {
-            buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint16 buf[8];
+            for (int i = 0; i < 8; i++) {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
           }
-          uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
-          buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
-          buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
-          vst1_u8(output_ptr + channel, buf8);
-        }
 #endif
-        for (; channel < depth; ++channel) {
-          uint16 a = (acc[channel] + filter_count / 2) / filter_count;
-          a = std::max<uint16>(a, params.quantized_activation_min);
-          a = std::min<uint16>(a, params.quantized_activation_max);
-          output_ptr[channel] = static_cast<uint8>(a);
+          for (; channel < tranche_depth; ++channel) {
+            uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16>(a, params.quantized_activation_min);
+            a = std::min<uint16>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
         }
       }
     }
@@ -3878,6 +4035,14 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                     const uint8* input_data, const RuntimeShape& output_shape,
                     uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -3890,77 +4055,85 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
+
+  uint8 acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin =
-            (out_x * stride_width) - params.padding_values.width;
-        const int in_y_origin =
-            (out_y * stride_height) - params.padding_values.height;
-        const int filter_x_start = std::max(0, -in_x_origin);
-        const int filter_x_end =
-            std::min(params.filter_width, input_width - in_x_origin);
-        const int filter_y_start = std::max(0, -in_y_origin);
-        const int filter_y_end =
-            std::min(params.filter_height, input_height - in_y_origin);
-        // 2560 is required by MobileNetV2 with depth multiplier 2.
-        static constexpr int kAccBufferMaxSize = 4096;
-        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
-        uint8 acc[kAccBufferMaxSize];
-        memset(acc, 0, depth * sizeof(acc[0]));
-        const uint8* input_ptr =
-            input_data +
-            depth * (in_x_origin +
-                     input_width * (in_y_origin + input_height * batch));
-        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-          const uint8* input_row_ptr =
-              input_ptr + depth * (fy * input_width + filter_x_start);
-          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-            int channel = 0;
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
 #ifdef USE_NEON
-            for (; channel <= depth - 16; channel += 16) {
-              uint8x16_t acc_reg = vld1q_u8(acc + channel);
-              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
-              input_row_ptr += 16;
-              acc_reg = vmaxq_u8(acc_reg, input_reg);
-              vst1q_u8(acc + channel, acc_reg);
-            }
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint8x16_t acc_reg = vld1q_u8(acc + channel);
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg = vmaxq_u8(acc_reg, input_reg);
+                vst1q_u8(acc + channel, acc_reg);
+              }
 
-            for (; channel <= depth - 8; channel += 8) {
-              uint8x8_t acc_reg = vld1_u8(acc + channel);
-              uint8x8_t input_reg = vld1_u8(input_row_ptr);
-              input_row_ptr += 8;
-              acc_reg = vmax_u8(acc_reg, input_reg);
-              vst1_u8(acc + channel, acc_reg);
-            }
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint8x8_t acc_reg = vld1_u8(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vmax_u8(acc_reg, input_reg);
+                vst1_u8(acc + channel, acc_reg);
+              }
 #endif
-            for (; channel < depth; ++channel) {
-              acc[channel] = std::max(acc[channel], *input_row_ptr++);
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
+              }
+              input_row_ptr += depth;
             }
           }
-        }
-        uint8* output_ptr =
-            output_data + Offset(output_shape, batch, out_y, out_x, 0);
-        int channel = 0;
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
 #ifdef USE_NEON
-        for (; channel <= depth - 16; channel += 16) {
-          uint8x16_t a = vld1q_u8(acc + channel);
-          a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
-          a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
-          vst1q_u8(output_ptr + channel, a);
-        }
-        for (; channel <= depth - 8; channel += 8) {
-          uint8x8_t a = vld1_u8(acc + channel);
-          a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
-          a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
-          vst1_u8(output_ptr + channel, a);
-        }
+          for (; channel <= tranche_depth - 16; channel += 16) {
+            uint8x16_t a = vld1q_u8(acc + channel);
+            a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
+            a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
+            vst1q_u8(output_ptr + channel, a);
+          }
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint8x8_t a = vld1_u8(acc + channel);
+            a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
+            a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, a);
+          }
 #endif
-        for (; channel < depth; ++channel) {
-          uint8 a = acc[channel];
-          a = std::max<uint8>(a, params.quantized_activation_min);
-          a = std::min<uint8>(a, params.quantized_activation_max);
-          output_ptr[channel] = static_cast<uint8>(a);
+          for (; channel < tranche_depth; ++channel) {
+            uint8 a = acc[channel];
+            a = std::max<uint8>(a, params.quantized_activation_min);
+            a = std::min<uint8>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
         }
       }
     }
@@ -4346,119 +4519,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-  // The reason for accumulating the result with an extra bit of headroom is
-  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
-  // recip_denom will otherwise introduce an error.
-  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
-
-  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1488522236, std::log(2.0));
-  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
-  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1518500250, std::sqrt(0.5));
-  const FixedPoint0 one_quarter =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
-
-  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1057819769,
-      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
-
-  const FixedPointAccum shifted_quarter =
-      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
-
-  // Reinterpret the input value as Q0.31, because we will figure out the
-  // required shift "ourselves" instead of using, say, Rescale.
-  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
-  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
-  FixedPoint0 r_a_tmp =
-      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
-      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
-  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
-  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
-  //                   InputIntegerBits - z_b_headroom - 0.25);
-  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
-  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
-      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
-  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
-  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
-      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
-
-  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
-  FixedPoint0 q = r - sqrt_sqrt_half;
-  q = q + q;
-
-  const FixedPoint0 common_sq = q * q;
-  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
-  const FixedPoint0 denom_minus_one_0 =
-      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
-  const FixedPoint0 recip_denom =
-      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
-
-  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
-  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
-                                              num_scaled * recip_denom);
-}
-
-// Minimum output bits to accommodate log of maximum input range.  It actually
-// does not matter if one considers, say, [-64,64] or [-64,64).
-//
-// For example, run this through Octave:
-// [0:127; ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
-constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
-}
-
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  static_assert(
-      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
-      "Output integer bits must be sufficent to accommodate logs of inputs.");
-  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
-                                                     InputIntegerBits>(
-      input_val);
-}
-
 // Currently just a copy of the reference code.
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const uint8* input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index d9a5372e6e2a4a3baefecd7f89c2284eec55016a..c38f37416dde30cf16a41d6cc6f08dc40f3dfe7d 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -27,7 +27,7 @@ namespace tflite {
 // Used in tests and template parameters to control which version of depthwise
 // convolution is called. Primarily for reference code, and specializations
 // forced in tests.
-enum class DepthwiseConvKernelInvocation {
+enum class DepthwiseConvImplementation {
   // Run all tests against kUseStandardEntry even if also testing another
   // kernel, since we need to be sure that the main DepthwiseConv() function in
   // optimized_ops.h dispatches to a correctly-executing kernel.
@@ -44,90 +44,152 @@ enum class DepthwiseConvKernelInvocation {
   kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
 };
 
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding {
+  kNone = 0,      // Invalid: specific method must be specified.
+  kAwayFromZero,  // Original method: exact halves rounded away from zero.
+  kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication {
+  kNoMultiplication = 0,  // Depth multiplier = 1.
+  kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
+};
+
 namespace reference_ops {
+namespace depthwise_conv {
 
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32 input_val =
-                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  int32 filter_val = filter_data[Offset(
-                      filter_shape, 0, filter_y, filter_x, oc)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
+template <DepthwiseConvOutputRounding output_rounding>
+inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
+                                int shift) {
+  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
+  return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                            quantized_multiplier) +
+          rounding_offset) >>
+         right_shift;
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DepthwiseConvBasicKernel {
+  static inline void Run(const DepthwiseParams& params,
+                         const RuntimeShape& input_shape,
+                         const uint8* input_data,
+                         const RuntimeShape& filter_shape,
+                         const uint8* filter_data,
+                         const RuntimeShape& bias_shape, const int32* bias_data,
+                         const RuntimeShape& output_shape, uint8* output_data) {
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 input_offset = params.input_offset;
+    const int32 filter_offset = params.weights_offset;
+    const int32 output_offset = params.output_offset;
+    const int32 output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int b = 0; b < batches; ++b) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int ic = 0; ic < input_depth; ++ic) {
+            for (int m = 0; m < depth_multiplier; m++) {
+              const int oc = m + ic * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32 acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    int32 input_val =
+                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                    int32 filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, oc)];
+                    acc += (filter_val + filter_offset) *
+                           (input_val + input_offset);
+                  }
                 }
               }
+              if (bias_data) {
+                acc += bias_data[oc];
+              }
+              acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
+                                                        output_shift);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                  static_cast<uint8>(acc);
             }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
-                                                output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
           }
         }
       }
     }
   }
+};
+
+}  // namespace depthwise_conv
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  return depthwise_conv::DepthwiseConvBasicKernel<
+      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
+                                                       input_data, filter_shape,
+                                                       filter_data, bias_shape,
+                                                       bias_data, output_shape,
+                                                       output_data);
 }
 
-}  // end namespace reference_ops
+}  // namespace reference_ops
 }  // end namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
new file mode 100644
index 0000000000000000000000000000000000000000..a694ba2aaa993b0631958e0b338a7a62e154de75
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+
+#include <limits>
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int8");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32_t shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32_t clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b349f4d49bb66d6f60ed92629945640e24cb9c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, void* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32 input_val = input_data[b * accum_depth + d];
+        int32 filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h b/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f22bb4f13803cf4e14c8b4fd18b9c301fab07359
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void LogSoftmax(int32_t input_multiplier, int32_t input_shift,
+                       int32_t reverse_multiplier, int32_t reverse_shift,
+                       int32_t diff_min, int32_t outer_size, int32_t depth,
+                       const int8* input_data, int8* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
+
+  // [-16, 0] is mapped to [-128, 127] with 1/16 as scale and 127 as zero
+  // point. This nudges the output to [-255/16, 0].
+  static constexpr int32_t kOutputZeroPoint = 127;
+
+  // All IntegerBits must agree with Prepare function.
+  // Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
+  static constexpr int kInputIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using F5 = gemmlowp::FixedPoint<int32, kInputIntegerBits>;
+  using F12 = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    int8 max_in_row = kMinInt8;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      max_in_row =
+          std::max(max_in_row, input_data[outer_index * depth + inner_index]);
+    }
+
+    // Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
+    F12 sum_of_exps_in_q12 = F12::FromRaw(0);
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+        sum_of_exps_in_q12 =
+            sum_of_exps_in_q12 +
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
+      }
+    }
+
+    const int32_t log_sum_of_exps_in_q5 =
+        log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
+            sum_of_exps_in_q12)
+            .raw();
+
+    // Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
+    // smallest representable in Q5.26 plus the log_sum_of_exps.
+    const int32_t shifted_log_sum_of_exps_in_q5 =
+        log_sum_of_exps_in_q5 + kMinInt32;
+    const int32_t adjusted_diff_min = std::max(
+        diff_min - 1,
+        MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
+                                      reverse_multiplier, -reverse_shift));
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      // Note use of > below instead of >= above.
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+
+        // Rescale and downcast.
+        int32_t output_in_q27 =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_in_q5 - log_sum_of_exps_in_q5),
+                31 - kInputIntegerBits - kOutputIntegerBits) +
+            kOutputZeroPoint;
+
+        output_in_q27 =
+            std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxInt8)),
+                     static_cast<int32_t>(kMinInt8));
+        output_data[outer_index * depth + inner_index] =
+            static_cast<int8_t>(output_in_q27);
+      } else {
+        output_data[outer_index * depth + inner_index] = kMinInt8;
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
new file mode 100644
index 0000000000000000000000000000000000000000..8277c3b3d565d845da4cc8931a4256c1005db77c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int8_t* input_data,
+                     int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputIntegerBits = 8;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kOutputZeroPoint = -128;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
+          input, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+      const int32_t output_in_q0 =
+          gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q23 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
+      output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
+                                        static_cast<int32_t>(kMinInt8)),
+                               static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q23);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e33d089945a2907e489c51c117eec77b194ed7e
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Mul with 16 bit inputs and int8_t outputs.
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16Int8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result =
+        std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result =
+        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, params.output_multiplier,
+                  params.output_shift);
+          const int32 clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
index 22750bc91a856b360459fbf9b5ed0519e4ac6c88..2762bec8e6c3c8d69198456cbd16b04dc45ef2ab 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
 
+#include <limits>
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
@@ -77,6 +78,63 @@ inline void AveragePool(const PoolParams& params,
   }
 }
 
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8* input_data, const RuntimeShape& output_shape,
+                    int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int8_t max = std::numeric_limits<int8_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int8_t>(max, params.quantized_activation_min);
+          max = std::min<int8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(max);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
new file mode 100644
index 0000000000000000000000000000000000000000..081928bc88d9c59e15b5ed857daf4a144abe2ad7
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int32_t input_shift,
+                 int32_t input_size, const int8_t* input_data,
+                 int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputScale = 7;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 =
+          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
+      const int32_t output_in_q0 =
+          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q24 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
+      output_in_q24 =
+          std::min(std::max(output_in_q24, static_cast<int32_t>(kMinInt8)),
+                   static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index ac6905fcd7bb9b043c73c773a5f85998b6f6a908..d2f5d987b9c2a481d807e9e11975b1f543cec678 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -36,68 +36,6 @@ limitations under the License.
 
 namespace tflite {
 
-// TODO(b/77858996): Add these to gemmlowp.
-template <typename IntegerType>
-IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
-  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
-  return a;
-}
-
-template <>
-inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
-  std::int64_t a64 = a;
-  std::int64_t b64 = b;
-  std::int64_t sum = a64 + b64;
-  return static_cast<std::int32_t>(std::min(
-      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
-      std::max(
-          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
-          sum)));
-}
-
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
-}
-
-template <typename IntegerType>
-IntegerType SaturatingSub(IntegerType a, IntegerType b) {
-  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
-  return a;
-}
-
-template <>
-inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
-  std::int32_t a32 = a;
-  std::int32_t b32 = b;
-  std::int32_t diff = a32 - b32;
-  return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff)));
-}
-
-template <>
-inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
-  std::int64_t a64 = a;
-  std::int64_t b64 = b;
-  std::int64_t diff = a64 - b64;
-  return static_cast<std::int32_t>(std::min(
-      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
-      std::max(
-          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
-          diff)));
-}
-
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingSub(a.raw(), b.raw()));
-}
-// End section to be moved to gemmlowp.
-
 namespace reference_ops {
 
 // Return true for broadcast case, false otherwise.
@@ -192,59 +130,6 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
   return true;
 }
 
-template <typename T>
-int CountLeadingZeros(T integer_input) {
-  static_assert(std::is_unsigned<T>::value,
-                "Only unsigned integer types handled.");
-  if (integer_input == 0) {
-    return std::numeric_limits<T>::digits;
-  }
-  const T one_in_leading_positive = static_cast<T>(1)
-                                    << (std::numeric_limits<T>::digits - 1);
-  int leading_zeros = 0;
-  while (integer_input < one_in_leading_positive) {
-    integer_input <<= 1;
-    ++leading_zeros;
-  }
-  return leading_zeros;
-}
-
-template <typename IntegerType>
-IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
-  if (exponent == 0) {
-    return x;
-  }
-  using ScalarIntegerType =
-      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
-  const IntegerType min =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
-  const IntegerType max =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
-  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
-
-  const std::int32_t threshold =
-      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
-  const IntegerType positive_mask =
-      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
-  const IntegerType negative_mask =
-      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
-
-  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
-  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
-  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
-  return result;
-}
-
-// If we want to leave IntegerBits fixed, then multiplication
-// by a power of two has to be saturating/rounding, not exact anymore.
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits>
-SaturatingRoundingMultiplyByPOTParam(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const float* input_data, const RuntimeShape& filter_shape,
                  const float* filter_data, const RuntimeShape& bias_shape,
@@ -506,6 +391,15 @@ inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
   }
 }
 
+inline void Elu(const RuntimeShape& input_shape, const float* input_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+  }
+}
+
 inline void Relu(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -543,16 +437,17 @@ inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
+template <typename T>
 inline void ReluX(const tflite::ActivationParams& params,
-                  const RuntimeShape& input_shape, const uint8* input_data,
-                  const RuntimeShape& output_shape, uint8* output_data) {
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  const uint8 max_value = params.quantized_activation_max;
-  const uint8 min_value = params.quantized_activation_min;
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
   for (int i = 0; i < flat_size; ++i) {
-    const uint8 val = input_data[i];
-    const uint8 clamped =
+    const T val = input_data[i];
+    const T clamped =
         val > max_value ? max_value : val < min_value ? min_value : val;
     output_data[i] = clamped;
   }
@@ -702,6 +597,22 @@ inline void Add(const ArithmeticParams& params,
   }
 }
 
+// T is expected to be either float or int.
+template <typename T>
+inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
+                 T* const* input_data, T* output_data) {
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t size = input_shape.FlatSize();
+  for (int i = 0; i < size; ++i) {
+    T x = 0;
+    for (int j = 0; j < num_inputs; ++j) {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
@@ -1885,11 +1796,17 @@ void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
   const int outputs_count = params.num_split;
 
   int outer_size = 1;
-  for (int i = 0; i < params.axis; i++) {
+  int axis = params.axis;
+  if (axis < 0) {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i) {
     outer_size *= input_shape.Dims(i);
   }
   int copy_size = 1;
-  for (int i = params.axis + 1; i < dimensions; i++) {
+  for (int i = axis + 1; i < dimensions; ++i) {
     copy_size *= input_shape.Dims(i);
   }
   TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
@@ -2729,121 +2646,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// Although currently the name of this function says that it cannot handle
-// values less than 1, in practice it can handle as low as 1/x_max, where
-// x_max is the largest representable input.  In other words, the output range
-// is symmetric.
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-  // The reason for accumulating the result with an extra bit of headroom is
-  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
-  // recip_denom will otherwise introduce an error.
-  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
-
-  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1488522236, std::log(2.0));
-  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
-  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1518500250, std::sqrt(0.5));
-  const FixedPoint0 one_quarter =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
-
-  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1057819769,
-      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
-
-  const FixedPointAccum shifted_quarter =
-      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
-
-  // Reinterpret the input value as Q0.31, because we will figure out the
-  // required shift "ourselves" instead of using, say, Rescale.
-  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
-  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
-  FixedPoint0 r_a_tmp =
-      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
-      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
-  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
-  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
-  //                   InputIntegerBits - z_b_headroom - 0.25);
-  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
-  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
-      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
-  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
-  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
-      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
-
-  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
-  FixedPoint0 q = r - sqrt_sqrt_half;
-  q = q + q;
-
-  const FixedPoint0 common_sq = q * q;
-  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
-  const FixedPoint0 denom_minus_one_0 =
-      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
-  const FixedPoint0 recip_denom =
-      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
-
-  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
-  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
-                                              num_scaled * recip_denom);
-}
-
-// Minimum output bits to accommodate log of maximum input range.  It actually
-// does not matter if one considers, say, [-64,64] or [-64,64).
-//
-// For example, run this through Octave:
-// [0:127; ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
-constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
-}
-
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  static_assert(
-      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
-      "Output integer bits must be sufficent to accommodate logs of inputs.");
-  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
-                                                     InputIntegerBits>(
-      input_val);
-}
-
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const uint8* input_data,
                        const RuntimeShape& output_shape, uint8* output_data) {
@@ -3217,6 +3019,43 @@ inline void Gather(const tflite::GatherParams& op_params,
   }
 }
 
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape& params_shape,
+                     const ParamsT* params_data,
+                     const RuntimeShape& indices_shape,
+                     const IndicesT* indices_data,
+                     const RuntimeShape& output_shape, ParamsT* output_data) {
+  gemmlowp::ScopedProfilingLabel label("GatherNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  const int indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i) {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = indices_nd; i < params_dims; ++i) {
+    slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i) {
+    dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  for (int i = 0; i < n_slices; ++i) {
+    int from_pos = 0;
+    for (int j = 0; j < indices_nd; ++j) {
+      from_pos += indices_data[i * indices_nd + j] * dims_to_count[j];
+    }
+    std::memcpy(output_data + i * slice_size, params_data + from_pos,
+                sizeof(ParamsT) * slice_size);
+  }
+}
+
 template <typename T>
 inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
                            const RuntimeShape& unextended_input_shape,
@@ -4532,6 +4371,34 @@ void RankOneSelect(const RuntimeShape& input_condition_shape,
   }
 }
 
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape& input_condition_shape,
+                      const D* input_condition_data, T* output_data) {
+  const size_t size = input_condition_shape.FlatSize();
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i) {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i) {
+    if (input_condition_data[i]) {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j) {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
 // For easy implementation, the indices is always a vector of size-4 vectors.
 template <typename T, typename TI>
 inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index eafc1e46765e662dc535731f4a34f733cf69c46c..d6984e9ec22958a4bb7d299aaa20bd39bd36604d 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -149,6 +149,9 @@ TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
   if (output->type == kTfLiteUInt8) {
     qmin = std::numeric_limits<uint8_t>::min();
     qmax = std::numeric_limits<uint8_t>::max();
+  } else if (output->type == kTfLiteInt8) {
+    qmin = std::numeric_limits<int8_t>::min();
+    qmax = std::numeric_limits<int8_t>::max();
   } else if (output->type == kTfLiteInt16) {
     qmin = std::numeric_limits<int16_t>::min();
     qmax = std::numeric_limits<int16_t>::max();
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3bcaabf675eba4f528fe73b01610d915e7780f85..0e15254f477f0bbc4ba15c4c4ac189fe1c8d3da0 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -108,6 +108,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       case kTfLiteUInt8:
         TFLiteOperation<uint8_t, OpType>(context, node, op_context);
         break;
+      case kTfLiteInt8:
+        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        break;
       case kTfLiteInt32:
        TFLiteOperation<int32_t, OpType>(context, node, op_context);
         break;
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index acb74e09d3fb47c33c6c146af4d0b1b1030491be..6567c8f3611204af3bdeecbdb11a07f6f16be908 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -112,6 +112,17 @@ TEST(MaxMinOpTest, Uint8Test) {
                      {0, 0, 1, 11, 2, 1});
 }
 
+TEST(MaxMinOpTest, Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {3, 1, 2}}, {TensorType_INT8, {3, 1, 2}},
+                    data1, data2, {1, 0, 2, 12, 123, 23});
+  TestModel<int8_t>(BuiltinOperator_MINIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {3, 1, 2}}, {TensorType_INT8, {3, 1, 2}},
+                    data1, data2, {0, 0, 1, 11, 2, 1});
+}
+
 TEST(MaximumOpTest, FloatWithBroadcastTest) {
   std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
   std::initializer_list<float> data2 = {0.5, 2.0};
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index 70f7e33928d8a7d68cd650de50914d59fd857b39..c0924da2a185758b7c508c6d39185b337ce7d90f 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -71,25 +71,32 @@ struct OpData {
 };
 
 // Util method to initialize the memory of the padded tensor.
-// Returns the index of the current item processed in 'padded_tensor_buffer'
-int InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
-                           int dims_size,
-                           std::vector<PaddedTensor>* padded_tensor_buffer,
-                           int element_index, int num_elements) {
-  if (dim_index >= dims_size) {
-    return element_index;
-  }
-  PaddedTensor* padded_tensor = &(*padded_tensor_buffer)[element_index];
-  padded_tensor->values.clear();
-  padded_tensor->values.reserve(dims->data[dim_index]);
-  for (int i = 0; i < dims->data[dim_index]; ++i) {
-    ++element_index;
-    padded_tensor->values.emplace_back(&(*padded_tensor_buffer)[element_index]);
-    element_index = InitializeTensorMemory(dims, dim_index + 1, dims_size,
-                                           padded_tensor_buffer, element_index,
-                                           num_elements);
+void InitializeTensorMemory(const TfLiteIntArray* const dims, int dims_size,
+                            std::vector<PaddedTensor>* padded_tensor_buffer) {
+  int dimension_index = 0;
+  int element_index = 0;
+  // We hold 2 vectors with values for nodes in current level, and
+  // nodes in the next level, and swap while moving on dimensions of the tensor.
+  std::vector<PaddedTensor*> current_nodes, next_level;
+  current_nodes.push_back(&(*padded_tensor_buffer)[element_index]);
+  element_index++;
+  int next_level_size = 1;
+  while (!current_nodes.empty() && dimension_index < dims_size) {
+    next_level_size *= dims->data[dimension_index];
+    next_level.resize(next_level_size);
+    // Index of elements in next level.
+    int index = 0;
+    for (auto* padded_tensor : current_nodes) {
+      padded_tensor->values.resize(dims->data[dimension_index]);
+      for (int i = 0; i < dims->data[dimension_index]; ++i) {
+        padded_tensor->values[i] = &(*padded_tensor_buffer)[element_index];
+        next_level[index++] = padded_tensor->values[i];
+        element_index++;
+      }
+    }
+    std::swap(current_nodes, next_level);
+    dimension_index++;
   }
-  return element_index;
 }
 
 // Returns pointer to the value at the specified index in 'data'.
@@ -117,20 +124,6 @@ inline const void* GetValuePointerAtIndex(const void* data, int index,
   return nullptr;
 }
 
-// Util method that increment index in the N-d array.
-void IncrementTensorIndex(const TfLiteIntArray* dims,
-                          std::vector<int>* tensor_index_ptr) {
-  int dimension_index = dims->size - 1;
-  auto& tensor_index = *tensor_index_ptr;
-  tensor_index[dimension_index]++;
-  while (dimension_index >= 0 &&
-         tensor_index[dimension_index] == dims->data[dimension_index]) {
-    tensor_index[dimension_index] = 0;
-    dimension_index--;
-    if (dimension_index >= 0) tensor_index[dimension_index]++;
-  }
-}
-
 // Fills the 'padded_tensor' with data from 'input_tensor'.
 TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
                                  PaddedTensor* padded_tensor) {
@@ -145,13 +138,13 @@ TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
   std::vector<int> tensor_index(dims->size, 0);
   int flat_index = 0;
   const int num_elements = NumElements(input_tensor);
+  auto* tensor = padded_tensor->GetMutable(tensor_index);
   while (flat_index < num_elements) {
-    auto* tensor = padded_tensor->GetMutable(tensor_index);
     if (tensor == nullptr) {
       return kTfLiteError;
     }
     tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
-    IncrementTensorIndex(dims, &tensor_index);
+    ++tensor;
     ++flat_index;
   }
 
@@ -308,8 +301,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   PaddedTensor& padded_tensor = op_data->pad_tensor_buffer[0];
   // Initialize memory.
-  InitializeTensorMemory(input_tensor->dims, 0, input_dims,
-                         &op_data->pad_tensor_buffer, 0, op_data->num_elements);
+  InitializeTensorMemory(input_tensor->dims, input_dims,
+                         &op_data->pad_tensor_buffer);
   // Set the values from the input_tensor.
   TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
   const int offset =
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 01039a705438af2a92a68b01c2146daf69c46250..e0ff6724ea2f3ea0fd4693571d6c509f5385a5d4 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -87,8 +88,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                   &data->output_activation_min,
                                   &data->output_activation_max);
   }
+  if (output->type == kTfLiteInt8) {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &data->output_activation_min,
+                                 &data->output_activation_max);
+  }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
     double real_multiplier =
         input1->params.scale * input2->params.scale / output->params.scale;
     QuantizeMultiplierSmallerThanOneExp(
@@ -151,8 +158,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteMulParams* params, const OpData* data,
                            const TfLiteTensor* input1,
                            const TfLiteTensor* input2, TfLiteTensor* output) {
-  if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 &&
-      output->type == kTfLiteUInt8) {
+  if (input1->type == input2->type && input1->type == output->type &&
+      (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8)) {
     tflite::ArithmeticParams op_params;
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
@@ -163,23 +170,31 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     op_params.output_shift = data->output_shift;
     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
         GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(type, opname)                                      \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-
-    if (kernel_type == kReference) {
+#define TF_LITE_MUL(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output))
+    if (input1->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
       } else {
-        TF_LITE_MUL(reference_ops, Mul);
+        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
       }
     } else {
-      if (need_broadcast) {
-        TF_LITE_MUL(optimized_ops, BroadcastMulFivefold);
+      // type == kTfLiteUInt8
+      if (kernel_type == kReference) {
+        if (need_broadcast) {
+          TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
+        } else {
+          TF_LITE_MUL(reference_ops, Mul, uint8_t);
+        }
       } else {
-        TF_LITE_MUL(optimized_ops, Mul);
+        if (need_broadcast) {
+          TF_LITE_MUL(optimized_ops, BroadcastMulFivefold, uint8_t);
+        } else {
+          TF_LITE_MUL(optimized_ops, Mul, uint8_t);
+        }
       }
     }
 #undef TF_LITE_MUL
@@ -198,8 +213,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
-             output->type == kTfLiteUInt8) {
-#define TF_LITE_MUL(type, opname)                                      \
+             (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8)) {
+#define TF_LITE_MUL(type, opname, output_dtype)                        \
   tflite::ArithmeticParams op_params;                                  \
   SetActivationParams(data->output_activation_min,                     \
                       data->output_activation_max, &op_params);        \
@@ -207,11 +222,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
                GetTensorData<int16_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-    if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, Mul);
+               GetTensorData<output_dtype>(output))
+    if (output->type == kTfLiteInt8) {
+      TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
     } else {
-      TF_LITE_MUL(optimized_ops, Mul);
+      if (kernel_type == kReference) {
+        TF_LITE_MUL(reference_ops, Mul, uint8_t);
+      } else {
+        TF_LITE_MUL(optimized_ops, Mul, uint8_t);
+      }
     }
 #undef TF_LITE_MUL
   } else {
@@ -233,14 +252,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalMul<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(
         context, EvalQuantized<kernel_type>(context, node, params, data, input1,
                                             input2, output));
   } else {
     context->ReportError(context,
-                         "Mul only supports FLOAT32, INT32 and quantized UINT8 "
-                         "and INT16 now, got %d.",
+                         "Mul only supports FLOAT32, INT32 and quantized UINT8,"
+                         " INT8 and INT16 now, got %d.",
                          output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 200cc26dadc3527813a7dabd3b9ca4811d4c8856..96f5a8a0e07e730394510f432b3313724e6c9172 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -73,9 +73,10 @@ class QuantizedMulOpModel : public BaseMulOpModel {
  public:
   using BaseMulOpModel::BaseMulOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -191,19 +192,28 @@ TEST(IntegerMulOpTest, WithBroadcast) {
   }
 }
 
-TEST(QuantizedMulOpTest, NoActivation) {
-  QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                        {TensorType_UINT8, {}, -1.0, 1.0},
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivation() {
+  QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                        {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                        {tensor_type, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
-  m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
-  m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
+  m.QuantizeAndPopulate<integer_dtype>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
+  m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.6, 0.4, 0.9, 0.8});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
                                               kQuantizedTolerance)));
 }
 
+TEST(QuantizedMulOpTest, NoActivationUInt8) {
+  NoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, NoActivationInt8) {
+  NoActivation<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedMulOpTest, NoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
@@ -219,23 +229,32 @@ TEST(QuantizedMulOpTest, NoActivationInt16) {
                                               kQuantizedToleranceInt16)));
 }
 
-TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) {
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivationInt16With8BitOutput() {
   const float kMinInt16 = -1.f;
   const float kMaxInt16 = 32767.f / 32768.f;
   const float kMinUint8 = -1.f;
   const float kMaxUint8 = 127.f / 128.f;
   QuantizedMulOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16},
                         {TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16},
-                        {TensorType_UINT8, {}, kMinUint8, kMaxUint8},
+                        {tensor_type, {}, kMinUint8, kMaxUint8},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<int16_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
   m.QuantizeAndPopulate<int16_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
                                               kQuantizedTolerance)));
 }
 
+TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) {
+  NoActivationInt16With8BitOutput<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, NoActivationInt16Withint8Output) {
+  NoActivationInt16With8BitOutput<TensorType_INT8, int8_t>();
+}
+
 // for quantized Mul, the error shouldn't exceed 2*step
 float GetTolerance(int min, int max) {
   float kQuantizedStep = (max - min) / 255.0;
@@ -243,25 +262,35 @@ float GetTolerance(int min, int max) {
   return kQuantizedTolerance;
 }
 
-TEST(QuantizedMulOpTest, WithBroadcast) {
+template <TensorType tensor_type, typename integer_dtype>
+void WithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedMulOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},  // always a scalar
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedMulOpTest, WithBroadcastUInt8) {
+  WithBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, WithBroadcastInt8) {
+  WithBroadcast<TensorType_INT8, int8_t>();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index d15a5a08af38672cbdaef76ff8a37c42d6e6f226..e26abaaff1e5c9e460621048eb15d0549b81fb36 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -39,8 +39,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 &&
-      input0->type != kTfLiteInt64) {
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt8 &&
+      input0->type != kTfLiteInt16 && input0->type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by pack.",
                          TfLiteTypeGetName(input0->type));
     return kTfLiteError;
@@ -106,6 +106,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<uint8_t>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteInt8: {
+      PackImpl<int8_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     case kTfLiteInt32: {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index 530cc2e50f0fe640cc5b120b8bbb1bade7e996fc..f44111567fc34f17912af7db352b47e57f8704f3 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -191,6 +191,37 @@ TEST(PackOpTest, Uint8MultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+// int8
+TEST(PackOpTest, Int8ThreeInputs) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, Int8ThreeInputsDifferentAxis) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, Int8MultilDimensions) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 8e6ed6e741f782f070714164a7af7b4f98a1558f..b60b3dd9c871bf864492505dd9fa4aabf496364c 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -214,6 +214,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         }
       }
     } break;
+    case kTfLiteInt8: {
+      int8_t pad_value;
+      if (op_context.constant_values == nullptr) {
+        // Quantized Pad requires that 0 is represented in the quantized
+        // range.
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                    std::numeric_limits<int8_t>::min());
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                    std::numeric_limits<int8_t>::max());
+        pad_value = static_cast<int8_t>(op_context.output->params.zero_point);
+      } else {
+        // Quantized Pad requires that 'constant_values' is represented in the
+        // same quantized range as the input and output tensors.
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
+                          op_context.constant_values->params.zero_point);
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.scale,
+                          op_context.constant_values->params.scale);
+        pad_value = *GetTensorData<int8_t>(op_context.constant_values);
+      }
+      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+        TF_LITE_PAD(reference_ops, PadImageStyle, int8_t, pad_value);
+      } else {
+        TF_LITE_PAD(reference_ops, Pad, int8_t, pad_value);
+      }
+    } break;
     case kTfLiteInt32: {
       int32_t pad_value =
           op_context.constant_values == nullptr
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 3caa4065dcbadd699ee9e61b8e97a42281d32309..97f95264f1a376b502be1db76e2f84c392d6c1cf 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -24,31 +24,34 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
 
-template <typename T>
+template <typename RegularInputOuput, typename QuantizedInputOuput>
 class PadOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<T> data) {
-    PopulateTensor<T>(input_, data);
+  void SetInput(std::initializer_list<RegularInputOuput> data) {
+    PopulateTensor<RegularInputOuput>(input_, data);
   }
 
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<QuantizedInputOuput>(input_, data);
   }
 
   void SetQuantizedPadValue(float data) {
-    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+    QuantizeAndPopulate<QuantizedInputOuput>(constant_values_, {data});
   }
 
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
 
-  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<RegularInputOuput> GetOutput() {
+    return ExtractVector<RegularInputOuput>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<QuantizedInputOuput>(
+        ExtractVector<QuantizedInputOuput>(output_), GetScale(output_),
+        GetZeroPoint(output_));
   }
 
  protected:
@@ -59,18 +62,18 @@ class PadOpModel : public SingleOpModel {
 };
 
 // Tests case where paddings is a const tensor. Type T is the dtype.
-template <typename T>
-class PadV2OpConstModel : public PadOpModel<T> {
+template <typename T1, typename T2>
+class PadV2OpConstModel : public PadOpModel<T1, T2> {
  public:
   PadV2OpConstModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
-                    std::initializer_list<int> paddings, T constant_values,
+                    std::initializer_list<int> paddings, T1 constant_values,
                     const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ =
         this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
     this->constant_values_ =
-        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+        this->AddConstInput(GetTensorType<T1>(), {constant_values}, {1});
 
     this->output_ = this->AddOutput(output);
 
@@ -103,7 +106,7 @@ class PadV2OpConstModel : public PadOpModel<T> {
 //    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
 //    m.SetInput(input_data);
 //    m.Invoke();
-class PadOpConstModel : public PadOpModel<float> {
+class PadOpConstModel : public PadOpModel<float, uint8_t> {
  public:
   PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
@@ -121,16 +124,18 @@ class PadOpConstModel : public PadOpModel<float> {
 };
 
 // Test case where paddings is a non-const tensor.
-template <typename T>
-class PadV2OpDynamicModel : public PadOpModel<T> {
+template <typename RegularInputOuput, typename QuantizedInputOuput>
+class PadV2OpDynamicModel
+    : public PadOpModel<RegularInputOuput, QuantizedInputOuput> {
  public:
   PadV2OpDynamicModel(const TensorData& input,
                       std::initializer_list<int> paddings_shape,
-                      T constant_values, const TensorData& output) {
+                      RegularInputOuput constant_values,
+                      const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ = this->AddInput(TensorType_INT32);
-    this->constant_values_ =
-        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+    this->constant_values_ = this->AddConstInput(
+        GetTensorType<RegularInputOuput>(), {constant_values}, {1});
     this->output_ = this->AddOutput(output);
 
     this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
@@ -159,7 +164,7 @@ class PadV2OpDynamicModel : public PadOpModel<T> {
 //    m.SetInput(input_data);
 //    m.SetPaddings(paddings_data);
 //    m.Invoke();
-class PadOpDynamicModel : public PadOpModel<float> {
+class PadOpDynamicModel : public PadOpModel<float, uint8_t> {
  public:
   PadOpDynamicModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
@@ -377,34 +382,47 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
 
 #ifdef GTEST_HAS_DEATH_TEST
 TEST(PadV2OpTest, TooManyDimensions) {
-  EXPECT_DEATH(PadV2OpConstModel<float>(
-                   {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
-                   {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
-                   {TensorType_FLOAT32}),
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                 {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
+                 {TensorType_FLOAT32}),
                "dims <= 4");
 }
 
 TEST(PadV2OpTest, UnequalDimensions) {
-  EXPECT_DEATH(
-      PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
-                               {1, 1, 2, 2, 3, 3}, 0.0, {TensorType_FLOAT32}),
-      "3 != 4");
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2}, {1, 1, 2, 2, 3, 3},
+                 0.0, {TensorType_FLOAT32}),
+               "3 != 4");
 }
 
 TEST(PadV2OpTest, InvalidPadValue) {
-  EXPECT_DEATH(PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}},
-                                        {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}, 0.0,
-                                        {TensorType_FLOAT32}),
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                 {0, 0, 1, -1, 2, -1, 0, 0}, 0.0, {TensorType_FLOAT32}),
                "Pad value has to be greater than equal to 0.");
 }
 #endif
 
-TEST(PadV2OpTest, SimpleConstTest) {
+TEST(PadV2OpTest, SimpleConstTestUint8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                      {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                                      {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstTestInt8) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
-                             {TensorType_FLOAT32});
+  PadV2OpConstModel<float, int8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                                     {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                                     {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -412,11 +430,25 @@ TEST(PadV2OpTest, SimpleConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestUint8) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                      {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+                                      {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestInt8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float, int8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                                     {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+                                     {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -427,8 +459,9 @@ TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
 TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
-                             {0, 1, 0, 0, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 1, 2, 1}},
+                                      {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1}, 5,
+                                      {TensorType_FLOAT32});
   m.SetInput({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 5, 3, 5, 5, 5, 5, 5}));
@@ -438,8 +471,9 @@ TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
 TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<int32_t> m({TensorType_INT32, {1, 2, 2, 1}}, {4, 2},
-                               {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_INT32});
+  PadV2OpConstModel<int32_t, uint8_t> m({TensorType_INT32, {1, 2, 2, 1}},
+                                        {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+                                        {TensorType_INT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -448,8 +482,8 @@ TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
 }
 
 TEST(PadV2OpTest, SimpleDynamicTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                        {4, 2}, 0.0, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -459,8 +493,8 @@ TEST(PadV2OpTest, SimpleDynamicTest) {
 }
 
 TEST(PadV2OpTest, SimpleDynamicValuedTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                        {4, 2}, 5, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -470,8 +504,9 @@ TEST(PadV2OpTest, SimpleDynamicValuedTest) {
 }
 
 TEST(PadV2OpTest, AdvancedConstTest) {
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                             {0, 0, 0, 2, 1, 3, 0, 0}, 0, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 3, 1}},
+                                      {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0}, 0,
+                                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -481,8 +516,8 @@ TEST(PadV2OpTest, AdvancedConstTest) {
 }
 
 TEST(PadV2OpTest, AdvancedDynamicTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 3, 1}},
+                                        {4, 2}, 0, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -505,21 +540,20 @@ class QuantizedPadV2OpTest : public ::testing::Test {
 TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
-  EXPECT_DEATH(
-      PadV2OpConstModel<float> m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
-                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0,
-                                 {TensorType_UINT8, {}, 1.0, 2.0}),
-      ".*Check failed: f_min <= 0.*");
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                 {0, 0, 1, 1, 1, 1, 0, 0}, 0, {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
 }
 #endif
 
 TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(0);
   m.Invoke();
@@ -531,9 +565,9 @@ TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(0);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
@@ -546,10 +580,10 @@ TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(0);
   m.Invoke();
@@ -562,9 +596,9 @@ TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(0);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
@@ -580,10 +614,10 @@ TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
 TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(-0.5);
   m.Invoke();
@@ -596,9 +630,9 @@ TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(-0.5);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
@@ -612,10 +646,10 @@ TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(-0.5);
   m.Invoke();
@@ -629,9 +663,9 @@ TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedDynamicValuedTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(-0.5);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index e6155fcb8c67ed3b5e676c2530ec7966d6cec56f..bdf736dcfb278ad93f43c25b9ae1c0b4038b695f 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -226,9 +226,9 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <KernelType kernel_type>
-void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLitePoolParams* params, OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
+                           TfLitePoolParams* params, OpData* data,
+                           const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -254,6 +254,31 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_MAX_POOL
 }
 
+template <KernelType kernel_type>
+void MaxEvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeInt8(params->activation, output, &activation_min,
+                               &activation_max);
+#define TF_LITE_MAX_POOL(type)                                        \
+  tflite::PoolParams op_params;                                       \
+  op_params.stride_height = params->stride_height;                    \
+  op_params.stride_width = params->stride_width;                      \
+  op_params.filter_height = params->filter_height;                    \
+  op_params.filter_width = params->filter_width;                      \
+  op_params.padding_values.height = data->padding.height;             \
+  op_params.padding_values.width = data->padding.width;               \
+  op_params.quantized_activation_min = activation_min;                \
+  op_params.quantized_activation_max = activation_max;                \
+  type::MaxPool(op_params, GetTensorShape(input),                     \
+                GetTensorData<int8_t>(input), GetTensorShape(output), \
+                GetTensorData<int8_t>(output))
+  TF_LITE_MAX_POOL(reference_integer_ops);
+#undef TF_LITE_MAX_POOL
+}
+
 template <KernelType kernel_type>
 void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
                  TfLitePoolParams* params, OpData* data,
@@ -321,7 +346,12 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
-      MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
+      MaxEvalQuantizedUInt8<kernel_type>(context, node, params, data, input,
+                                         output);
+      break;
+    case kTfLiteInt8:
+      MaxEvalQuantizedInt8<kernel_type>(context, node, params, data, input,
+                                        output);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index e1b79340115ad18e50ecdb6944904bf2ab7c9e84..4627d7a5f0c2803635b9df85dd9275cc7851e8fb 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -97,6 +97,24 @@ class SymmetricQuantizedPoolingOpModel : public BasePoolingOpModel {
   }
 };
 
+// Replicate each entry in a vector n times along depth (innermost dimension).
+// The values are incremented by delta, creating ramps offset by each input
+// value. This is used to create simple and predicatable variation.
+std::vector<float> ReplicateDepthRamp(const std::vector<float>& image_plane,
+                                      int n, float delta) {
+  const int size = image_plane.size();
+  std::vector<float> ramped_data(n * size);
+  // The input is treated as a 1-D even if logically it is multi-dimensional.
+  for (int input_index = 0; input_index < size; ++input_index) {
+    for (int depth = 0; depth < n; ++depth) {
+      ramped_data[n * input_index + depth] =
+          image_plane[input_index] + depth * delta;
+    }
+  }
+
+  return ramped_data;
+}
+
 TEST(FloatPoolingOpTest, AveragePool) {
   FloatPoolingOpModel m(BuiltinOperator_AVERAGE_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
@@ -147,6 +165,31 @@ TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
   EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
 }
 
+TEST(QuantizedPoolingOpTest, AveragePoolLargeDepth) {
+  // Test with a larger depth that is not a multiple of the tranche size, or of
+  // any register-oriented multiples such as 8 and 16.
+  constexpr int depth = 1999;  // Prime number.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, depth}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+
+  std::vector<float> input_image_plane({
+      0.f, 6.f, 2.f, 4.f,   //
+      3.f, 2.f, 10.f, 7.f,  //
+  });
+  std::vector<float> output_image_plane({2.75f, 5.75f});
+
+  m.SetInput(ReplicateDepthRamp(input_image_plane, depth, 1.f / 512.f));
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  ReplicateDepthRamp(output_image_plane, depth, 1.f / 512.f),
+                  1. / 32.f)));
+}
+
 // Test quantized AveragePool with int8 input and output. The input is the same
 // as the uint8 test QuantizedPoolingOpTest.AveragePool. The float output is
 // identical to uint8 test and quantized output is identical to uint8 test with
@@ -204,7 +247,7 @@ TEST(FloatPoolingOpTest, MaxPool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 10}));
 }
 
-TEST(QuantizedPoolingOpTest, MaxPool) {
+TEST(QuantizedUInt8PoolingOpTest, MaxPool) {
   // Choose the input ranges carefully so that the dequantized output matches
   // the results of the float model above.
   QuantizedPoolingOpModel m(
@@ -223,6 +266,50 @@ TEST(QuantizedPoolingOpTest, MaxPool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({96, 160}));
 }
 
+TEST(QuantizedPoolingOpTest, MaxPoolLargeDepth) {
+  // Test with a larger depth that is not a multiple of the tranche size, or of
+  // any register-oriented multiples such as 8 and 16.
+  constexpr int depth = 1999;  // Prime number.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, depth}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+
+  std::vector<float> input_image_plane({
+      0.f, 6.f, 2.f, 4.f,   //
+      3.f, 2.f, 10.f, 7.f,  //
+  });
+  std::vector<float> output_image_plane({6.f, 10.f});
+
+  m.SetInput(ReplicateDepthRamp(input_image_plane, depth, 1.f / 512.f));
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  ReplicateDepthRamp(output_image_plane, depth, 1.f / 512.f),
+                  1. / 32.f)));
+}
+
+TEST(QuantizedInt8PoolingOpTest, MaxPool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  SymmetricQuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_INT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, 2, -10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({3, 7})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-80, -16}));
+}
+
 TEST(FloatPoolingOpTest, L2Pool) {
   FloatPoolingOpModel m(BuiltinOperator_L2_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cef1f53a024b833034deb497909beac4b4753e6
--- /dev/null
+++ b/tensorflow/lite/kernels/rank.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace rank {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = kTfLiteInt32;
+
+  // Rank produces a 0-D int32 Tensor representing the rank of input.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
+
+  if (output->type == kTfLiteInt32) {
+    int32_t* output_data = GetTensorData<int32_t>(output);
+    *output_data = NumDimensions(input);
+  } else {
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace rank
+
+TfLiteRegistration* Register_RANK() {
+  static TfLiteRegistration r = {nullptr, nullptr, rank::Prepare, rank::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c31fc5866931708eb8155c2dc88026b623039ed
--- /dev/null
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class RankOpModel : public SingleOpModel {
+ public:
+  RankOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    TensorType output_type = TensorType_INT32;
+    input_ = AddInput(input_type);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_RANK, BuiltinOptions_RankOptions,
+                 CreateRankOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
+
+  int input() { return input_; }
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(RankOpTest, InputTypeFloat) {
+  RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, InputTypeInt) {
+  RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, ScalarTensor) {
+  RankOpModel model({}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, EmptyTensor) {
+  RankOpModel model({1, 0}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index 1193f84f6f6d768464c6122ebe0195a12b797fb0..3fb2715e48fabeee09b5d6670b6a45cf26d89bc8 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -180,6 +180,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     case kTfLiteUInt8:
       temp_sum->type = kTfLiteInt32;
       break;
+    case kTfLiteInt8:
+      temp_sum->type = kTfLiteInt32;
+      break;
     case kTfLiteBool:
       temp_sum->type = kTfLiteBool;
       break;
@@ -465,6 +468,9 @@ TfLiteStatus EvalGeneric(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       return EvalType<uint8_t>(context, node, &op_context, reduce_type);
       break;
+    case kTfLiteInt8:
+      return EvalType<int8_t>(context, node, &op_context, reduce_type);
+      break;
     case kTfLiteBool:
       return EvalType<bool>(context, node, &op_context, reduce_type);
       break;
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index 16c59396ddf46544edf2e5c4ae20161b5d2ef9ed..373fa56fc1983baec55ed6421950012f4564e9c6 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -38,9 +38,10 @@ class BaseOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -280,7 +281,7 @@ TEST(ConstFloatMeanOpTest, KeepDims_4DMean_UInt8) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 3}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({0.25098, 0.25098, 0.25098},
                                               kQuantizedTolerance)));
 }
@@ -294,7 +295,7 @@ TEST(ConstFloatMeanOpTest, KeepDims_4DMean_Quantized) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.235294, 0.313726}, kQuantizedTolerance)));
 }
@@ -361,8 +362,9 @@ TEST(ConstUint8MeanOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {0.4, 0.4}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8MeanOpTest, KeepDims) {
@@ -374,7 +376,7 @@ TEST(ConstUint8MeanOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
@@ -390,7 +392,7 @@ TEST(DynamicUint8MeanOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance)));
 }
 
@@ -406,7 +408,7 @@ TEST(DynamicUint8MeanOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
 }
 
@@ -420,7 +422,7 @@ TEST(DynamicUint8MeanOpTest, QuantizedScalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({0.643}, kQuantizedTolerance)));
 }
 
@@ -433,7 +435,7 @@ TEST(ConstUint8MeanOpTest, QuantizedKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
@@ -526,7 +528,7 @@ TEST(ConstUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({-0.823529, -0.815686}, kQuantizedTolerance)));
 }
@@ -539,8 +541,9 @@ TEST(ConstUint8SumOpTest, NotKeepDimsRescaling) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {1.2, 1.2}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({1.2, 1.2}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8SumOpTest, KeepDims) {
@@ -551,7 +554,7 @@ TEST(ConstUint8SumOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({-0.407843, -0.313726, 0.0941177},
                                               kQuantizedTolerance)));
 }
@@ -567,7 +570,7 @@ TEST(DynamicUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({1.48235, 1.64706}, kQuantizedTolerance)));
 }
@@ -584,7 +587,7 @@ TEST(DynamicUint8SumOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({6.47059, 10.698}, kQuantizedTolerance)));
 }
 
@@ -741,7 +744,20 @@ TEST(ConstUint8MaxOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MaxOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MaxOpConstModel m({TensorType_INT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance)));
 }
@@ -754,7 +770,20 @@ TEST(ConstUint8MaxOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MaxOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MaxOpConstModel m({TensorType_INT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance)));
 }
@@ -770,7 +799,23 @@ TEST(DynamicUint8MaxOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MaxOpDynamicModel m({TensorType_INT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_INT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance)));
 }
@@ -786,7 +831,23 @@ TEST(DynamicUint8MaxOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MaxOpDynamicModel m({TensorType_INT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_INT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
 }
@@ -801,7 +862,21 @@ TEST(DynamicUint8MaxOpTest, Scalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MaxOpDynamicModel m({TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
@@ -883,7 +958,20 @@ TEST(ConstUint8MinOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_INT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
 }
 
@@ -896,7 +984,20 @@ TEST(ConstUint8MinOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_INT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
 }
 
@@ -912,7 +1013,23 @@ TEST(DynamicUint8MinOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MinOpDynamicModel m({TensorType_INT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_INT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
 }
 
@@ -927,7 +1044,23 @@ TEST(DynamicUint8MinOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MinOpDynamicModel m({TensorType_INT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_INT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
 }
@@ -942,7 +1075,21 @@ TEST(DynamicUint8MinOpTest, Scalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MinOpDynamicModel m({TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index bc5ec4a1d6f72220e4598db528117ef30e9879b2..2a4654a8a38b336aca1840487b73f4b4716de4bd 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -25,6 +25,7 @@ TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
 TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
 
 }  // namespace custom
 
@@ -104,6 +105,7 @@ TfLiteRegistration* Register_REDUCE_ANY();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE();
 TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_COS();
 TfLiteRegistration* Register_TRANSPOSE_CONV();
 TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
@@ -112,6 +114,7 @@ TfLiteRegistration* Register_NOT_EQUAL();
 TfLiteRegistration* Register_SQRT();
 TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_RANK();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
 TfLiteRegistration* Register_PACK();
@@ -131,6 +134,10 @@ TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
 TfLiteRegistration* Register_UNIQUE();
 TfLiteRegistration* Register_REVERSE_V2();
+TfLiteRegistration* Register_ADD_N();
+TfLiteRegistration* Register_GATHER_ND();
+TfLiteRegistration* Register_WHERE();
+TfLiteRegistration* Register_ELU();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -163,13 +170,19 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
-  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_TANH, Register_TANH());
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
              /* min_version */ 1,
@@ -198,16 +211,22 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 3);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
              /* min_version */ 1,
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
-  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND());
-  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL());
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
@@ -220,63 +239,115 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR());
+             Register_RESIZE_NEAREST_NEIGHBOR(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
-  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
-  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
-  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), /* min_version */ 1,
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
-  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version */ 1,
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
   AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
-  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
   AddBuiltin(BuiltinOperator_TILE, Register_TILE());
   AddBuiltin(BuiltinOperator_SUM, Register_SUM());
   AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
-  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
-  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
   AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
   AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_RANK, Register_RANK());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
@@ -293,6 +364,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
   AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
   AddBuiltin(BuiltinOperator_REVERSE_V2, Register_REVERSE_V2());
+  AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
+  AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND());
+  AddBuiltin(BuiltinOperator_WHERE, Register_WHERE());
+  AddBuiltin(BuiltinOperator_ELU, Register_ELU());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
@@ -304,6 +379,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
 
   // WARNING: Control flow ops are experimental and subject to change.
   AddCustom("Experimental_If", tflite::ops::custom::Register_IF());
+  AddCustom("Experimental_While", tflite::ops::custom::Register_WHILE());
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index d42cb188669587a957dd085f9ecb123f44b59437..7383d03438c65a710efbfe30f3d3c0ce261f0ca8 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -109,6 +109,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
       TF_LITE_RESIZE_BILINEAR(optimized_ops, uint8_t);
     }
+  } else if (output->type == kTfLiteInt8) {
+    TF_LITE_RESIZE_BILINEAR(reference_ops, int8_t);
 #undef TF_LITE_RESIZE_BILINEAR
   } else {
     context->ReportError(context, "Output type is %d, requires float.",
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index d3f4837a287accd93c23e17fa3a361efd4120101..a5ead9c874d5cf1e4e1abb7998150df44eced3c5 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -78,7 +78,7 @@ TEST(ResizeBilinearOpTest, HorizontalResize) {
               ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
-TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
+TEST(ResizeBilinearOpTest, HorizontalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
   m.SetInput<uint8>({3, 6});
   m.SetSize({1, 3});
@@ -93,6 +93,21 @@ TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
+TEST(ResizeBilinearOpTest, HorizontalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 1, 2, 1}});
+  m.SetInput<int8_t>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<int8_t>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
 TEST(ResizeBilinearOpTest, VerticalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput<float>({3, 9});
@@ -108,7 +123,7 @@ TEST(ResizeBilinearOpTest, VerticalResize) {
               ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
-TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
+TEST(ResizeBilinearOpTest, VerticalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
   m.SetInput<uint8>({3, 9});
   m.SetSize({3, 1});
@@ -123,6 +138,21 @@ TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
+TEST(ResizeBilinearOpTest, VerticalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 1, 1}});
+  m.SetInput<int8_t>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<int8_t>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput<float>({
@@ -150,7 +180,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,  //
@@ -177,6 +207,33 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 5, 6,    //
+                                         7, 9, 10,   //
+                                         9, 11, 12,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 5, 6,    //
+                                               7, 9, 10,   //
+                                               9, 11, 12,  //
+                                           })));
+}
+
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput<float>({
@@ -241,7 +298,7 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,   //
@@ -278,7 +335,44 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {2, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 5, 6,     //
+                                         7, 9, 10,    //
+                                         9, 11, 12,   //
+                                         4, 8, 10,    //
+                                         9, 12, 13,   //
+                                         12, 14, 16,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 5, 6,     //
+                                               7, 9, 10,    //
+                                               9, 11, 12,   //
+                                               4, 8, 10,    //
+                                               9, 12, 13,   //
+                                               12, 14, 16,  //
+                                           })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
   m.SetInput<uint8>({
       3, 4, 6, 10,     //
@@ -304,6 +398,33 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
                                               10, 12, 12, 14, 14, 16,  //
                                           })));
 }
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 2, 2}});
+  m.SetInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 4, 5, 8, 6, 10,       //
+                                         7, 9, 10, 12, 11, 13,    //
+                                         10, 12, 12, 14, 14, 16,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 4, 5, 8, 6, 10,       //
+                                               7, 9, 10, 12, 11, 13,    //
+                                               10, 12, 12, 14, 14, 16,  //
+                                           })));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index a48d8004f8b6cead177286328082310237af515a..3030a4f28e22396cbc51e55ff04562fa76a0264e 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -106,8 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorShape(size), GetTensorData<int32>(size),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
     }
+  } else if (output->type == kTfLiteInt8) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
   } else {
-    context->ReportError(context, "Output type is %d, requires float or uint8.",
+    context->ReportError(context,
+                         "Output type is %d, requires float, uint8 or int8.",
                          output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index 03e2effd84c4adb13db1bb3ada4f5cfe1c0b12c9..63b4f13d643e1da312b835f4bf3257b39d103b5a 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -79,7 +79,7 @@ TEST(ResizeNearestNeighborOpTest, HorizontalResize) {
               ElementsAreArray(ArrayFloatNear({3, 3, 6})));
 }
 
-TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, HorizontalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
   m.SetInput<uint8>({3, 6});
   m.SetSize({1, 3});
@@ -95,6 +95,21 @@ TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 3, 6})));
 }
 
+TEST(ResizeNearestNeighborOpTest, HorizontalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 1, 2, 1}});
+  m.SetInput<int8_t>({-3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<int8_t>({-3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+}
+
 TEST(ResizeNearestNeighborOpTest, VerticalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput<float>({3, 9});
@@ -111,7 +126,7 @@ TEST(ResizeNearestNeighborOpTest, VerticalResize) {
               ElementsAreArray(ArrayFloatNear({3, 3, 9})));
 }
 
-TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, VerticalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
   m.SetInput<uint8>({3, 9});
   m.SetSize({3, 1});
@@ -127,6 +142,21 @@ TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 3, 9})));
 }
 
+TEST(ResizeNearestNeighborOpTest, VerticalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 1, 1}});
+  m.SetInput<int8_t>({3, -9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<int8_t>({3, -9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+}
+
 TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput<float>({
@@ -155,7 +185,7 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,  //
@@ -183,6 +213,33 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 3, -6,  //
+                                         3, 3, -6,  //
+                                         9, 9, 12,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 3, -6,  //
+                                               3, 3, -6,  //
+                                               9, 9, 12,  //
+                                           })));
+}
+
 TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput<float>({
@@ -249,7 +306,7 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,   //
@@ -287,7 +344,44 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {2, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 3, 6,     //
+                                         3, 3, 6,     //
+                                         9, 9, -12,   //
+                                         -4, -4, 10,  //
+                                         -4, -4, 10,  //
+                                         12, 12, 16,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 3, 6,     //
+                                               3, 3, 6,     //
+                                               9, 9, -12,   //
+                                               -4, -4, 10,  //
+                                               -4, -4, 10,  //
+                                               12, 12, 16,  //
+                                           })));
+}
+
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
   m.SetInput<uint8>({
       3, 4, 6, 10,     //
@@ -315,6 +409,33 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 2}});
+  m.SetInput<int8_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 4, 3, 4, -6, 10,       //
+                                         3, 4, 3, 4, -6, 10,       //
+                                         10, 12, 10, 12, -14, 16,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 4, 3, 4, -6, 10,       //
+                                               3, 4, 3, 4, -6, 10,       //
+                                               10, 12, 10, 12, -14, 16,  //
+                                           })));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 4687ab44171fab73ff1b4ef93592b25680f3a59f..d1c63d887db00143fb6b154306313411643cf2b8 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -89,6 +89,9 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:                                                         \
       TF_LITE_SELECT(uint8_t, op);                                             \
       break;                                                                   \
+    case kTfLiteInt8:                                                          \
+      TF_LITE_SELECT(int8_t, op);                                              \
+      break;                                                                   \
     case kTfLiteInt16:                                                         \
       TF_LITE_SELECT(int16_t, op);                                             \
       break;                                                                   \
diff --git a/tensorflow/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
index 5111300e479a92ad9cbf00628750dc61effc50d3..d7cadeb51eb3ee0645eaccc1bbcea59bd279e0d7 100644
--- a/tensorflow/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -96,6 +96,19 @@ TEST(SelectOpTest, SelectUInt8) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
+TEST(SelectOpTest, SelectInt8) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_INT8);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<int8_t>(model.input2(), {1, -2, 3, 4});
+  model.PopulateTensor<int8_t>(model.input3(), {5, 6, 7, -8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray({5, -2, 7, -8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(SelectOpTest, SelectInt16) {
   SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
                       TensorType_INT16);
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 5fca7a3ea71aa41c6e466b7814921e2e1ac6293d..650c65d7dc01efb77f28dfc29b674726954c9edf 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -204,6 +204,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       TF_LITE_SLICE(int64_t, kernel_type);
       break;
+    case kTfLiteInt8:
+      TF_LITE_SLICE(int8_t, kernel_type);
+      break;
     case kTfLiteUInt8:
       TF_LITE_SLICE(uint8_t, kernel_type);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 563329ddb164d3aa5f13c8ee0d6482d79b84ed32..102218ba23c105014ee6d501d2941f8b4755a44e 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -163,6 +163,28 @@ TEST(SliceOpTest, SizeMinus1) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST(SliceOpTest, SliceUint8) {
+  SliceOpModel<uint8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_UINT8);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+TEST(SliceOpTest, SliceInt8) {
+  SliceOpModel<int8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                  TensorType_INT8);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index 1c61b2ef30379e808085f3b0d16a5b1157bea314..2fb7198cd67e8b9d13873d25a2eaa04fd2ff2ae0 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -141,6 +141,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   op_context.output->params.zero_point);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int8_t,
+                                  op_context.output->params.zero_point);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int8_t,
+                                  op_context.output->params.zero_point);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0);
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index c5d6e9a53062d97801b518f15305e2052f861e7c..52a77984d935d04a79807707729754abd21d3be6 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -31,8 +31,9 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  template <typename T>
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<T>(input_, data);
   }
 
   void SetBlockShape(std::initializer_list<int> data) {
@@ -46,9 +47,10 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
  protected:
@@ -233,29 +235,62 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
 }
 #endif
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestUint8) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 0, 2, 0},
                                {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetQuantizedInput<uint8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
                   -1.0, 1.0)));
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestInt8) {
+  SpaceToBatchNDOpConstModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 0, 2, 0},
+                               {TensorType_INT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput<int8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestUint8) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetQuantizedInput<uint8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestInt8) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
+                                 {TensorType_INT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput<int8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 0, 2, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
@@ -266,10 +301,10 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 1, 2, 4},
                                {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {
                       0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
@@ -282,12 +317,12 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
 TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 1, 2, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {
                       0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index 79e28bf47d98b64572d9e7404f8d69788cd30e08..cf6b0bd4d3d4b61b87a14d1090a7e89d9b77a0f2 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -50,7 +50,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto data_type = output->type;
   TF_LITE_ENSURE(context,
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
-                     data_type == kTfLiteInt32 || data_type == kTfLiteInt64);
+                     data_type == kTfLiteInt8 || data_type == kTfLiteInt32 ||
+                     data_type == kTfLiteInt64);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
   const int block_size = params->block_size;
@@ -100,6 +101,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_SPACE_TO_DEPTH(optimized_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, int8_t);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_DEPTH(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 3fa8d86348ef899b9bd42c19f5b1510b4c4e33d3..58665fc9d83007d7bed638418cba058e4ff189c5 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -74,6 +74,14 @@ TEST(SpaceToDepthOpModel, Uint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
+TEST(SpaceToDepthOpModel, int8) {
+  SpaceToDepthOpModel m({TensorType_INT8, {1, 2, 2, 1}}, 2);
+  m.SetInput<int8_t>({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({1, 2, 3, 4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(SpaceToDepthOpModel, Int32) {
   SpaceToDepthOpModel m({TensorType_INT32, {1, 2, 2, 3}}, 2);
   m.SetInput<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 7902ed2a46d297cca6f076bf1bb48580f3c4bf40..c0f701f55dd096279e1e9f1e54817490cb0c230b 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -76,9 +76,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
 
   auto input_type = op_context.input->type;
-  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
-                              input_type == kTfLiteUInt8 ||
-                              input_type == kTfLiteInt16);
+  TF_LITE_ENSURE(context,
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+                     input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
+                     input_type == kTfLiteInt32);
   for (int i = 0; i < NumOutputs(node); ++i) {
     GetOutput(context, node, i)->type = input_type;
   }
@@ -137,15 +138,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_SPLIT(uint8_t);
       break;
     }
+    case kTfLiteInt8: {
+      TF_LITE_SPLIT(int8_t);
+      break;
+    }
     case kTfLiteInt16: {
       TF_LITE_SPLIT(int16_t);
       break;
     }
+    case kTfLiteInt32: {
+      TF_LITE_SPLIT(int32_t);
+      break;
+    }
     default:
-      context->ReportError(
-          context,
-          "Only float32, uint8 and int16 are currently supported, got %d.",
-          op_context.input->type);
+      context->ReportError(context,
+                           "Only float32, uint8, int8, int16 and int32 are "
+                           "currently supported, got %d.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPLIT
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
index f3d9ea3bf4158dd51b5102b942125b7561024c19..fa313d4b18f803dc5060425d8162af25129dd5d9 100644
--- a/tensorflow/lite/kernels/split_test.cc
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -47,13 +47,15 @@ class SplitOpModel : public SingleOpModel {
     }
   }
 
-  void SetInput(std::initializer_list<float> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
   void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
 
-  std::vector<float> GetOutput(int i) {
-    return ExtractVector<float>(outputs_[i]);
+  template <typename T>
+  std::vector<T> GetOutput(int i) {
+    return ExtractVector<T>(outputs_[i]);
   }
   std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
 
@@ -63,33 +65,34 @@ class SplitOpModel : public SingleOpModel {
   std::vector<int> outputs_;
 };
 
-using TensorValues = std::initializer_list<float>;
-
+template <typename T>
 void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
            std::initializer_list<int> output_shape,
-           const TensorValues& input_data,
-           const std::vector<TensorValues>& output_data) {
+           const std::initializer_list<T>& input_data,
+           const std::vector<std::initializer_list<T>>& output_data,
+           const TensorType& type = TensorType_FLOAT32) {
   auto debug = [&](int i) {
     std::stringstream ss;
     ss << "for output tensor " << i << " axis=" << axis
        << " and num_splits=" << num_splits;
     return ss.str();
   };
-  SplitOpModel m({TensorType_FLOAT32, input_shape}, num_splits);
+  SplitOpModel m({type, input_shape}, num_splits);
   m.SetInput(input_data);
   m.SetAxis(axis);
   m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i])) << debug(i);
+    EXPECT_THAT(m.GetOutput<T>(i), ElementsAreArray(output_data[i]))
+        << debug(i);
     EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
   }
 
-  SplitOpModel const_m({TensorType_FLOAT32, input_shape}, num_splits, axis);
+  SplitOpModel const_m({type, input_shape}, num_splits, axis);
   const_m.SetInput(input_data);
   const_m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]))
+    EXPECT_THAT(const_m.GetOutput<T>(i), ElementsAreArray(output_data[i]))
         << debug(i);
     EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
@@ -97,44 +100,106 @@ void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
 }
 
 TEST(SplitOpTest, FourDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 9, 10, 11, 12},
-            {5, 6, 7, 8, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 5, 6, 9, 10, 13, 14},
-            {3, 4, 7, 8, 11, 12, 15, 16},
-        });
-  Check(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 3, 5, 7, 9, 11, 13, 15},
-            {2, 4, 6, 8, 10, 12, 14, 16},
-        });
+  Check<float>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 5, 6, 7, 8},
+                   {9, 10, 11, 12, 13, 14, 15, 16},
+               });
+  Check<float>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 9, 10, 11, 12},
+                   {5, 6, 7, 8, 13, 14, 15, 16},
+               });
+  Check<float>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 5, 6, 9, 10, 13, 14},
+                   {3, 4, 7, 8, 11, 12, 15, 16},
+               });
+  Check<float>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 3, 5, 7, 9, 11, 13, 15},
+                   {2, 4, 6, 8, 10, 12, 14, 16},
+               });
+}
+
+TEST(SplitOpTest, FourDimensionalInt8) {
+  Check<int8_t>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 3, 4, 5, 6, 7, 8},
+                    {9, 10, 11, 12, 13, 14, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 3, 4, 9, 10, 11, 12},
+                    {5, 6, 7, 8, 13, 14, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 5, 6, 9, 10, 13, 14},
+                    {3, 4, 7, 8, 11, 12, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 3, 5, 7, 9, 11, 13, 15},
+                    {2, 4, 6, 8, 10, 12, 14, 16},
+                },
+                TensorType_INT8);
+}
+
+TEST(SplitOpTest, FourDimensionalInt32) {
+  Check<int32_t>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 3, 4, 5, 6, 7, 8},
+                     {9, 10, 11, 12, 13, 14, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 3, 4, 9, 10, 11, 12},
+                     {5, 6, 7, 8, 13, 14, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 5, 6, 9, 10, 13, 14},
+                     {3, 4, 7, 8, 11, 12, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 3, 5, 7, 9, 11, 13, 15},
+                     {2, 4, 6, 8, 10, 12, 14, 16},
+                 },
+                 TensorType_INT32);
 }
 
 TEST(SplitOpTest, OneDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
-        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+  Check<float>(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+               {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
 TEST(SplitOpTest, NegativeAxis) {
-  Check(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
+  Check<float>(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 5, 6, 7, 8},
+                   {9, 10, 11, 12, 13, 14, 15, 16},
+               });
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index c797a98e9f1bda8595e6822638949bab48cb2eab..8c25ffa3a1a669684d9fb1b552893de3a450264f 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -234,6 +234,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type %d is currently not supported "
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 34875bf0497a000da02f3d0212b042399046a492..cac9e1672f871268d6d37b3488d00a0c1399aaa7 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -577,6 +577,18 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1int8) {
+  StridedSliceOpModel<int8_t, TensorType_INT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({1, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 9144144e16d9cda68d5e0c90993422baa5c9f849..8bd6052307cc0e032a566e437923cac2f16be69e 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -68,21 +70,39 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
-TfLiteStatus PrepareUint8SubOp(TfLiteContext* context,
-                               const TfLiteTensor* input_1,
-                               const TfLiteTensor* input_2,
-                               TfLiteTensor* output, TfLiteSubParams* params,
-                               OpData* op_params, int op_sign) {
+TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
+                              const TfLiteTensor* input_1,
+                              const TfLiteTensor* input_2, TfLiteTensor* output,
+                              TfLiteSubParams* params, OpData* op_params,
+                              int op_sign) {
+  TF_LITE_ENSURE(context,
+                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
   const auto& input1_quantization_params = input_1->params;
   const auto& input2_quantization_params = input_2->params;
   const auto& output_quantization_params = output->params;
+  int32_t integer_type_min = 0;
+  int32_t integer_type_max = 0;
+  if (output->type == kTfLiteUInt8) {
+    integer_type_min = std::numeric_limits<uint8_t>::min();
+    integer_type_max = std::numeric_limits<uint8_t>::max();
+  } else {
+    // output->type == kTfLiteInt8
+    integer_type_min = std::numeric_limits<int8_t>::min();
+    integer_type_max = std::numeric_limits<int8_t>::max();
+  }
 
-  TF_LITE_ENSURE(context, input1_quantization_params.zero_point >= 0);
-  TF_LITE_ENSURE(context, input1_quantization_params.zero_point <= 255);
-  TF_LITE_ENSURE(context, input2_quantization_params.zero_point >= 0);
-  TF_LITE_ENSURE(context, input2_quantization_params.zero_point <= 255);
-  TF_LITE_ENSURE(context, output_quantization_params.zero_point >= 0);
-  TF_LITE_ENSURE(context, output_quantization_params.zero_point <= 255);
+  TF_LITE_ENSURE(context,
+                 input1_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 input1_quantization_params.zero_point <= integer_type_max);
+  TF_LITE_ENSURE(context,
+                 input2_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 input2_quantization_params.zero_point <= integer_type_max);
+  TF_LITE_ENSURE(context,
+                 output_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 output_quantization_params.zero_point <= integer_type_max);
 
   op_params->input1_offset = -input1_quantization_params.zero_point;
   op_params->input2_offset = -input2_quantization_params.zero_point;
@@ -109,10 +129,15 @@ TfLiteStatus PrepareUint8SubOp(TfLiteContext* context,
   tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
                                               &op_params->output_multiplier,
                                               &op_params->output_shift);
-
-  CalculateActivationRangeUint8(params->activation, output,
-                                &op_params->output_activation_min,
-                                &op_params->output_activation_max);
+  if (output->type == kTfLiteUInt8) {
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &op_params->output_activation_min,
+                                  &op_params->output_activation_max);
+  } else {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &op_params->output_activation_min,
+                                 &op_params->output_activation_max);
+  }
   return kTfLiteOk;
 }
 
@@ -186,9 +211,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_OK(context, PrepareUint8SubOp(context, input1, input2,
-                                                 output, params, data, -1));
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output,
+                                                params, data, -1));
   } else if (output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2,
                                                  output, params, data));
@@ -271,9 +296,15 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                GetTensorData<data_type>(input1), GetTensorShape(input2), \
                GetTensorData<data_type>(input2), GetTensorShape(output), \
                GetTensorData<data_type>(output))
-  if (output->type == kTfLiteUInt8) {
     // NOTE: We are using the add kernels. This is possible as the second values
     // multiplier is negated before being passed down.
+  if (output->type == kTfLiteInt8) {
+    if (need_broadcast) {
+      TF_LITE_SUB(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+    } else {
+      TF_LITE_SUB(reference_integer_ops, Add, int8_t);
+    }
+  } else if (output->type == kTfLiteUInt8) {
     if (kernel_type == kReference) {
       if (need_broadcast) {
         TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, uint8_t);
@@ -319,7 +350,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 23927c65546bf8eec8cd24a7f761649548ce559f..3c19678b20f21894461f5ef79b1df6c45e1cac5a 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -63,9 +63,10 @@ class QuantizedSubOpModel : public BaseSubOpModel {
  public:
   using BaseSubOpModel::BaseSubOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -74,17 +75,15 @@ class QuantizedSubOpModel : public BaseSubOpModel {
   }
 };
 
-// for quantized Sub, the error shouldn't exceed 2*step
+// for quantized Sub, the error shouldn't exceed step
 float GetTolerance(int min, int max) {
   float kQuantizedStep = (max - min) / 255.0;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 float GetToleranceInt16(float min, float max) {
   float kQuantizedStep = (max - min) / std::numeric_limits<int16_t>::max();
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 TEST(FloatSubOpModel, NoActivation) {
@@ -194,7 +193,8 @@ TEST(IntegerSubOpModel, WithBroadcast) {
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
@@ -204,20 +204,30 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
                                              {-0.8, -0.2, -0.1, 0.9},
                                              {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedSubOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                              {-0.8, 0.2, 0.7, 0.5}};
@@ -226,57 +236,85 @@ TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
                                              {-1.0, -0.2, 1.0, 0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedSubOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELUN1TO1UInt8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_UINT8, uint8_t>();
+}
 
-TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELUN1TO1Int8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedSubOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(),
+                                         {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-2.1, -0.1, 0.4, 0.3, 0.0, 1.9}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesUInt8) {
+  QuantizedVariousInputShapes<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
+  QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedWithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
+    QuantizedSubOpModel m(
+        {tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, {}, -3.0, 3.0},
+        {tensor_type, {}, -3.0, 3.0}, ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.7});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-2.7, -0.5, 0.0, 0.1, 0.4, 1.3}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastUInt8) {
+  QuantizedWithBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
+  QuantizedWithBroadcast<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax =
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e712be1b516ed0dca5097e66c2d1f20e63a78038
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -0,0 +1,409 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+// ADD and MUL are used to test simple branch.
+TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_MUL();
+// ADD and MUL are used to test dynamic sized subgraphs.
+TfLiteRegistration* Register_PAD();
+TfLiteRegistration* Register_LESS_EQUAL();
+}  // namespace builtin
+namespace custom {
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
+}  // namespace custom
+}  // namespace ops
+
+namespace subgraph_test_util {
+
+namespace {
+
+void SetupTensor(Subgraph* subgraph, int tensor_index, TfLiteType type) {
+  ASSERT_EQ(subgraph->SetTensorParametersReadWrite(tensor_index, type, "", 0,
+                                                   nullptr, {}, false),
+            kTfLiteOk);
+}
+
+}  // namespace
+
+SubgraphBuilder::~SubgraphBuilder() {
+  for (auto buffer : buffers_) {
+    free(buffer);
+  }
+}
+
+void SubgraphBuilder::BuildAddSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |ADD| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_ADD(), &node_index);
+}
+
+// Build a subgraph with an mul op. Helper function for testing.
+void SubgraphBuilder::BuildMulSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |MUL| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLiteMulParams* params =
+      reinterpret_cast<TfLiteMulParams*>(malloc(sizeof(TfLiteMulParams)));
+  params->activation = kTfLiteActNone;
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_MUL(), &node_index);
+}
+
+// Build a subgraph with a pad op. Helper function for testing.
+void SubgraphBuilder::BuildPadSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |PAD| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLitePadParams* params =
+      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLitePadParams)));
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_PAD(), &node_index);
+}
+
+void SubgraphBuilder::BuildIfSubgraph(Subgraph* subgraph) {
+  const int kCondInput = 0;
+  const int kInput1 = 1;
+  const int kInput2 = 2;
+  const int kOutput = 3;
+  const int kTensorCount = 4;
+
+  // kCondInput(0) --> +----+
+  // kInput1(1)  ----> | IF | --> kOutput(3)
+  // kInput2(2)  ----> +----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kCondInput, kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kCondInput, kTfLiteBool);
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  flexbuffers::Builder fbb;
+  fbb.Map([&]() {
+    fbb.Int("then_subgraph_index", 1);
+    fbb.Int("else_subgraph_index", 2);
+  });
+  fbb.Finish();
+  const auto& buffer = fbb.GetBuffer();
+
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kCondInput, kInput1, kInput2}, {kOutput},
+      reinterpret_cast<const char*>(buffer.data()), buffer.size(), nullptr,
+      ::tflite::ops::custom::Register_IF(), &node_index);
+}
+
+void SubgraphBuilder::BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kConstRhs = 3;
+  const int kTensorCount = 4;
+
+  // kInput1(0) ----> +------------+
+  //                  | LESS_EQUAL | --> kOutput(2)
+  // kConstRhs(3) --> +------------+
+  //
+  // kInput2(1) --> (unused)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteBool);
+
+  CreateConstantInt32Tensor(subgraph, kConstRhs, {1}, {rhs});
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kConstRhs}, {kOutput}, nullptr, 0, nullptr,
+      ::tflite::ops::builtin::Register_LESS_EQUAL(), &node_index);
+}
+
+void SubgraphBuilder::BuildAccumulateLoopBodySubgraph(Subgraph* subgraph) {
+  const int kInputCounter = 0;
+  const int kInputValue = 1;
+  const int kOutputCounter = 2;
+  const int kOutputValue = 3;
+  const int kConstStep = 4;
+  const int kTensorCount = 5;
+
+  // kInputCounter(0) --> +-----+
+  //                      | ADD | --> kOutputCounter(2)
+  // kConstStep(4) -----> +-----+            |
+  //                                         |
+  //                                         v
+  //                                      +-----+
+  //                                      | ADD | --> kOutputValue(3)
+  // kInputValue(1) ----------------------+-----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInputCounter, kInputValue}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutputCounter, kOutputValue}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kInputValue, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputValue, kTfLiteInt32);
+  CreateConstantInt32Tensor(subgraph, kConstStep, {1}, {1});
+
+  int node_index;
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters({0, 4}, {2}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_ADD(),
+                                  &node_index);
+  params = reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters({2, 1}, {3}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_ADD(),
+                                  &node_index);
+}
+
+void SubgraphBuilder::BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                               const std::vector<int> padding) {
+  const int kInputCounter = 0;
+  const int kInputValue = 1;
+  const int kOutputCounter = 2;
+  const int kOutputValue = 3;
+  const int kConstStep = 4;
+  const int kConstPadding = 5;
+  const int kTensorCount = 6;
+
+  // kInputCounter(0) --> +-----+
+  //                      | ADD | --> kOutputCounter(2)
+  // kConstStep(4) -----> +-----+
+  //
+  // kInputValue(1) ----> +-----+
+  //                      | PAD | --> kOutputValue(3)
+  // kConstPadding(5) --> +-----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInputCounter, kInputValue}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutputCounter, kOutputValue}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kInputValue, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputValue, kTfLiteInt32);
+
+  CreateConstantInt32Tensor(subgraph, kConstStep, {1}, {1});
+  ASSERT_EQ(padding.size() % 2, 0);
+  int padding_dims = padding.size();
+  CreateConstantInt32Tensor(subgraph, kConstPadding, {1, padding_dims},
+                            padding);
+
+  int node_index;
+  TfLiteAddParams* add_params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  add_params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters(
+      {kInputCounter, kConstStep}, {kOutputCounter}, nullptr, 0, add_params,
+      ::tflite::ops::builtin::Register_ADD(), &node_index);
+  TfLitePadParams* pad_params =
+      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLiteAddParams)));
+  subgraph->AddNodeWithParameters(
+      {kInputValue, kConstPadding}, {kOutputValue}, nullptr, 0, pad_params,
+      ::tflite::ops::builtin::Register_PAD(), &node_index);
+}
+
+void SubgraphBuilder::BuildWhileSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput1 = 2;
+  const int kOutput2 = 3;
+  const int kTensorCount = 4;
+
+  // kInput1(0) --> +-------+ --> kOutput1(2)
+  //                | WHILE |
+  // kInput2(1) --> +-------+ --> kOutput2(3)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput1, kOutput2}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput1, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput2, kTfLiteInt32);
+
+  flexbuffers::Builder fbb;
+  fbb.Map([&]() {
+    fbb.Int("cond_subgraph_index", 1);
+    fbb.Int("body_subgraph_index", 2);
+  });
+  fbb.Finish();
+  const auto& buffer = fbb.GetBuffer();
+
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {0, 1}, {2, 3}, reinterpret_cast<const char*>(buffer.data()),
+      buffer.size(), nullptr, ::tflite::ops::custom::Register_WHILE(),
+      &node_index);
+}
+
+void SubgraphBuilder::CreateConstantInt32Tensor(Subgraph* subgraph,
+                                                int tensor_index,
+                                                const std::vector<int>& shape,
+                                                const std::vector<int>& data) {
+  ASSERT_GT(shape.size(), 0);
+  int num_elements = 1;
+  for (int dim : shape) {
+    num_elements *= dim;
+  }
+  ASSERT_EQ(data.size(), num_elements);
+  size_t size_in_bytes = sizeof(int32_t) * num_elements;
+  // Maybe aligned.
+  int32_t* buffer = reinterpret_cast<int32_t*>(malloc(size_in_bytes));
+  for (int i = 0; i < num_elements; ++i) {
+    buffer[i] = data[i];
+  }
+  buffers_.push_back(buffer);
+  ASSERT_EQ(subgraph->SetTensorParametersReadOnly(
+                tensor_index, kTfLiteInt32, "", shape, {},
+                reinterpret_cast<const char*>(buffer), size_in_bytes),
+            kTfLiteOk);
+}
+
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data) {
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    tensor->data.i32[i] = data[i];
+  }
+}
+
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data) {
+  ASSERT_EQ(tensor->dims->size, shape.size());
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    ASSERT_EQ(tensor->dims->data[i], shape[i]);
+  }
+  ASSERT_EQ(tensor->type, kTfLiteInt32);
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(tensor->data.i32[i], data[i]);
+  }
+}
+
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data) {
+  ASSERT_EQ(tensor->dims->size, shape.size());
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    ASSERT_EQ(tensor->dims->data[i], shape[i]);
+  }
+  ASSERT_EQ(tensor->type, kTfLiteBool);
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(tensor->data.b[i], data[i]);
+  }
+}
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..972f1381af2804252461bf81dfbce3563be41c3b
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module provides helper functions for testing the interaction between
+// control flow ops and subgraphs.
+// For convenience, we mostly only use `kTfLiteInt32` in this module.
+
+#ifndef TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/interpreter.h"
+
+namespace tflite {
+namespace subgraph_test_util {
+
+// TODO(ycling): This file should be renamed as
+// `control_flow_test_util` to avoid confusion. I'll do it immediately
+// in a separated change.
+class SubgraphBuilder {
+ public:
+  ~SubgraphBuilder();
+
+  // Build a subgraph with a single Add op.
+  // 2 inputs. 1 output.
+  void BuildAddSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Mul op.
+  // 2 inputs. 1 output.
+  void BuildMulSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Pad op.
+  // 2 inputs. 1 output.
+  void BuildPadSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single If op.
+  // 3 inputs:
+  //   The 1st input is condition with boolean type.
+  //   The 2nd and 3rd inputs are feed input the branch subgraphs.
+  // 1 output.
+  void BuildIfSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Less op.
+  // The subgraph is used as the condition subgraph for testing `While` op.
+  // 2 inputs:
+  //   The 1st input is a counter with `kTfLiteInt32` type.
+  //   The 2nd input is ignored in this subgraph.
+  // 1 output with `kTfLiteBool` type.
+  //   Equivalent to (input < rhs).
+  void BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs);
+
+  // An accumulate loop body subgraph. Used to produce triangle number
+  // seqeuence. 2 inputs and 2 outpus
+  //   Equivalent to (counter, value) -> (counter + 1, counter + 1 + value)
+  void BuildAccumulateLoopBodySubgraph(Subgraph* subgraph);
+
+  // A pad loop body subgraph. When used in a loop it will repeatively enlarge
+  // the
+  //   tensor.
+  // 2 inputs and 2 outputs.
+  //   Equivalent to (counter, value) -> (counter + 1, tf.pad(value, padding))
+  // Note the padding is created as a constant tensor.
+  void BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                const std::vector<int> padding);
+
+  // Build a subgraph with a single While op.
+  // 2 inputs, 2 outputs.
+  void BuildWhileSubgraph(Subgraph* subgraph);
+
+ private:
+  void CreateConstantInt32Tensor(Subgraph* subgraph, int tensor_index,
+                                 const std::vector<int>& shape,
+                                 const std::vector<int>& data);
+  std::vector<void*> buffers_;
+};
+
+class ControlFlowOpTest : public ::testing::Test {
+ public:
+  ControlFlowOpTest()
+      : interpreter_(new Interpreter), builder_(new SubgraphBuilder) {}
+
+  ~ControlFlowOpTest() override {
+    interpreter_.reset();
+    builder_.reset();
+  }
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+// Fill a `TfLiteTensor` with a 32-bits integer vector.
+// Preconditions:
+// * The tensor must have `kTfLiteInt32` type.
+// * The tensor must be allocated.
+// * The element count of the tensor must be equal to the length or
+//   the vector.
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data);
+
+// Check if the shape and int32 data of a tensor is as expected.
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data);
+// Check if the shape and bool data of a tensor is as expected.
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data);
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/subgraph_test_util_test.cc b/tensorflow/lite/kernels/subgraph_test_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04e5118b543c1723e1de1875ffd9315991a4dd69
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+
+namespace subgraph_test_util {
+
+namespace {
+
+class SubgraphBuilderTest : public ::testing::Test {
+ public:
+  SubgraphBuilderTest()
+      : interpreter_(new Interpreter), builder_(new SubgraphBuilder) {}
+
+  ~SubgraphBuilderTest() override {
+    interpreter_.reset();
+    builder_.reset();
+  }
+
+ protected:
+  void TestAccumelateLoopBody(int input1, int input2, int output1,
+                              int output2) {
+    interpreter_.reset(new Interpreter);
+    builder_->BuildAccumulateLoopBodySubgraph(
+        &interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {input1});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {input2});
+
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output_tensor1 =
+        interpreter_->tensor(interpreter_->outputs()[0]);
+    CheckIntTensor(output_tensor1, {1}, {output1});
+    TfLiteTensor* output_tensor2 =
+        interpreter_->tensor(interpreter_->outputs()[1]);
+    CheckIntTensor(output_tensor2, {1}, {output2});
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+TEST_F(SubgraphBuilderTest, TestBuildAddSubgraph) {
+  builder_->BuildAddSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildMulSubgraph) {
+  builder_->BuildMulSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {5, 14});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildPadSubgraph) {
+  builder_->BuildPadSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {5}, {0, 5, 7, 0, 0});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildLessEqualCondSubgraph) {
+  builder_->BuildLessEqualCondSubgraph(&interpreter_->primary_subgraph(), 3);
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {5});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {10, 10});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Test [1, 2, 3, 4, 5] <= 3 == [true, true, true, false, false]
+  // (with broadcasting).
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]),
+                {1, 2, 3, 4, 5});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckBoolTensor(output, {5}, {true, true, true, false, false});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildAccumulateLoopBodySubgraph) {
+  TestAccumelateLoopBody(1, 1, 2, 3);
+  TestAccumelateLoopBody(2, 3, 3, 6);
+  TestAccumelateLoopBody(3, 6, 4, 10);
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildPadLoopBodySubgraph) {
+  builder_->BuildPadLoopBodySubgraph(&interpreter_->primary_subgraph(), {1, 2});
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {5});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]),
+                {0, 5, 7, 0, 0});
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output1, {1}, {2});
+  TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+  CheckIntTensor(output2, {8}, {0, 0, 5, 7, 0, 0, 0, 0});
+}
+
+}  // namespace
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 444b01e7b2e055ab4e26a2ea1dce28642dc430b7..64973d7b860fc3089850cc3648ee4fb9da11047e 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -207,6 +207,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TopK(row_size, num_rows, input->data.uint8, k, output_indexes->data.i32,
            output_values->data.uint8);
       break;
+    case kTfLiteInt8:
+      TopK(row_size, num_rows, input->data.int8, k, output_indexes->data.i32,
+           output_values->data.int8);
+      break;
     case kTfLiteInt32:
       TopK(row_size, num_rows, input->data.i32, k, output_indexes->data.i32,
            output_values->data.i32);
diff --git a/tensorflow/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
index 108b8123666aaddcc8ba8438bac82c91ce98d50d..0097ae2f9aece116c963a4b460c2e3ff0fc127c4 100644
--- a/tensorflow/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -46,6 +46,10 @@ class TopKV2OpModel : public SingleOpModel {
     PopulateTensor<uint8_t>(input_, data);
   }
 
+  void SetInputInt8(std::initializer_list<int8_t> data) {
+    PopulateTensor<int8_t>(input_, data);
+  }
+
   void SetInputInt32(std::initializer_list<int32_t> data) {
     PopulateTensor<int32_t>(input_, data);
   }
@@ -66,6 +70,10 @@ class TopKV2OpModel : public SingleOpModel {
     return ExtractVector<uint8_t>(output_values_);
   }
 
+  std::vector<int8_t> GetValuesInt8() {
+    return ExtractVector<int8_t>(output_values_);
+  }
+
   std::vector<int32_t> GetValuesInt32() {
     return ExtractVector<int32_t>(output_values_);
   }
@@ -128,6 +136,14 @@ TEST(TopKV2OpTest, TypeUint8) {
   EXPECT_THAT(m.GetValuesUInt8(), ElementsAreArray({3, 2, 251, 250}));
 }
 
+TEST(TopKV2OpTest, TypeInt8) {
+  TopKV2OpModel m({2, 3}, TensorType_INT8, 2);
+  m.SetInputInt8({1, 2, 3, -126, 125, -24});
+  m.Invoke();
+  EXPECT_THAT(m.GetIndexes(), ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(m.GetValuesInt8(), ElementsAreArray({3, 2, 125, -24}));
+}
+
 // Check that int32_t works.
 TEST(TopKV2OpTest, TypeInt32) {
   TopKV2OpModel m({2, 3}, TensorType_INT32, 2);
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 7a6d320674ad1c8302f8bf3a9d1d5153223deed3..0ef4972d1a856f84d3511657ec9d9f2f3cc36182 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -117,6 +117,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_TRANSPOSE(reference_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_TRANSPOSE(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index 93df2c81db8c17de7a36d155c7d26b826c859c99..71644159209cc289329f65d1cac929585f2f4200 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -25,16 +25,17 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename T>
 void RunTestPermutation(const std::vector<int>& shape,
                         const std::vector<int>& perms,
-                        std::vector<float>* input_transposed) {
+                        std::vector<T>* input_transposed) {
   // Count elements and allocate output.
   int count = 1;
   for (auto factor : shape) count *= factor;
   input_transposed->resize(count);
 
   // Create the dummy data
-  std::vector<float> input(count);
+  std::vector<T> input(count);
   for (int i = 0; i < input.size(); i++) {
     input[i] = i;
   }
@@ -64,8 +65,8 @@ void RunTestPermutation(const std::vector<int>& shape,
     params.perm[i] = perms[i];
   }
 
-  reference_ops::Transpose<float>(params, input_shape, input.data(),
-                                  output_shape, input_transposed->data());
+  reference_ops::Transpose<T>(params, input_shape, input.data(), output_shape,
+                              input_transposed->data());
 }
 
 TEST(TransposeTest, TestRefOps1D) {
@@ -125,6 +126,28 @@ TEST(TransposeTest, TestRefOps4D) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(TransposeTest, TestRefOps4DInt8) {
+  std::vector<int8_t> out;
+  // Basic 4d.
+  RunTestPermutation({2, 3, 4, 5}, {2, 0, 1, 3}, &out);
+  ASSERT_EQ(
+      out,
+      std::vector<int8_t>(
+          {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+           60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+           5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+           65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+           10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+           70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+           15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+           75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}));
+  RunTestPermutation({2, 3, 4, 5}, {0, 1, 2, 3}, &out);
+  // Basic identity.
+  std::vector<int8_t> ref(out.size());
+  for (int k = 0; k < ref.size(); k++) ref[k] = k;
+  ASSERT_EQ(out, ref);
+}
+
 class TransposeOpModel : public SingleOpModel {
  public:
   void SetInput(std::initializer_list<float> data) {
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 1caffe14f90b8ce9d13d8c781e87bf918c02b9f4..99ad4bb4e817ed435043fb17469381192db843ff 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -52,9 +52,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
   TF_LITE_ENSURE(context, NumDimensions(input) > 1);
-  TF_LITE_ENSURE(context, NumDimensions(input) > data->axis);
-  // TODO(renjieliu): Support negative axis.
-  TF_LITE_ENSURE(context, data->axis >= 0);
+  int axis = data->axis;
+  if (axis < 0) {
+    axis += NumDimensions(input);
+  }
+  TF_LITE_ENSURE(context, 0 <= axis && axis < NumDimensions(input));
   if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) {
     context->ReportError(context,
                          "Currently pack only supports int32 and float32.");
@@ -67,12 +69,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(input) - 1);
   int o = 0;
   for (int index = 0; index < NumDimensions(input); ++index) {
-    if (index != data->axis) {
+    if (index != axis) {
       output_shape->data[o++] = input_shape->data[index];
     }
   }
 
-  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[data->axis]);
+  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[axis]);
   for (int i = 0; i < data->num; ++i) {
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
     TfLiteTensor* output = GetOutput(context, node, i);
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 9b60cce549804a59e343f3e26f978679a1624c00..76f7dff93e395414f0e5a69fe4cef151a7517315 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -28,14 +28,16 @@ template <typename T>
 class UnpackOpModel : public SingleOpModel {
  public:
   UnpackOpModel(const TensorData& input, int axis) {
-    CHECK_LE(axis, input.shape.size());
+    if (axis < 0) {
+      axis += input.shape.size();
+    }
     const int num_outputs = input.shape[axis];
     input_ = AddInput(input);
     for (int i = 0; i < num_outputs; ++i) {
       outputs_.push_back(AddOutput(input.type));
     }
     SetBuiltinOp(BuiltinOperator_UNPACK, BuiltinOptions_UnpackOptions,
-                 CreatePackOptions(builder_, num_outputs, axis).Union());
+                 CreateUnpackOptions(builder_, num_outputs, axis).Union());
     BuildInterpreter({GetShape(input_)});
   }
 
@@ -104,6 +106,44 @@ TEST(UnpackOpTest, FloatThreeOutputsAxisOne) {
   EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
 }
 
+TEST(UnpackOpTest, FloatThreeOutputsNegativeAxisOne) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, -1);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(3));
+  EXPECT_THAT(output_shapes[1], ElementsAre(3));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
+}
+
+TEST(UnpackOpTest, FloatThreeOutputsNegativeAxisTwo) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, -2);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 3);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2));
+  EXPECT_THAT(output_shapes[2], ElementsAre(2));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 3);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2));
+  EXPECT_THAT(output_datas[1], ElementsAre(3, 4));
+  EXPECT_THAT(output_datas[2], ElementsAre(5, 6));
+}
+
 TEST(UnpackOpTest, FloatOneOutput) {
   UnpackOpModel<float> model({TensorType_FLOAT32, {1, 6}}, 0);
   model.SetInput({1, 2, 3, 4, 5, 6});
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96ee36f08bc0144ce0e4a66d3d7350a791d26d86
--- /dev/null
+++ b/tensorflow/lite/kernels/where.cc
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace where {
+
+constexpr int kInputConditionTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* cond_tensor,
+                                TfLiteTensor* output_tensor) {
+  // Output tensor should have shape:
+  // (num_true, cond_rank), where num_true denotes the number of true values
+  // in condition.
+  const RuntimeShape& cond_shape = GetTensorShape(cond_tensor);
+  const int size = cond_shape.FlatSize();
+  const int cond_rank = cond_shape.DimensionsCount();
+  const bool* cond_data = GetTensorData<bool>(cond_tensor);
+
+  int true_count = 0;
+  for (int i = 0; i < size; ++i) {
+    if (cond_data[i]) {
+      true_count++;
+    }
+  }
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(2);
+  output_dims->data[0] = true_count;
+  output_dims->data[1] = cond_rank;
+  return context->ResizeTensor(context, output_tensor, output_dims);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* cond_tensor =
+      GetInput(context, node, kInputConditionTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (cond_tensor->type != kTfLiteBool) {
+    context->ReportError(context,
+                         "Condition tensor must be of type bool, but saw '%s'.",
+                         TfLiteTypeGetName(cond_tensor->type));
+    return kTfLiteError;
+  }
+
+  // As output will be a 2D tensor of indices, we use int32 as data type.
+  output->type = kTfLiteInt32;
+
+  // Exit early if cond is a non-const tensor. Set output tensor to dynamic so
+  // output size can be determined in Eval.
+  if (!IsConstantTensor(cond_tensor)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, cond_tensor, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* cond_tensor =
+      GetInput(context, node, kInputConditionTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, cond_tensor, output));
+  }
+
+  reference_ops::SelectTrueCoords(GetTensorShape(cond_tensor),
+                                  GetTensorData<bool>(cond_tensor),
+                                  GetTensorData<int32_t>(output));
+  return kTfLiteOk;
+}
+}  // namespace where
+
+TfLiteRegistration* Register_WHERE() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 where::Prepare, where::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89bd7c43646f80f8b0adb4ef4026f1d9bc7b43c4
--- /dev/null
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseWhereOpModel : public SingleOpModel {
+ public:
+  BaseWhereOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_WHERE, BuiltinOptions_WhereOptions,
+                 CreateWhereOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class IntegerWhereOpModel : public BaseWhereOpModel {
+ public:
+  using BaseWhereOpModel::BaseWhereOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(WhereOpTest, SelectFromVectorNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromVector) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2}));
+}
+
+TEST(WhereOpTest, SelectFromMatrixNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false,  //
+                                     false, false, false,  //
+                                     false, false, false});
+  m.Invoke();
+  EXPECT_EQ(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromMatrix1) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 1}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               2, 0}));
+}
+
+TEST(WhereOpTest, SelectFromMatrix2) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false,   //
+                                     true, false, false,  //
+                                     true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               0, 1,  //
+                                               1, 0,  //
+                                               2, 0,  //
+                                               2, 2}));
+}
+
+TEST(WhereOpTest, SelectFromMatrix3) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 5}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, false, true, true,   //
+                                     false, true, true, false, false,  //
+                                     true, false, true, false, false});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               0, 3,  //
+                                               0, 4,  //
+                                               1, 1,  //
+                                               1, 2,  //
+                                               2, 0,  //
+                                               2, 2}));
+}
+
+TEST(WhereOpTest, SelectFromRank3TensorNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 2, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false, false,  //
+                                     false, false, false, false});
+  m.Invoke();
+  EXPECT_EQ(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor1) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 1, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true,  //
+                                     false, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 2,  //
+                                               1, 0, 2}));
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor2) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 2, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true,  //
+                                     false, false, true, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 1,  //
+                                               0, 1, 1,  //
+                                               1, 1, 0,  //
+                                               1, 1, 1}));
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor3) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true, false, false,  //
+                                     false, false, true, false, true, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 1,  //
+                                               0, 1, 1,  //
+                                               1, 1, 0,  //
+                                               1, 2, 0,  //
+                                               1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07a48e67506978cea9a91f99c20b638de7ffbab9
--- /dev/null
+++ b/tensorflow/lite/kernels/while.cc
@@ -0,0 +1,312 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace while_kernel {
+
+namespace {
+
+// Propagate tensor shapes and types from `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsShapeAndType(TfLiteContext* context,
+                                     Subgraph* src_subgraph,
+                                     const SrcVector& src_tensor_indices,
+                                     Subgraph* dst_subgraph,
+                                     const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    std::vector<int> dims(src_tensor->dims->data,
+                          src_tensor->dims->data + src_tensor->dims->size);
+    dst_subgraph->ResizeInputTensor(dst_tensor_indices[i], dims);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    dst_tensor->type = src_tensor->type;
+  }
+  return kTfLiteOk;
+}
+
+// Copy the tensors data from tensors `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsData(TfLiteContext* context, Subgraph* src_subgraph,
+                             const SrcVector& src_tensor_indices,
+                             Subgraph* dst_subgraph,
+                             const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    TF_LITE_ENSURE_EQ(context, src_tensor->bytes, dst_tensor->bytes);
+    memcpy(dst_tensor->data.raw, src_tensor->data.raw, src_tensor->bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckCondOutput(TfLiteContext* context,
+                             const TfLiteTensor* cond_output) {
+  // The condition output must be a single boolean value.
+  TF_LITE_ENSURE_EQ(context, cond_output->type, kTfLiteBool);
+  if (cond_output->dims->size == 0) {
+    // It's okay if it's a 0D scalar.
+    return kTfLiteOk;
+  }
+  // Otherwise it must be 1D with shape [1].
+  TF_LITE_ENSURE_EQ(context, cond_output->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cond_output->dims->data[0], 1);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+struct OpData {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+  bool cond_has_dynamic_output_tensors;
+  bool body_has_dynamic_output_tensors;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  op_data->cond_subgraph_index = m["cond_subgraph_index"].AsInt32();
+  op_data->body_subgraph_index = m["body_subgraph_index"].AsInt32();
+  op_data->cond_has_dynamic_output_tensors = false;
+  op_data->body_has_dynamic_output_tensors = false;
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  int num_inputs = node->inputs->size;
+  // The number of outputs should be the same as number of inputs.
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, num_inputs);
+
+  // Check subgraph indices and get subgraphs.
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_data->cond_subgraph_index < subgraphs->size());
+  TF_LITE_ENSURE(context, op_data->body_subgraph_index < subgraphs->size());
+
+  Subgraph* cond_subgraph = (*subgraphs)[op_data->cond_subgraph_index].get();
+  Subgraph* body_subgraph = (*subgraphs)[op_data->body_subgraph_index].get();
+
+  // Check input & output count of the condition subgraph.
+  TF_LITE_ENSURE_EQ(context, cond_subgraph->inputs().size(), num_inputs);
+  TF_LITE_ENSURE_EQ(context, cond_subgraph->outputs().size(), 1);
+
+  // Check input & output count of the body subgraph.
+  TF_LITE_ENSURE_EQ(context, body_subgraph->inputs().size(), num_inputs);
+  TF_LITE_ENSURE_EQ(context, body_subgraph->outputs().size(), num_inputs);
+
+  // Prepare and check the condition subgraph.
+  TF_LITE_ENSURE_OK(
+      context, CopyTensorsShapeAndType(context, this_subgraph,
+                                       TfLiteIntArrayView(node->inputs),
+                                       cond_subgraph, cond_subgraph->inputs()));
+  TF_LITE_ENSURE_OK(context, cond_subgraph->AllocateTensors());
+  TfLiteTensor* cond_output =
+      cond_subgraph->tensor(cond_subgraph->outputs()[0]);
+  // TODO(ycling): Handle the case the cond subgraph has dynamic tensor outputs.
+  // This should rarely happens. In most cases the output is static with shape
+  // [1]. However theoretically intermediate tensors in the cond subgraph
+  // can be dynamic.
+  if (IsDynamicTensor(cond_output)) {
+    op_data->cond_has_dynamic_output_tensors = true;
+  } else {
+    TF_LITE_ENSURE_STATUS(CheckCondOutput(context, cond_output));
+  }
+
+  // Prepare and check the body subgraph.
+  TF_LITE_ENSURE_OK(
+      context, CopyTensorsShapeAndType(context, this_subgraph,
+                                       TfLiteIntArrayView(node->inputs),
+                                       body_subgraph, body_subgraph->inputs()));
+  TF_LITE_ENSURE_OK(context, body_subgraph->AllocateTensors());
+  if (body_subgraph->HasDynamicTensors()) {
+    op_data->body_has_dynamic_output_tensors = true;
+  } else {
+    for (int i = 0; i < num_inputs; ++i) {
+      TfLiteTensor* body_input =
+          body_subgraph->tensor(body_subgraph->inputs()[i]);
+      TfLiteTensor* body_output =
+          body_subgraph->tensor(body_subgraph->outputs()[i]);
+      TF_LITE_ENSURE_EQ(context, body_input->type, body_output->type);
+
+      // TODO(ycling): Support dynamic sized body subgraph.
+      TF_LITE_ENSURE(context, !IsDynamicTensor(body_output));
+      if (!TfLiteIntArrayEqual(body_input->dims, body_output->dims)) {
+        // If the output shape of the body subgraph is static w.r.t. a fixed
+        // input size, but it's different from input size, it's still considered
+        // dynamic. For example: If a subgraph keeps padding its input with a
+        // fixed padding, the output shape is static w.r.t the input shape and
+        // padding, but running it in a loop will keep bloating the tensor.
+        op_data->body_has_dynamic_output_tensors = true;
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < num_inputs; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (op_data->body_has_dynamic_output_tensors) {
+      SetTensorToDynamic(output);
+    } else {
+      TfLiteTensor* body_output =
+          body_subgraph->tensor(body_subgraph->outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(body_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  Subgraph* cond_subgraph = (*subgraphs)[op_data->cond_subgraph_index].get();
+  Subgraph* body_subgraph = (*subgraphs)[op_data->body_subgraph_index].get();
+
+  // The follow graph illustrates the current implementation.
+  //
+  // This Subgraph          Cond Subgraph         Body Subgraph
+  // +-----------+   (1)   +------------+   (3)   +------------+
+  // |   WHILE   |-------->|  SUBGRAPH  |-------->|  SUBGRAPH  |
+  // |   INPUT   |        /|   INPUT    |<-----   |   INPUT    |
+  // +-----------+       / +------------+      \  +------------+
+  //                    /        |              \       |
+  //               (6) /         | (2)       (5) \      | (4)
+  //                  /          v                \     v
+  // +-----------+   /     +------------+         +------------+
+  // |   WHILE   |<--      |  SUBGRAPH  |         |  SUBGRAPH  |
+  // |   OUTPUT  |         |   OUTPUT   |         |   OUTPUT   |
+  // +-----------+         +------------+         +------------+
+  //
+  // (1) Copy the inputs of WHILE op to the inputs of condition subgraph.
+  // (2) Invoke condition subgraph.
+  //     Jump to step 5 if result is false.
+  // (3) Copy the inputs of condition subgraph to the inputs of body subgraph.
+  // (4) Invoke body subgraph.
+  // (5) Copy the outputs of body subgraph to the inputs condition subgraph.
+  //     Jump back to step 2!
+  // (6) Copy the inputs of condition subgraph to the outputs of WHILE op.
+  //
+  // If the body subgraph has dynamic sized outputs, it's required to resize the
+  // tensor before copying in step 1, 3, 4 and 6.
+  //
+  // Note the flow is carefully designed to handle the dynamic sized output
+  // case. The loop invariant is: The newest value is in the inputs of condition
+  // subgraph. This is always true before step 2.
+  //
+  // This is the best we can do without sharing tensor buffer across subgraph
+  // boundry. Currently we copy the input / output between the subgraphs. This
+  // isn't optimized yet and a lot of redundent copies are made.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsData(context, this_subgraph, TfLiteIntArrayView(node->inputs),
+                      cond_subgraph, cond_subgraph->inputs()));
+
+  while (true) {
+    TF_LITE_ENSURE_OK(context, cond_subgraph->Invoke());
+    int cond_subgraph_output_index = cond_subgraph->outputs()[0];
+    cond_subgraph->EnsureTensorDataIsReadable(cond_subgraph_output_index);
+    TfLiteTensor* cond_output =
+        cond_subgraph->tensor(cond_subgraph_output_index);
+    if (op_data->cond_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_STATUS(CheckCondOutput(context, cond_output));
+    }
+
+    if (!cond_output->data.b[0]) {
+      break;
+    }
+    if (op_data->body_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_OK(context,
+                        CopyTensorsShapeAndType(
+                            context, cond_subgraph, cond_subgraph->inputs(),
+                            body_subgraph, body_subgraph->inputs()));
+      TF_LITE_ENSURE_OK(context, body_subgraph->AllocateTensors());
+    }
+
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyTensorsData(context, cond_subgraph, cond_subgraph->inputs(),
+                        body_subgraph, body_subgraph->inputs()));
+
+    TF_LITE_ENSURE_OK(context, body_subgraph->Invoke());
+
+    for (int tensor_index : body_subgraph->outputs()) {
+      body_subgraph->EnsureTensorDataIsReadable(tensor_index);
+    }
+
+    if (op_data->body_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_OK(context,
+                        CopyTensorsShapeAndType(
+                            context, body_subgraph, body_subgraph->outputs(),
+                            cond_subgraph, cond_subgraph->inputs()));
+      TF_LITE_ENSURE_OK(context, cond_subgraph->AllocateTensors());
+    }
+
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyTensorsData(context, body_subgraph, body_subgraph->outputs(),
+                        cond_subgraph, cond_subgraph->inputs()));
+  }
+
+  // Note that copying from body's output will fail if body is never invoked.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  if (op_data->body_has_dynamic_output_tensors) {
+    TF_LITE_ENSURE_OK(
+        context, CopyTensorsShapeAndType(context, cond_subgraph,
+                                         cond_subgraph->inputs(), this_subgraph,
+                                         TfLiteIntArrayView(node->outputs)));
+  }
+
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsData(context, cond_subgraph, cond_subgraph->inputs(),
+                      this_subgraph, TfLiteIntArrayView(node->outputs)));
+  return kTfLiteOk;
+}
+
+}  // namespace while_kernel
+
+TfLiteRegistration* Register_WHILE() {
+  static TfLiteRegistration r = {while_kernel::Init, while_kernel::Free,
+                                 while_kernel::Prepare, while_kernel::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/while_test.cc b/tensorflow/lite/kernels/while_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9946b4a3280116d7cb176f54b94b73bb956a5f71
--- /dev/null
+++ b/tensorflow/lite/kernels/while_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+using subgraph_test_util::CheckIntTensor;
+using subgraph_test_util::ControlFlowOpTest;
+using subgraph_test_util::FillIntTensor;
+
+namespace {
+
+class WhileTest : public ControlFlowOpTest {};
+
+// The test builds a model that produces the i-th number of
+// triangular number sequence.
+//
+// TODO(ycling): Consider to improve this test case by adding a
+// concat into the body subgraph.
+TEST_F(WhileTest, TestTriangularNumberSequence) {
+  const std::vector<int> expected = {1, 3, 6, 10, 15, 21, 28};
+  for (int i = 0; i < expected.size(); ++i) {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildLessEqualCondSubgraph(interpreter_->subgraph(1), i);
+    builder_->BuildAccumulateLoopBodySubgraph(interpreter_->subgraph(2));
+    builder_->BuildWhileSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1});
+
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+    CheckIntTensor(output1, {1}, {i + 1});
+    TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+    CheckIntTensor(output2, {1}, {expected[i]});
+  }
+}
+
+// This requires dynamic sized subgraphs and it's not supported right now.
+// TODO(ycling): Support dynamic sized subgraphs.
+TEST_F(WhileTest, TestPadLoop) {
+  interpreter_.reset(new Interpreter);
+  interpreter_->AddSubgraphs(2);
+  builder_->BuildLessEqualCondSubgraph(interpreter_->subgraph(1), 3);
+  builder_->BuildPadLoopBodySubgraph(interpreter_->subgraph(2), {1, 2});
+  builder_->BuildWhileSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+  // This is not supported yet. The test ensures thatit doesn't crash and raises
+  // an error properly.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output1, {1}, {4});
+  TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+  CheckIntTensor(output2, {11}, {0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/minimal_logging.cc b/tensorflow/lite/minimal_logging.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8768ef6e312ec80c9b3653983421e07c662f8e5e
--- /dev/null
+++ b/tensorflow/lite/minimal_logging.cc
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <cstdarg>
+
+namespace tflite {
+namespace logging_internal {
+
+void MinimalLogger::Log(LogSeverity severity, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  VLog(severity, format, args);
+  va_end(args);
+}
+
+const char* MinimalLogger::GetSeverityName(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return "INFO";
+    case TFLITE_LOG_WARNING:
+      return "WARNING";
+    case TFLITE_LOG_ERROR:
+      return "ERROR";
+    default:
+      return "<Unknown severity>";
+  }
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging.h b/tensorflow/lite/minimal_logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..7682ed8edc401762613a9cae582fdf1e16e61f51
--- /dev/null
+++ b/tensorflow/lite/minimal_logging.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+#define TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+
+#include <cstdarg>
+
+namespace tflite {
+
+enum LogSeverity {
+  TFLITE_LOG_INFO = 0,
+  TFLITE_LOG_WARNING = 1,
+  TFLITE_LOG_ERROR = 2,
+};
+
+namespace logging_internal {
+
+// Helper class for simple platform-specific console logging. Note that we
+// explicitly avoid the convenience of ostream-style logging to minimize binary
+// size impact.
+class MinimalLogger {
+ public:
+  // Logging hook that takes variadic args.
+  static void Log(LogSeverity severity, const char* format, ...);
+
+  // Logging hook that takes a formatted va_list.
+  static void VLog(LogSeverity severity, const char* format, va_list args);
+
+ private:
+  static const char* GetSeverityName(LogSeverity severity);
+};
+
+}  // namespace logging_internal
+}  // namespace tflite
+
+// Convenience macro for basic internal logging in production builds.
+// Note: This should never be used for debug-type logs, as it will *not* be
+// stripped in release optimized builds. In general, prefer the error reporting
+// APIs for developer-facing errors, and only use this for diagnostic output
+// that should always be logged in user builds.
+#define TFLITE_LOG_PROD(severity, format, ...) \
+  tflite::logging_internal::MinimalLogger::Log(severity, format, ##__VA_ARGS__);
+
+#endif  // TENSORFLOW_LITE_MINIMAL_LOGGING_H_
diff --git a/tensorflow/lite/minimal_logging_android.cc b/tensorflow/lite/minimal_logging_android.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f87e6fa18e1dac7e2b4e093f6d4e91a1e652ba3d
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_android.cc
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <android/log.h>
+#include <cstdio>
+
+namespace tflite {
+namespace logging_internal {
+namespace {
+
+int GetPlatformSeverity(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return ANDROID_LOG_INFO;
+    case TFLITE_LOG_WARNING:
+      return ANDROID_LOG_WARN;
+    case TFLITE_LOG_ERROR:
+      return ANDROID_LOG_ERROR;
+    default:
+      return ANDROID_LOG_DEBUG;
+  }
+}
+
+}  // namespace
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  // First log to Android's explicit log(cat) API.
+  va_list args_for_android_log;
+  va_copy(args_for_android_log, args);
+  __android_log_vprint(GetPlatformSeverity(severity), "tflite", format, args);
+  va_end(args_for_android_log);
+
+  // Also print to stderr for standard console applications.
+  fprintf(stderr, "%s: ", GetSeverityName(severity));
+  vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_default.cc b/tensorflow/lite/minimal_logging_default.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fa13e47e63a01b5c15ada3b09498fdb755f6376
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_default.cc
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <cstdio>
+
+namespace tflite {
+namespace logging_internal {
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  fprintf(stderr, "%s: ", GetSeverityName(severity));
+  vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_ios.cc b/tensorflow/lite/minimal_logging_ios.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a774682a5b42f71d0cc77c49bbcf9a4ec6ef21b7
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_ios.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <syslog.h>
+#include <cstdarg>
+
+namespace tflite {
+namespace logging_internal {
+namespace {
+
+int GetPlatformSeverity(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return LOG_INFO;
+    case TFLITE_LOG_WARNING:
+      return LOG_WARNING;
+    case TFLITE_LOG_ERROR:
+      return LOG_ERR;
+    default:
+      return LOG_DEBUG;
+  }
+}
+
+}  // namespace
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  // TODO(b/123704468): Use os_log when available.
+  vsyslog(GetPlatformSeverity(severity), format, args);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_test.cc b/tensorflow/lite/minimal_logging_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e59425a2b264b72d44477c6484fc0ffea014a750
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+
+TEST(MinimalLogging, Basic) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Foo");
+  EXPECT_EQ("INFO: Foo\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, BasicFormatted) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Foo %s %s", "Bar", "Baz");
+  EXPECT_EQ("INFO: Foo Bar Baz\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, Warn) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_WARNING, "One", "");
+  EXPECT_EQ("WARNING: One\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, Error) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Two");
+  EXPECT_EQ("ERROR: Two\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, UnknownSeverity) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(static_cast<LogSeverity>(-1), "Three");
+  EXPECT_EQ("<Unknown severity>: Three\n",
+            testing::internal::GetCapturedStderr());
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 573500dbb358eb0484cf9ad801782786e8b6e821..ba7eaf67633c3d1d3cc8c4aac814ef2e5a1fcd8b 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -39,6 +39,7 @@ enum {
   ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
   ANEURALNETWORKS_TENSOR_INT32 = 4,
   ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+  ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 36301f1a83aa380ff83b8fe56b007a1c891ea1ad..bbc0c866e6352de40925d896886317d94814a308 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -45,6 +45,19 @@ int32_t GetAndroidSdkVersion() {
       }
       result = result * 10 + digit;
     }
+    // TODO(levp): remove once SDK gets updated to 29th level
+    // Upgrade SDK version for pre-release Q to be able to test functionality
+    // available from SDK level 29.
+    if (result == 28) {
+      char versionCodename[PROP_VALUE_MAX];
+      const char* versionCodenameProp = "ro.build.version.codename";
+      length = __system_property_get(versionCodenameProp, versionCodename);
+      if (length != 0) {
+        if (versionCodename[0] == 'Q') {
+          return 29;
+        }
+      }
+    }
     return result;
   }
   return 0;
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 7c6746887974cc501c34ecf5ccfa6369dfa1316b..d0605ef6b355c61ef85b55c601861771975e73c6 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -610,6 +610,7 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_SPLIT:
       case tflite::BuiltinOperator_STRIDED_SLICE:
       case tflite::BuiltinOperator_EXP:
+      case tflite::BuiltinOperator_COS:
       case tflite::BuiltinOperator_LOG_SOFTMAX:
       case tflite::BuiltinOperator_DEQUANTIZE:
       case tflite::BuiltinOperator_DELEGATE:
@@ -665,6 +666,10 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_CEIL:
       case tflite::BuiltinOperator_REVERSE_V2:
       case tflite::BuiltinOperator_ADD_N:
+      case tflite::BuiltinOperator_GATHER_ND:
+      case tflite::BuiltinOperator_WHERE:
+      case tflite::BuiltinOperator_RANK:
+      case tflite::BuiltinOperator_ELU:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index aa0ffe90e1addcac9e1ef88cd63f797b8dae7e43..a252fcb0c77ef7f6d08ca24d790cab2c2d58c4b5 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -13,7 +13,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
-        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
@@ -40,6 +39,14 @@ py_binary(
     srcs = ["tflite_convert.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":tflite_convert_main_lib"],
+)
+
+py_library(
+    name = "tflite_convert_main_lib",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [":tflite_convert_lib"],
 )
 
@@ -64,6 +71,8 @@ py_library(
         ":interpreter",
         ":lite_constants",
         ":op_hint",
+        "//tensorflow/lite/experimental/examples/lstm:tflite_lstm_ops",
+        "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/python:graph_util",
         "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/keras",
@@ -79,6 +88,22 @@ py_test(
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",  # TODO(b/125045671) reenable once segfault is fixed.
+        "no_windows",
+    ],
+    deps = [
+        ":lite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "lite_v2_test",
+    srcs = ["lite_v2_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9c603998717019ac8624868b16d720e300a30efd..4d38ee9be498beed9384e7ecf6be147c4086e7c0 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -443,7 +443,7 @@ def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
   return data
 
 
-@_tf_export("lite.toco_convert")
+@_tf_export(v1=["lite.toco_convert"])
 @deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
 def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
   """Convert a model using TOCO.
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index a1325f0b1ff8bec11f0ad90846154401b1bb0134..9b9516f6d0bdedb30e9ddcb419639920fe6e000f 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -216,7 +216,8 @@ class Interpreter(object):
   def get_tensor(self, tensor_index):
     """Gets the value of the input tensor (get a copy).
 
-    If you wish to avoid the copy, use `tensor()`.
+    If you wish to avoid the copy, use `tensor()`. This function cannot be used
+    to read intermediate results.
 
     Args:
       tensor_index: Tensor index of tensor to get. This value can be gotten from
@@ -233,7 +234,8 @@ class Interpreter(object):
     This allows reading and writing to this tensors w/o copies. This more
     closely mirrors the C++ Interpreter class interface's tensor() member, hence
     the name. Be careful to not hold these output references through calls
-    to `allocate_tensors()` and `invoke()`.
+    to `allocate_tensors()` and `invoke()`. This function cannot be used to read
+    intermediate results.
 
     Usage:
 
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 7ec56a21c9ffa82e1893d3846d92564539ac34ae..b21779226f62ead3fd4bde5aacdfc393a4d5bff9 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -91,6 +91,41 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     output_data = interpreter.get_tensor(output_details[0]['index'])
     self.assertTrue((expected_output == output_data).all())
 
+  def testString(self):
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/gather_string.tflite'))
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('input', input_details[0]['name'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
+    self.assertTrue(([10] == input_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[0]['quantization'])
+    self.assertEqual('indices', input_details[1]['name'])
+    self.assertEqual(np.int64, input_details[1]['dtype'])
+    self.assertTrue(([3] == input_details[1]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
+    self.assertTrue(([3] == output_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), output_details[0]['quantization'])
+
+    test_input = np.array([1, 2, 3], dtype=np.int64)
+    interpreter.set_tensor(input_details[1]['index'], test_input)
+
+    test_input = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
+    expected_output = np.array([b'b', b'c', b'd'])
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
+
 
 class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 6de6fb48f78022652de3cd2cecf7fe51c39e79f6..6ec7ce497a51b9b7b66d680ea9a81ef47df51718 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -6,14 +6,26 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
+cc_library(
+    name = "numpy",
+    srcs = ["numpy.cc"],
+    hdrs = ["numpy.h"],
+    deps = [
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 cc_library(
     name = "interpreter_wrapper_lib",
     srcs = ["interpreter_wrapper.cc"],
     hdrs = ["interpreter_wrapper.h"],
     deps = [
+        ":numpy",
         ":python_error_reporter",
         ":python_utils",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
@@ -36,7 +48,9 @@ cc_library(
     srcs = ["python_utils.cc"],
     hdrs = ["python_utils.h"],
     deps = [
+        ":numpy",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 9ccaabbfe97a3b024abc1080de75f2434b64962e..6023587d3b191d8c486dac78b889510ff1c22805 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -21,24 +21,10 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
-
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define CPP_TO_PYSTRING PyBytes_FromStringAndSize
-#else
-#define PY_TO_CPPSTRING PyString_AsStringAndSize
-#define CPP_TO_PYSTRING PyString_FromStringAndSize
-#endif
+#include "tensorflow/lite/string_util.h"
 
 #define TFLITE_PY_CHECK(x)               \
   if ((x) != kTfLiteOk) {                \
@@ -64,11 +50,7 @@ namespace interpreter_wrapper {
 
 namespace {
 
-// Calls PyArray's initialization to initialize all the API pointers. Note that
-// this usage implies only this translation unit can use the pointers. See
-// tensorflow/python/core/numpy.cc for a strategy if we ever need to extend
-// this further.
-void ImportNumpy() { import_array1(); }
+using python_utils::PyDecrefDeleter;
 
 std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     const tflite::FlatBufferModel* model,
@@ -77,7 +59,7 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     return nullptr;
   }
 
-  ImportNumpy();
+  ::tflite::python::ImportNumpy();
 
   std::unique_ptr<tflite::Interpreter> interpreter;
   if (tflite::InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
@@ -86,10 +68,6 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
   return interpreter;
 }
 
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
 PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
   void* pydata = malloc(size * sizeof(int));
   memcpy(pydata, data, size * sizeof(int));
@@ -267,7 +245,7 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value) {
   }
 
   PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  TfLiteTensor* tensor = interpreter_->tensor(i);
 
   if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
     PyErr_Format(PyExc_ValueError,
@@ -279,26 +257,41 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value) {
   }
 
   if (PyArray_NDIM(array) != tensor->dims->size) {
-    PyErr_SetString(PyExc_ValueError, "Cannot set tensor: Dimension mismatch");
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor: Dimension mismatch."
+                 " Got %d"
+                 " but expected %d for input %d.",
+                 PyArray_NDIM(array), tensor->dims->size, i);
     return nullptr;
   }
 
   for (int j = 0; j < PyArray_NDIM(array); j++) {
     if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
-      PyErr_SetString(PyExc_ValueError,
-                      "Cannot set tensor: Dimension mismatch");
+      PyErr_Format(PyExc_ValueError,
+                   "Cannot set tensor: Dimension mismatch."
+                   " Got %ld"
+                   " but expected %d for dimension %d of input %d.",
+                   PyArray_SHAPE(array)[j], tensor->dims->data[j], j, i);
       return nullptr;
     }
   }
 
-  size_t size = PyArray_NBYTES(array);
-  if (size != tensor->bytes) {
-    PyErr_Format(PyExc_ValueError,
-                 "numpy array had %zu bytes but expected %zu bytes.", size,
-                 tensor->bytes);
-    return nullptr;
+  if (tensor->type != kTfLiteString) {
+    size_t size = PyArray_NBYTES(array);
+    if (size != tensor->bytes) {
+      PyErr_Format(PyExc_ValueError,
+                   "numpy array had %zu bytes but expected %zu bytes.", size,
+                   tensor->bytes);
+      return nullptr;
+    }
+    memcpy(tensor->data.raw, PyArray_DATA(array), size);
+  } else {
+    DynamicBuffer dynamic_buffer;
+    if (!python_utils::FillStringBufferWithPyArray(value, &dynamic_buffer)) {
+      return nullptr;
+    }
+    dynamic_buffer.WriteToTensor(tensor, nullptr);
   }
-  memcpy(tensor->data.raw, PyArray_DATA(array), size);
   Py_RETURN_NONE;
 }
 
@@ -345,19 +338,51 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
 
   std::vector<npy_intp> dims(tensor->dims->data,
                              tensor->dims->data + tensor->dims->size);
-  // Make a buffer copy but we must tell Numpy It owns that data or else
-  // it will leak.
-  void* data = malloc(tensor->bytes);
-  if (!data) {
-    PyErr_SetString(PyExc_ValueError, "Malloc to copy tensor failed.");
-    return nullptr;
+  if (tensor->type != kTfLiteString) {
+    // Make a buffer copy but we must tell Numpy It owns that data or else
+    // it will leak.
+    void* data = malloc(tensor->bytes);
+    if (!data) {
+      PyErr_SetString(PyExc_ValueError, "Malloc to copy tensor failed.");
+      return nullptr;
+    }
+    memcpy(data, tensor->data.raw, tensor->bytes);
+    PyObject* np_array =
+        PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
+    PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(np_array),
+                        NPY_ARRAY_OWNDATA);
+    return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+  } else {
+    // Create a C-order array so the data is contiguous in memory.
+    const int32_t kCOrder = 0;
+    PyObject* py_object =
+        PyArray_EMPTY(dims.size(), dims.data(), NPY_OBJECT, kCOrder);
+
+    if (py_object == nullptr) {
+      PyErr_SetString(PyExc_MemoryError, "Failed to allocate PyArray.");
+      return nullptr;
+    }
+
+    PyArrayObject* py_array = reinterpret_cast<PyArrayObject*>(py_object);
+    PyObject** data = reinterpret_cast<PyObject**>(PyArray_DATA(py_array));
+    auto num_strings = GetStringCount(tensor->data.raw);
+    for (int j = 0; j < num_strings; ++j) {
+      auto ref = GetString(tensor->data.raw, j);
+
+      PyObject* bytes = PyBytes_FromStringAndSize(ref.str, ref.len);
+      if (bytes == nullptr) {
+        Py_DECREF(py_object);
+        PyErr_Format(PyExc_ValueError,
+                     "Could not create PyBytes from string %d of input %d.", j,
+                     i);
+        return nullptr;
+      }
+      // PyArray_EMPTY produces an array full of Py_None, which we must decref.
+      Py_DECREF(data[j]);
+      data[j] = bytes;
+    }
+    return py_object;
   }
-  memcpy(data, tensor->data.raw, tensor->bytes);
-  PyObject* np_array =
-      PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
-  PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(np_array),
-                      NPY_ARRAY_OWNDATA);
-  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
 PyObject* InterpreterWrapper::tensor(PyObject* base_object, int i) {
@@ -394,7 +419,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   char * buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
     return nullptr;
   }
   std::unique_ptr<tflite::FlatBufferModel> model =
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff5403d2a60a66886681db73c4aa69bf43369170
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -0,0 +1,25 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy() { import_array1(); }
+
+}  // namespace python
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3b013fcb27ad1837dfb83efbcec2ae800850058
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+
+#ifdef PyArray_Type
+#error "Numpy cannot be included before numpy.h."
+#endif
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+// To handle PyArray_* calles, numpy defines a static lookup table called
+// PyArray_API, or PY_ARRAY_UNIQUE_SYMBOL, if defined. This causes the
+// PyArray_* pointers to be different for different translation units, unless
+// we take care of selectivel defined NO_IMPORT_ARRAY.
+//
+// Virtually every usage will define NO_IMPORT_ARRAY, and will have access to
+// the lookup table via:
+//   extern void **PyArray_API;
+// In numpy.cc we will define TFLITE_IMPORT_NUMPY, effectively disabling that
+// and instead using:
+//   void **PyArray_API;
+// which is initialized when ImportNumpy() is called.
+//
+// If we don't define PY_ARRAY_UNIQUE_SYMBOL then PyArray_API is a static
+// variable, which causes strange crashes when the pointers are used across
+// translation unit boundaries.
+//
+// For mone info see https://sourceforge.net/p/numpy/mailman/message/5700519
+// See also tensorflow/python/lib/core/numpy.h for a similar approach.
+#define PY_ARRAY_UNIQUE_SYMBOL _tensorflow_numpy_api
+#ifndef TFLITE_IMPORT_NUMPY
+#define NO_IMPORT_ARRAY
+#endif
+
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy();
+
+}  // namespace python
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
index 2dc604356ab7d312f13e5c432fd3a8b05bbbc14c..e3d713630f6d39dd21b3c01cc4c75d4408243827 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -15,9 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 
+#include <memory>
+
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+
 namespace tflite {
 namespace python_utils {
 
+struct PyObjectDereferencer {
+  void operator()(PyObject* py_object) const { Py_DECREF(py_object); }
+};
+
+using UniquePyObjectRef = std::unique_ptr<PyObject, PyObjectDereferencer>;
+
 int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
   switch (tf_lite_type) {
     case kTfLiteFloat32:
@@ -33,7 +43,7 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
     case kTfLiteInt64:
       return NPY_INT64;
     case kTfLiteString:
-      return NPY_OBJECT;
+      return NPY_STRING;
     case kTfLiteBool:
       return NPY_BOOL;
     case kTfLiteComplex64:
@@ -73,5 +83,98 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
   return kTfLiteNoType;
 }
 
+#if PY_VERSION_HEX >= 0x03030000
+bool FillStringBufferFromPyUnicode(PyObject* value,
+                                   DynamicBuffer* dynamic_buffer) {
+  Py_ssize_t len = -1;
+  const char* buf = PyUnicode_AsUTF8AndSize(value, &len);
+  if (buf == NULL) {
+    PyErr_SetString(PyExc_ValueError, "PyUnicode_AsUTF8AndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+#else
+bool FillStringBufferFromPyUnicode(PyObject* value,
+                                   DynamicBuffer* dynamic_buffer) {
+  UniquePyObjectRef utemp(PyUnicode_AsUTF8String(value));
+  if (!utemp) {
+    PyErr_SetString(PyExc_ValueError, "PyUnicode_AsUTF8String() failed.");
+    return false;
+  }
+  char* buf = nullptr;
+  Py_ssize_t len = -1;
+  if (PyBytes_AsStringAndSize(utemp.get(), &buf, &len) == -1) {
+    PyErr_SetString(PyExc_ValueError, "PyBytes_AsStringAndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+#endif
+
+bool FillStringBufferFromPyString(PyObject* value,
+                                  DynamicBuffer* dynamic_buffer) {
+  if (PyUnicode_Check(value)) {
+    return FillStringBufferFromPyUnicode(value, dynamic_buffer);
+  }
+
+  char* buf = nullptr;
+  Py_ssize_t len = -1;
+  if (PyBytes_AsStringAndSize(value, &buf, &len) == -1) {
+    PyErr_SetString(PyExc_ValueError, "PyBytes_AsStringAndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer) {
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+  switch (PyArray_TYPE(array)) {
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE: {
+      UniquePyObjectRef iter(PyArray_IterNew(value));
+      while (PyArray_ITER_NOTDONE(iter.get())) {
+        UniquePyObjectRef item(PyArray_GETITEM(
+            array, reinterpret_cast<char*>(PyArray_ITER_DATA(iter.get()))));
+
+        if (!FillStringBufferFromPyString(item.get(), dynamic_buffer)) {
+          return false;
+        }
+
+        PyArray_ITER_NEXT(iter.get());
+      }
+      return true;
+    }
+    default:
+      break;
+  }
+
+  PyErr_Format(PyExc_ValueError,
+               "Cannot use numpy array of type %d for string tensor.",
+               PyArray_TYPE(array));
+  return false;
+}
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyBytes_AsStringAndSize(obj, data, length);
+#else
+  return PyString_AsStringAndSize(obj, data, length);
+#endif
+}
+
+PyObject* ConvertToPyString(const char* data, size_t length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyBytes_FromStringAndSize(data, length);
+#else
+  return PyString_FromStringAndSize(data, length);
+#endif
+}
+
 }  // namespace python_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.h b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
index 30a44226b8fb1a26ced71f3128dc363ec864e9bd..f4677378cbc177b42c1b802b40beeba86ed605c4 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_utils.h
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
@@ -17,22 +17,26 @@ limitations under the License.
 #define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
 
 #include "tensorflow/lite/context.h"
-
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace python_utils {
 
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
 int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type);
 
 TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
 
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer);
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length);
+PyObject* ConvertToPyString(const char* data, size_t length);
+
 }  // namespace python_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 3b0aa02b7c1c5215908c86b35525566669a0cd30..c8254518bd0a5a02769a9b503757342c2e9ac52c 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -12,31 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorFlow Lite tooling helper functionality.
+"""TensorFlow Lite tooling helper functionality."""
 
-EXPERIMENTAL: APIs here are unstable and likely to change without notice.
-
-@@TocoConverter
-@@TFLiteConverter
-@@toco_convert
-@@toco_convert_protos
-@@Interpreter
-@@OpHint
-@@convert_op_hints_to_stubs
-@@build_toco_convert_protos
-
-@@TFLITE
-@@GRAPHVIZ_DOT
-
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+import enum
 from six import PY3
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
+from tensorflow.lite.experimental.examples.lstm.rnn import dynamic_rnn  # pylint: disable=unused-import
+from tensorflow.lite.experimental.examples.lstm.rnn_cell import TFLiteLSTMCell  # pylint: disable=unused-import
+from tensorflow.lite.experimental.examples.lstm.rnn_cell import TfLiteRNNCell  # pylint: disable=unused-import
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
@@ -52,12 +42,17 @@ from tensorflow.lite.python.convert_saved_model import set_tensor_shapes as _set
 from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
+from tensorflow.lite.python.optimize import calibrator as _calibrator
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2 as _rewriter_config_pb2
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
+from tensorflow.python.eager import def_function as _def_function
+from tensorflow.python.eager import function as _function
+from tensorflow.python.framework import convert_to_constants as _convert_to_constants
+from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import graph_util as _tf_graph_util
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework.errors_impl import NotFoundError as _NotFoundError
@@ -71,18 +66,20 @@ from tensorflow.python.util import deprecation as _deprecation
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
-def _run_graph_optimizations(graph_def, input_arrays, output_arrays):
+def _run_graph_optimizations(graph_def, input_arrays, output_arrays,
+                             graph=None):
   """Apply standard TensorFlow optimizations to the graph_def.
 
   Args:
     graph_def: Frozen GraphDef to be optimized.
     input_arrays: List of arrays that are considered inputs of the graph.
     output_arrays: List of arrays that are considered outputs of the graph.
+    graph: TensorFlow Graph. Required when Eager mode is enabled. (default None)
 
   Returns:
     A new, optimized GraphDef.
   """
-  meta_graph = _export_meta_graph(graph_def=graph_def)
+  meta_graph = _export_meta_graph(graph_def=graph_def, graph=graph)
 
   # We need to add a collection called 'train_op' so that grappler
   # knows what the outputs are.
@@ -100,6 +97,196 @@ def _run_graph_optimizations(graph_def, input_arrays, output_arrays):
   return _tf_optimizer.OptimizeGraph(config, meta_graph)
 
 
+@_tf_export("lite.Optimize")
+class Optimize(enum.Enum):
+  """Enum defining the optimizations to apply when generating tflite graphs.
+
+  Some optimizations may come at the cost of accuracy.
+  """
+
+  # Optimize for size.
+  #
+  # Optimizations that reduce the size of the model.
+  # The model size will be reduced. Optimizations can include quantizing the
+  # weights of the floating point model.
+  OPTIMIZE_FOR_SIZE = "OPTIMIZE_FOR_SIZE"
+
+  # Optimize for latency.
+  #
+  # Optimizations that reduce the latency of the model.
+  # The model latency will be reduced. Optimizations can include quantizing the
+  # weights of the floating point model.
+  OPTIMIZE_FOR_LATENCY = "OPTIMIZE_FOR_LATENCY"
+
+  def __str__(self):
+    return self.value
+
+
+@_tf_export("lite.RepresentativeDataset")
+class RepresentativeDataset(object):
+  """Representative dataset to evaluate optimizations.
+
+  A representative dataset that can be used to evaluate optimizations by the
+  converter. E.g. converter can use these examples to estimate (min, max) ranges
+  by calibrating the model on inputs. This can allow converter to quantize a
+  converted floating point model.
+  """
+
+  def __init__(self, input_gen, output_gen=None):
+    """Creates a representative dataset.
+
+    Args:
+      input_gen: an input generator that can be used to generate input samples
+        for the model. This must be a callable object that returns an object
+        that supports the `iter()` protocol (e.g. a generator function). The
+        elements generated must have same type and shape as inputs to the model.
+      output_gen: (optional) an output generator that can be used to generate
+        output samples for the model. This must be a callable object that
+        returns an object that supports the `iter()` protocol (e.g. a generator
+        function). The elements generated must have same type and shape as
+        outputs to the model. (default None)
+    """
+    self.input_gen = input_gen
+    self.output_gen = output_gen
+
+
+class TFLiteConverterV2(object):
+  """Converts a TensorFlow model into TensorFlow Lite model.
+
+  Attributes:
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver. (default
+      False)
+    target_ops: Experimental flag, subject to change. Set of OpsSet options
+      indicating which converter to use. (default set([OpsSet.TFLITE_BUILTINS]))
+    optimizations: Experimental flag, subject to change, A list of optimizations
+      to apply when converting the model. The converter applies the
+      optimizations by giving priority to the optimizations specified earlier in
+      the list. E.g. `[optimize.OPTIMIZE_FOR_SIZE,
+      optimize.OPTIMIZE_FOR_LATENCY]` requires the converter to do both size and
+      latency optimizations giving priority to size optimizations over latency
+      optimizations.
+    representative_dataset: a representative dataset that can be used to
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations.
+
+  Example usage:
+
+    ```python
+    # Converting a GraphDef from a ConcreteFunction.
+    converter = lite.TFLiteConverter.from_concrete_function(func)
+    tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
+    ```
+  """
+
+  def __init__(self, func):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      func: TensorFlow ConcreteFunction.
+    """
+    self._func = func
+    self.allow_custom_ops = False
+    self.target_ops = set([OpsSet.TFLITE_BUILTINS])
+    self.representative_dataset = None
+    self.optimizations = []
+
+  @classmethod
+  def from_concrete_function(cls, func):
+    """Creates a TFLiteConverter class from a ConcreteFunction.
+
+    Args:
+      func: TensorFlow ConcreteFunction.
+
+    Returns:
+      TFLiteConverter class.
+    """
+    if not isinstance(func, _function.ConcreteFunction):
+      message = "This function takes in a ConcreteFunction."
+      if isinstance(func, _def_function.Function):
+        message += (" To get the ConcreteFunction from a Function,"
+                    " call from_concrete_function.")
+      raise ValueError(message)
+    return cls(func)
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    graph_def = _convert_to_constants.convert_variables_to_constants_v2(
+        self._func)
+    input_tensors = [
+        tensor for tensor in self._func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = self._func.outputs
+
+    # Run a Grappler pass.
+    graph_def = _run_graph_optimizations(graph_def, input_tensors,
+                                         output_tensors, self._func.graph)
+
+    # Checks dimensions in input tensor.
+    for tensor in input_tensors:
+      # Note that shape_list might be empty for scalar shapes.
+      shape_list = tensor.get_shape().as_list()
+      if None in shape_list[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
+      elif shape_list and shape_list[0] is None:
+        self._set_batch_size(batch_size=1)
+
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        raise TypeError("representative_dataset must be an instance of "
+                        "RepresentativeDataset")
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+
+    # TODO(shashishekhar): For now use optimizations order is ignored.
+    # Both size and latency optimizations decide whether to apply post
+    # training optimizations.
+    post_training_optimize = bool(
+        len(
+            set(self.optimizations)
+            & set([Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE])))
+    # Do weights only quantization if there is no dataset for calibration.
+    weights_only_quantize_flag = (
+        post_training_optimize and (self.representative_dataset is None))
+
+    converter_kwargs = {
+        "input_format": constants.TENSORFLOW_GRAPHDEF,
+        "allow_custom_ops": self.allow_custom_ops,
+        "post_training_quantize": weights_only_quantize_flag,
+        "target_ops": self.target_ops,
+    }
+
+    # Converts model.
+    result = _toco_convert_impl(
+        input_data=graph_def,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        **converter_kwargs)
+
+    if self.representative_dataset and post_training_optimize:
+      calibrate_quantize = _calibrator.Calibrator(result)
+      result = calibrate_quantize.calibrate_and_quantize(
+          self.representative_dataset.input_gen)
+
+    return result
+
+
 @_tf_export("lite.TFLiteConverter")
 class TFLiteConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
@@ -141,10 +328,11 @@ class TFLiteConverter(object):
       created for any op that is unknown. The developer will need to provide
       these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
-    post_training_quantize: Boolean indicating whether to quantize the weights
-      of the converted float model. Model size will be reduced and there will be
-      latency improvements (at the cost of accuracy).
-      (default False)
+    post_training_quantize: deprecated, please specify
+     `[optimize.OPTIMIZE_FOR_SIZE]` for `optimizations` instead. Boolean
+     indicating whether to quantize the weights of the converted float model.
+     Model size will be reduced and there will be latency improvements
+     (at the cost of accuracy). (default False)
     dump_graphviz_dir: Full filepath of folder to dump the graphs at various
       stages of processing GraphViz .dot files. Preferred over
       --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
@@ -154,6 +342,16 @@ class TFLiteConverter(object):
     target_ops: Experimental flag, subject to change. Set of OpsSet
       options indicating which converter to use.
       (default set([OpsSet.TFLITE_BUILTINS]))
+    optimizations: Experimental flag, subject to change, A list of
+      optimizations to apply when converting the model. The converter applies
+      the optimizations by giving priority to the optimizations specified
+      earlier in the list. E.g.
+      `[optimize.OPTIMIZE_FOR_SIZE, optimize.OPTIMIZE_FOR_LATENCY]` requires
+      the converter to do both size and latency optimizations giving priority
+      to size optimizations over latency optimizations.
+    representative_dataset: a representative dataset that can be used to
+      generate input and output samples for the model. The converter can use
+      the dataset to evaluate different optimizations.
 
   Example usage:
 
@@ -216,10 +414,12 @@ class TFLiteConverter(object):
     self.reorder_across_fake_quant = False
     self.change_concat_input_ranges = False
     self.allow_custom_ops = False
-    self.post_training_quantize = False
+    self._post_training_quantize = False
     self.dump_graphviz_dir = None
     self.dump_graphviz_video = False
     self.target_ops = set([OpsSet.TFLITE_BUILTINS])
+    self.representative_dataset = None
+    self.optimizations = []
 
     # Attributes are used by models that cannot be loaded into TensorFlow.
     if not self._has_valid_tensors():
@@ -419,6 +619,27 @@ class TFLiteConverter(object):
     graph_def = _freeze_graph(sess, output_tensors)
     return cls(graph_def, input_tensors, output_tensors)
 
+  def __setattr__(self, name, value):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    " instead." % name)
+      if value:
+        # Use OPTIMIZE_FOR_SIZE for post training for now.
+        self.optimizations = [Optimize.OPTIMIZE_FOR_SIZE]
+      else:
+        self.optimizations = []
+      return
+    object.__setattr__(self, name, value)
+
+  def __getattribute__(self, name):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    " instead." % name)
+      return Optimize.OPTIMIZE_FOR_SIZE in set(self.optimizations)
+    return object.__getattribute__(self, name)
+
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
@@ -463,6 +684,24 @@ class TFLiteConverter(object):
                          "tensors '{0}'.".format(",".join(invalid_stats)))
     else:
       quantized_stats = None
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        raise TypeError(
+            "representative_dataset must be an instance of "
+            "RepresentativeDataset")
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+
+    # TODO(shashishekhar): For now use optimizations order is ignored.
+    # Both size and latency optimizations decide whether to apply post
+    # training optimizations.
+    post_training_optimize = bool(
+        len(set(self.optimizations) & set([Optimize.OPTIMIZE_FOR_LATENCY,
+                                           Optimize.OPTIMIZE_FOR_SIZE])))
+    # Do weights only quantization if there is no dataset for calibration.
+    weights_only_quantize_flag = (
+        post_training_optimize and (self.representative_dataset is None))
 
     converter_kwargs = {
         "inference_type": self.inference_type,
@@ -475,7 +714,7 @@ class TFLiteConverter(object):
         "reorder_across_fake_quant": self.reorder_across_fake_quant,
         "change_concat_input_ranges": self.change_concat_input_ranges,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": self.post_training_quantize,
+        "post_training_quantize": weights_only_quantize_flag,
         "target_ops": self.target_ops,
         "dump_graphviz_dir": self.dump_graphviz_dir,
         "dump_graphviz_video": self.dump_graphviz_video
@@ -504,6 +743,12 @@ class TFLiteConverter(object):
           input_arrays_with_shape=self._input_arrays_with_shape,
           output_arrays=self._output_arrays,
           **converter_kwargs)
+
+    if self.representative_dataset and post_training_optimize:
+      calibrate_quantize = _calibrator.Calibrator(result)
+      result = calibrate_quantize.calibrate_and_quantize(
+          self.representative_dataset.input_gen)
+
     return result
 
   def get_input_arrays(self):
@@ -545,7 +790,7 @@ class TFLiteConverter(object):
       tensor.set_shape(shape)
 
 
-@_tf_export("lite.TocoConverter")
+@_tf_export(v1=["lite.TocoConverter"])
 class TocoConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
 
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index ff91d5e2970aac86cc5b05b4b61741e5ab54ad50..d41b7a75fd1d7523551a37baac8038af2624cb28 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
 from tensorflow.python.platform import gfile
@@ -131,13 +132,13 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     input_details = interpreter.get_input_details()
     self.assertEqual(1, len(input_details))
     self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.object_, input_details[0]['dtype'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
     self.assertTrue(([4] == input_details[0]['shape']).all())
 
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual('Reshape', output_details[0]['name'])
-    self.assertEqual(np.object_, output_details[0]['dtype'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
     self.assertTrue(([2, 2] == output_details[0]['shape']).all())
     # TODO(b/122659643): Test setting/getting string data via the python
     # interpreter API after support has been added.
@@ -481,6 +482,29 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
+  def testPostTrainingQuantizeDeprecatedAttribute(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[33, 33], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = constant_op.constant(
+        np.random.uniform(low=-10., high=10., size=(33, 33)),
+        shape=[33, 33],
+        dtype=dtypes.float32,
+        name='inputB')
+    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+    sess = session.Session()
+
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [in_tensor_1], [out_tensor])
+    self.assertFalse(quantized_converter.post_training_quantize)
+
+    quantized_converter.post_training_quantize = True
+    self.assertTrue(quantized_converter.post_training_quantize)
+    self.assertEqual(quantized_converter.optimizations,
+                     [lite.Optimize.OPTIMIZE_FOR_SIZE])
+
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
   def testPostTrainingQuantize(self):
     np.random.seed(0)
     # We need the tensor to have more than 1024 elements for quantize_weights
@@ -504,7 +528,53 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Convert quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
-    quantized_converter.post_training_quantize = True
+    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertTrue(len(quantized_tflite) < len(float_tflite))
+
+  def testPostTrainingCalibrateAndQuantize(self):
+    np.random.seed(0)
+    # Create a mobilenet like model.
+    output_channel = 16
+    depth_multiplier = 1
+    inp = array_ops.placeholder(dtype=dtypes.float32, shape=(1, 5, 5, 3))
+    conv = nn_ops.conv2d(
+        inp,
+        filter=array_ops.zeros([3, 3, 3, output_channel]),
+        strides=[1, 1, 1, 1],
+        padding='SAME')
+    dconv = nn_ops.depthwise_conv2d_native(
+        conv,
+        filter=array_ops.zeros(
+            [16, 16, output_channel, output_channel * depth_multiplier]),
+        strides=[1, 1, 1, 1],
+        padding='SAME')
+    pool = nn_ops.pool(
+        dconv, window_shape=[2, 2], pooling_type='AVG', padding='SAME')
+    max_pool = nn_ops.pool(
+        pool, window_shape=[2, 2], pooling_type='MAX', padding='SAME')
+    output = nn_ops.softmax(max_pool)
+
+    def calibration_gen():
+      for _ in range(10):
+        yield np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32)
+
+    sess = session.Session()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    # Convert quantized weights model.
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [inp], [output])
+    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_converter.representative_dataset = lite.RepresentativeDataset(
+        calibration_gen)
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
 
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..126a7783486dde402fcf7957d8eaca6a71a93e01
--- /dev/null
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -0,0 +1,182 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lite.py functionality related to TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.lite.python import lite
+from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+from tensorflow.python.training.tracking import tracking
+
+
+class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
+
+  def _evaluateTFLiteModel(self, tflite_model, input_data):
+    """Evaluates the model on the `input_data`."""
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    for input_tensor, tensor_data in zip(input_details, input_data):
+      interpreter.set_tensor(input_tensor['index'], tensor_data.numpy())
+    interpreter.invoke()
+    return interpreter.get_tensor(output_details[0]['index'])
+
+  @test_util.run_v2_only
+  def testTypeInvalid(self):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+
+    with self.assertRaises(ValueError) as error:
+      _ = lite.TFLiteConverterV2.from_concrete_function(root.f)
+    self.assertIn('call from_concrete_function', str(error.exception))
+
+  @test_util.run_v2_only
+  def testFloat(self):
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testSizeNone(self):
+    # Test with a shape of None
+    input_data = constant_op.constant(1., shape=None)
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.f = def_function.function(lambda x: root.v1 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testConstSavedModel(self):
+    """Test a basic model with functions to make sure functions are inlined."""
+    self.skipTest('b/124205572')
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda x: 2. * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures['serving_default']
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableSavedModel(self):
+    """Test a basic model with Variables with saving/loading the SavedModel."""
+    self.skipTest('b/124205572')
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures['serving_default']
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testMultiFunctionModel(self):
+    """Test a basic model with Variables."""
+
+    class BasicModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def add(self, x):
+        if self.y is None:
+          self.y = variables.Variable(2.)
+        return x + self.y
+
+      @def_function.function
+      def sub(self, x):
+        if self.z is None:
+          self.z = variables.Variable(3.)
+        return x - self.z
+
+    input_data = constant_op.constant(1., shape=[1])
+    root = BasicModel()
+    concrete_func = root.add.get_concrete_function(input_data)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.add(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 74b573b81f3b17ffdb2d58d4ca5d845f732723db..51310af14a46ef66d8e968a11ceef55bd5799b81 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -13,11 +13,12 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/python/interpreter_wrapper:numpy",
         "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
         "//tensorflow/lite/python/interpreter_wrapper:python_utils",
-        "//tensorflow/lite/tools/optimize:calibration_reader",
-        "//tensorflow/lite/tools/optimize:calibrator_lib",
         "//tensorflow/lite/tools/optimize:quantize_model",
+        "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
+        "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index d6fe1fbdad79d1bfd8415e43a3e5672771b6f877..5a481d4cab7f00372a86e61baa090ba512f7c5bc 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -22,28 +22,13 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
-#include "tensorflow/lite/tools/optimize/calibrator.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
 #include "tensorflow/lite/tools/optimize/quantize_model.h"
 
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define CPP_TO_PYSTRING PyBytes_FromStringAndSize
-#else
-#define PY_TO_CPPSTRING PyString_AsStringAndSize
-#define CPP_TO_PYSTRING PyString_FromStringAndSize
-#endif
-
 #define TFLITE_PY_CHECK(x)               \
   if ((x) != kTfLiteOk) {                \
     return error_reporter_->exception(); \
@@ -60,9 +45,7 @@ namespace calibration_wrapper {
 
 namespace {
 
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
+using python_utils::PyDecrefDeleter;
 
 std::unique_ptr<tflite::ModelT> CreateMutableModel(const tflite::Model& model) {
   std::unique_ptr<tflite::ModelT> copied_model =
@@ -185,7 +168,8 @@ PyObject* CalibrationWrapper::QuantizeModel() {
     error_reporter_->exception();
     return nullptr;
   }
-  return CPP_TO_PYSTRING(
+
+  return python_utils::ConvertToPyString(
       reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
       builder.GetSize());
 }
@@ -196,7 +180,8 @@ PyObject* CalibrationWrapper::QuantizeModel() {
   char* buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
     return nullptr;
   }
   std::unique_ptr<tflite::FlatBufferModel> model =
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 4689c318b35691ca26cc28b905baa5b5fd926a9e..2fa08e5326990ecda1857fec8eb9caadac1f4102 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -32,9 +32,20 @@ tf_to_tflite(
     ],
 )
 
+tf_to_tflite(
+    name = "gather_string",
+    src = "gather.pbtxt",
+    out = "gather_string.tflite",
+    options = [
+        "--input_arrays=input,indices",
+        "--output_arrays=output",
+    ],
+)
+
 filegroup(
     name = "interpreter_test_data",
     srcs = [
+        ":gather_string",
         ":permute_float",
         ":permute_uint8",
     ],
diff --git a/tensorflow/lite/python/testdata/gather.pbtxt b/tensorflow/lite/python/testdata/gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b1193c475d3b4b663accf036753bfbe9d8adb7d
--- /dev/null
+++ b/tensorflow/lite/python/testdata/gather.pbtxt
@@ -0,0 +1,93 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+}
+node {
+  name: "indices"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "axis"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "GatherV2"
+  input: "input"
+  input: "indices"
+  input: "axis"
+  device: "/device:CPU:0"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index ea516764c929080bc42e48a7cfcdd171f2d6cc57..e55419186e16f62f27f9df0201e814cb8936fc27 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -9,6 +9,12 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 py_binary(
     name = "upgrade_schema",
+    srcs = ["upgrade_schema.py"],
+    deps = [":upgrade_schema_main_lib"],
+)
+
+py_library(
+    name = "upgrade_schema_main_lib",
     srcs = [
         "upgrade_schema.py",
     ],
@@ -39,7 +45,7 @@ py_test(
         "notap",
     ],
     deps = [
-        ":upgrade_schema",
+        ":upgrade_schema_main_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 098b7c719394b39f7a1dab42817b51a265c5e063..69a0a0acebdc2fa75eed688751085dccc0713479 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -220,6 +220,11 @@ enum BuiltinOperator : byte {
   CEIL = 104,
   REVERSE_V2 = 105,
   ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
 }
 
 // Options for the builtin operators.
@@ -306,6 +311,10 @@ union BuiltinOptions {
   UniqueOptions,
   ReverseV2Options,
   AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -549,6 +558,9 @@ table TransposeOptions {
 table ExpOptions {
 }
 
+table CosOptions {
+}
+
 table ReducerOptions {
   keep_dims: bool;
 }
@@ -643,6 +655,9 @@ table ShapeOptions {
   out_type : TensorType;
 }
 
+table RankOptions {
+}
+
 table PowOptions {
 }
 
@@ -729,6 +744,12 @@ table ReverseV2Options {
 table AddNOptions {
 }
 
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index f0a6b00f968f4867aaad971a12fcd0a8810d65ad..ad9ae192db4e5005631c57621d50664dcac132ed 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -139,6 +139,9 @@ struct TransposeOptionsT;
 struct ExpOptions;
 struct ExpOptionsT;
 
+struct CosOptions;
+struct CosOptionsT;
+
 struct ReducerOptions;
 struct ReducerOptionsT;
 
@@ -214,6 +217,9 @@ struct NotEqualOptionsT;
 struct ShapeOptions;
 struct ShapeOptionsT;
 
+struct RankOptions;
+struct RankOptionsT;
+
 struct PowOptions;
 struct PowOptionsT;
 
@@ -277,6 +283,12 @@ struct ReverseV2OptionsT;
 struct AddNOptions;
 struct AddNOptionsT;
 
+struct GatherNdOptions;
+struct GatherNdOptionsT;
+
+struct WhereOptions;
+struct WhereOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -533,11 +545,16 @@ enum BuiltinOperator {
   BuiltinOperator_CEIL = 104,
   BuiltinOperator_REVERSE_V2 = 105,
   BuiltinOperator_ADD_N = 106,
+  BuiltinOperator_GATHER_ND = 107,
+  BuiltinOperator_COS = 108,
+  BuiltinOperator_WHERE = 109,
+  BuiltinOperator_RANK = 110,
+  BuiltinOperator_ELU = 111,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_ADD_N
+  BuiltinOperator_MAX = BuiltinOperator_ELU
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[106] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[111] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -644,7 +661,12 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[106] {
     BuiltinOperator_UNIQUE,
     BuiltinOperator_CEIL,
     BuiltinOperator_REVERSE_V2,
-    BuiltinOperator_ADD_N
+    BuiltinOperator_ADD_N,
+    BuiltinOperator_GATHER_ND,
+    BuiltinOperator_COS,
+    BuiltinOperator_WHERE,
+    BuiltinOperator_RANK,
+    BuiltinOperator_ELU
   };
   return values;
 }
@@ -758,6 +780,11 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "CEIL",
     "REVERSE_V2",
     "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
     nullptr
   };
   return names;
@@ -852,11 +879,15 @@ enum BuiltinOptions {
   BuiltinOptions_UniqueOptions = 80,
   BuiltinOptions_ReverseV2Options = 81,
   BuiltinOptions_AddNOptions = 82,
+  BuiltinOptions_GatherNdOptions = 83,
+  BuiltinOptions_CosOptions = 84,
+  BuiltinOptions_WhereOptions = 85,
+  BuiltinOptions_RankOptions = 86,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_AddNOptions
+  BuiltinOptions_MAX = BuiltinOptions_RankOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[83] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[87] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -940,7 +971,11 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[83] {
     BuiltinOptions_SplitVOptions,
     BuiltinOptions_UniqueOptions,
     BuiltinOptions_ReverseV2Options,
-    BuiltinOptions_AddNOptions
+    BuiltinOptions_AddNOptions,
+    BuiltinOptions_GatherNdOptions,
+    BuiltinOptions_CosOptions,
+    BuiltinOptions_WhereOptions,
+    BuiltinOptions_RankOptions
   };
   return values;
 }
@@ -1030,6 +1065,10 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "UniqueOptions",
     "ReverseV2Options",
     "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
     nullptr
   };
   return names;
@@ -1372,6 +1411,22 @@ template<> struct BuiltinOptionsTraits<AddNOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_AddNOptions;
 };
 
+template<> struct BuiltinOptionsTraits<GatherNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<CosOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CosOptions;
+};
+
+template<> struct BuiltinOptionsTraits<WhereOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhereOptions;
+};
+
+template<> struct BuiltinOptionsTraits<RankOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RankOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -2059,6 +2114,38 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_AddNOptions ?
       reinterpret_cast<const AddNOptionsT *>(value) : nullptr;
   }
+  GatherNdOptionsT *AsGatherNdOptions() {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<GatherNdOptionsT *>(value) : nullptr;
+  }
+  const GatherNdOptionsT *AsGatherNdOptions() const {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<const GatherNdOptionsT *>(value) : nullptr;
+  }
+  CosOptionsT *AsCosOptions() {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<CosOptionsT *>(value) : nullptr;
+  }
+  const CosOptionsT *AsCosOptions() const {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<const CosOptionsT *>(value) : nullptr;
+  }
+  WhereOptionsT *AsWhereOptions() {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<WhereOptionsT *>(value) : nullptr;
+  }
+  const WhereOptionsT *AsWhereOptions() const {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<const WhereOptionsT *>(value) : nullptr;
+  }
+  RankOptionsT *AsRankOptions() {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<RankOptionsT *>(value) : nullptr;
+  }
+  const RankOptionsT *AsRankOptions() const {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<const RankOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -4991,6 +5078,46 @@ inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
 
 flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct CosOptionsT : public flatbuffers::NativeTable {
+  typedef CosOptions TableType;
+  CosOptionsT() {
+  }
+};
+
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CosOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  CosOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CosOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CosOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CosOptionsBuilder &operator=(const CosOptionsBuilder &);
+  flatbuffers::Offset<CosOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CosOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  CosOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ReducerOptionsT : public flatbuffers::NativeTable {
   typedef ReducerOptions TableType;
   bool keep_dims;
@@ -6237,6 +6364,46 @@ inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
 
 flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct RankOptionsT : public flatbuffers::NativeTable {
+  typedef RankOptions TableType;
+  RankOptionsT() {
+  }
+};
+
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RankOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RankOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RankOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RankOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  RankOptionsBuilder &operator=(const RankOptionsBuilder &);
+  flatbuffers::Offset<RankOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RankOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RankOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct PowOptionsT : public flatbuffers::NativeTable {
   typedef PowOptions TableType;
   PowOptionsT() {
@@ -7235,6 +7402,86 @@ inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(
 
 flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct GatherNdOptionsT : public flatbuffers::NativeTable {
+  typedef GatherNdOptions TableType;
+  GatherNdOptionsT() {
+  }
+};
+
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherNdOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GatherNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherNdOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GatherNdOptionsBuilder &operator=(const GatherNdOptionsBuilder &);
+  flatbuffers::Offset<GatherNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherNdOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GatherNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhereOptionsT : public flatbuffers::NativeTable {
+  typedef WhereOptions TableType;
+  WhereOptionsT() {
+  }
+};
+
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WhereOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  WhereOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<WhereOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhereOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  WhereOptionsBuilder &operator=(const WhereOptionsBuilder &);
+  flatbuffers::Offset<WhereOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<WhereOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  WhereOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -7614,6 +7861,18 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const AddNOptions *builtin_options_as_AddNOptions() const {
     return builtin_options_type() == BuiltinOptions_AddNOptions ? static_cast<const AddNOptions *>(builtin_options()) : nullptr;
   }
+  const GatherNdOptions *builtin_options_as_GatherNdOptions() const {
+    return builtin_options_type() == BuiltinOptions_GatherNdOptions ? static_cast<const GatherNdOptions *>(builtin_options()) : nullptr;
+  }
+  const CosOptions *builtin_options_as_CosOptions() const {
+    return builtin_options_type() == BuiltinOptions_CosOptions ? static_cast<const CosOptions *>(builtin_options()) : nullptr;
+  }
+  const WhereOptions *builtin_options_as_WhereOptions() const {
+    return builtin_options_type() == BuiltinOptions_WhereOptions ? static_cast<const WhereOptions *>(builtin_options()) : nullptr;
+  }
+  const RankOptions *builtin_options_as_RankOptions() const {
+    return builtin_options_type() == BuiltinOptions_RankOptions ? static_cast<const RankOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7973,6 +8232,22 @@ template<> inline const AddNOptions *Operator::builtin_options_as<AddNOptions>()
   return builtin_options_as_AddNOptions();
 }
 
+template<> inline const GatherNdOptions *Operator::builtin_options_as<GatherNdOptions>() const {
+  return builtin_options_as_GatherNdOptions();
+}
+
+template<> inline const CosOptions *Operator::builtin_options_as<CosOptions>() const {
+  return builtin_options_as_CosOptions();
+}
+
+template<> inline const WhereOptions *Operator::builtin_options_as<WhereOptions>() const {
+  return builtin_options_as_WhereOptions();
+}
+
+template<> inline const RankOptions *Operator::builtin_options_as<RankOptions>() const {
+  return builtin_options_as_RankOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -9518,6 +9793,29 @@ inline flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferB
       _fbb);
 }
 
+inline CosOptionsT *CosOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CosOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CosOptions::UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<CosOptions> CosOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCosOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateCosOptions(
+      _fbb);
+}
+
 inline ReducerOptionsT *ReducerOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ReducerOptionsT();
   UnPackTo(_o, _resolver);
@@ -10147,6 +10445,29 @@ inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBuf
       _out_type);
 }
 
+inline RankOptionsT *RankOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new RankOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void RankOptions::UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RankOptions> RankOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRankOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRankOptions(
+      _fbb);
+}
+
 inline PowOptionsT *PowOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new PowOptionsT();
   UnPackTo(_o, _resolver);
@@ -10666,6 +10987,52 @@ inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBuffe
       _fbb);
 }
 
+inline GatherNdOptionsT *GatherNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GatherNdOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherNdOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGatherNdOptions(
+      _fbb);
+}
+
+inline WhereOptionsT *WhereOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new WhereOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<WhereOptions> WhereOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhereOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateWhereOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -11252,6 +11619,22 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const AddNOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -11598,6 +11981,22 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const AddNOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -11932,6 +12331,22 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const AddNOptionsT *>(value);
       return CreateAddNOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptionsT *>(value);
+      return CreateGatherNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptionsT *>(value);
+      return CreateCosOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptionsT *>(value);
+      return CreateWhereOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptionsT *>(value);
+      return CreateRankOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -12266,6 +12681,22 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new AddNOptionsT(*reinterpret_cast<AddNOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_GatherNdOptions: {
+      value = new GatherNdOptionsT(*reinterpret_cast<GatherNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      value = new CosOptionsT(*reinterpret_cast<CosOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      value = new WhereOptionsT(*reinterpret_cast<WhereOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      value = new RankOptionsT(*reinterpret_cast<RankOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -12683,6 +13114,26 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<GatherNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<CosOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<WhereOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<RankOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/stderr_reporter.cc b/tensorflow/lite/stderr_reporter.cc
index 09eb1d254a608ba2d19c824a323f0b5173afe15f..366a1816ef2b2ef62e093bbe99690eae52fdc8c4 100644
--- a/tensorflow/lite/stderr_reporter.cc
+++ b/tensorflow/lite/stderr_reporter.cc
@@ -13,28 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/stderr_reporter.h"
-#include <cstdarg>
-#include <cstdio>
 
-#ifdef __ANDROID__
-#include <android/log.h>
-#endif
+#include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
 
 int StderrReporter::Report(const char* format, va_list args) {
-#ifdef __ANDROID__
-  // On Android stderr is not captured for applications, only for code run from
-  // the shell. Rather than assume all users will set up a custom error
-  // reporter, let's output to logcat here
-  va_list args_for_log;
-  va_copy(args_for_log, args);
-  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
-  va_end(args_for_log);
-#endif
-  const int result = vfprintf(stderr, format, args);
-  fputc('\n', stderr);
-  return result;
+  logging_internal::MinimalLogger::VLog(TFLITE_LOG_ERROR, format, args);
+  return 0;
 }
 
 ErrorReporter* DefaultErrorReporter() {
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index f076db76f2d4ef416e5f7ec98ac2ec0aa94d95c2..cb268ee805c89951e4ef302595339ea0e771b48a 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 //   buf.AddString("AB", 2);
 //   # Write content of DynamicBuffer to tensor in format of string tensor
 //   # described above.
-//   buf.WriteToTensor(tensor)
+//   buf.WriteToTensor(tensor, nullptr)
 
 #ifndef TENSORFLOW_LITE_STRING_UTIL_H_
 #define TENSORFLOW_LITE_STRING_UTIL_H_
@@ -83,10 +83,6 @@ class DynamicBuffer {
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
 
-  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
-  // TODO(b/120230709): remove when people migrate away.
-  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
-
  private:
   // Data buffer to store contents of strings, not including headers.
   std::vector<char> data_;
diff --git a/tensorflow/lite/testdata/test_input.csv b/tensorflow/lite/testdata/test_input.csv
new file mode 100644
index 0000000000000000000000000000000000000000..33894d3063f35a885fb34c3c5b85bb6a4d8e711e
--- /dev/null
+++ b/tensorflow/lite/testdata/test_input.csv
@@ -0,0 +1 @@
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
\ No newline at end of file
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 3840b52a0ad6a639d67d8240ddfa9177e776c31e..68512b952a9a34c67452b676db97534b1fb3c733 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -10,6 +10,7 @@ load(
     "generated_test_models_all",
 )
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -78,6 +79,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":generate_examples_report",
+        ":string_util_wrapper",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:graph_util",
         "//third_party/py/numpy",
@@ -159,6 +161,7 @@ cc_library(
     srcs = ["tflite_driver.cc"],
     hdrs = ["tflite_driver.h"],
     deps = [
+        ":join",
         ":split",
         ":test_runner",
         "//tensorflow/lite:builtin_op_data",
@@ -391,4 +394,29 @@ tf_cc_binary(
     ],
 )
 
+cc_library(
+    name = "string_util_lib",
+    srcs = ["string_util.cc"],
+    hdrs = ["string_util.h"],
+    deps = [
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/python/interpreter_wrapper:numpy",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "string_util_wrapper",
+    srcs = [
+        "string_util.i",
+    ],
+    deps = [
+        ":string_util_lib",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index ee68607a0fe07756f8abc03d75a05a8b08c3395c..20c23017c53d0b8929e701ef505c70ba156be821 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -36,6 +36,7 @@ import operator
 import os
 import random
 import re
+import string
 import sys
 import tempfile
 import traceback
@@ -52,6 +53,7 @@ import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
 from tensorflow.lite.testing import generate_examples_report as report_lib
+from tensorflow.lite.testing import string_util_wrapper
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.ops import rnn
 
@@ -163,6 +165,16 @@ def toco_options(data_types,
   return s
 
 
+def format_result(t):
+  """Convert a tensor to a format that can be used in test specs."""
+  if t.dtype.kind not in [np.dtype(np.string_).kind, np.dtype(np.object_).kind]:
+    # Output 9 digits after the point to ensure the precision is good enough.
+    values = ["{:.9f}".format(value) for value in list(t.flatten())]
+    return ",".join(values)
+  else:
+    return string_util_wrapper.SerializeAsHexString(t.flatten())
+
+
 def write_examples(fp, examples):
   """Given a list `examples`, write a text format representation.
 
@@ -179,9 +191,7 @@ def write_examples(fp, examples):
     """Write tensor in file format supported by TFLITE example."""
     fp.write("dtype,%s\n" % x.dtype)
     fp.write("shape," + ",".join(map(str, x.shape)) + "\n")
-    # Output 9 digits after the point to ensure the precision is good enough.
-    values = ["{:.9f}".format(value) for value in list(x.flatten())]
-    fp.write("values," + ",".join(values) + "\n")
+    fp.write("values," + format_result(x) + "\n")
 
   fp.write("test_cases,%d\n" % len(examples))
   for example in examples:
@@ -214,11 +224,9 @@ def write_test_cases(fp, model_name, examples):
     fp.write("invoke {\n")
 
     for t in example["inputs"]:
-      values = ["{:.9f}".format(value) for value in list(t.flatten())]
-      fp.write("  input: \"" + ",".join(values) + "\"\n")
+      fp.write("  input: \"" + format_result(t) + "\"\n")
     for t in example["outputs"]:
-      values = ["{:.9f}".format(value) for value in list(t.flatten())]
-      fp.write("  output: \"" + ",".join(values) + "\"\n")
+      fp.write("  output: \"" + format_result(t) + "\"\n")
     fp.write("}\n")
 
 
@@ -230,6 +238,7 @@ _TF_TYPE_INFO = {
     tf.int16: (np.int16, "QUANTIZED_INT16"),
     tf.int64: (np.int64, "INT64"),
     tf.bool: (np.bool, "BOOL"),
+    tf.string: (np.string_, "STRING"),
 }
 
 
@@ -245,6 +254,10 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
     value = np.random.randint(min_value, max_value+1, shape)
   elif dtype == tf.bool:
     value = np.random.choice([True, False], size=shape)
+  elif dtype == np.string_:
+    # Not the best strings, but they will do for some basic testing.
+    letters = list(string.ascii_uppercase)
+    return np.random.choice(letters, size=shape).astype(dtype)
   return np.dtype(dtype).type(value) if np.isscalar(value) else value.astype(
       dtype)
 
@@ -300,8 +313,13 @@ def make_control_dep_tests(zip_path):
 
   extra_toco_options = ExtraTocoOptions()
   extra_toco_options.drop_control_dependency = True
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    extra_toco_options)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      extra_toco_options,
+      expected_tf_failures=3)
 
 
 def toco_convert(graph_def_str, input_tensors, output_tensors,
@@ -369,7 +387,7 @@ def make_zip_of_tests(zip_path,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
                       use_frozen_graph=False,
-                      expected_tf_success=None):
+                      expected_tf_failures=0):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -389,8 +407,9 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
-    expected_tf_success: Number of times tensorflow is supposed to succeed in
-      executing the input graphs. `None` means "unknown".
+    expected_tf_failures: Number of times tensorflow is expected to fail in
+      executing the input graphs. In some cases it is OK for TensorFlow to
+      fail because the one or more combination of parameters is invalid.
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -551,10 +570,17 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
-  if expected_tf_success is not None and tf_success != expected_tf_success:
-    raise RuntimeError(
-        "Expected TF to succeed %d times, but that happened %d times" %
-        (expected_tf_success, tf_success))
+  tf_failures = parameter_count - tf_success
+
+  if tf_failures / parameter_count > 0.8:
+    raise RuntimeError(("Test for '%s' is not very useful. "
+                        "TensorFlow fails in %d percent of the cases.") %
+                       (zip_path, int(100 * tf_failures / parameter_count)))
+
+  if tf_failures != expected_tf_failures:
+    raise RuntimeError(("Expected TF to fail %d times while generating '%s', "
+                        "but that happened %d times") % (expected_tf_failures,
+                                                         zip_path, tf_failures))
 
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
@@ -573,11 +599,12 @@ def make_pool_tests(pool_op_in):
 
   pool_op = pool_op_in
 
-  def f(zip_path):
+  def f(zip_path, expected_tf_failures=0):
     """Actual function that generates examples.
 
     Args:
       zip_path: path to write zip to.
+      expected_tf_failures: number of expected tensorflow failures.
     """
 
     # Chose a set of parameters
@@ -606,20 +633,26 @@ def make_pool_tests(pool_op_in):
       return [input_values], sess.run(
           outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(
+        zip_path,
+        test_parameters,
+        build_graph,
+        build_inputs,
+        expected_tf_failures=expected_tf_failures)
+
   return f
 
 
 def make_l2_pool_tests(zip_path):
-  make_pool_tests(make_l2_pool)(zip_path)
+  make_pool_tests(make_l2_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_avg_pool_tests(zip_path):
-  make_pool_tests(tf.nn.avg_pool)(zip_path)
+  make_pool_tests(tf.nn.avg_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_max_pool_tests(zip_path):
-  make_pool_tests(tf.nn.max_pool)(zip_path)
+  make_pool_tests(tf.nn.max_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_abs_tests(zip_path):
@@ -645,6 +678,32 @@ def make_abs_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+def make_elu_tests(zip_path):
+  """Make a set of tests to do (float) tf.nn.elu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                          [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.elu(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
@@ -814,6 +873,9 @@ def make_constant_tests(zip_path):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
       "constant_is_also_output": [True, False],
+      # This is a regression test for a bug where Toco rejects models with
+      # unread inputs.
+      "has_unread_input": [True, False],
   }]
 
   def build_graph(parameters):
@@ -823,22 +885,28 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    out = [tf.maximum(dummy_input, constant)]
+    outputs = [tf.maximum(dummy_input, constant)]
     if parameters["constant_is_also_output"]:
-      out.append(constant)
+      outputs.append(constant)
+    inputs = [dummy_input]
+    if parameters["has_unread_input"]:
+      unread_input = tf.placeholder(
+          dtype=parameters["dtype"],
+          name="unread_input",
+          shape=parameters["input_shape"])
+      inputs.append(unread_input)
 
-    return [dummy_input], out
+    return inputs, outputs
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=20)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_binary_op_tests(zip_path, binary_operator):
+def make_binary_op_tests(zip_path, binary_operator, expected_tf_failures=0):
   """Make a set of tests to do binary ops with and without broadcast."""
 
   test_parameters = [
@@ -908,7 +976,12 @@ def make_binary_op_tests(zip_path, binary_operator):
             inputs[1]: input2
         })
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=expected_tf_failures)
 
 
 def make_reduce_tests(reduce_op,
@@ -1074,6 +1147,34 @@ def make_exp_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_cos_tests(zip_path):
+  """Make a set of tests to do cos."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the cos op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    out = tf.cos(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"],
+                           min_value=-np.pi, max_value=np.pi)
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_log_softmax_tests(zip_path):
   """Make a set of tests to do log_softmax."""
 
@@ -1137,7 +1238,12 @@ def make_maximum_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=8)
 
 
 def make_minimum_tests(zip_path):
@@ -1172,7 +1278,12 @@ def make_minimum_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=8)
 
 
 def make_binary_op_tests_func(binary_operator):
@@ -1184,6 +1295,51 @@ def make_add_tests(zip_path):
   make_binary_op_tests(zip_path, tf.add)
 
 
+def make_add_n_tests(zip_path):
+  """Make a set of tests for AddN op."""
+
+  test_parameters = [
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[2, 5, 3, 1]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[5]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Builds the graph given the current parameters."""
+    input_tensors = []
+    for i in range(parameters["num_inputs"]):
+      input_tensors.append(
+          tf.placeholder(
+              dtype=parameters["dtype"],
+              name="input_{}".format(i),
+              shape=parameters["input_shape"]))
+    out = tf.add_n(input_tensors)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Builds operand inputs for op."""
+    input_data = []
+    for i in range(parameters["num_inputs"]):
+      input_data.append(
+          create_tensor_data(parameters["dtype"], parameters["input_shape"]))
+    return input_data, sess.run(
+        outputs, feed_dict={i: d for i, d in zip(inputs, input_data)})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_div_tests(zip_path):
   make_binary_op_tests(zip_path, tf.div)
 
@@ -1197,7 +1353,7 @@ def make_mul_tests(zip_path):
 
 
 def make_pow_tests(zip_path):
-  make_binary_op_tests(zip_path, tf.pow)
+  make_binary_op_tests(zip_path, tf.pow, expected_tf_failures=7)
 
 
 def make_floor_div_tests(zip_path):
@@ -1215,16 +1371,23 @@ def make_squared_difference_tests(zip_path):
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
-  test_parameters = [{
-      # TODO(mgubin): add string tests when they are supported by Toco.
-      # TODO(mgubin): add tests for Nd indices when they are supported by
-      # TfLite.
-      "params_dtype": [tf.float32, tf.int32, tf.int64],
-      "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32, tf.int64],
-      "indices_shape": [[3], [5]],
-      "axis": [-1, 0, 1],
-  }]
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[10], [1, 2, 20]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[3], [5]],
+          "axis": [-1, 0, 1],
+      },
+      {
+          # TODO(b/123895910): add Nd support for strings.
+          "params_dtype": [tf.string],
+          "params_shape": [[8]],
+          "indices_dtype": [tf.int32],
+          "indices_shape": [[3]],
+          "axis": [0],
+      }
+  ]
 
   def build_graph(parameters):
     """Build the gather op testing graph."""
@@ -1255,7 +1418,56 @@ def make_gather_tests(zip_path):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_success=60)
+      expected_tf_failures=12)
+
+
+def make_gather_nd_tests(zip_path):
+  """Make a set of tests to do gather_nd."""
+
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 1]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[1, 1]],
+      },
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 5]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[2, 1], [2, 2]],
+      },
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 5, 10]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[3, 1], [2, 2], [2, 3], [2, 1, 3]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the gather_nd op testing graph."""
+    params = tf.placeholder(
+        dtype=parameters["params_dtype"],
+        name="params",
+        shape=parameters["params_shape"])
+    indices = tf.placeholder(
+        dtype=parameters["indices_dtype"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    out = tf.gather_nd(params, indices)
+    return [params, indices], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    params = create_tensor_data(parameters["params_dtype"],
+                                parameters["params_shape"])
+    indices = create_tensor_data(parameters["indices_dtype"],
+                                 parameters["indices_shape"], 0,
+                                 parameters["params_shape"][0] - 1)
+    return [params, indices], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [params, indices])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_gather_with_constant_tests(zip_path):
@@ -1284,8 +1496,7 @@ def make_gather_with_constant_tests(zip_path):
     return [reference_values], sess.run(
         outputs, feed_dict={inputs[0]: reference_values})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=2)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -1421,7 +1632,12 @@ def make_conv_tests(zip_path):
       values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=40)
 
 
 # Note: This is a regression test for a bug (b/122651451) that Toco incorrectly
@@ -1663,7 +1879,12 @@ def make_depthwiseconv_tests(zip_path):
       values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=4)
 
 
 def make_split_tests(zip_path):
@@ -1686,7 +1907,12 @@ def make_split_tests(zip_path):
     values = [create_tensor_data(np.float32, parameters["input_shape"])]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=112)
 
 
 def make_splitv_tests(zip_path):
@@ -1709,7 +1935,12 @@ def make_splitv_tests(zip_path):
     values = [create_tensor_data(np.float32, parameters["input_shape"])]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=158)
 
 
 def make_concat_tests(zip_path):
@@ -1751,7 +1982,12 @@ def make_concat_tests(zip_path):
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=60)
 
 
 def make_fully_connected_tests(zip_path):
@@ -1812,7 +2048,12 @@ def make_fully_connected_tests(zip_path):
       values.append(create_tensor_data(np.float32, parameters["shape2"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=10)
 
 
 def make_l2norm_tests(zip_path):
@@ -1842,7 +2083,12 @@ def make_l2norm_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=9)
 
 
 def make_local_response_norm_tests(zip_path):
@@ -2068,6 +2314,29 @@ def make_shape_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_rank_tests(zip_path):
+  """Make a set of tests to do rank."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[], [0], [1, 1, 1, 3], [2, 3, 4, 5], [5, 5], [10]],
+  }]
+
+  def build_graph(parameters):
+    """Build the rank op testing graph."""
+    input_value = tf.placeholder(dtype=parameters["input_dtype"], name="input")
+    out = tf.rank(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_one_hot_tests(zip_path):
   """Make a set of tests to do one_hot."""
 
@@ -2327,7 +2596,12 @@ def make_space_to_batch_nd_tests(zip_path):
       values.append(np.array(parameters["paddings"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=56)
 
 
 def make_batch_to_space_nd_tests(zip_path):
@@ -2440,7 +2714,12 @@ def make_transpose_tests(zip_path):
       values.append(np.array(parameters["perm"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=9)
 
 
 def make_squeeze_tests(zip_path):
@@ -2478,10 +2757,16 @@ def make_squeeze_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=12)
 
 
-def _make_strided_slice_tests(zip_path, test_parameters):
+def _make_strided_slice_tests(zip_path, test_parameters,
+                              expected_tf_failures=0):
   """Utility function to make strided_slice_tests based on parameters."""
 
   def build_graph(parameters):
@@ -2541,7 +2826,12 @@ def _make_strided_slice_tests(zip_path, test_parameters):
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=expected_tf_failures)
 
 
 def make_strided_slice_tests(zip_path):
@@ -2615,7 +2905,7 @@ def make_strided_slice_tests(zip_path):
           "constant_indices": [False],
       },
   ]
-  _make_strided_slice_tests(zip_path, test_parameters)
+  _make_strided_slice_tests(zip_path, test_parameters, expected_tf_failures=2)
 
 
 def make_strided_slice_1d_exhaustive_tests(zip_path):
@@ -2638,7 +2928,10 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
-def make_strided_slice_buggy_tests(zip_path):
+# For verifying https://github.com/tensorflow/tensorflow/issues/23599
+# TODO(chaomei): refactor the test to cover more cases, like negative stride,
+# negative array index etc.
+def make_resolve_constant_strided_slice_tests(zip_path):
   """Make a set of tests to show strided_slice yields incorrect results."""
 
   test_parameters = [{
@@ -2817,7 +3110,12 @@ def make_arg_min_max_tests(zip_path):
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=4)
 
 
 def make_equal_tests(zip_path):
@@ -2852,7 +3150,12 @@ def make_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_not_equal_tests(zip_path):
@@ -2886,7 +3189,12 @@ def make_not_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_greater_tests(zip_path):
@@ -2920,7 +3228,12 @@ def make_greater_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_greater_equal_tests(zip_path):
@@ -2954,7 +3267,12 @@ def make_greater_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_less_tests(zip_path):
@@ -2988,7 +3306,12 @@ def make_less_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_less_equal_tests(zip_path):
@@ -3022,7 +3345,12 @@ def make_less_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_floor_tests(zip_path):
@@ -3274,7 +3602,12 @@ def make_slice_tests(zip_path):
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=18)
 
 
 def make_conv2d_transpose_tests(zip_path):
@@ -3311,12 +3644,7 @@ def make_conv2d_transpose_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      expected_tf_success=4)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 # Since compute output_shape is fairly complicated for
@@ -3566,7 +3894,12 @@ def make_pack_tests(zip_path):
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=72)
 
 
 def make_unpack_tests(zip_path):
@@ -3657,13 +3990,18 @@ def make_fill_tests(zip_path):
     return [input1, input2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input1, input2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=12)
 
 
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
-  def logical(zip_path):
+  def logical(zip_path, expected_tf_failures=0):
     """Generate examples."""
     test_parameters = [{
         "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]),
@@ -3688,19 +4026,24 @@ def _make_logical_tests(op):
       return [input_value1, input_value2], sess.run(
           outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(
+        zip_path,
+        test_parameters,
+        build_graph,
+        build_inputs,
+        expected_tf_failures=expected_tf_failures)
 
   return logical
 
 
 def make_logical_or_tests(zip_path):
   """Make a set of tests to do logical_or."""
-  return _make_logical_tests(tf.logical_or)(zip_path)
+  return _make_logical_tests(tf.logical_or)(zip_path, expected_tf_failures=1)
 
 
 def make_logical_and_tests(zip_path):
   """Make a set of tests to do logical_and."""
-  return _make_logical_tests(tf.logical_and)(zip_path)
+  return _make_logical_tests(tf.logical_and)(zip_path, expected_tf_failures=1)
 
 
 def make_logical_xor_tests(zip_path):
@@ -3708,7 +4051,7 @@ def make_logical_xor_tests(zip_path):
 
     Test logical_not as well.
   """
-  return _make_logical_tests(tf.logical_xor)(zip_path)
+  return _make_logical_tests(tf.logical_xor)(zip_path, expected_tf_failures=1)
 
 
 def make_mirror_pad_tests(zip_path):
@@ -3791,18 +4134,23 @@ def make_mirror_pad_tests(zip_path):
     return input_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, input_values)))
 
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      expected_tf_success=8)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_unroll_batch_matmul_tests(zip_path):
   """Make a set of tests to test unroll_batch_matmul."""
 
-  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "shape": [[(2, 2, 3), (2, 3, 2), False, False],
+                [(2, 2, 3), (2, 3, 2), True, True],
+                [(2, 2, 3), (2, 2, 3), False, True],
+                [(2, 2, 3), (2, 2, 3), True, False],
+                [(4, 2, 2, 3), (4, 2, 3, 2), False, False],
+                [(4, 2, 2, 3), (4, 2, 3, 2), True, True],
+                [(4, 2, 2, 3), (4, 2, 2, 3), False, True],
+                [(4, 2, 2, 3), (4, 2, 2, 3), True, False]]
+  }]
 
   def build_graph(parameters):
     """Build the batch_matmul op testing graph."""
@@ -3811,7 +4159,11 @@ def make_unroll_batch_matmul_tests(zip_path):
     input_tensor2 = tf.placeholder(
         dtype=parameters["dtype"], shape=parameters["shape"][1])
     # Should be unrolled and replaced with fully_connected ops in the end.
-    out = tf.matmul(input_tensor1, input_tensor2)
+    out = tf.matmul(
+        input_tensor1,
+        input_tensor2,
+        transpose_a=parameters["shape"][2],
+        transpose_b=parameters["shape"][3])
     return [input_tensor1, input_tensor2], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -3848,8 +4200,7 @@ def make_placeholder_with_default_tests(zip_path):
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=3)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_unique_tests(zip_path):
@@ -3894,12 +4245,7 @@ def make_unique_tests(zip_path):
     return input_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, input_values)))
 
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      expected_tf_success=9)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_reverse_v2_tests(zip_path):
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 45bd59a67d10baf61ad981f2fef29e948c2e77d2..fb98cc9b1725f8295bb060ae60ceb151569616e6 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -102,9 +102,6 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/add.*dtype=tf\.int64)", "119126484"},
     {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
     {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
-
-    // Strided Slice chooses the wrong dimension.
-    {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
 // Additional list of tests that are expected to fail when
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c46e80cc360043158928544a54c0221a7b405ad0
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -0,0 +1,124 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "util",
+    hdrs = ["util.h"],
+    deps = [
+        ":input_generator",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:split",
+        "//tensorflow/lite/testing:tflite_driver",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":util",
+        "//tensorflow/lite/testing:tflite_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_binary(
+    name = "tflite_kernel_runner",
+    srcs = ["tflite_kernel_runner.cc"],
+    deps = [
+        ":util",
+    ],
+)
+
+tf_cc_binary(
+    name = "generate_diff_report",
+    srcs = ["generate_diff_report.cc"],
+    deps = [
+        ":diff_analyzer",
+        "//tensorflow/core:framework_internal",
+    ],
+)
+
+cc_library(
+    name = "input_generator",
+    srcs = ["input_generator.cc"],
+    hdrs = ["input_generator.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:join",
+        "//tensorflow/lite/testing:split",
+    ],
+)
+
+tf_cc_test(
+    name = "input_generator_test",
+    size = "small",
+    srcs = ["input_generator_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":input_generator",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "diff_analyzer",
+    srcs = ["diff_analyzer.cc"],
+    hdrs = ["diff_analyzer.h"],
+    deps = [
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:split",
+    ],
+)
+
+tf_cc_test(
+    name = "diff_analyzer_test",
+    size = "small",
+    srcs = ["diff_analyzer_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":diff_analyzer",
+        "//tensorflow/core:lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d6fcc80be17b4020f53dddb8215a083031fd501
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+#include <cmath>
+#include <fstream>
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+float CalculateNormalizedMaxDiff(const std::vector<float>& base,
+                                 const std::vector<float>& test) {
+  float diff = 0;
+  // For numerical stability in case the tensor is all 0.
+  float base_max = 1e-6;
+
+  for (int i = 0; i < base.size(); i++) {
+    diff = std::max(diff, std::abs(base[i] - test[i]));
+    base_max = std::max(base_max, base[i]);
+  }
+
+  return diff / base_max;
+}
+
+float CalculateNormalizedL2Norm(const std::vector<float>& base,
+                                const std::vector<float>& test) {
+  float l2_error = 0;
+  // For numerical stability in case the tensor is all 0.
+  float base_max = 1e-6;
+
+  for (int i = 0; i < base.size(); i++) {
+    float diff = base[i] - test[i];
+    l2_error += diff * diff;
+    base_max = std::max(base_max, base[i]);
+  }
+
+  l2_error /= base.size();
+
+  return std::sqrt(l2_error) / base_max;
+}
+
+TfLiteStatus Populate(const string& filename,
+                      std::vector<std::vector<float>>* tensors) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ifstream file(filename);
+  string content;
+  while (std::getline(file, content, '\n')) {
+    tensors->push_back(Split<float>(content, ","));
+  }
+
+  file.close();
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus DiffAnalyzer::ReadFiles(const string& base, const string& test) {
+  TF_LITE_ENSURE_STATUS(Populate(base, &base_tensors_));
+  TF_LITE_ENSURE_STATUS(Populate(test, &test_tensors_));
+
+  if (base_tensors_.size() != test_tensors_.size()) {
+    fprintf(stderr, "Golden and test tensor dimensions don't match.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DiffAnalyzer::WriteReport(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty output file name.");
+    return kTfLiteError;
+  }
+
+  std::ofstream output_file;
+  output_file.open(filename, std::fstream::out | std::fstream::trunc);
+  if (!output_file) {
+    fprintf(stderr, "Failed to open output file %s.", filename.c_str());
+    return kTfLiteError;
+  }
+
+  output_file << "Normalized L2 Error"
+              << ","
+              << "Normalized Max Diff"
+              << "\n";
+  for (int i = 0; i < base_tensors_.size(); i++) {
+    float l2_error =
+        CalculateNormalizedL2Norm(base_tensors_[i], test_tensors_[i]);
+    float max_diff =
+        CalculateNormalizedMaxDiff(base_tensors_[i], test_tensors_[i]);
+    output_file << l2_error << "," << max_diff << "\n";
+  }
+
+  output_file.close();
+  return kTfLiteOk;
+}
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aecbaea449bda3edd1e5176b9a91b4542afc64f3
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Reads the baseline and test files with output tensor values, and calculates
+// the diff metrics.
+class DiffAnalyzer {
+ public:
+  DiffAnalyzer() = default;
+  TfLiteStatus ReadFiles(const string& base, const string& test);
+  TfLiteStatus WriteReport(const string& filename);
+
+ private:
+  std::vector<std::vector<float>> base_tensors_;
+  std::vector<std::vector<float>> test_tensors_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f92a3e6af756b932aa0a78ddd2ab5cfb48f9dc8b
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+#include <fstream>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+TEST(DiffAnalyzerTest, ZeroDiff) {
+  DiffAnalyzer diff_analyzer;
+  string filename = "third_party/tensorflow/lite/testdata/test_input.csv";
+  ASSERT_EQ(diff_analyzer.ReadFiles(filename, filename), kTfLiteOk);
+
+  string output_file =
+      tensorflow::io::JoinPath(FLAGS_test_tmpdir + "diff_report.csv");
+  ASSERT_EQ(diff_analyzer.WriteReport(output_file), kTfLiteOk);
+
+  std::string content;
+  std::ifstream file(output_file);
+  std::getline(file, content);
+  std::getline(file, content);
+  ASSERT_EQ(content, "0,0");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/generate_diff_report.cc b/tensorflow/lite/testing/kernel_test/generate_diff_report.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afa6a9a94ec2ffd824d66a363c53b69455706d06
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/generate_diff_report.cc
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+int main(int argc, char** argv) {
+  string base, test, output;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("base", &base, "Path to the base serialized tensor."),
+      tensorflow::Flag("test", &test, "Path to the test serialized tensor."),
+      tensorflow::Flag("output", &output, "Path to the output file."),
+  };
+  tensorflow::Flags::Parse(&argc, argv, flag_list);
+
+  tflite::testing::DiffAnalyzer diff_analyzer;
+  diff_analyzer.ReadFiles(base, test);
+  diff_analyzer.WriteReport(output);
+  return 0;
+}
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.cc b/tensorflow/lite/testing/kernel_test/input_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..897e18560d18584a4ba93957f372073de47b44d3
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator.cc
@@ -0,0 +1,208 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+
+#include <fstream>
+#include <limits>
+#include <random>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/testing/join.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+template <typename T>
+std::vector<T> GenerateRandomTensor(TfLiteIntArray* dims,
+                                    const std::function<T(int)>& random_func) {
+  int64_t num_elements = 1;
+  for (int i = 0; i < dims->size; i++) {
+    num_elements *= dims->data[i];
+  }
+
+  std::vector<T> result(num_elements);
+  for (int i = 0; i < num_elements; i++) {
+    result[i] = random_func(i);
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<T> GenerateUniform(TfLiteIntArray* dims, float min, float max) {
+  auto random_float = [](float min, float max) {
+    // TODO(yunluli): Change seed for each invocation if needed.
+    static unsigned int seed;
+    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(dims, random_t);
+  return data;
+}
+
+template <typename T>
+std::vector<T> GenerateGaussian(TfLiteIntArray* dims, float min, float max) {
+  auto random_float = [](float min, float max) {
+    static std::default_random_engine generator;
+    // We generate a float number within [0, 1) following a mormal distribution
+    // with mean = 0.5 and stddev = 1/3, and use it to scale the final random
+    // number into the desired range.
+    static std::normal_distribution<double> distribution(0.5, 1.0 / 3);
+    auto rand_n = distribution(generator);
+    while (rand_n < 0 || rand_n >= 1) {
+      rand_n = distribution(generator);
+    }
+
+    return min + (max - min) * static_cast<float>(rand_n);
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(dims, random_t);
+  return data;
+}
+
+}  // namespace
+
+TfLiteStatus InputGenerator::LoadModel(const string& model_dir) {
+  model_ = FlatBufferModel::BuildFromFile(model_dir.c_str());
+  if (!model_) {
+    fprintf(stderr, "Cannot load model %s", model_dir.c_str());
+    return kTfLiteError;
+  }
+
+  ::tflite::ops::builtin::BuiltinOpResolver builtin_ops;
+  InterpreterBuilder(*model_, builtin_ops)(&interpreter_);
+  if (!interpreter_) {
+    fprintf(stderr, "Failed to build interpreter.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus InputGenerator::ReadInputsFromFile(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ifstream input_file(filename);
+  string input;
+  while (std::getline(input_file, input, '\n')) {
+    inputs_.push_back(input);
+  }
+  input_file.close();
+  return kTfLiteOk;
+}
+
+TfLiteStatus InputGenerator::WriteInputsToFile(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ofstream output_file;
+  output_file.open(filename, std::fstream::out | std::fstream::trunc);
+  if (!output_file) {
+    fprintf(stderr, "Failed to open output file %s.", filename.c_str());
+    return kTfLiteError;
+  }
+
+  for (const auto& input : inputs_) {
+    output_file << input << "\n";
+  }
+  output_file.close();
+
+  return kTfLiteOk;
+}
+
+// TODO(yunluli): Support more tensor types when needed.
+TfLiteStatus InputGenerator::GenerateInput(const string& distribution) {
+  auto input_tensor_ids = interpreter_->inputs();
+  for (auto id : input_tensor_ids) {
+    auto* tensor = interpreter_->tensor(id);
+    if (distribution == "UNIFORM") {
+      switch (tensor->type) {
+        case kTfLiteInt8: {
+          auto data = GenerateUniform<int8_t>(
+              tensor->dims, std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteUInt8: {
+          auto data = GenerateUniform<uint8_t>(
+              tensor->dims, std::numeric_limits<uint8_t>::min(),
+              std::numeric_limits<uint8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteFloat32: {
+          auto data = GenerateUniform<float>(tensor->dims, -1, 1);
+          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          break;
+        }
+        default:
+          fprintf(stderr, "Unsupported input tensor type %s.",
+                  TfLiteTypeGetName(tensor->type));
+          break;
+      }
+    } else if (distribution == "GAUSSIAN") {
+      switch (tensor->type) {
+        case kTfLiteInt8: {
+          auto data = GenerateGaussian<int8_t>(
+              tensor->dims, std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteUInt8: {
+          auto data = GenerateGaussian<uint8_t>(
+              tensor->dims, std::numeric_limits<uint8_t>::min(),
+              std::numeric_limits<uint8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteFloat32: {
+          auto data = GenerateGaussian<float>(tensor->dims, -1, 1);
+          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          break;
+        }
+        default:
+          fprintf(stderr, "Unsupported input tensor type %s.",
+                  TfLiteTypeGetName(tensor->type));
+          break;
+      }
+    } else {
+      fprintf(stderr, "Unsupported distribution %s.", distribution.c_str());
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+std::vector<string> InputGenerator::GetInputs() { return inputs_; }
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.h b/tensorflow/lite/testing/kernel_test/input_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..859c7068e5448c837580fe79e89918fbd34c2a66
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Generate random input, or read input from a file for kernel diff test.
+// Needs to load the tflite graph to get information like tensor shape and
+// data type.
+class InputGenerator {
+ public:
+  InputGenerator() = default;
+  TfLiteStatus LoadModel(const string& model_dir);
+  TfLiteStatus ReadInputsFromFile(const string& filename);
+  TfLiteStatus GenerateInput(const string& distribution);
+  std::vector<string> GetInputs();
+  TfLiteStatus WriteInputsToFile(const string& filename);
+
+ private:
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::vector<string> inputs_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
diff --git a/tensorflow/lite/testing/kernel_test/input_generator_test.cc b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6bd8b94fbd0e8288f0f3beab4aa8a8f91563e90
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+
+#include <fstream>
+#include <map>
+
+#include <gmock/gmock.h>
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+TEST(InputGeneratorTest, LoadModel) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "third_party/tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+}
+
+TEST(InputGeneratorTest, ReadWriteSimpleFile) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.ReadInputsFromFile(
+                "third_party/tensorflow/lite/testdata/test_input.csv"),
+            kTfLiteOk);
+
+  std::vector<string> inputs;
+  std::string content = "1";
+  for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
+    content.append(",1");
+  }
+  inputs.push_back(content);
+  ASSERT_EQ(input_generator.GetInputs(), inputs);
+
+  auto output_filename = FLAGS_test_tmpdir + "/out.csv";
+  ASSERT_EQ(input_generator.WriteInputsToFile(output_filename), kTfLiteOk);
+
+  std::ifstream in(output_filename);
+  std::string out;
+  std::getline(in, out, '\n');
+  ASSERT_EQ(out, content);
+}
+
+TEST(InputGeneratorTest, GenerateUniformInput) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "third_party/tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+  input_generator.GenerateInput("UNIFORM");
+  auto inputs = input_generator.GetInputs();
+  ASSERT_EQ(inputs.size(), 4);
+}
+
+TEST(InputGeneratorTest, GenerateGaussianInput) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "third_party/tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+  input_generator.GenerateInput("GAUSSIAN");
+  auto inputs = input_generator.GetInputs();
+  ASSERT_EQ(inputs.size(), 4);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34c1728ed1da6ec962989479dccfdc64bc8ca6cd
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/testing/kernel_test/util.h"
+
+int main(int argc, char** argv) {
+  tflite::testing::kernel_test::TestOptions options =
+      tflite::testing::kernel_test::ParseTfliteKernelTestFlags(&argc, argv);
+  const bool run_reference_kernel = options.kernel_type == "REFERENCE";
+  const bool use_nnapi = options.kernel_type == "NNAPI";
+
+  auto runner = absl::make_unique<tflite::testing::TfLiteDriver>(
+      use_nnapi, "", run_reference_kernel);
+  if (tflite::testing::kernel_test::RunKernelTest(options, runner.get()) ==
+      kTfLiteOk) {
+    return 0;
+  }
+
+  return -1;
+}
diff --git a/tensorflow/lite/testing/kernel_test/util.h b/tensorflow/lite/testing/kernel_test/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d940e5ad12f497ec827ce0dc6be9e6311078b1a9
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/util.h
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+
+#include <fstream>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+
+struct TestOptions {
+  // Path of tensorflow lite model.
+  string tflite_model;
+  // Path of the input file. If empty, generate at runtime.
+  string read_input_from_file;
+  // Path to dump the input file.
+  string dump_input_to_file;
+  // Path to dump the output.
+  string dump_output_to_file;
+  // Input distribution.
+  string input_distribution;
+  // Kernel type.
+  string kernel_type;
+};
+
+TestOptions ParseTfliteKernelTestFlags(int* argc, char** argv) {
+  TestOptions options;
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag("tflite_model", &options.tflite_model,
+                       "Path of tensorflow lite model."),
+      tensorflow::Flag("read_input_from_file", &options.read_input_from_file,
+                       "File to read input data from. If empty, generates "
+                       "input at runtime."),
+      tensorflow::Flag("dump_input_to_file", &options.dump_input_to_file,
+                       "File to dump randomly generated input."),
+      tensorflow::Flag("dump_output_to_file", &options.dump_output_to_file,
+                       "File to dump output."),
+      tensorflow::Flag("input_distribution", &options.input_distribution,
+                       "Input distribution. Default: Gaussian."),
+      tensorflow::Flag("kernel_type", &options.kernel_type, "Kernel type."),
+  };
+
+  tensorflow::Flags::Parse(argc, argv, flags);
+
+  return options;
+}
+
+TfLiteStatus RunKernelTest(const kernel_test::TestOptions& options,
+                           TestRunner* runner) {
+  InputGenerator input_generator;
+
+  if (options.read_input_from_file.empty()) {
+    TF_LITE_ENSURE_STATUS(input_generator.LoadModel(options.tflite_model));
+    TF_LITE_ENSURE_STATUS(
+        input_generator.GenerateInput(options.input_distribution));
+  } else {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.ReadInputsFromFile(options.read_input_from_file));
+  }
+
+  runner->LoadModel(options.tflite_model);
+  runner->AllocateTensors();
+  if (!runner->IsValid()) return kTfLiteError;
+  auto input_tensor_ids = runner->GetInputs();
+  auto inputs = input_generator.GetInputs();
+  if (inputs.size() != input_tensor_ids.size()) {
+    fprintf(stderr,
+            "Number of input tensors generated doesn't match what the model "
+            "asks for.");
+  }
+  for (int i = 0; i < inputs.size(); i++) {
+    runner->SetInput(input_tensor_ids[i], inputs[i]);
+  }
+
+  runner->Invoke();
+
+  if (!options.dump_input_to_file.empty()) {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.WriteInputsToFile(options.dump_input_to_file));
+  }
+
+  if (!options.dump_output_to_file.empty()) {
+    std::ofstream output_file;
+    output_file.open(options.dump_output_to_file,
+                     std::fstream::out | std::fstream::trunc);
+    if (!output_file) {
+      return kTfLiteError;
+    }
+
+    for (auto id : runner->GetOutputs()) {
+      output_file << runner->ReadOutput(id) << "\n";
+    }
+    output_file.close();
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
diff --git a/tensorflow/lite/testing/kernel_test/util_test.cc b/tensorflow/lite/testing/kernel_test/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..751c77e13621e338e4aba628d36950956c326593
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/util_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/util.h"
+
+#include <fstream>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+namespace {
+
+TEST(UtilTest, SimpleE2ETest) {
+  TestOptions options;
+  options.tflite_model = "third_party/tensorflow/lite/testdata/add.bin";
+  options.read_input_from_file =
+      "third_party/tensorflow/lite/testdata/test_input.csv";
+  options.dump_output_to_file = FLAGS_test_tmpdir + "/test_out.csv";
+  options.kernel_type = "REFERENCE";
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(false, "", true));
+  RunKernelTest(options, runner.get());
+  std::string expected = "3";
+  for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
+    expected.append(",3");
+  }
+  std::string content;
+  std::ifstream file(options.dump_output_to_file);
+  std::getline(file, content);
+  EXPECT_EQ(content, expected);
+}
+
+}  // namespace
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/string_util.cc b/tensorflow/lite/testing/string_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf9d5087644cc52415a83dd80b457249b85765b5
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.cc
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/testing/string_util.h"
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+PyObject* SerializeAsHexString(PyObject* value) {
+  DynamicBuffer dynamic_buffer;
+  if (!python_utils::FillStringBufferWithPyArray(value, &dynamic_buffer)) {
+    return nullptr;
+  }
+
+  char* char_buffer = nullptr;
+  size_t size = dynamic_buffer.WriteToBuffer(&char_buffer);
+  string s = absl::BytesToHexString({char_buffer, size});
+  free(char_buffer);
+
+  return python_utils::ConvertToPyString(s.data(), s.size());
+}
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/string_util.h b/tensorflow/lite/testing/string_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..56c024d918df37641c12851a2a02187d12e03b7d
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+
+#include <Python.h>
+#include <string>
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+// Take a python string array, convert it to TF Lite dynamic buffer format and
+// serialize it as a HexString.
+PyObject* SerializeAsHexString(PyObject* value);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
diff --git a/tensorflow/lite/testing/string_util.i b/tensorflow/lite/testing/string_util.i
new file mode 100644
index 0000000000000000000000000000000000000000..574abb79653ff858721e28d0d33225e3e24cbbfd
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.i
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+
+#define SWIG_FILE_WITH_INIT
+#include "tensorflow/lite/testing/string_util.h"
+
+%}
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+PyObject* SerializeAsHexString(PyObject* string_tensor);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index a637dc86c020d4e16fb4fc02e9f62e8dec6a3a25..55670858338bda0bfe04828c33da6c64982a6656 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
@@ -383,5 +384,34 @@ void TfLiteDriver::ResetLSTMStateTensors() {
   interpreter_->ResetVariableTensors();
 }
 
+string TfLiteDriver::ReadOutput(int id) {
+  auto* tensor = interpreter_->tensor(id);
+  int num_elements = 1;
+
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    num_elements *= tensor->dims->data[i];
+  }
+
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      return JoinDefault(tensor->data.f, num_elements, ",");
+    case kTfLiteInt32:
+      return JoinDefault(tensor->data.i32, num_elements, ",");
+    case kTfLiteInt64:
+      return JoinDefault(tensor->data.i64, num_elements, ",");
+    case kTfLiteUInt8:
+      return Join(tensor->data.uint8, num_elements, ",");
+    case kTfLiteInt8:
+      return JoinDefault(tensor->data.int8, num_elements, ",");
+    case kTfLiteBool:
+      return JoinDefault(tensor->data.b, num_elements, ",");
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::ReadOutput"));
+      return "";
+  }
+}
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 537f20dfbfd6c6fe0fbefd854358146129d33b7a..3cce6c4222ec36f5eac2f144062b5b850c326345 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -49,7 +49,7 @@ class TfLiteDriver : public TestRunner {
   void SetExpectation(int id, const string& csv_values) override;
   void Invoke() override;
   bool CheckResults() override;
-  string ReadOutput(int id) override { return "no-op"; }
+  string ReadOutput(int id) override;
 
  private:
   void DeallocateStringTensor(TfLiteTensor* t) {
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 81bf6700cb898796a72bea38ea0711556a7215a5..e80816bdf5ecd21d4f147e824188dd3a206d68dd 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -54,6 +54,8 @@ TEST(TfliteDriverTest, SimpleTest) {
   ASSERT_TRUE(runner->IsValid());
 
   ASSERT_TRUE(runner->CheckResults());
+  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
 }
 
 TEST(TfliteDriverTest, SingleAddOpTest) {
@@ -88,6 +90,8 @@ TEST(TfliteDriverTest, SingleAddOpTest) {
   ASSERT_TRUE(runner->IsValid());
 
   ASSERT_TRUE(runner->CheckResults());
+  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tflite_exported_symbols.lds b/tensorflow/lite/tflite_exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..b145204aa1e2b039aa7075047b1fd9ca73157320
--- /dev/null
+++ b/tensorflow/lite/tflite_exported_symbols.lds
@@ -0,0 +1,3 @@
+*TfLite*
+*tflite*
+*TFL_*
diff --git a/tensorflow/lite/tflite_version_script.lds b/tensorflow/lite/tflite_version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..1df70705ebf4a85d2f4c9f2301c53d48e623dff7
--- /dev/null
+++ b/tensorflow/lite/tflite_version_script.lds
@@ -0,0 +1,8 @@
+VERS_1.0 {
+  global:
+    *TfLite*;
+    *tflite*;
+    *TFL_*;
+  local:
+    *;
+};
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 3150c48c73d4eaa2786227064e7f41abb99409a5..c477e2f4c089d8165186671ea5f6cd703297c7fe 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -378,6 +378,7 @@ cc_library(
         ":types_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@protobuf_archive//:protobuf_headers",
diff --git a/tensorflow/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
index 8896893f3579abcefa87e3411f9b186ca7a45a1b..ad69e4f7b7a4285f36750c60291d7a6a97e7e9f7 100644
--- a/tensorflow/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -15,17 +15,21 @@ limitations under the License.
 #include "tensorflow/lite/toco/dump_graphviz.h"
 
 #include <cmath>
+#include <functional>
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
+#include "re2/re2.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 using toco::port::AppendF;
 using toco::port::StringF;
@@ -33,72 +37,158 @@ using toco::port::StringF;
 namespace toco {
 namespace {
 
+// 'nslimit' is a graphviz (dot) paramater that limits the iterations during
+// the layout phase. Omitting it allows infinite iterations, causing some
+// complex graphs to never finish. A value of 125 produces good graphs
+// while allowing complex graphs to finish.
+constexpr char kGraphFmt[] = R"CODE(digraph Computegraph { tooltip = "/"
+    nslimit=125 margin=36 ranksep = 2 labelloc="t" label=%s
+)CODE";
+// Note: tooltip's are only supported on SVGs in Chrome.
+constexpr char kSubgraphFmt[] =
+    R"CODE(    subgraph "cluster_%s" { style=rounded bgcolor="%s" penwidth=0.0 label=%s
+)CODE";
+constexpr char kArrayNodeFmt[] =
+    R"CODE(        "%s" [label=%s tooltip="%s" shape=%s style=filled fillcolor="%s" fontcolor="%sDD"];
+)CODE";
+constexpr char kOpNodeFmt[] =
+    R"CODE(        %s [label=%s tooltip=" " shape=box margin=0 style=filled fillcolor="%s" fontcolor="%sDD"];
+)CODE";
+constexpr char kInputEdgeFmt[] =
+    R"CODE(        "%s"%s -> %s:i%d:n [penwidth=%f weight=%f];
+)CODE";
+constexpr char kOutputEdgeFmt[] =
+    R"CODE(        %s:o%d:s -> "%s"%s [penwidth=%f weight=%f];
+)CODE";
+constexpr char kRNNBackEdgeFmt[] =
+    R"CODE(        "%s":s -> "%s":n [color="#0F9D58" constraint=false];
+)CODE";
+constexpr char kUnicodeMult[] = "\u00D7";
+constexpr char kUnicodeEllipsis[] = " \u2026 ";
+
 class Color {
  public:
   Color() {}
   Color(uint8 r, uint8 g, uint8 b) : r_(r), g_(g), b_(b) {}
+  explicit Color(uint32 word)
+      : r_((word & 0x00FF0000) >> 16),
+        g_((word & 0x0000FF00) >> 8),
+        b_((word & 0x000000FF) >> 0) {}
+
   // Returns the string serialization of this color in graphviz format,
   // for use as 'fillcolor' in boxes.
-  string FillColorString() const { return StringF("%.2X%.2X%.2X", r_, g_, b_); }
+  string AsHexString() const { return StringF("#%.2X%.2X%.2X", r_, g_, b_); }
+  // The color to use for this node; will be used as 'fillcolor'
+  // for its box. See Color::AsHexString. A suitable, different
+  // color will be chosen for the 'fontcolor' for the inside text
+  // label, see Color::TextColorString.
   // Returns the serialization in graphviz format of a suitable color to use
   // 'fontcolor' in the same boxes. It should black or white, whichever offers
-  // the better contrast from FillColorString().
+  // the better contrast from AsHexString().
   string TextColorString() const {
     // https://en.wikipedia.org/wiki/Relative_luminance
     const float luminance = 0.2126f * r_ + 0.7152f * g_ + 0.0722f * b_;
     const uint8 l = luminance > 128.f ? 0 : 255;
-    return StringF("%.2X%.2X%.2X", l, l, l);
+    return StringF("#%.2X%.2X%.2X", l, l, l);
   }
 
  private:
   uint8 r_ = 0, g_ = 0, b_ = 0;
 };
 
-struct NodeProperties {
-  // The text to display inside the box for this node.
-  string label;
-  // The color to use for this node; will be used as 'fillcolor'
-  // for its box. See Color::FillColorString. A suitable, different
-  // color will be chosen for the 'fontcolor' for the inside text
-  // label, see Color::TextColorString.
-  Color color;
-  float log2_buffer_size;
-};
-
-// All colors in this file are from:
-// https://material.io/guidelines/style/color.html
+Color HashStringToColor(string s) {
+  // Return a unique color for a name.
+  //
+  // This function removes Tensorflow anti-collision suffixes (eg "_2"), hashes
+  // the string to a uint_32, then twiddles some bits to get a light and subtle
+  // color. This seems to be a good heuristic for keeping enough of the name to
+  // hash to a unique color while still revealing structure through naming
+  // similarities.
+  //
+  // The regular expression "_\d+" matches any underscore followed by numbers,
+  // which we strip out. Examples:
+  //
+  //     "Conv"      -> "Conv"
+  //     "Conv_2"    -> "Conv"
+  //     "Conv_72"   -> "Conv"
+  //     "Pad_1_bias -> "Pad_bias"
+  //     "Conv_abc"  -> "Conv_abc"
+
+  RE2::GlobalReplace(&s, R"CODE(_\d+)CODE", "");
+  uint32 color_word = std::hash<std::string>{}(s);
+  color_word |= 0x00E0E0E0;
+  return Color(color_word);
+}
 
-Color GetColorForArray(const Model& model, const string& array_name) {
+void GetArrayColorAndShape(const Model& model, const string& array_name,
+                           Color* color, string* shape) {
+  // All colors in this file are from:
+  // https://material.io/guidelines/style/color.html
   // Arrays involved in RNN back-edges have a different color
   for (const auto& rnn_state : model.flags.rnn_states()) {
     // RNN state, fed by a back-edge. Bold color.
     if (array_name == rnn_state.state_array()) {
-      return Color(0x0F, 0x9D, 0x58);
+      *color = Color(0x0F, 0x9D, 0x58);
+      *shape = "invhouse";
+      return;
     }
     // RNN back-edge source, feeding a RNN state.
     // Light tone of the same color as RNN states.
     if (array_name == rnn_state.back_edge_source_array()) {
-      return Color(0xB7, 0xE1, 0xCD);
+      *color = Color(0xB7, 0xE1, 0xCD);
+      *shape = "house";
+      return;
     }
   }
   // Constant parameter arrays have their own bold color
   if (model.GetArray(array_name).buffer) {
-    return Color(0x42, 0x85, 0xF4);
+    *color = Color(0x42, 0x85, 0xF4);
+    *shape = "cylinder";
+    return;
   }
   // Remaining arrays are activations.
   // We use gray colors for them because they are the majority
   // of arrays so we want to highlight other arrays instead of them.
   // First, we use a bolder gray for input/output arrays:
   if (IsInputArray(model, array_name)) {
-    return Color(0x9E, 0x9E, 0x9E);
+    *color = Color(0x9E, 0x9E, 0x9E);
+    *shape = "invhouse";
+    return;
   }
   if (IsOutputArray(model, array_name)) {
-    return Color(0x9E, 0x9E, 0x9E);
+    *color = Color(0x9E, 0x9E, 0x9E);
+    *shape = "house";
+    return;
   }
   // Remaining arrays are intermediate activation arrays.
   // Lighter tone of the same grey as for input/output arrays:
   // We want these to be very discrete.
-  return Color(0xF5, 0xF5, 0xF5);
+  *color = Color(0xF5, 0xF5, 0xF5);
+  *shape = "box";
+}
+
+string GetArrayCompassPt(const Model& model, const string& array_name) {
+  // The "compass point" is the point on the node where edge connections are
+  // made. For most arrays we don't care, but input's and outputs look better
+  // connected at the tip of the "house" and "invhouse" shapes used. So we
+  // append ":n" and ":s" respectively for those.
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    // RNN state is essentially an input
+    if (array_name == rnn_state.state_array()) {
+      return ":s";
+    }
+    // RNN back-edge source is essentially an output
+    if (array_name == rnn_state.back_edge_source_array()) {
+      return ":n";
+    }
+  }
+  if (IsInputArray(model, array_name)) {
+    return ":s";
+  }
+  if (IsOutputArray(model, array_name)) {
+    return ":n";
+  }
+  return "";
 }
 
 void AppendArrayVal(string* string, Array const& array, int index) {
@@ -141,239 +231,550 @@ void AppendArrayVal(string* string, Array const& array, int index) {
   }
 }
 
-NodeProperties GetPropertiesForArray(const Model& model,
-                                     const string& array_name) {
-  NodeProperties node_properties;
-  node_properties.color = GetColorForArray(model, array_name);
-  node_properties.label = absl::StrReplaceAll(array_name, {{"/", "/\\n"}});
-  node_properties.log2_buffer_size = 0.0f;
+typedef std::map<string, string> Attributes;
+
+string AttributesToHtml(Attributes attributes) {
+  string html;
+  for (const auto& attr : attributes) {
+    html += R"CODE(<TR><TD CELLPADDING="1" ALIGN="RIGHT">)CODE";
+    html += attr.first;
+    html += R"CODE(:</TD><TD CELLPADDING="1" ALIGN="LEFT">)CODE";
+    html += attr.second;
+    html += "</TD></TR>";
+  }
+  return html;
+}
+
+string GetArrayLabel(const Model& model, const string& array_id) {
+  string html;
 
-  // Append array shape to the label.
-  auto& array = model.GetArray(array_name);
-  AppendF(&node_properties.label, "\\nType: %s",
-          ArrayDataTypeName(array.data_type));
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  html += "<";
 
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="10" FACE="Courier">)CODE";
+  html += R"CODE(<TABLE BORDER="0" CELLSPACING="2" CELLPADDING="0">)CODE";
+
+  auto& array = model.GetArray(array_id);
+  if (array.buffer) {
+    // "cylinder" shapes require some extra head room.
+    html += R"CODE(<TR><TD COLSPAN="2"> </TD></TR>)CODE";
+  }
+
+  // "Primary" name of array (last non-slash delimited group of characters).
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><I>)CODE";
+  AppendF(&html, R"CODE(%s)CODE",
+          std::vector<string>(absl::StrSplit(array_id, '/')).back());
+  html += R"CODE(</I></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Array data type and dimensions
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="14" FACE="Courier"><B>)CODE";
+  // Type
+  html += ArrayDataTypeName(array.data_type);
+  // Shape
   if (array.has_shape()) {
     auto& array_shape = array.shape();
-    node_properties.label += "\\n[";
-    for (int id = 0; id < array_shape.dimensions_count(); id++) {
-      if (id == 0) {
-        AppendF(&node_properties.label, "%d", array_shape.dims(id));
-      } else {
-        // 0x00D7 is the unicode multiplication symbol
-        AppendF(&node_properties.label, "\u00D7%d", array_shape.dims(id));
+    html += "[";
+    for (int dim = 0; dim < array_shape.dimensions_count(); dim++) {
+      AppendF(&html, "%d", array_shape.dims(dim));
+      if (dim + 1 < array_shape.dimensions_count()) {
+        html += kUnicodeMult;
       }
     }
-    node_properties.label += "]";
+    html += "]";
+  }
 
-    int buffer_size = 0;
-    if (IsNonEmpty(array.shape())) {
-      buffer_size = RequiredBufferSizeForShape(array.shape());
-      node_properties.log2_buffer_size =
-          std::log2(static_cast<float>(buffer_size));
+  // Small buffer sample
+  int buffer_size = 0;
+  if (array.buffer) {
+    buffer_size = RequiredBufferSizeForShape(array.shape());
+  }
+  if ((buffer_size > 0) && (buffer_size <= 4)) {
+    html += " = ";
+    if (array.shape().dimensions_count() > 0) {
+      html += "{";
     }
-
-    if (array.buffer) {
-      const auto& array = model.GetArray(array_name);
-      if (buffer_size <= 4) {
-        AppendF(&node_properties.label, " = ");
-        if (array.shape().dimensions_count() > 0) {
-          AppendF(&node_properties.label, "{");
-        }
-        for (int i = 0; i < buffer_size; i++) {
-          AppendArrayVal(&node_properties.label, array, i);
-          if (i + 1 < buffer_size) {
-            AppendF(&node_properties.label, ", ");
-          }
-        }
-      } else {
-        AppendF(&node_properties.label, "\\n = ");
-        if (array.shape().dimensions_count() > 0) {
-          AppendF(&node_properties.label, "{");
-        }
-        AppendArrayVal(&node_properties.label, array, 0);
-        AppendF(&node_properties.label, ", ");
-        AppendArrayVal(&node_properties.label, array, 1);
-        // 0x2026 is the unicode ellipsis symbol
-        AppendF(&node_properties.label, " \u2026 ");
-        AppendArrayVal(&node_properties.label, array, buffer_size - 2);
-        AppendF(&node_properties.label, ", ");
-        AppendArrayVal(&node_properties.label, array, buffer_size - 1);
-      }
-      if (array.shape().dimensions_count() > 0) {
-        AppendF(&node_properties.label, "}");
+    for (int i = 0; i < buffer_size; i++) {
+      AppendArrayVal(&html, array, i);
+      if (i + 1 < buffer_size) {
+        html += ", ";
       }
     }
+    if (array.shape().dimensions_count() > 0) {
+      html += "}";
+    }
+  }
+  html += R"CODE(</B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Large buffer samples get their own line
+  if (buffer_size > 4) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER"> = {)CODE";
+    AppendArrayVal(&html, array, 0);
+    html += ", ";
+    AppendArrayVal(&html, array, 1);
+    html += kUnicodeEllipsis;
+    AppendArrayVal(&html, array, buffer_size - 2);
+    html += ", ";
+    AppendArrayVal(&html, array, buffer_size - 1);
+    html += "}</TD></TR>";
   }
 
+  // Other array properties
+  Attributes attrs;
   if (array.minmax) {
-    AppendF(&node_properties.label, "\\nMinMax: [%.7g, %.7g]",
-            array.minmax->min, array.minmax->max);
+    attrs["minmax"] =
+        StringF("[%.7g, %.7g]", array.minmax->min, array.minmax->max);
   }
-
   if (array.quantization_params) {
-    AppendF(&node_properties.label, "\\nQuantization: %7g * (x - %d)",
-            array.quantization_params->scale,
-            array.quantization_params->zero_point);
+    attrs["quant"] = StringF("%7g\u00B7(x-%d)",  // Unicode "cdot"
+                             array.quantization_params->scale,
+                             array.quantization_params->zero_point);
   }
-
   if (array.alloc) {
-    AppendF(&node_properties.label, "\\nTransient Alloc: [%d, %d)",
-            array.alloc->start, array.alloc->end);
+    attrs["alloc"] = StringF("[%d, %d)", array.alloc->start, array.alloc->end);
   }
-
-  return node_properties;
+  html += AttributesToHtml(attrs);
+
+  // output array_id in ultra-small font so it can be searched and copied.
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="3" FACE="">)CODE";
+  AppendF(&html, R"CODE("%s")CODE", array_id);
+  html += R"CODE(</FONT>)CODE";
+  html += "</TD></TR>";
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+  return html;
 }
 
-NodeProperties GetPropertiesForOperator(const Operator& op) {
-  NodeProperties node_properties;
-  if (op.type == OperatorType::kUnsupported) {
-    node_properties.label =
-        static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
-  } else {
-    node_properties.label =
-        string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
-  }
+Attributes GetOpAttributes(const Model& model, const Operator& op) {
+  Attributes attrs;
   switch (op.fused_activation_function) {
     case FusedActivationFunctionType::kRelu:
-      AppendF(&node_properties.label, "\\nReLU");
+      attrs["func"] = "ReLU";
       break;
     case FusedActivationFunctionType::kRelu6:
-      AppendF(&node_properties.label, "\\nReLU6");
+      attrs["func"] = "ReLU6";
       break;
     case FusedActivationFunctionType::kRelu1:
-      AppendF(&node_properties.label, "\\nReLU1");
+      attrs["func"] = "ReLU1";
       break;
     default:
       break;
   }
-  // Additional information for some of the operators.
+  // Output state of member vars on derived operators.
   switch (op.type) {
     case OperatorType::kConv: {
       const auto& conv_op = static_cast<const ConvOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
-      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
-              conv_op.stride_height,
-              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
+      string stride;
+      AppendF(&stride, "%d", conv_op.stride_width);
+      stride += kUnicodeMult;
+      AppendF(&stride, "%d", conv_op.stride_height);
+      attrs["stride"] = stride;
+      attrs["padding"] =
+          (conv_op.padding.type == PaddingType::kSame) ? "same" : "valid";
       break;
     }
     case OperatorType::kDepthwiseConv: {
-      const auto& conv_op = static_cast<const DepthwiseConvOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
-      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
-              conv_op.stride_height,
-              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
-      break;
-    }
-    case OperatorType::kFullyConnected: {
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      const auto& depthconv_op = static_cast<const ConvOperator&>(op);
+      string stride;
+      AppendF(&stride, "%d", depthconv_op.stride_width);
+      stride += kUnicodeMult;
+      AppendF(&stride, "%d", depthconv_op.stride_height);
+      attrs["stride"] = stride;
+      attrs["padding"] =
+          (depthconv_op.padding.type == PaddingType::kSame) ? "same" : "valid";
       break;
     }
     case OperatorType::kFakeQuant: {
       const auto& fakequant_op = static_cast<const FakeQuantOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      attrs["bits"] = StringF("%d", fakequant_op.num_bits);
       if (fakequant_op.minmax) {
-        AppendF(&node_properties.label, "\\n%dbit [%g,%g]",
-                fakequant_op.num_bits, fakequant_op.minmax->min,
-                fakequant_op.minmax->max);
+        attrs["range"] = StringF("[%g,%g]", fakequant_op.minmax->min,
+                                 fakequant_op.minmax->max);
       } else {
-        AppendF(&node_properties.label, "\\n%dbit [?,?]",
-                fakequant_op.num_bits);
+        attrs["range"] = "[?,?]";
       }
       break;
     }
     default:
-      node_properties.color = Color(0xDB, 0x44, 0x37);
       break;
   }
+  int64 math_ops_count;
+  if (EstimateArithmeticOpsCount(model, op, &math_ops_count) &&
+      (math_ops_count != 0)) {
+    attrs["math"] = FormattedNumber(math_ops_count) + "ops";
+  }
 
-  return node_properties;
+  return attrs;
 }
 
-}  // namespace
+Color GetOpColor(const Operator& op) {
+  if ((op.type == OperatorType::kDepthwiseConv) ||
+      (op.type == OperatorType::kConv) ||
+      (op.type == OperatorType::kFullyConnected) ||
+      (op.type == OperatorType::kFakeQuant)) {
+    // Give some ops a bolder red
+    return Color(0xC5, 0x39, 0x29);
+  } else {
+    return Color(0xDB, 0x44, 0x37);
+  }
+}
+
+string GetOpLabel(const Model& model, const Operator& op) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="10" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Input Ports
+  if (!op.inputs.empty()) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+    // Distribute evenly using a sub-table
+    html += R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">)CODE";
+    html += R"CODE(<TR>)CODE";
+    for (int i = 0; i < op.inputs.size(); i++) {
+      html += R"CODE(<TD PORT=")CODE";
+      AppendF(&html, "i%d", i);
+      html += R"CODE(">)CODE";
+      if (op.inputs.size() > 1) {
+        // Only number inputs when op has two or more inputs
+        AppendF(&html, "%d", i);
+      }
+      html += "</TD>";
+    }
+    html += "</TR>";
+    html += R"CODE(</TABLE></TD></TR>)CODE";
+  }
 
-void DumpGraphviz(const Model& model, string* output_file_contents) {
-  AppendF(output_file_contents, "digraph Computegraph {\n");
-  // 'nslimit' is a graphviz (dot) paramater that limits the iterations during
-  // the layout phase. Omitting it allows infinite iterations, causing some
-  // complex graphs to never finish. A value of 125 produces good graphs
-  // while allowing complex graphs to finish.
-  AppendF(output_file_contents, "\t nslimit=125;\n");
-
-  constexpr char kNodeFormat[] =
-      "\t \"%s\" [label=\"%s\", shape=%s, style=filled, fillcolor=\"#%s\", "
-      "fontcolor = \"#%sDD\"];\n";
-
-  constexpr char kEdgeFormat[] =
-      "\t \"%s\" -> \"%s\" [penwidth=%f, weight=%f];\n";
-
-  constexpr char kRNNBackEdgeFormat[] =
-      "\t \"%s\" -> \"%s\" [color=\"#0F9D58\"];\n";
-
-  for (const auto& array_kv : model.GetArrayMap()) {
-    // Add node for array.
-    const string& array_name = array_kv.first;
-    const auto& array_properties = GetPropertiesForArray(model, array_name);
-    AppendF(output_file_contents, kNodeFormat, array_name,
-            array_properties.label, "octagon",
-            array_properties.color.FillColorString().c_str(),
-            array_properties.color.TextColorString().c_str());
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><B>)CODE";
+  if (op.type == OperatorType::kUnsupported) {
+    html += static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
+  } else {
+    html += string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+  }
+  html += R"CODE(</B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs = GetOpAttributes(model, op);
+  html += AttributesToHtml(attrs);
+
+  // Output Ports
+  if (!op.outputs.empty()) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+    // Distribute evenly using a sub-table
+    html += R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">)CODE";
+    html += R"CODE(<TR>)CODE";
+    for (int i = 0; i < op.outputs.size(); i++) {
+      html += R"CODE(<TD PORT=")CODE";
+      AppendF(&html, "o%d", i);
+      html += R"CODE(">)CODE";
+      if (op.outputs.size() > 1) {
+        // Only number outputs when op has two or more outputs
+        AppendF(&html, "%d", i);
+      }
+      html += "</TD>";
+    }
+    html += "</TR>";
+    html += R"CODE(</TABLE></TD></TR>)CODE";
   }
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+
+float GetLog2BufferSize(const Model& model, const string& array_id) {
+  auto& array = model.GetArray(array_id);
+  if (array.has_shape()) {
+    int buffer_size = 0;
+    if (IsNonEmpty(array.shape())) {
+      buffer_size = RequiredBufferSizeForShape(array.shape());
+      return std::log2(static_cast<float>(buffer_size));
+    }
+  }
+  return 0.0f;
+}
+
+string GetOpId(int op_index) { return StringF("op%05d", op_index); }
+
+void DumpOperator(const Model& model, string* output_file, int op_index) {
+  // Dump node for operator.
+  const Operator& op = *model.operators[op_index];
+  Color color = GetOpColor(op);
+  string label = GetOpLabel(model, op);
+  string op_id = GetOpId(op_index);
+  AppendF(output_file, kOpNodeFmt, op_id, label, color.AsHexString(),
+          color.TextColorString());
+}
+
+void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
+  // Inputs
+  const Operator& op = *model.operators[op_index];
+  string op_id = GetOpId(op_index);
+  for (int i = 0; i < op.inputs.size(); i++) {
+    const auto& input = op.inputs[i];
+    if (!model.HasArray(input)) {
+      // Connected arrays should _always_ exist. Except, perhaps, during
+      // development.
+      continue;
+    }
+    float log2_buffer_size = GetLog2BufferSize(model, input);
+    // Draw lines that transport more data thicker (Otherwise, where would the
+    // data fit? right?).
+    float line_width = std::max(0.5f, log2_buffer_size / 3.0f);
+    // Keep edges that transport more data shorter than those with less.
+    float weight = std::max(1.0f, log2_buffer_size);
+    if (!IsInputArray(model, input) &&
+        GetOpWithOutput(model, input) == nullptr) {
+      // Give the main line of data flow a straighter path by penalizing edges
+      // to standalone buffers. Weights are generally very large buffers that
+      // would otherwise skew the layout.
+      weight = 1.0f;
+    }
+    string compass_pt = GetArrayCompassPt(model, input);
+    AppendF(output_file, kInputEdgeFmt, input, compass_pt, op_id, i, line_width,
+            weight);
+  }
+  // Outputs
+  for (int i = 0; i < op.outputs.size(); i++) {
+    const auto& output = op.outputs[i];
+    if (!model.HasArray(output)) {
+      continue;
+    }
+    float log2_buffer_size = GetLog2BufferSize(model, output);
+    // See comments above regarding weight and line_width calculations.
+    float line_width = std::max(0.5f, log2_buffer_size / 3.0f);
+    float weight = std::max(1.0f, log2_buffer_size);
+    if (!IsArrayConsumed(model, output)) {
+      weight = 1.0f;
+    }
+    string compass_pt = GetArrayCompassPt(model, output);
+    AppendF(output_file, kOutputEdgeFmt, op_id, i, output, compass_pt,
+            line_width, weight);
+  }
+}
+
+struct Node {
+  Node() : math_ops(0) {}
+  // Name used as a key in the model's array map
+  string array_id;
+
+  // Estimated number of math ops incurred by this node (the sum of the op
+  // with this array as 1st output, plus all children nodes).
+  int64 math_ops;
+
+  // A map of child nodes keyed by name.
+  std::map<const string, std::unique_ptr<Node>> children;
+};
+
+string GetSubgraphLabel(Node const& node, const string& subgraph) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="12" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="18" FACE="Helvetica"><I>)CODE";
+  html += subgraph;
+  html += R"CODE(</I></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs;
+  if (node.math_ops > 0) {
+    attrs["math"] = FormattedNumber(node.math_ops) + "ops";
+  }
+  html += AttributesToHtml(attrs);
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+
+void DumpSubgraphHeader(string* output_file, Node const& node,
+                        const string& node_name) {
+  Color color = HashStringToColor(node_name);
+  string label = GetSubgraphLabel(node, node_name);
+  AppendF(output_file, kSubgraphFmt, node_name, color.AsHexString(), label);
+}
+
+void DumpArray(const Model& model, string* output_file,
+               const string& array_id) {
+  Color color;
+  string shape;
+  GetArrayColorAndShape(model, array_id, &color, &shape);
+  string label = GetArrayLabel(model, array_id);
+  AppendF(output_file, kArrayNodeFmt, array_id, label, array_id, shape,
+          color.AsHexString(), color.TextColorString());
+
+  // Ops are placed in the same subgraph as their first output.
   for (int op_index = 0; op_index < model.operators.size(); op_index++) {
     const Operator& op = *model.operators[op_index];
-    // Add node for operator.
-    auto op_properties = GetPropertiesForOperator(op);
-    string operator_id = StringF("op%05d", op_index);
-    AppendF(output_file_contents, kNodeFormat, operator_id, op_properties.label,
-            "box", op_properties.color.FillColorString().c_str(),
-            op_properties.color.TextColorString().c_str());
-    // Add edges for all inputs of the operator.
-    for (const auto& input : op.inputs) {
-      if (!model.HasArray(input)) {
-        // Arrays should _always_ exist. Except, perhaps, during development.
-        continue;
-      }
-      auto array_properties = GetPropertiesForArray(model, input);
-      // Draw lines that transport more data thicker (Otherwise, where would the
-      // data fit? right?).
-      float line_width =
-          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
-      // Keep edges that transport more data shorter than those with less.
-      float weight = std::max(1.0f, array_properties.log2_buffer_size);
-      if (!IsInputArray(model, input) &&
-          GetOpWithOutput(model, input) == nullptr) {
-        // Give the main line of data flow a straighter path by penalizing edges
-        // to standalone buffers. Weights are generally very large buffers that
-        // otherwise skew the layout without this.
-        weight = 1.0f;
-      }
-      AppendF(output_file_contents, kEdgeFormat, input, operator_id, line_width,
-              weight);
+    if (!op.outputs.empty() && (op.outputs[0] == array_id)) {
+      DumpOperator(model, output_file, op_index);
     }
-    // Add edges for all outputs of the operator.
-    for (const auto& output : op.outputs) {
-      if (!model.HasArray(output)) {
-        // Arrays should _always_ exist. Except, perhaps, during development.
-        continue;
-      }
-      auto array_properties = GetPropertiesForArray(model, output);
-      // See comments above regarding weight and line_width calculations.
-      float line_width =
-          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
-      float weight = std::max(1.0f, array_properties.log2_buffer_size);
-      if (!IsArrayConsumed(model, output)) {
-        weight = 1.0f;
+  }
+}
+
+void DumpNode(const Model& model, string* output_file, const string& node_name,
+              Node const& node) {
+  bool not_root = !node_name.empty();
+  if (not_root) {
+    DumpSubgraphHeader(output_file, node, node_name);
+  }
+
+  for (const auto& child : node.children) {
+    if (!child.second->array_id.empty()) {
+      // Dump array if this node posesses one.
+      DumpArray(model, output_file, child.second->array_id);
+    }
+    // Note that it is always possible to have children. Unlike a filesystem,
+    // the existence of array "foo/bar" does _not_ prevent other arrays, such as
+    // and "foo/bar/baz", from being nested beneath it.
+    DumpNode(model, output_file, child.first, *child.second);
+  }
+
+  if (not_root) {
+    // End subgraph
+    AppendF(output_file, "    }\n");
+  }
+}
+
+int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
+  for (const auto& op : model.operators) {
+    if (!op->outputs.empty() && op->outputs[0] == array_id) {
+      int64 count;
+      if (EstimateArithmeticOpsCount(model, *op, &count)) {
+        return count;
+      } else {
+        return 0;
       }
-      AppendF(output_file_contents, kEdgeFormat, operator_id, output,
-              line_width, weight);
     }
   }
+  return 0;
+}
 
-  for (const auto& rnn_state : model.flags.rnn_states()) {
-    AppendF(output_file_contents, kRNNBackEdgeFormat,
-            rnn_state.back_edge_source_array(), rnn_state.state_array());
+void InsertNode(const Model& model, const string& array_id, Node* node,
+                std::vector<string> prefixes, int64* math_ops) {
+  if (prefixes.empty()) {
+    // Base case: store array in this node.
+    node->array_id = array_id;
+    *math_ops = GetArithmeticOpsCount(model, array_id);
+  } else {
+    // Insert into the sub-tree for that prefix.
+    string prefix = prefixes.back();
+    prefixes.pop_back();
+    if (node->children.count(prefix) == 0) {
+      // Create a new node if this prefix is unseen.
+      node->children[prefix] = absl::make_unique<Node>();
+    }
+    InsertNode(model, array_id, node->children[prefix].get(), prefixes,
+               math_ops);
   }
+  // Sum estimated math ops into all nodes.
+  node->math_ops += *math_ops;
+}
 
-  AppendF(output_file_contents, "}\n");
+void BuildArrayTree(const Model& model, Node* tree) {
+  // Delimit array names by path "/", then place into a tree based on this path.
+  for (const auto& array_id : model.GetArrayMap()) {
+    std::vector<string> prefixes = absl::StrSplit(array_id.first, '/');
+    std::reverse(prefixes.begin(), prefixes.end());
+    int64 math_ops;  // Temporary storage for math ops used during recursion.
+    InsertNode(model, array_id.first, tree, prefixes, &math_ops);
+  }
+}
+
+string GetGraphLabel(const Model& model, const string& graph_name) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="36" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="64" FACE="Helvetica"><B><I>)CODE";
+  html += graph_name;
+  html += R"CODE(</I></B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs;
+  attrs["arrays"] = StringF("%d", model.GetArrayMap().size());
+  if (!model.optional_arrays.empty()) {
+    attrs["optional arrays"] = StringF("%d", model.optional_arrays.size());
+  }
+  attrs["operators"] = StringF("%d", model.operators.size());
+  int64 ops_count;
+  if (EstimateArithmeticOpsCount(model, &ops_count) && (ops_count > 0)) {
+    attrs["math"] = FormattedNumber(ops_count) + "ops";
+  }
+  if (model.transient_data_size > 0) {
+    attrs["transient data size"] =
+        StringF("%d KiB", model.transient_data_size / 1024);
+  }
+  if (model.transient_data_alignment > 0) {
+    attrs["transient data alignment"] =
+        StringF("%d bytes", model.transient_data_alignment);
+  }
+  html += AttributesToHtml(attrs);
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+}  // namespace
+
+void DumpGraphviz(const Model& model, string* output_file,
+                  const string& graph_name) {
+  // Start graphviz format
+  AppendF(output_file, kGraphFmt, GetGraphLabel(model, graph_name));
+
+  // Organize arrays into a tree for subgraphing
+  Node tree;
+  BuildArrayTree(model, &tree);
+  DumpNode(model, output_file, "", tree);
+
+  // Dump edges outside all subgraphs (otherwise the referred-to nodes are
+  // implicitly included in that subgraph).
+  for (int op_index = 0; op_index < model.operators.size(); op_index++) {
+    DumpOperatorEdges(model, output_file, op_index);
+  }
+
+  // Dump RNN Backedges
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    AppendF(output_file, kRNNBackEdgeFmt, rnn_state.back_edge_source_array(),
+            rnn_state.state_array());
+  }
+  // End graphviz format
+  AppendF(output_file, "}\n");
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/dump_graphviz.h b/tensorflow/lite/toco/dump_graphviz.h
index 9697bd6f0dc434aaf98762698c64fb60cb97f2ee..9bb74dac3f8fb34fb2a440e499c4ed0066ffea4d 100644
--- a/tensorflow/lite/toco/dump_graphviz.h
+++ b/tensorflow/lite/toco/dump_graphviz.h
@@ -21,7 +21,8 @@ limitations under the License.
 
 namespace toco {
 
-void DumpGraphviz(const Model& model, string* output_file_contents);
+void DumpGraphviz(const Model& model, string* output_file_contents,
+                  const string& graph_name);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 50a30f51621c15766a1a2bb58628055ca07baecb..c2952c7dd1abc4d87b603db03b4d4ea4318cf870 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -1305,7 +1305,8 @@ void ConvertTensorFlowShapeOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.outputs[0]));
 }
 
-void ConvertRankOperator(const Model& model, const RankOperator& src_op,
+void ConvertRankOperator(const Model& model,
+                         const TensorFlowRankOperator& src_op,
                          GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* rank_op = tensorflow_graph->add_node();
   rank_op->set_op("Rank");
@@ -2274,7 +2275,8 @@ void ConvertOperator(const Model& model, const Operator& src_op,
         model, static_cast<const TensorFlowShapeOperator&>(src_op),
         tensorflow_graph);
   } else if (src_op.type == OperatorType::kRank) {
-    ConvertRankOperator(model, static_cast<const RankOperator&>(src_op),
+    ConvertRankOperator(model,
+                        static_cast<const TensorFlowRankOperator&>(src_op),
                         tensorflow_graph);
   } else if (src_op.type == OperatorType::kRange) {
     ConvertRangeOperator(model, static_cast<const RangeOperator&>(src_op),
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index a0260e24013bfda8718e0dc04052abb49b65debf..e4eb7698597f588947bc19f5ab449c9d3ff14adc 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -128,7 +128,8 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
 }
 
 bool GraphTransformationsPass(int increment, Model* model,
-                              const GraphTransformationsSet& transformations) {
+                              const GraphTransformationsSet& transformations,
+                              tensorflow::Status* status) {
   CHECK(increment == 1 || increment == -1);
   bool changed = false;
   if (model->operators.empty()) {
@@ -142,7 +143,10 @@ bool GraphTransformationsPass(int increment, Model* model,
     for (const auto& transformation : transformations) {
       CHECK(!changed_now);
       CHECK(transformation->Messages().empty());
-      CHECK(transformation->Run(model, op_index, &changed_now).ok());
+      *status = transformation->Run(model, op_index, &changed_now);
+      if (!status->ok()) {
+        return false;
+      }
       const char* made_a_change_msg =
           changed_now ? "made a change" : "did NOT make a change";
       const int log_level =
@@ -186,18 +190,21 @@ bool GraphTransformationsPass(int increment, Model* model,
 
 }  // namespace
 
-void RunGraphTransformations(Model* model, const string& msg,
-                             const GraphTransformationsSet& transformations) {
+tensorflow::Status RunGraphTransformationsWithStatus(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations) {
   PrintModelStats(toco::port::StringF("Before %s", msg), *model);
   int pass_index = 0;
+  tensorflow::Status status;
   while (GraphTransformationsPass((pass_index % 2) ? -1 : 1, model,
-                                  transformations)) {
+                                  transformations, &status)) {
     pass_index++;
     const auto& label =
         toco::port::StringF("After %s pass %d", msg, pass_index);
     PrintModelStats(label, *model);
     CheckInvariants(*model);
   }
+  return status;
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 4008bbdb4d3a4a9051de70ef63d73d74fd44281c..d92733ba3b5490b0b77e88e3beb1bbe9d4508a3a 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -102,8 +102,16 @@ class GraphTransformationsSet {
 // construct GraphTransformation objects by using 'new', pass us
 // the resulting raw pointers, and this RunGraphTransformations
 // takes care of delete'ing these pointers.
-void RunGraphTransformations(Model* model, const string& message,
-                             const GraphTransformationsSet& transformations);
+tensorflow::Status RunGraphTransformationsWithStatus(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations);
+
+inline void RunGraphTransformations(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations) {
+  auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
+  CHECK(s.ok()) << s.error_message();
+}
 
 #define DECLARE_GRAPH_TRANSFORMATION(GTName)                     \
   class GTName : public GraphTransformation {                    \
@@ -129,6 +137,8 @@ DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBroadcastIntoFollowingBinary)
 DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceLstm)
 DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceRnn)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceRnn)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index 10e817b16559ad551f6eb366f67ff1edcd486ff2..05c85b33d2a2c947eac7ada775d520bd8d3a6641 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -56,6 +56,29 @@ bool MatchTwoUnpackOps(const Operator& op, const Model& model,
   return true;
 }
 
+bool MatchDynamicBidirectionalSequenceOutputs(Operator* op, const Model& model,
+                                              Operator** fw_output,
+                                              Operator** bw_output) {
+  if (op->inputs.size() != 2) {
+    return false;
+  }
+
+  // The concat op is already the fw_rnn_output.
+  *fw_output = op;
+  auto* reverse_output = GetOpWithOutput(model, op->inputs[1]);
+  if (*fw_output == nullptr || reverse_output == nullptr) {
+    return false;
+  }
+
+  if (reverse_output->type != OperatorType::kReverseV2) {
+    return false;
+  }
+
+  *bw_output = reverse_output;
+
+  return true;
+}
+
 bool FindUnidirectionalSequenceOp(const Model& model, const Operator& output_op,
                                   OperatorType operator_type,
                                   std::stack<Operator*>* sequence_ops,
@@ -68,7 +91,7 @@ bool FindUnidirectionalSequenceOp(const Model& model, const Operator& output_op,
 
   while (op_it->type == operator_type) {
     sequence_ops->push(op_it);
-    // Check the first input of the unidirectional squence lstm op.
+    // Check the first input of the unidirectional sequence op.
     op_it = GetOpWithOutput(model, op_it->inputs[0]);
     if (op_it == nullptr) {
       return false;
@@ -80,34 +103,51 @@ bool FindUnidirectionalSequenceOp(const Model& model, const Operator& output_op,
 }
 
 bool CheckTwoUnidirectionalSequenceOpsAreValid(
+    const Model& model,
     const std::stack<Operator*>& fw_unidirectional_sequence_ops,
     const std::stack<Operator*>& bw_unidirectional_sequence_ops,
     const Operator* first_fw_sequence_op_input,
-    const Operator* first_bw_sequence_op_input) {
+    const Operator* first_bw_sequence_op_input, bool is_dynamic_rnn) {
   if (fw_unidirectional_sequence_ops.size() !=
           bw_unidirectional_sequence_ops.size() ||
       fw_unidirectional_sequence_ops.empty()) {
     return false;
   }
 
-  // For static bidirectional sequence lstm, we should have two pack ops.
-  if (first_fw_sequence_op_input->type != OperatorType::kPack ||
-      first_bw_sequence_op_input->type != OperatorType::kPack) {
-    return false;
-  }
+  if (is_dynamic_rnn) {
+    // For dynamic bidirectional sequence ops, bw_sequence will have a reverse
+    // op.
+    if (first_bw_sequence_op_input->type != OperatorType::kReverseV2) {
+      return false;
+    }
 
-  // fw_lstm & bw_lstm should point to the same input, but reversed sequence.
-  for (int i = 0; i < first_fw_sequence_op_input->inputs.size(); ++i) {
-    if (first_fw_sequence_op_input->inputs[i] !=
-        first_bw_sequence_op_input
-            ->inputs[first_fw_sequence_op_input->inputs.size() - i - 1]) {
+    const auto* bw_real_input_op =
+        GetOpWithOutput(model, first_bw_sequence_op_input->inputs[0]);
+    if (first_fw_sequence_op_input != bw_real_input_op) {
       return false;
     }
+
+  } else {
+    // For static bidirectional sequence ops, we should have two pack ops.
+    if (first_fw_sequence_op_input->type != OperatorType::kPack ||
+        first_bw_sequence_op_input->type != OperatorType::kPack) {
+      return false;
+    }
+
+    // fw_lstm & bw_lstm should point to the same input, but reversed sequence.
+    for (int i = 0; i < first_fw_sequence_op_input->inputs.size(); ++i) {
+      if (first_fw_sequence_op_input->inputs[i] !=
+          first_bw_sequence_op_input
+              ->inputs[first_fw_sequence_op_input->inputs.size() - i - 1]) {
+        return false;
+      }
+    }
   }
+
   return true;
 }
 
-bool ConstructBidirectionalSequenceOp(
+void ConstructBidirectionalSequenceOp(
     const Operator& fw_lstm_op, const Operator& bw_lstm_op, Model* model,
     BidirectionalSequenceLstmOperator** bi_op) {
   // TODO(renjieliu): Check the shapes & configurations are equal.
@@ -170,10 +210,9 @@ bool ConstructBidirectionalSequenceOp(
   (*bi_op)->outputs.push_back(fw_output_array_name);
   (*bi_op)->outputs.push_back(bw_output_array_name);
   (*bi_op)->merge_outputs = false;
-  return true;
 }
 
-bool ConstructBidirectionalSequenceOp(
+void ConstructBidirectionalSequenceOp(
     const Operator& fw_rnn_op, const Operator& bw_rnn_op, Model* model,
     BidirectionalSequenceRnnOperator** bi_op) {
   // TODO(renjieliu): Check the shapes & configurations are equal.
@@ -219,27 +258,23 @@ bool ConstructBidirectionalSequenceOp(
   (*bi_op)->outputs.push_back(fw_output_array_name);
   (*bi_op)->outputs.push_back(bw_output_array_name);
   (*bi_op)->merge_outputs = false;
-  return true;
 }
 
 template <typename T>
-bool GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
+void GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
                           std::stack<Operator*> bw_sequence_ops,
                           std::vector<T*>* bidirectional_sequence_ops) {
   while (!fw_sequence_ops.empty()) {
     Operator* fw_sequence_op = fw_sequence_ops.top();
     Operator* bw_sequence_op = bw_sequence_ops.top();
     T* bidirectional_sequence_op = new T;
-    if (!ConstructBidirectionalSequenceOp(*fw_sequence_op, *bw_sequence_op,
-                                          model, &bidirectional_sequence_op)) {
-      return false;
-    }
+    ConstructBidirectionalSequenceOp(*fw_sequence_op, *bw_sequence_op, model,
+                                     &bidirectional_sequence_op);
 
     bidirectional_sequence_ops->push_back(bidirectional_sequence_op);
     fw_sequence_ops.pop();
     bw_sequence_ops.pop();
   }
-  return true;
 }
 
 template <typename T>
@@ -331,16 +366,96 @@ void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
   }
 }
 
+template <typename T>
+::tensorflow::Status GroupDynamicSequenceOps(Model* model, std::size_t op_index,
+                                             OperatorType operator_type,
+                                             bool* modified) {
+  *modified = false;
+
+  // We assume there's a concatenation right after the bidirectional sequence
+  // ops, it may not be the case.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // for bw, there will be a reverse op at the end.
+  Operator *fw_sequence_output, *bw_sequence_output;
+  if (!MatchDynamicBidirectionalSequenceOutputs(
+          final_concat_op, *model, &fw_sequence_output, &bw_sequence_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional sequence ops.
+  std::stack<Operator*> fw_unidirectional_sequence_ops,
+      bw_unidirectional_sequence_ops;
+  OperatorType unidirectional_op_type;
+  if (operator_type == OperatorType::kBidirectionalSequenceLstm) {
+    unidirectional_op_type = OperatorType::kUnidirectionalSequenceLstm;
+  } else {
+    unidirectional_op_type = OperatorType::kUnidirectionalSequenceRnn;
+  }
+  Operator *first_fw_sequence_input, *first_bw_sequence_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_sequence_output, unidirectional_op_type,
+          &fw_unidirectional_sequence_ops, &first_fw_sequence_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_sequence_output, unidirectional_op_type,
+          &bw_unidirectional_sequence_ops, &first_bw_sequence_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_ops,
+          bw_unidirectional_sequence_ops, first_fw_sequence_input,
+          first_bw_sequence_input, /*is_dynamic_rnn=*/true)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // TODO(b/125143808): Before really group the fw & bw sequence ops and
+  // modified the model, we should check both the fw & bw sequence ops have the
+  // same data_type, inputs_shapes, output_shapes etc.
+  std::vector<T> bidirectional_sequence_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_ops,
+                       bw_unidirectional_sequence_ops,
+                       &bidirectional_sequence_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_sequence_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      operator_type, current_input, bidirectional_sequence_ops, &op_it, model);
+
+  // Change last bidirectional sequence rnn output to the concat output.
+  bidirectional_sequence_ops[bidirectional_sequence_ops.size() - 1]
+      ->outputs[0] = final_concat_op->outputs[0];
+
+  // Delete unused ops.
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_ops, model);
+
+  DeleteArrayIfUnused(final_concat_op->inputs[0], model);
+  DeleteArrayIfUnused(final_concat_op->inputs[1], model);
+  model->operators.erase(FindOperator(model, *final_concat_op));
+
+  // Only keep the fw lstm's input.
+  DeleteArrayIfUnused(first_bw_sequence_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_sequence_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
 }  // namespace
 
-// TODO(renjieliu): Support graph generated by dynamic rnn as well.
 ::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
                                                          std::size_t op_index,
                                                          bool* modified) {
   *modified = false;
   // Bidirectional sequence lstm will generate two separate unidirectional
   // sequence lstm ops, for static bidirectional sequence lstm, there will be
-  // a concatenation op at very end; for dynamic bidirectional squence lstm,
+  // a concatenation op at very end; for dynamic bidirectional sequence lstm,
   // it is not guaranteed, but currently we do not support that.
   auto op_it = model->operators.begin() + op_index;
   Operator* final_concat_op = op_it->get();
@@ -372,19 +487,20 @@ void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
   }
 
   if (!CheckTwoUnidirectionalSequenceOpsAreValid(
-          fw_unidirectional_sequence_lstm_ops,
+          *model, fw_unidirectional_sequence_lstm_ops,
           bw_unidirectional_sequence_lstm_ops, first_fw_lstm_input,
-          first_bw_lstm_input)) {
+          first_bw_lstm_input, /*is_dynamic_rnn=*/false)) {
     return ::tensorflow::Status::OK();
   }
 
+  // TODO(b/125143808): Before really group the fw & bw sequence ops and
+  // modified the model, we should check both the fw & bw sequence ops have the
+  // same data_type, inputs_shapes, output_shapes etc.
   std::vector<BidirectionalSequenceLstmOperator*>
       bidirectional_sequence_lstm_ops;
-  if (!GroupFwBwSequenceOps(model, fw_unidirectional_sequence_lstm_ops,
-                            bw_unidirectional_sequence_lstm_ops,
-                            &bidirectional_sequence_lstm_ops)) {
-    return ::tensorflow::Status::OK();
-  }
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_lstm_ops,
+                       bw_unidirectional_sequence_lstm_ops,
+                       &bidirectional_sequence_lstm_ops);
 
   // Rewire the inputs & outputs.
   string current_input = first_fw_lstm_input->outputs[0];
@@ -414,14 +530,13 @@ void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
   return ::tensorflow::Status::OK();
 }
 
-// TODO(renjieliu): Support graph generated by dynamic rnn as well.
 ::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
                                                         std::size_t op_index,
                                                         bool* modified) {
   *modified = false;
   // Bidirectional sequence rnn will generate two separate unidirectional
   // sequence rnn ops, for static bidirectional sequence rnn, there will be
-  // a concatenation op at very end; for dynamic bidirectional squence rnn,
+  // a concatenation op at very end; for dynamic bidirectional sequence rnn,
   // it is not guaranteed, but currently we do not support that.
   auto op_it = model->operators.begin() + op_index;
   Operator* final_concat_op = op_it->get();
@@ -453,18 +568,19 @@ void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
   }
 
   if (!CheckTwoUnidirectionalSequenceOpsAreValid(
-          fw_unidirectional_sequence_rnn_ops,
+          *model, fw_unidirectional_sequence_rnn_ops,
           bw_unidirectional_sequence_rnn_ops, first_fw_rnn_input,
-          first_bw_rnn_input)) {
+          first_bw_rnn_input, /*is_dynamic_rnn=*/false)) {
     return ::tensorflow::Status::OK();
   }
 
+  // TODO(b/125143808): Before really group the fw & bw sequence ops and
+  // modified the model, we should check both the fw & bw sequence ops have the
+  // same data_type, inputs_shapes, output_shapes etc.
   std::vector<BidirectionalSequenceRnnOperator*> bidirectional_sequence_rnn_ops;
-  if (!GroupFwBwSequenceOps(model, fw_unidirectional_sequence_rnn_ops,
-                            bw_unidirectional_sequence_rnn_ops,
-                            &bidirectional_sequence_rnn_ops)) {
-    return ::tensorflow::Status::OK();
-  }
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_rnn_ops,
+                       bw_unidirectional_sequence_rnn_ops,
+                       &bidirectional_sequence_rnn_ops);
 
   // Rewire the inputs & outputs.
   string current_input = first_fw_rnn_input->outputs[0];
@@ -492,4 +608,17 @@ void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
   *modified = true;
   return ::tensorflow::Status::OK();
 }
+
+::tensorflow::Status GroupDynamicBidirectionalSequenceRnn::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  return GroupDynamicSequenceOps<BidirectionalSequenceRnnOperator*>(
+      model, op_index, OperatorType::kBidirectionalSequenceRnn, modified);
+}
+
+::tensorflow::Status GroupDynamicBidirectionalSequenceLstm::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  return GroupDynamicSequenceOps<BidirectionalSequenceLstmOperator*>(
+      model, op_index, OperatorType::kBidirectionalSequenceLstm, modified);
+}
+
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 329ef92fe9b7046f5d5eece45b5204ce1b025c2b..1ea4b298834b66c616a4b1a14d57a55481283ebb 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1400,6 +1400,38 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
   }
 }
 
+void ProcessGatherNdOperator(Model* model, GatherNdOperator* op) {
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& indices_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+
+  // Bail if we already know the output shape.
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape() || !indices_array.has_shape()) {
+    return;
+  }
+
+  const auto& input_shape = input_array.shape();
+  const auto& indices_shape = indices_array.shape();
+  QCHECK_GE(input_shape.dimensions_count(), 1);
+  QCHECK_GE(indices_shape.dimensions_count(), 1);
+  const int indices_nd =
+      indices_shape.dims(indices_shape.dimensions_count() - 1);
+  QCHECK_LE(indices_nd, input_shape.dimensions_count());
+
+  auto output_dims = output_array.mutable_shape()->mutable_dims();
+  for (int dim = 0; dim < indices_shape.dimensions_count() - 1; ++dim) {
+    output_dims->push_back(indices_shape.dims(dim));
+  }
+  for (int dim = indices_nd; dim < input_shape.dimensions_count(); ++dim) {
+    output_dims->push_back(input_shape.dims(dim));
+  }
+}
+
 void ProcessTopkV2Operator(Model* model, TopKV2Operator* op) {
   const auto& input_values = model->GetArray(op->inputs[0]);
   const auto& input_k = model->GetArray(op->inputs[1]);
@@ -1485,7 +1517,7 @@ void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
   output_array.copy_shape(output_shape);
 }
 
-void ProcessRankOperator(Model* model, RankOperator* op) {
+void ProcessRankOperator(Model* model, TensorFlowRankOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
@@ -2033,6 +2065,7 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kBatchNormalization:
     case OperatorType::kL2Normalization:
     case OperatorType::kDequantize:
+    case OperatorType::kElu:
     case OperatorType::kRelu:
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
@@ -2057,6 +2090,7 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kSin:
+    case OperatorType::kCos:
     case OperatorType::kLogicalAnd:
     case OperatorType::kLogicalNot:
     case OperatorType::kLogicalOr:
@@ -2067,6 +2101,9 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kGather:
       ProcessGatherOperator(model, static_cast<GatherOperator*>(op));
       break;
+    case OperatorType::kGatherNd:
+      ProcessGatherNdOperator(model, static_cast<GatherNdOperator*>(op));
+      break;
     case OperatorType::kTopK_V2:
       ProcessTopkV2Operator(model, static_cast<TopKV2Operator*>(op));
       break;
@@ -2183,7 +2220,7 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
       ProcessRangeOperator(model, static_cast<RangeOperator*>(op));
       break;
     case OperatorType::kRank:
-      ProcessRankOperator(model, static_cast<RankOperator*>(op));
+      ProcessRankOperator(model, static_cast<TensorFlowRankOperator*>(op));
       break;
     case OperatorType::kShape:
       ProcessShapeOperator(model, static_cast<TensorFlowShapeOperator*>(op));
@@ -2305,6 +2342,11 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kUnique:
       ProcessUniqueOperator(model, static_cast<UniqueOperator*>(op));
       break;
+    case OperatorType::kWhere:
+      // The size of the output can only be known after evaluating the cond
+      // tensor. Ignore shape propagation here and defer that to the
+      // interpreter.
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index ee65f92e00cd9f9347e62db314ca3a3f5e8bb396..c7836f61ac30c2ba3784511193b0ed0217f64fcb 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -489,20 +489,20 @@ void FixMinMaxPostQuantization(GraphTransformation* transformation,
     }
   }
   if (!SupportsQuantization(op)) {
-    LOG(FATAL) << "Unimplemented: this graph contains an operator of type "
-               << HelpfulOperatorTypeName(op)
-               << " for which the quantized form is not yet implemented. "
-                  "Sorry, and patches welcome (that's a relatively fun patch "
-                  "to write, mostly providing the actual quantized arithmetic "
-                  "code for this op).";
+    return tensorflow::errors::InvalidArgument(
+        "Unimplemented: this graph contains an operator of type ",
+        HelpfulOperatorTypeName(op),
+        " for which the quantized form is not yet implemented. Sorry, and "
+        "patches welcome (that's a relatively fun patch to write, mostly "
+        "providing the actual quantized arithmetic code for this op).");
   }
 
   for (const auto& input : op.inputs) {
     const auto& array = model->GetArray(input);
     if (array.data_type == ArrayDataType::kFloat) {
       if (!array.minmax && !array.buffer) {
-        LOG(ERROR) << "Can't quantize input array " << input
-                   << " because it lacks min/max info";
+        LOG(WARNING) << "Can't quantize input array " << input
+                     << " because it lacks min/max info";
         return ::tensorflow::Status::OK();
       }
       const auto* other_op = GetOpWithOutput(*model, input);
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index fdd411c84c2678bc483b00849d5142665e706fac..77803d580e98aea94f0a7191666212cb15f58a7a 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -218,6 +218,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   CHECK_EQ(input_dims.size(), new_perm.size());
 
   auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.data_type = ArrayDataType::kInt32;
   transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
   *(transpose_array.mutable_shape()->mutable_dims()) = {
       static_cast<int>(new_perm.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 0c9effee1fd364fa83f61339251e48070f503d1e..1c8f4619bc40d976717bba16ee3857531cac76e0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -71,34 +71,29 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   // Each "digit" is incremented individually (by the stride). When it overflows
   // (becomes greater than the stop), that digit is reset and a carry flag is
   // used to increment the next digit.
-  int dst_offset = 0;
-  do {
+  for (int dst_offset = 0; dst_offset < output_data.size(); ++dst_offset) {
     // Copy element.
     output_data[dst_offset] = input_buffer.data[Offset(input_shape, src_coord)];
 
-    // Compute next source input coordinates.
-    bool carry = true;
-    for (int axis = 0; axis < num_input_axes; axis++) {
+    // Note we consider elements in the highest dimension are stored
+    // contiguously. So, we increment the stride starting from the highest
+    // dimension.
+    for (int axis = num_input_axes - 1; axis >= 0; --axis) {
       int stride = op.strides[axis];
-      // Increment this axis if we carried from the previous one
-      if (carry) {
-        src_coord[axis] += stride;
-      }
+      src_coord[axis] += stride;
 
-      // Check if we've overflowed.
+      // Check if we've overflowed. If not, we just break from the loop to
+      // continue w/ the element copy. Otherwise, reset the starting coordinate
+      // for this axis and move to the next lower axis.
       int stop = stop_for_axis[axis];
-      if (tflite::strided_slice::LoopCondition(src_coord[axis], stop, stride)) {
-        // Reset axis and set carry
-        src_coord[axis] = tflite::strided_slice::StartForAxis(
-            strided_slice_params, ToRuntimeShape(input_shape), axis);
-        carry = true;
-      } else {
-        carry = false;
+      if (!tflite::strided_slice::LoopCondition(src_coord[axis], stop,
+                                                stride)) {
+        break;
       }
+      src_coord[axis] = tflite::strided_slice::StartForAxis(
+          strided_slice_params, ToRuntimeShape(input_shape), axis);
     }
-    // increment destination buffer offset
-    dst_offset++;
-  } while (dst_offset < output_data.size());
+  }
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
index c0becaf7d39cdbc01217bbb9b5a6b50017cc2eaa..2c860c30974766a093ef1bf2d9a93fb29bb65949 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
@@ -61,11 +61,11 @@ namespace toco {
   minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
   // We always want [min, max] to contain 0.
   if (minmax.min > 0 || minmax.max < 0) {
-    LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
-               << "[" << minmax.min << ", " << minmax.max
-               << "] does not contain 0. "
-               << "Proceeding by tweaking it to contain 0, which will result "
-                  "in poor accuracy.";
+    LOG(WARNING) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
   }
   minmax.min = std::min(minmax.min, 0.);
   minmax.max = std::max(minmax.max, 0.);
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 41a735394d714b65a4c9fc309927e34a7f610431..7492f3e116c60ca2c574bf8d2fd4b08f5914f3d0 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -13,17 +13,192 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
+namespace {
+
+void UnrollBatchMatMul3D(
+    const string& input_lhs, const string& input_rhs,
+    const BatchMatMulOperator* batch_op, const std::vector<int> batch,
+    Model* model, std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
+    std::vector<string>* pack_inputs) {
+  const std::string batch_name =
+      absl::StrCat(batch_op->outputs[0], "_b", absl::StrJoin(batch, "-"));
+  const auto& input_array_a = model->GetArray(input_lhs);
+  const auto& input_array_b = model->GetArray(input_rhs);
+  const int dims_count = input_array_a.shape().dimensions_count();
+
+  // tf.slice(a, ...).
+  std::vector<int> begin_indices_a = batch;
+  begin_indices_a.resize(dims_count);
+  std::vector<int> slice_size_a = input_array_a.shape().dims();
+  for (int i = 0; i < batch.size(); ++i) {
+    slice_size_a[i] = 1;
+  }
+  auto* slice_a_op = new SliceOperator;
+  slice_a_op->inputs = {
+      input_lhs,
+      CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
+                       begin_indices_a),
+      CreateInt32Array(model, batch_name + "/slice_a/slice/size", slice_size_a),
+  };
+  slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
+  auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
+  slice_a_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_a_op) + 1;
+
+  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+  auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
+  slice_a_reshape_op->inputs = {
+      slice_a_op->outputs[0],
+      CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
+                       {-1, input_array_a.shape().dims(dims_count - 1)})};
+  slice_a_reshape_op->outputs = {
+      AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
+  auto& slice_a_reshape_op_output =
+      model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
+  slice_a_reshape_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_a_reshape_op) + 1;
+
+  // tf.slice(b, ...).
+  std::vector<int> begin_indices_b = batch;
+  begin_indices_b.resize(dims_count);
+  std::vector<int> slice_size_b = input_array_b.shape().dims();
+  for (int i = 0; i < batch.size(); ++i) {
+    slice_size_b[i] = 1;
+  }
+  auto* slice_b_op = new SliceOperator;
+  slice_b_op->inputs = {
+      input_rhs,
+      CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                       begin_indices_b),
+      CreateInt32Array(model, batch_name + "/slice_b/slice/size", slice_size_b),
+  };
+  slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
+  auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
+  slice_b_op_output.data_type = input_array_b.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_b_op) + 1;
+
+  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+  auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
+  slice_b_reshape_op->inputs = {
+      slice_b_op->outputs[0],
+      CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
+                       {-1, input_array_b.shape().dims(dims_count - 1)})};
+  slice_b_reshape_op->outputs = {
+      AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
+  auto& slice_b_reshape_op_output =
+      model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
+  slice_b_reshape_op_output.data_type = input_array_b.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_b_reshape_op) + 1;
+
+  // tf.matmul(slice_a, slice_b).
+  auto* matmul_op = new TensorFlowMatMulOperator;
+  matmul_op->inputs = {slice_a_reshape_op->outputs[0],
+                       slice_b_reshape_op->outputs[0]};
+  matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
+  auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
+  matmul_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, matmul_op) + 1;
+
+  // Add to stack.
+  pack_inputs->push_back(matmul_op->outputs[0]);
+}
+
+std::vector<string> UnrollBatchMatMulRecursion(
+    const string& input_lhs, const string& input_rhs,
+    const BatchMatMulOperator* batch_op, Model* model,
+    std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
+    const std::vector<int>& batch_prefix) {
+  const auto& input_array_a = model->GetArray(input_lhs);
+  const auto& dims_vec = input_array_a.shape().dims();
+  const int current_dim_size = dims_vec[batch_prefix.size()];
+  std::vector<string> batch_pack_inputs;
+
+  if (batch_prefix.size() + 3 == dims_vec.size()) {
+    // Base case
+    for (int batch = 0; batch < current_dim_size; ++batch) {
+      std::vector<int> new_batch_prefix = batch_prefix;
+      new_batch_prefix.emplace_back(batch);
+      UnrollBatchMatMul3D(input_lhs, input_rhs, batch_op, new_batch_prefix,
+                          model, tail_it, &batch_pack_inputs);
+    }
+  } else {
+    // Recursion
+    for (int batch = 0; batch < current_dim_size; ++batch) {
+      std::vector<int> new_batch_prefix = batch_prefix;
+      new_batch_prefix.emplace_back(batch);
+      std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
+          input_lhs, input_rhs, batch_op, model, tail_it, new_batch_prefix);
+
+      // The pack that will join all the individual matmul results together.
+      auto* pack_op = new PackOperator;
+      std::string batch_name = absl::StrCat(
+          batch_op->outputs[0], "_b", absl::StrJoin(new_batch_prefix, "-"));
+      pack_op->inputs = pack_inputs;
+      pack_op->outputs = {AvailableArrayName(*model, batch_name + "/pack")};
+      auto& pack_op_output = model->GetOrCreateArray(pack_op->outputs[0]);
+      pack_op_output.data_type = input_array_a.data_type;
+      pack_op->axis = 0;
+      pack_op->values_count = pack_inputs.size();
+      *tail_it = model->operators.emplace(*tail_it, pack_op) + 1;
+
+      batch_pack_inputs.push_back(pack_op->outputs[0]);
+    }
+  }
+  return batch_pack_inputs;
+}
+
+std::vector<int32> GetTransposePerm(const Array& input_array) {
+  const int32 dims = input_array.shape().dimensions_count();
+  std::vector<int32> perm_array_val(dims);
+  for (int i = 0; i < dims; ++i) {
+    perm_array_val[i] = i;
+  }
+  perm_array_val[dims - 2] = dims - 1;
+  perm_array_val[dims - 1] = dims - 2;
+  return perm_array_val;
+}
+
+std::vector<int32> GetTransposeShape(const Shape& input_shape,
+                                     const std::vector<int32>& perm_array_val) {
+  const int32 dims = input_shape.dimensions_count();
+  std::vector<int32> output_shape(dims);
+  for (int i = 0; i < dims; ++i) {
+    output_shape[i] = input_shape.dims(perm_array_val[i]);
+  }
+  return output_shape;
+}
+
+TransposeOperator* TransposeInput(const string& input, Model* model) {
+  const auto& input_array = model->GetArray(input);
+  const auto perm_array = GetTransposePerm(input_array);
+  const string perm_array_name = CreateInt32Array(
+      model, AvailableArrayName(*model, input + "/transpose/perm"), perm_array);
+  auto* transpose_op = new TransposeOperator;
+  transpose_op->inputs = {input, perm_array_name};
+  transpose_op->outputs = {AvailableArrayName(*model, input + "/transpose")};
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->outputs[0]);
+  *transpose_array.mutable_shape()->mutable_dims() =
+      GetTransposeShape(input_array.shape(), perm_array);
+  model->GetOrCreateArray(transpose_op->outputs[0]);
+  return transpose_op;
+}
+
+}  // namespace
+
 // Unrolls a BatchMatMul on the batch dimension.
 // We need to slice each batch out of the inputs, matmul them individually, then
 // stack them all back together at the end.
@@ -46,115 +221,67 @@ namespace toco {
   const auto* batch_op =
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
 
-  // We must have the shape of at least one input to know our batch size.
-  const auto& input_array_a = model->GetArray(batch_op->inputs[0]);
-  const auto& input_array_b = model->GetArray(batch_op->inputs[1]);
-  if (!input_array_a.has_shape() || !input_array_b.has_shape())
+  auto& tail_it = batch_op_it;
+
+  string input_lhs = batch_op->inputs[0];
+  string input_rhs = batch_op->inputs[1];
+  const auto& input_lhs_array = model->GetArray(input_lhs);
+  const auto& input_rhs_array = model->GetArray(input_rhs);
+  if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
     return ::tensorflow::Status::OK();
 
-  // We only support the rank 3 case. If you are batching on rank > 3 you'll
-  // have to figure that out.
-  CHECK_EQ(input_array_a.shape().dimensions_count(),
-           input_array_b.shape().dimensions_count())
-      << "Input dimensions must have the same rank";
-  if (input_array_a.shape().dimensions_count() == 2) {
+  // Transpose LHS input if necessary.
+  if (batch_op->adj_x) {
+    TransposeOperator* transpose_op = TransposeInput(input_lhs, model);
+    tail_it = model->operators.emplace(tail_it, transpose_op) + 1;
+    input_lhs = transpose_op->outputs[0];
+  }
+  const auto& input_array_a = model->GetArray(input_lhs);
+
+  // Transpose RHS input if necessary.
+  if (batch_op->adj_y) {
+    TransposeOperator* transpose_op = TransposeInput(input_rhs, model);
+    tail_it = model->operators.emplace(tail_it, transpose_op) + 1;
+    input_rhs = transpose_op->outputs[0];
+  }
+  const auto& input_array_b = model->GetArray(input_rhs);
+
+  const int dims = input_array_a.shape().dimensions_count();
+  for (int i = 0; i < dims - 2; ++i) {
+    CHECK_EQ(input_array_a.shape().dims(i), input_array_b.shape().dims(i))
+        << "input array not consistent at index " << i;
+  }
+  CHECK_EQ(input_array_a.shape().dims(dims - 1),
+           input_array_b.shape().dims(dims - 2))
+      << "Input dimensions must be compatible for multipication. shape a = ["
+      << absl::StrJoin(input_array_a.shape().dims(), ", ") << "], shape b = ["
+      << absl::StrJoin(input_array_b.shape().dims(), ", ") << "]";
+
+  if (dims == 2) {
     // This is really just a MatMul. This likely means that someone hand-crafted
     // a graphdef with a BatchMatMul when they really wanted a MatMul.
     AddMessageF("Replacing non-batch BatchMatMul %s by a MatMul operator",
                 LogName(*batch_op));
     auto* matmul_op = new TensorFlowMatMulOperator;
-    matmul_op->inputs = batch_op->inputs;
+    matmul_op->inputs = {input_lhs, input_rhs};
     matmul_op->outputs = batch_op->outputs;
-    const auto matmul_op_it = model->operators.emplace(batch_op_it, matmul_op);
-    batch_op_it = matmul_op_it + 1;
-    CHECK_EQ(batch_op_it->get(), batch_op);
-    model->operators.erase(batch_op_it);
+    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+    CHECK_EQ(tail_it->get(), batch_op);
+    model->operators.erase(tail_it);
     *modified = true;
     return ::tensorflow::Status::OK();
   }
-  CHECK_EQ(input_array_a.shape().dimensions_count(), 3)
-      << "Input arrays must have rank 3";
 
-  // Perform the matmul for each slice of the batch.
-  int batch_count = input_array_a.shape().dims(0);
-  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
-              batch_count);
-  auto tail_it = batch_op_it;
-  std::vector<string> pack_inputs;
-  for (int batch = 0; batch < batch_count; ++batch) {
-    std::string batch_name =
-        std::string(batch_op->outputs[0]) + "_b" + std::to_string(batch);
-
-    // tf.slice(a, ...).
-    auto* slice_a_op = new SliceOperator;
-    slice_a_op->inputs = {
-        batch_op->inputs[0],
-        CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
-                         {batch, 0, 0}),
-        CreateInt32Array(
-            model, batch_name + "/slice_a/slice/size",
-            {1, input_array_a.shape().dims(1), input_array_a.shape().dims(2)}),
-    };
-    slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
-    auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
-    slice_a_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_a_op) + 1;
-
-    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-    auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
-    slice_a_reshape_op->inputs = {
-        slice_a_op->outputs[0],
-        CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
-                         {-1, input_array_a.shape().dims(2)})};
-    slice_a_reshape_op->outputs = {
-        AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
-    auto& slice_a_reshape_op_output =
-        model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
-    slice_a_reshape_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_a_reshape_op) + 1;
-
-    // tf.slice(b, ...).
-    auto* slice_b_op = new SliceOperator;
-    slice_b_op->inputs = {
-        batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
-                         {batch, 0, 0}),
-        CreateInt32Array(
-            model, batch_name + "/slice_b/slice/size",
-            {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
-    };
-    slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
-    auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
-    slice_b_op_output.data_type = input_array_b.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_b_op) + 1;
-
-    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-    auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
-    slice_b_reshape_op->inputs = {
-        slice_b_op->outputs[0],
-        CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
-                         {-1, input_array_b.shape().dims(2)})};
-    slice_b_reshape_op->outputs = {
-        AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
-    auto& slice_b_reshape_op_output =
-        model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
-    slice_b_reshape_op_output.data_type = input_array_b.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_b_reshape_op) + 1;
-
-    // tf.matmul(slice_a, slice_b).
-    auto* matmul_op = new TensorFlowMatMulOperator;
-    matmul_op->inputs = {slice_a_reshape_op->outputs[0],
-                         slice_b_reshape_op->outputs[0]};
-    matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
-    auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
-    matmul_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+  CHECK_GE(input_array_a.shape().dimensions_count(), 3)
+      << "Input arrays must have rank >= 3";
 
-    // Add to stack.
-    pack_inputs.push_back(matmul_op->outputs[0]);
-  }
+  const auto& dims_vec = input_array_a.shape().dims();
+  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
+              std::accumulate(dims_vec.begin(), dims_vec.end() - 2, 1,
+                              std::multiplies<int>()));
 
-  // The pack that will join all the individual matmul results together.
+  std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
+      input_lhs, input_rhs, batch_op, model, &tail_it, {});
   auto* pack_op = new PackOperator;
   pack_op->inputs = pack_inputs;
   pack_op->outputs = {batch_op->outputs[0]};
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index b1b04949bad7f96866cc5687940f4fef2a8ee1a8..db7c5e6fd56549bb6e005c57db62edb23a56218d 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -1092,11 +1092,14 @@ tensorflow::Status ConvertBatchMatMulOperator(
     Model* model) {
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
 
-  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
-  CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
-  CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
-
   auto* batch_matmul = new BatchMatMulOperator;
+  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
+  if (HasAttr(node, "adj_x")) {
+    batch_matmul->adj_x = GetBoolAttr(node, "adj_x");
+  }
+  if (HasAttr(node, "adj_y")) {
+    batch_matmul->adj_y = GetBoolAttr(node, "adj_y");
+  }
   batch_matmul->inputs = {node.input(0), node.input(1)};
   batch_matmul->outputs = {node.name()};
 
@@ -1346,7 +1349,7 @@ tensorflow::Status ConvertUnsupportedOperator(
   }
 
   // Parse outputs. Name them after the node's name, plus an ordinal suffix.
-  // Note that some outputs are to be multipled by a named attribute.
+  // Note that some outputs are to be multiplied by a named attribute.
   const tensorflow::OpDef* op_def = nullptr;
   if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
     GetOutputNamesFromNodeDef(node, *op_def, op);
@@ -1480,7 +1483,7 @@ tensorflow::Status ConvertPlaceholderOperator(
   if (node.attr().count("shape")) {
     const auto& shape = GetShapeAttr(node, "shape");
     auto num_dims = shape.dim_size();
-    // TODO(b/62716978): This logic needs to be revisted.  During dims
+    // TODO(b/62716978): This logic needs to be revisited.  During dims
     // refactoring it is an interim fix.
     if (num_dims > 0 && !HasWildcardDimension(shape)) {
       auto& dst_array_dims = *array.mutable_shape()->mutable_dims();
@@ -1572,6 +1575,21 @@ tensorflow::Status ConvertGatherOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertGatherNdOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "GatherNd");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
+  const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
+  CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
+  auto* op = new GatherNdOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 template <typename Op>
 tensorflow::Status ConvertArgMinMaxOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
@@ -2375,7 +2393,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
       {"Abs", ConvertSimpleOperator<AbsOperator, kAnyNumInputs, 1>},
       {"Add", ConvertSimpleOperator<AddOperator, 2, 1>},
-      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator, kAnyNumInputs, 1>},
+      {"AddN", ConvertSimpleOperator<AddNOperator, kAnyNumInputs, 1>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator, kAnyNumInputs, 1>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
@@ -2396,12 +2414,14 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Const", ConvertConstOperator},
       {"Conv2D", ConvertConvOperator},
       {"Conv2DBackpropInput", ConvertTransposeConvOperator},
+      {"Cos", ConvertSimpleOperator<CosOperator, 1, 1>},
       {"CTCBeamSearchDecoder", ConvertCTCBeamSearchDecoderOperator},
       {"DepthToSpace", ConvertDepthToSpaceOperator},
       {"DepthwiseConv2dNative", ConvertDepthwiseConvOperator},
       {"Div", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"DynamicPartition", ConvertDynamicPartitionOperator},
       {"DynamicStitch", ConvertDynamicStitchOperator},
+      {"Elu", ConvertSimpleOperator<EluOperator, 1, 1>},
       {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2, 1>},
       {"Exp", ConvertSimpleOperator<ExpOperator, 1, 1>},
       {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2, 1>},
@@ -2414,6 +2434,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"FusedBatchNorm", ConvertFusedBatchNormOperator},
       {"Gather", ConvertGatherOperator},
       {"GatherV2", ConvertGatherOperator},
+      {"GatherNd", ConvertGatherNdOperator},
       {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2, 1>},
       {"GreaterEqual",
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2, 1>},
@@ -2452,7 +2473,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Prod", ConvertReduceOperator<TensorFlowProdOperator>},
       {"RandomUniform", ConvertRandomUniform},
       {"Range", ConvertRangeOperator},
-      {"Rank", ConvertSimpleOperator<RankOperator, 1, 1>},
+      {"Rank", ConvertSimpleOperator<TensorFlowRankOperator, 1, 1>},
       {"RealDiv", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"Relu", ConvertSimpleOperator<ReluOperator, 1, 1>},
       {"Relu6", ConvertSimpleOperator<Relu6Operator, 1, 1>},
@@ -2494,6 +2515,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"UnidirectionalSequenceRnn", ConvertUnidirectionalSequenceRnn},
       {"MirrorPad", ConvertMirrorPadOperator},
       {"Unique", ConvertSimpleOperator<UniqueOperator, 1, 2>},
+      {"Where", ConvertSimpleOperator<WhereOperator, 1, 1>},
   });
 }
 
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index b99cb74a161dc8468e37ba8ceaac464d63f5abc8..d9909c11fa863d5ba7e3d7fbd755023bf56a1264 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/runtime/types.h"
 #include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/toco_types.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
@@ -45,6 +45,7 @@ enum class OperatorType : uint8 {
   kCeil,
   kConv,
   kConcatenation,
+  kCos,
   kDepthwiseConv,
   kDepthToSpace,
   kSpaceToDepth,
@@ -163,7 +164,10 @@ enum class OperatorType : uint8 {
   kUnidirectionalSequenceRnn,
   kBidirectionalSequenceLstm,
   kReverseV2,
-  kBidirectionalSequenceRnn
+  kBidirectionalSequenceRnn,
+  kGatherNd,
+  kWhere,
+  kElu
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -687,6 +691,17 @@ struct AbsOperator : Operator {
   AbsOperator() : Operator(OperatorType::kAbs) {}
 };
 
+// Elu
+//   f(x) -> exp(x) - 1 for x < 0, x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Elu
+struct EluOperator : Operator {
+  EluOperator() : Operator(OperatorType::kElu) {}
+};
+
 // Element-wise Relu operator:
 //   x -> max(0, x)
 //
@@ -964,6 +979,8 @@ struct TensorFlowIdentityOperator : Operator {
 // TensorFlow equivalent: MatMul
 struct BatchMatMulOperator : Operator {
   BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+  bool adj_x = false;
+  bool adj_y = false;
 };
 
 // General matrix multiplication operator. We don't want to support general
@@ -1166,6 +1183,17 @@ struct ExpOperator : Operator {
   ExpOperator() : Operator(OperatorType::kExp) {}
 };
 
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = cos(x)).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Cos
+struct CosOperator : Operator {
+  CosOperator() : Operator(OperatorType::kCos) {}
+};
+
 // Given a tensor input, this operation inserts a dimension of 1 at the
 // dimension index axis of input's shape. The dimension index axis starts at
 // zero; if you specify a negative number for axis it is counted backward from
@@ -1244,13 +1272,12 @@ struct RangeOperator : Operator {
 // Inputs:
 //   inputs[0]: required: the input array
 //
-// This operation outputs a 0-D integer tensor representing the rank of
-// the input.
+// This operation outputs a 0-D int32 Tensor representing the rank of input.
 //
-// TensorFlow equivalent: Rank.  We currently assume that the output is int32
-// and not int64.  The output type could be stored herein.
-struct RankOperator : Operator {
-  RankOperator() : Operator(OperatorType::kRank) {}
+// TensorFlow equivalent: Rank.
+struct TensorFlowRankOperator : Operator {
+  TensorFlowRankOperator() : Operator(OperatorType::kRank) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt32;
 };
 
 // Element-wise negation (-x) operator.
@@ -1707,6 +1734,17 @@ struct GatherOperator : Operator {
   int input_rank = 0;
 };
 
+// GatherNd operator. It gathers slices from params according to indices.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//
+// TensorFlow equivalent: GatherNd
+struct GatherNdOperator : Operator {
+  GatherNdOperator() : Operator(OperatorType::kGatherNd) {}
+};
+
 // ArgMax operator. It returns the index of the maximum value along axis.
 //
 // Inputs:
@@ -2011,6 +2049,18 @@ struct UnidirectionalSequenceRnnOperator : Operator {
   FusedActivationFunctionType fused_activation_function;
 };
 
+// Where Operator:
+// Return the coordinates of the true values in condition tensor in row-major
+// order.
+//
+// Inputs:
+//  inputs[0]: required: boolean condition tensor
+//
+//  TensorFlow equivalent: Where
+struct WhereOperator : Operator {
+  WhereOperator() : Operator(OperatorType::kWhere) {}
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 6da48333a292df6a9fd21da620be9bb00afe0b7d..2f5654c56e0acca57a2d644a7c50e87c185f721b 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -25,6 +25,7 @@ cc_library(
     deps = [
         "//third_party/python_runtime:headers",
         "//tensorflow/core:lib",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:toco_graphviz_dump_options",
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index ce8e3c9df88ba511fcca9d9a256896624194463b..6fad092f35aa386757885f9320f47e9f372e9f47 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/platform/logging.h"
 
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/python/toco_python_api.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -26,14 +27,6 @@ limitations under the License.
 
 namespace toco {
 
-#if PY_MAJOR_VERSION >= 3
-#define TOCO_PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define TOCO_FROM_CPPSTRING_TO_PY PyBytes_FromStringAndSize
-#else
-#define TOCO_PY_TO_CPPSTRING PyString_AsStringAndSize
-#define TOCO_FROM_CPPSTRING_TO_PY PyString_FromStringAndSize
-#endif
-
 // NOTE(aselle): We are using raw PyObject's here because we want to make
 // sure we input and output bytes rather than unicode strings for Python3.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
@@ -44,7 +37,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   auto ConvertArg = [&](PyObject* obj, bool* error) {
     char* buf;
     Py_ssize_t len;
-    if (TOCO_PY_TO_CPPSTRING(obj, &buf, &len) == -1) {
+    if (::tflite::python_utils::ConvertFromPyString(obj, &buf, &len) == -1) {
       *error = true;
       return std::string();
     } else {
@@ -96,15 +89,15 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     PyObject* dict = PyDict_New();
     PyDict_SetItemString(
         dict, "flatbuffer",
-        TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-                                  output_file_contents_txt.size()));
+        ::tflite::python_utils::ConvertToPyString(
+            output_file_contents_txt.data(), output_file_contents_txt.size()));
     PyDict_SetItemString(dict, "arithmetic_ops",
                          PyLong_FromLong(model->ArithmeticOpsCount()));
     return dict;
   }
   // Convert arguments back to byte (py3) or str (py2)
-  return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-                                   output_file_contents_txt.size());
+  return ::tflite::python_utils::ConvertToPyString(
+      output_file_contents_txt.data(), output_file_contents_txt.size());
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 53f5ecef872774e83fbcb1abe394a2dbaa189f4c..1ce37d29018ffdb7ee185e002a1543b053e1eec0 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -63,12 +63,12 @@ bool IsControlFlowOp(const string& tensorflow_op) {
   return false;
 }
 
-// Check if a TensorFlow Op is unsupportred by the Flex runtime.
+// Check if a TensorFlow Op is unsupported by the Flex runtime.
 bool IsUnsupportedFlexOp(const string& tensorflow_op) {
   if (IsControlFlowOp(tensorflow_op)) {
     return true;
   }
-  // `HashTableV2` isn't supported for now since it requires an additinonal
+  // `HashTableV2` isn't supported for now since it requires an additional
   // initialization step.
   // TODO(b/117651199): Support `HashTableV2` with Flex runtime.
   if (tensorflow_op == "HashTableV2") {
@@ -157,7 +157,7 @@ OperatorKey::OperatorKey(
         string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
   } else {
     // If Flex is disabled or the original TensorFlow NodeDef isn't available,
-    // we produce a custom op. This gives developers a chance to implemenr
+    // we produce a custom op. This gives developers a chance to implement
     // custom ops.
     custom_code_ = name;
   }
@@ -222,7 +222,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
 
     std::vector<int> shape;
     if (array.has_shape()) {
-      for (int d : array.shape().dims()) {
+      for (const auto& d : array.shape().dims()) {
         shape.push_back(d);
       }
     }
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 58cfb4987ff67d87d330688cd7bf75ca2eb98ebd..fb640f776abdef3e5a59d075d3bc15e8d0f9565f 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -51,9 +51,27 @@ class ExportTest : public ::testing::Test {
         output_array.data_type = ArrayDataType::kFloat;
         input_model_.operators.emplace_back(op);
       } else if (name == "Add") {
-        input_model_.operators.emplace_back(new AddOperator);
+        auto* op = new AddOperator;
+        op->inputs = {"input1", "input2"};
+        op->outputs = {"output"};
+        Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input1_array.data_type = ArrayDataType::kFloat;
+        input2_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
+        input_model_.operators.emplace_back(op);
       } else if (name == "Sub") {
-        input_model_.operators.emplace_back(new SubOperator);
+        auto* op = new SubOperator;
+        op->inputs = {"input1", "input2"};
+        op->outputs = {"output"};
+        Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input1_array.data_type = ArrayDataType::kFloat;
+        input2_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
+        input_model_.operators.emplace_back(op);
       } else if (name == "Assert") {
         auto* op = new TensorFlowAssertOperator;
 
@@ -114,7 +132,18 @@ class ExportTest : public ::testing::Test {
       output_array.data_type = ArrayDataType::kFloat;
       input_model_.operators.emplace_back(op);
     }
-    input_model_.operators.emplace_back(new AddOperator);
+    {
+      auto* op = new AddOperator;
+      op->inputs = {"input1", "input2"};
+      op->outputs = {"output"};
+      Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+      Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+      Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+      input1_array.data_type = ArrayDataType::kFloat;
+      input2_array.data_type = ArrayDataType::kFloat;
+      output_array.data_type = ArrayDataType::kFloat;
+      input_model_.operators.emplace_back(op);
+    }
   }
 
   std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
@@ -445,7 +474,7 @@ TEST_F(VersionedOpExportTest, Export) {
   auto* model = ::tflite::GetModel(result.data());
   auto operator_codes = model->operator_codes();
 
-  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
+  // Verify that 2 operator codes are populated. Both are CONV_2D but with
   // different versions.
   EXPECT_EQ(2, operator_codes->size());
   EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index 93ab5141abe81c4ed4c1ff0ac7ca5e89577c71fb..b00c4124d83ae558b4aa6f5ecc2ba9eb06e5dac0 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -60,7 +60,7 @@ class ImportTest : public ::testing::Test {
                                builder_.CreateString("tensor_one"), q);
     auto t2 =
         ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({2, 1}),
-                               ::tflite::TensorType_FLOAT32, 2,
+                               ::tflite::TensorType_FLOAT32, 0,
                                builder_.CreateString("tensor_two"), q);
     return builder_.CreateVector(
         std::vector<Offset<::tflite::Tensor>>({t1, t2}));
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 61c0b605dd51fe78267e2711dba685756a12c751..6c83ef628800acc93d238dcd5edd0358f189f3f7 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
+
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -198,6 +199,31 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class AddN : public BuiltinOperator<AddNOperator, ::tflite::AddNOptions,
+                                    ::tflite::BuiltinOptions_AddNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateAddNOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
   int GetVersion(const OperatorSignature& op_signature) const override {
     return 1;
   }
@@ -220,6 +246,12 @@ class SpaceToBatchND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -244,6 +276,12 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -289,6 +327,12 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -427,18 +471,29 @@ class FullyConnected
     }
   }
 
+  // +-----------------+--------------------+--------------------------+
+  // |                 |    Weight::Default | Weight::Shuffled4x16Int8 |
+  // +-----------------+--------------------+--------------------------+
+  // | Float           |                  1 |                        2 |
+  // | Quantized Uint8 |                  1 |                        2 |
+  // | Hybrid          |                  3 |                        3 |
+  // | Quantized Int8  |                  4 |                        4 |
+  // +-----------------+--------------------+--------------------------+
   int GetVersion(const OperatorSignature& op_signature) const override {
     const auto& fc_op =
         static_cast<const FullyConnectedOperator&>(*op_signature.op);
-    if (fc_op.weights_format == FullyConnectedWeightsFormat::kDefault) {
-      return 1;
-    }
     const string& input_name = op_signature.op->inputs[0];
     const string& weights_name = op_signature.op->inputs[1];
     const string& output_name = op_signature.op->outputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     const Array& weights_array = op_signature.model->GetArray(weights_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
+    // Int8 fully fixed point kernel is at version 4.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        weights_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 4;
+    }
     // If the op is a signed int8 hybrid operation, we need to return
     // version 3.
     if (input_array.data_type == ArrayDataType::kFloat &&
@@ -446,7 +501,15 @@ class FullyConnected
         output_array.data_type == ArrayDataType::kFloat) {
       return 3;
     }
-    return 2;
+    // For float and uint8 fixed point kernels, if the weight is
+    // Shuffled4x16Int8, is is version 2.
+    if (fc_op.weights_format ==
+        FullyConnectedWeightsFormat::kShuffled4x16Int8) {
+      return 2;
+    }
+
+    // Otherwise (weight is default), the version is 1.
+    return 1;
   }
 };
 
@@ -466,6 +529,32 @@ class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
     op->axis = {options.axis()};
   }
 
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class GatherNd
+    : public BuiltinOperator<GatherNdOperator, ::tflite::GatherNdOptions,
+                             ::tflite::BuiltinOptions_GatherNdOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateGatherNdOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
   int GetVersion(const OperatorSignature& op_signature) const override {
     return 1;
   }
@@ -618,6 +707,39 @@ class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Maximum : public SimpleOperator<TensorFlowMaximumOperator> {
+ public:
+  explicit Maximum() : SimpleOperator("MAXIMUM", OperatorType::kMaximum) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Minimum : public SimpleOperator<TensorFlowMinimumOperator> {
+ public:
+  explicit Minimum() : SimpleOperator("MINIMUM", OperatorType::kMinimum) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -642,6 +764,12 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -661,6 +789,12 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -698,6 +832,12 @@ class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -771,6 +911,12 @@ class SpaceToDepth
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -790,6 +936,12 @@ class Transpose
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1079,6 +1231,12 @@ class ReduceMax
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1100,6 +1258,12 @@ class ReduceMin
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1146,6 +1310,20 @@ class ReduceAny
   }
 };
 
+class Relu6 : public SimpleOperator<Relu6Operator> {
+ public:
+  explicit Relu6() : SimpleOperator("RELU6", OperatorType::kRelu6) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 class ResizeBilinear
     : public BuiltinOperator<ResizeBilinearOperator,
                              ::tflite::ResizeBilinearOptions,
@@ -1164,6 +1342,12 @@ class ResizeBilinear
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op takes int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1187,6 +1371,12 @@ class ResizeNearestNeighbor
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1234,6 +1424,14 @@ class Split
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2, for int32 it's version 3.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    } else if (input_array.data_type == ArrayDataType::kInt32) {
+      return 3;
+    }
     return 1;
   }
 };
@@ -1284,6 +1482,12 @@ class StridedSlice
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1302,6 +1506,11 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1323,6 +1532,12 @@ class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+
     return 1;
   }
 };
@@ -1344,6 +1559,12 @@ class ArgMin : public BuiltinOperator<ArgMinOperator, ::tflite::ArgMinOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+
     return 1;
   }
 };
@@ -1436,6 +1657,12 @@ class Pack : public BuiltinOperator<PackOperator, ::tflite::PackOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1462,6 +1689,34 @@ class Shape
   }
 };
 
+class Slice : public SimpleOperator<SliceOperator> {
+ public:
+  explicit Slice() : SimpleOperator("SLICE", OperatorType::kSlice) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Tanh : public SimpleOperator<TanhOperator> {
+ public:
+  explicit Tanh() : SimpleOperator("TANH", OperatorType::kTanh) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 class OneHot : public BuiltinOperator<OneHotOperator, ::tflite::OneHotOptions,
                                       ::tflite::BuiltinOptions_OneHotOptions> {
  public:
@@ -1544,6 +1799,35 @@ class LeakyRelu
   }
 };
 
+class Logistic : public SimpleOperator<LogisticOperator> {
+ public:
+  explicit Logistic() : SimpleOperator("LOGISTIC", OperatorType::kLogistic) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class LogSoftmax : public SimpleOperator<LogSoftmaxOperator> {
+ public:
+  explicit LogSoftmax()
+      : SimpleOperator("LOG_SOFTMAX", OperatorType::kLogSoftmax) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 class SquaredDifference
     : public BuiltinOperator<
           SquaredDifferenceOperator, ::tflite::SquaredDifferenceOptions,
@@ -1648,6 +1932,25 @@ class UnidirectionalSequenceRnn
   }
 };
 
+class Where : public BuiltinOperator<WhereOperator, ::tflite::WhereOptions,
+                                     ::tflite::BuiltinOptions_WhereOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateWhereOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
     const string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
@@ -1888,6 +2191,106 @@ class Dequantize
   }
 };
 
+class Equal : public SimpleOperator<TensorFlowEqualOperator> {
+ public:
+  explicit Equal() : SimpleOperator("EQUAL", OperatorType::kEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class NotEqual : public SimpleOperator<TensorFlowNotEqualOperator> {
+ public:
+  explicit NotEqual() : SimpleOperator("NOT_EQUAL", OperatorType::kNotEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Greater : public SimpleOperator<TensorFlowGreaterOperator> {
+ public:
+  explicit Greater() : SimpleOperator("GREATER", OperatorType::kGreater) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class GreaterEqual : public SimpleOperator<TensorFlowGreaterEqualOperator> {
+ public:
+  explicit GreaterEqual()
+      : SimpleOperator("GREATER_EQUAL", OperatorType::kGreaterEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Less : public SimpleOperator<TensorFlowLessOperator> {
+ public:
+  explicit Less() : SimpleOperator("LESS", OperatorType::kLess) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class LessEqual : public SimpleOperator<TensorFlowLessEqualOperator> {
+ public:
+  explicit LessEqual()
+      : SimpleOperator("LESS_EQUAL", OperatorType::kLessEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Select : public SimpleOperator<SelectOperator> {
+ public:
+  explicit Select() : SimpleOperator("SELECT", OperatorType::kSelect) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 namespace {
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
@@ -1897,6 +2300,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   // Builtin Operators.
   ops.push_back(
       MakeUnique<Add>(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
+  ops.push_back(
+      MakeUnique<AddN>(::tflite::BuiltinOperator_ADD_N, OperatorType::kAddN));
   ops.push_back(
       MakeUnique<Div>(::tflite::BuiltinOperator_DIV, OperatorType::kDiv));
   ops.push_back(
@@ -1923,6 +2328,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                  OperatorType::kFullyConnected));
   ops.push_back(MakeUnique<Gather>(::tflite::BuiltinOperator_GATHER,
                                    OperatorType::kGather));
+  ops.push_back(MakeUnique<GatherNd>(::tflite::BuiltinOperator_GATHER_ND,
+                                     OperatorType::kGatherNd));
   ops.push_back(
       MakeUnique<L2Normalization>(::tflite::BuiltinOperator_L2_NORMALIZATION,
                                   OperatorType::kL2Normalization));
@@ -2025,6 +2432,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(MakeUnique<UnidirectionalSequenceRnn>(
       ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
       OperatorType::kUnidirectionalSequenceRnn));
+  ops.push_back(
+      MakeUnique<Where>(::tflite::BuiltinOperator_WHERE, OperatorType::kWhere));
 
   // Custom Operators.
   ops.push_back(
@@ -2044,44 +2453,34 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       MakeUnique<SimpleOperator<FloorOperator>>("FLOOR", OperatorType::kFloor));
   ops.push_back(
       MakeUnique<SimpleOperator<CeilOperator>>("CEIL", OperatorType::kCeil));
+  ops.push_back(
+      MakeUnique<SimpleOperator<EluOperator>>("ELU", OperatorType::kElu));
   ops.push_back(
       MakeUnique<SimpleOperator<ReluOperator>>("RELU", OperatorType::kRelu));
   ops.push_back(MakeUnique<SimpleOperator<Relu1Operator>>(
       "RELU_N1_TO_1", OperatorType::kRelu1));
-  ops.push_back(
-      MakeUnique<SimpleOperator<Relu6Operator>>("RELU6", OperatorType::kRelu6));
+  ops.push_back(MakeUnique<Relu6>());
   ops.push_back(
       MakeUnique<SimpleOperator<PReluOperator>>("PRELU", OperatorType::kPRelu));
-  ops.push_back(MakeUnique<SimpleOperator<LogisticOperator>>(
-      "LOGISTIC", OperatorType::kLogistic));
-  ops.push_back(
-      MakeUnique<SimpleOperator<TanhOperator>>("TANH", OperatorType::kTanh));
+  ops.push_back(MakeUnique<Logistic>());
+  ops.push_back(MakeUnique<Tanh>());
   ops.push_back(
       MakeUnique<SimpleOperator<ExpOperator>>("EXP", OperatorType::kExp));
-  ops.push_back(MakeUnique<SimpleOperator<LogSoftmaxOperator>>(
-      "LOG_SOFTMAX", OperatorType::kLogSoftmax));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMaximumOperator>>(
-      "MAXIMUM", OperatorType::kMaximum));  //  Element-wise Maximum
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMinimumOperator>>(
-      "MINIMUM", OperatorType::kMinimum));  //  Element-wise Minimum
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterOperator>>(
-      "GREATER", OperatorType::kGreater));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterEqualOperator>>(
-      "GREATER_EQUAL", OperatorType::kGreaterEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessOperator>>(
-      "LESS", OperatorType::kLess));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessEqualOperator>>(
-      "LESS_EQUAL", OperatorType::kLessEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowEqualOperator>>(
-      "EQUAL", OperatorType::kEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowNotEqualOperator>>(
-      "NOT_EQUAL", OperatorType::kNotEqual));
   ops.push_back(
-      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
-  ops.push_back(MakeUnique<SimpleOperator<SelectOperator>>(
-      "SELECT", OperatorType::kSelect));
+      MakeUnique<SimpleOperator<CosOperator>>("COS", OperatorType::kCos));
+  ops.push_back(MakeUnique<LogSoftmax>());
+  ops.push_back(MakeUnique<Maximum>());  //  Element-wise Maximum
+  ops.push_back(MakeUnique<Minimum>());  //  Element-wise Minimum
+  ops.push_back(MakeUnique<Greater>());
+  ops.push_back(MakeUnique<GreaterEqual>());
+  ops.push_back(MakeUnique<Less>());
+  ops.push_back(MakeUnique<LessEqual>());
+  ops.push_back(MakeUnique<Equal>());
+  ops.push_back(MakeUnique<NotEqual>());
   ops.push_back(
-      MakeUnique<SimpleOperator<SliceOperator>>("SLICE", OperatorType::kSlice));
+      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
+  ops.push_back(MakeUnique<Select>());
+  ops.push_back(MakeUnique<Slice>());
   ops.push_back(
       MakeUnique<SimpleOperator<PowOperator>>("POW", OperatorType::kPow));
   ops.push_back(MakeUnique<SimpleOperator<LogicalOrOperator>>(
@@ -2115,6 +2514,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
   ops.push_back(MakeUnique<SimpleOperator<ReverseV2Operator>>(
       "REVERSE_V2", OperatorType::kReverseV2));
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowRankOperator>>(
+      "RANK", OperatorType::kRank));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 88f68f7ebf919bc53255cff46dc3b0178093a70e..ac3b84777964583c17c7482855d942125c67c02f 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -113,12 +113,14 @@ class OperatorTest : public ::testing::Test {
 TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
   CheckSimpleOperator<CeilOperator>("CEIL", OperatorType::kCeil);
+  CheckSimpleOperator<EluOperator>("ELU", OperatorType::kElu);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
   CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
   CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
   CheckSimpleOperator<ExpOperator>("EXP", OperatorType::kExp);
+  CheckSimpleOperator<CosOperator>("COS", OperatorType::kCos);
   CheckSimpleOperator<LogSoftmaxOperator>("LOG_SOFTMAX",
                                           OperatorType::kLogSoftmax);
   CheckSimpleOperator<TensorFlowMaximumOperator>(
@@ -153,6 +155,7 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
   CheckSimpleOperator<ReverseV2Operator>("REVERSE_V2",
                                          OperatorType::kReverseV2);
+  CheckSimpleOperator<TensorFlowRankOperator>("RANK", OperatorType::kRank);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -164,6 +167,13 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinAddN) {
+  AddNOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("ADD_N", OperatorType::kAddN), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
 TEST_F(OperatorTest, BuiltinReducerOps) {
   CheckReducerOperator<MeanOperator>("MEAN", OperatorType::kMean);
   CheckReducerOperator<TensorFlowSumOperator>("SUM", OperatorType::kSum);
@@ -232,6 +242,20 @@ TEST_F(OperatorTest, BuiltinGather) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinGatherNd) {
+  GatherNdOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("GATHER_ND", OperatorType::kGatherNd), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
+TEST_F(OperatorTest, BuiltinWhere) {
+  WhereOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("WHERE", OperatorType::kWhere), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
 TEST_F(OperatorTest, BuiltinL2Pool) {
   L2PoolOperator op;
   op.stride_width = 123;
@@ -280,6 +304,44 @@ TEST_F(OperatorTest, BuiltinMaxPool) {
   EXPECT_EQ(op.kheight, output_toco_op->kheight);
 }
 
+TEST_F(OperatorTest, VersioningMaxTest) {
+  TensorFlowMaximumOperator max_op;
+  max_op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op = operator_by_type_map.at(max_op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(max_op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model, .op = &max_op};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(max_op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model, .op = &max_op};
+  EXPECT_EQ(op->GetVersion(int8_signature), 2);
+}
+
+TEST_F(OperatorTest, VersioningMinTest) {
+  TensorFlowMinimumOperator min_op;
+  min_op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op = operator_by_type_map.at(min_op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(min_op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model, .op = &min_op};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(min_op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model, .op = &min_op};
+  EXPECT_EQ(op->GetVersion(int8_signature), 2);
+}
+
 TEST_F(OperatorTest, BuiltinReshape) {
   TensorFlowReshapeOperator op;
   op.shape = {1, 2, 4, 5, 8};
@@ -670,6 +732,145 @@ TEST_F(OperatorTest, BuiltinUnique) {
   EXPECT_EQ(output_toco_op->idx_out_type, op.idx_out_type);
 }
 
+// Test version for a simple Op with 2 versions and the input type controls the
+// version.
+template <typename Op>
+void SimpleVersioningTest() {
+  Op op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model, .op = &op};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model, .op = &op};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+}
+
+TEST_F(OperatorTest, VersioningEqualTest) {
+  SimpleVersioningTest<TensorFlowEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningNotEqualTest) {
+  SimpleVersioningTest<TensorFlowNotEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLessTest) {
+  SimpleVersioningTest<TensorFlowLessOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLessEqualTest) {
+  SimpleVersioningTest<TensorFlowLessEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningGreaterTest) {
+  SimpleVersioningTest<TensorFlowGreaterOperator>();
+}
+
+TEST_F(OperatorTest, VersioningGreaterEqualTest) {
+  SimpleVersioningTest<TensorFlowGreaterEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSpaceToBatchNDTest) {
+  SimpleVersioningTest<SpaceToBatchNDOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLogSoftmaxTest) {
+  SimpleVersioningTest<LogSoftmaxOperator>();
+}
+
+TEST_F(OperatorTest, VersioningPackTest) {
+  SimpleVersioningTest<PackOperator>();
+}
+
+TEST_F(OperatorTest, VersioningBatchToSpaceNDTest) {
+  SimpleVersioningTest<BatchToSpaceNDOperator>();
+}
+
+TEST_F(OperatorTest, VersioningTanhTest) {
+  SimpleVersioningTest<TanhOperator>();
+}
+
+TEST_F(OperatorTest, VersioningStridedSliceTest) {
+  SimpleVersioningTest<StridedSliceOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSpaceToDepthTest) {
+  SimpleVersioningTest<SpaceToDepthOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSliceTest) {
+  SimpleVersioningTest<SliceOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLogisticTest) {
+  SimpleVersioningTest<LogisticOperator>();
+}
+
+TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest<AddOperator>(); }
+
+TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest<SubOperator>(); }
+
+TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest<MulOperator>(); }
+
+TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest<PadOperator>(); }
+
+TEST_F(OperatorTest, VersioningPadV2Test) {
+  SimpleVersioningTest<PadV2Operator>();
+}
+
+TEST_F(OperatorTest, VersioningSelectTest) {
+  SimpleVersioningTest<SelectOperator>();
+}
+
+TEST_F(OperatorTest, VersioningRelu6Test) {
+  SimpleVersioningTest<Relu6Operator>();
+}
+
+TEST_F(OperatorTest, VersioningFullyConnectedTest) {
+  FullyConnectedOperator fully_connected_op;
+  fully_connected_op.inputs = {"input", "weight"};
+  fully_connected_op.outputs = {"output"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op =
+      operator_by_type_map.at(fully_connected_op.type).get();
+
+  Model uint8_model;
+  Array& input_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.inputs[0]);
+  input_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& weight_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.inputs[1]);
+  weight_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& output_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.outputs[0]);
+  output_uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model,
+                                       .op = &fully_connected_op};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& input_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.inputs[0]);
+  input_int8_array.data_type = ArrayDataType::kInt8;
+  Array& weight_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.inputs[1]);
+  weight_int8_array.data_type = ArrayDataType::kInt8;
+  Array& output_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.outputs[0]);
+  output_int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model,
+                                      .op = &fully_connected_op};
+  EXPECT_EQ(op->GetVersion(int8_signature), 4);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 4a3d6a5848751f4c1d526153bd6f6d08a9f882af..aa7e43350caca295e027a433da1d96af76bb6686 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -49,5 +49,10 @@ int main(int argc, char** argv) {
   }
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
   auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
-  return status.ok() ? 0 : -1;
+  if (!status.ok()) {
+    fprintf(stderr, "%s\n", status.error_message().c_str());
+    fflush(stderr);
+    return 1;
+  }
+  return 0;
 }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index 28e7b10ecd056815c8ca6d7a74f324a18d307451..2adfc1dd236bfe3ba8ee1de70e0dbdba08d9f283 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -77,7 +77,7 @@ tensorflow::Status Convert(const string& graph_def_contents,
                            string* output_file_contents) {
   std::unique_ptr<Model> model =
       Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
+  TF_RETURN_IF_ERROR(TransformWithStatus(toco_flags, model.get()));
   return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
                 output_file_contents);
 }
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 69d7a7a61a5065ba0284edb3b0fdecf40df5d267..c66ef1db915b0b055982c06e24a9706b1943c804 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -236,7 +236,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
   return model;
 }
 
-void Transform(const TocoFlags& toco_flags, Model* model) {
+tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
+                                       Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
@@ -258,8 +259,8 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
   // and BatchNorm layers.
-  RunGraphTransformations(model, "Removing unused ops",
-                          {new toco::RemoveUnusedOp});
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "Removing unused ops", {new toco::RemoveUnusedOp}));
 
   GraphTransformationsSet transformations;
   MakeGeneralGraphTransformationsSet(&transformations);
@@ -307,20 +308,21 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     identify_dilated_conv->set_identify_depthwise_conv(false);
   }
   transformations.Add(identify_dilated_conv);
-  RunGraphTransformations(model, "general graph transformations",
-                          transformations);
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "general graph transformations", transformations));
 
   if (quantize_output) {
     if (toco_flags.propagate_fake_quant_num_bits()) {
-      RunGraphTransformations(model,
-                              "fake quant propagation graph transformations",
-                              {new PropagateFakeQuantNumBits});
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+          model, "fake quant propagation graph transformations",
+          {new PropagateFakeQuantNumBits}));
     }
-    RunGraphTransformations(model, "pre-quantization graph transformations",
-                            {
-                                new HardcodeMinMax,
-                                new DropFakeQuant,
-                            });
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "pre-quantization graph transformations",
+        {
+            new HardcodeMinMax,
+            new DropFakeQuant,
+        }));
   }
 
   // Try to merge bidirectional sequence lstm or rnn if present.
@@ -328,8 +330,13 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   bidirectional_transformations.Add(new RemoveUnusedOp);
   bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceLstm);
   bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceRnn);
-  RunGraphTransformations(model, "Group bidirectional sequence lstm/rnn",
-                          bidirectional_transformations);
+  bidirectional_transformations.Add(
+      new toco::GroupDynamicBidirectionalSequenceRnn);
+  bidirectional_transformations.Add(
+      new toco::GroupDynamicBidirectionalSequenceLstm);
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "Group bidirectional sequence lstm/rnn",
+      bidirectional_transformations));
 
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
@@ -357,12 +364,12 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
           toco_flags.default_int16_ranges_max());
     }
     if (propagate_default_min_max->has_any_ranges_defined()) {
-      RunGraphTransformations(
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
           model, "default min-max range propagation graph transformations",
           {
               propagate_default_min_max.release(),
               new HardcodeMinMax,
-          });
+          }));
     }
 
     CheckIsReadyForQuantization(*model);
@@ -372,17 +379,18 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
     ensure_safe_for_int8_kernels->set_has_default_ranges_flag(
         has_default_ranges_flag);
-    RunGraphTransformations(model, "quantization graph transformations",
-                            {
-                                new RemoveTrivialQuantizedActivationFunc,
-                                new RemoveTrivialQuantizedMinMax,
-                                new Quantize,
-                                new RemoveFinalDequantizeOp,
-                                ensure_safe_for_int8_kernels,
-                            });
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "quantization graph transformations",
+        {
+            new RemoveTrivialQuantizedActivationFunc,
+            new RemoveTrivialQuantizedMinMax,
+            new Quantize,
+            new RemoveFinalDequantizeOp,
+            ensure_safe_for_int8_kernels,
+        }));
     if (SupportsShuffledFCWeights(output_format)) {
-      RunGraphTransformations(model, "shuffling of FC weights",
-                              {new ShuffleFCWeights});
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+          model, "shuffling of FC weights", {new ShuffleFCWeights}));
     }
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
@@ -392,8 +400,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
       dequantization_transformations.Add(new DropFakeQuant);
     }
 
-    RunGraphTransformations(model, "dequantization graph transformations",
-                            dequantization_transformations);
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "dequantization graph transformations",
+        dequantization_transformations));
   }
 
   if (output_format == TENSORFLOW_GRAPHDEF) {
@@ -425,6 +434,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
               << " billion (note that a multiply-add is counted as 2 ops).";
   }
   model->ops_count = ops_count;
+  return tensorflow::Status::OK();
 }
 
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
@@ -448,7 +458,7 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       return status;
     } break;
     case GRAPHVIZ_DOT:
-      DumpGraphviz(model, output_file_contents);
+      DumpGraphviz(model, output_file_contents, "Computation Graph");
       break;
     default:
       LOG(FATAL) << "Unhandled output_format='"
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
index 742e3769269859c62522707ba415cd509e8df629..369961519499027ee4e3b04e4ebee6aadfd7c21c 100644
--- a/tensorflow/lite/toco/toco_tooling.h
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -31,7 +31,12 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 
 // Transforms a Model. The resulting Model is ready to be passed
 // to Export with the exact same toco_flags.
-void Transform(const TocoFlags& toco_flags, Model* model);
+tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
+                                       Model* model);
+inline void Transform(const TocoFlags& toco_flags, Model* model) {
+  auto s = TransformWithStatus(toco_flags, model);
+  CHECK(s.ok()) << s.error_message();
+}
 
 // Exports the Model, which must be of the 'lowered' form returned by
 // Transform, to a file of the format given by
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index 6978dde668af932d44ed824d4724234d3b96d3d9..41773356382f63b14c7162c33a733afcff0b9f36 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -66,29 +66,29 @@ string LogName(const Operator& op) {
 string ArrayDataTypeName(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
-      return "Float";
+      return "float";
     case ArrayDataType::kInt8:
-      return "Int8";
+      return "int8";
     case ArrayDataType::kUint8:
-      return "Uint8";
+      return "uint8";
     case ArrayDataType::kInt16:
-      return "Int16";
+      return "int16";
     case ArrayDataType::kUint16:
-      return "Uint16";
+      return "uint16";
     case ArrayDataType::kInt32:
-      return "Int32";
+      return "int32";
     case ArrayDataType::kUint32:
-      return "Uint32";
+      return "uint32";
     case ArrayDataType::kInt64:
-      return "Int64";
+      return "int64";
     case ArrayDataType::kUint64:
-      return "Uint64";
+      return "uint64";
     case ArrayDataType::kString:
-      return "String";
+      return "string";
     case ArrayDataType::kBool:
-      return "Bool";
+      return "bool";
     case ArrayDataType::kComplex64:
-      return "Complex64";
+      return "complex64";
     case ArrayDataType::kNone:
       return "None";
     default:
@@ -331,6 +331,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
     HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
+    HANDLE_OPERATORTYPENAME_CASE(Elu)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
@@ -387,6 +388,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Floor)
     HANDLE_OPERATORTYPENAME_CASE(Ceil)
     HANDLE_OPERATORTYPENAME_CASE(Gather)
+    HANDLE_OPERATORTYPENAME_CASE(GatherNd)
     HANDLE_OPERATORTYPENAME_CASE(ResizeBilinear)
     HANDLE_OPERATORTYPENAME_CASE(SpaceToBatchND)
     HANDLE_OPERATORTYPENAME_CASE(BatchToSpaceND)
@@ -422,6 +424,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Unique)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceRnn)
     HANDLE_OPERATORTYPENAME_CASE(ReverseV2)
+    HANDLE_OPERATORTYPENAME_CASE(Cos)
+    HANDLE_OPERATORTYPENAME_CASE(Where)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -538,7 +542,8 @@ void DumpGraphvizVideoFrame(const Model& model) {
   static int dump_id = 0;
   static std::unordered_set<std::size_t> dump_hashes;
   string graphviz_dump;
-  DumpGraphviz(model, &graphviz_dump);
+  DumpGraphviz(model, &graphviz_dump,
+               toco::port::StringF("VIDEO frame:%05d", dump_id));
   std::size_t hash = std::hash<string>{}(graphviz_dump);
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
@@ -561,7 +566,7 @@ void LogDump(int log_level, const string& message, const Model& model) {
   if (!dump_options.dump_graphviz.empty()) {
     string graphviz_dump;
 
-    DumpGraphviz(model, &graphviz_dump);
+    DumpGraphviz(model, &graphviz_dump, message);
     const auto result = port::file::SetContents(
         port::file::JoinPath(
             dump_options.dump_graphviz,
@@ -899,11 +904,6 @@ void CheckNonExistentIOArrays(const Model& model) {
   static constexpr char general_comment[] =
       "Is it a typo? To silence this message, pass this flag:  "
       "allow_nonexistent_arrays";
-  for (const auto& input_array : model.flags.input_arrays()) {
-    QCHECK(GetOpWithInput(model, input_array.name()))
-        << "Specified input array \"" << input_array.name()
-        << "\" is not consumed by any op in this graph. " << general_comment;
-  }
   for (const string& output_array : model.flags.output_arrays()) {
     if (IsConstantParameterArray(model, output_array)) {
       continue;  // It is OK to request that a constant be an output.
@@ -1863,119 +1863,140 @@ string CreateInt32Array(Model* model, const string& param_name,
   return param_array_name;
 }
 
-bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
-  int64 total = 0;
-  for (const auto& op : model.operators) {
-    switch (op->type) {
-      case OperatorType::kFullyConnected:
-      case OperatorType::kConv:
-      case OperatorType::kDepthwiseConv: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        const auto& weights_array = model.GetArray(op->inputs[1]);
-        if (!output_array.has_shape() || !weights_array.has_shape()) {
-          return false;
-        }
-        int cols = 1;
-        for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
-          cols *= output_array.shape().dims(i);
-        }
-        const int64 cost_per_col =
-            2 * RequiredBufferSizeForShape(weights_array.shape());
-        total += cost_per_col * cols;
-        if (op->inputs.size() > 2) {
-          // There is a bias vector. One more op per output value.
-          total += RequiredBufferSizeForShape(output_array.shape());
-        }
-        break;
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64* result) {
+  switch (op.type) {
+    case OperatorType::kFullyConnected:
+    case OperatorType::kConv:
+    case OperatorType::kDepthwiseConv: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      const auto& weights_array = model.GetArray(op.inputs[1]);
+      if (!output_array.has_shape() || !weights_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kAdd:
-      case OperatorType::kSub:
-      case OperatorType::kMul: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape());
-        break;
+      int64 cols = 1;
+      for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
+        cols *= output_array.shape().dims(i);
       }
-      case OperatorType::kAddN: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // AddN cost is roughly the same cost as N-1 Adds.
-        const int num_adds = op->inputs.size() - 1;
-        total += num_adds * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      const int64 cost_per_col =
+          2 * RequiredBufferSizeForShape(weights_array.shape());
+      *result = cost_per_col * cols;
+      if (op.inputs.size() > 2) {
+        // There is a bias vector. One more op per output value.
+        *result += RequiredBufferSizeForShape(output_array.shape());
       }
-      case OperatorType::kLogistic:
-      case OperatorType::kSoftmax:
-      case OperatorType::kLogSoftmax:
-      case OperatorType::kTanh: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // As a very rough ballpark, the cost of evaluating a math function
-        // such as tanh or logistic is about 32 multiplications, and about as
-        // many additions/subtractions. (Just a power-of-two order-of-magnitude
-        // from looking at actual implementations that we use in runtime/ code).
-        total += 64 * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      break;
+    }
+    case OperatorType::kAdd:
+    case OperatorType::kSub:
+    case OperatorType::kMul: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kMaxPool: {
-        const auto& maxpool = *static_cast<const MaxPoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape()) *
-                 maxpool.kheight * maxpool.kwidth;
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kAddN: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kAveragePool: {
-        const auto& avgpool =
-            *static_cast<const AveragePoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape()) *
-                 avgpool.kheight * avgpool.kwidth;
-        break;
+      // AddN cost is roughly the same cost as N-1 Adds.
+      const int64 num_adds = op.inputs.size() - 1;
+      *result = num_adds * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kLogistic:
+    case OperatorType::kSoftmax:
+    case OperatorType::kLogSoftmax:
+    case OperatorType::kTanh: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kL2Pool: {
-        const auto* maxpool = static_cast<const MaxPoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // The sum of squares requires (kheight*kwidth) multiply-adds,
-        // and then there is the sqrt which we ballpark at 32 ops.
-        const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
-        total +=
-            RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
-        break;
+      // As a very rough ballpark, the cost of evaluating a math function
+      // such as tanh or logistic is about 32 multiplications, and about as
+      // many additions/subtractions. (Just a power-of-two order-of-magnitude
+      // from looking at actual implementations that we use in runtime/ code).
+      *result = 64 * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kMaxPool: {
+      const auto& maxpool = *static_cast<const MaxPoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kL2Normalization: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // Computing the squared L2 norm is N multiply-adds so 2N ops,
-        // then the single inverse-sqrt is negligible, then we multiply each
-        // value by the resulting multiplier, so an extra N ops. Total 3N ops.
-        total += 3 * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape()) *
+                maxpool.kheight * maxpool.kwidth;
+      break;
+    }
+    case OperatorType::kAveragePool: {
+      const auto& avgpool = *static_cast<const AveragePoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      default:
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape()) *
+                avgpool.kheight * avgpool.kwidth;
+      break;
+    }
+    case OperatorType::kL2Pool: {
+      const auto* maxpool = static_cast<const MaxPoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
+      }
+      // The sum of squares requires (kheight*kwidth) multiply-adds,
+      // and then there is the sqrt which we ballpark at 32 ops.
+      const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
+      *result = RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
+      break;
+    }
+    case OperatorType::kL2Normalization: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
+      }
+      // Computing the squared L2 norm is N multiply-adds so 2N ops,
+      // then the single inverse-sqrt is negligible, then we multiply each
+      // value by the resulting multiplier, so an extra N ops. count 3N ops.
+      *result = 3 * RequiredBufferSizeForShape(output_array.shape());
+      break;
     }
+    default:
+      *result = 0;
+      break;
+  }
+  return true;
+}
+
+bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
+  int64 total = 0;
+  for (const auto& op : model.operators) {
+    int64 num_ops;
+    if (!EstimateArithmeticOpsCount(model, *op, &num_ops)) {
+      return false;
+    }
+    total += num_ops;
   }
   *result = total;
   return true;
 }
 
+string FormattedNumber(int64 x) {
+  const int64 million = 1000000;
+  const int64 billion = 1000000000;
+  if (x < 10000) {
+    return toco::port::StringF("%d ", x);
+  } else if (x < billion) {
+    return toco::port::StringF("%.3f M", static_cast<double>(x) / million);
+  } else {
+    return toco::port::StringF("%.3f G", static_cast<double>(x) / billion);
+  }
+}
+
 void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
                      std::vector<int>* shuffle) {
   CHECK_EQ(AxesCount(input_axes_order), AxesCount(output_axes_order));
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 517da784d0e6395eb06a0bf0fb9004645d292e42..fc4aac7740c64e06a7d762f3af4db504ea1213bd 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -267,7 +267,10 @@ void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
 string CreateInt32Array(Model* model, const string& param_name,
                         const std::vector<int>& value);
 
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64* result);
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
+string FormattedNumber(int64 x);
 
 int AxesCount(AxesOrder axes_order);
 
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 1d141b5dd01a4a03c65d0c8a119ad62eea224d52..f67b3f98e9beafd1548a2033289ffbc9e3b86356 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -70,7 +70,9 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -86,6 +88,7 @@ cc_test(
         "//tensorflow/core:framework_lite",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
index 04b6cb755892bd218d899587bd81b818a51f85d8..b730b0804e0df3d559ec99552fb443efc3e867eb 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 0bc7565e82c0471c439c0a0ab84e09dd39c7b9a9..83e0ff1f872ef7849a92aa24a3ff052b61dc6fa8 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -356,11 +356,23 @@ void BenchmarkTfLiteModel::Init() {
     }
   }
 
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
+  // Don't allocate tensors if we have delegates.
+  if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
   }
 }
 
+void BenchmarkTfLiteModel::ApplyDelegates() {
+  for (int i = 0; i < delegates_.size(); ++i) {
+    if (interpreter->ModifyGraphWithDelegate(delegates_[i].get()) !=
+        kTfLiteOk) {
+      TFLITE_LOG(FATAL) << "Failed to apply delegate # " << i;
+    } else {
+      TFLITE_LOG(INFO) << "Applied Delegate # " << i;
+    }
+  }
+}
+
 void BenchmarkTfLiteModel::RunImpl() {
   if (interpreter->Invoke() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to invoke!";
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 83599e644d1f41f70fd96f3a73f9155d6e62deef..3532d2a5ddb104d413612e00001bf213e90e283e 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -70,6 +70,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   uint64_t ComputeInputBytes() override;
   void Init() override;
   void RunImpl() override;
+  void SetDelegates(std::vector<std::unique_ptr<TfLiteDelegate>> delegates) {
+    delegates_ = std::move(delegates);
+  }
 
   struct InputLayerInfo {
     std::string name;
@@ -81,7 +84,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   void PrepareInputsAndOutputs() override;
 
   // Allows installation of custom delegates during initialization
-  virtual void ApplyDelegates() {}
+  virtual void ApplyDelegates();
 
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
@@ -90,6 +93,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::vector<InputLayerInfo> inputs;
   ProfilingListener profiling_listener_;
   GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::vector<std::unique_ptr<TfLiteDelegate>> delegates_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index 8142f48ad9002cd6578b790e794e06094f349b74..ee880f005dfaec1cd27d5dc093720f5de5433bfa 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -27,9 +27,9 @@ Running
 tensorflow/lite/tools/make/build_ios_universal_lib.sh
 ```
 
-will also build `tensorflow/lite/gen/lib/benchmark-lib.a` .
+will also build `tensorflow/lite/tools/make/gen/lib/benchmark-lib.a` .
 
-- Now copy the downloaded model file to `benchmark_data` directory. 
+- Now copy the downloaded model file to `benchmark_data` directory.
 
 - Modify `benchmark_params.json` change the `input_layer`, `input_layer_shape`
 and other benchmark parameters.
@@ -37,8 +37,8 @@ and other benchmark parameters.
 - Change `Build Phases -> Copy Bundle Resources` and add the model file to the
 resources that need to be copied.
 
-- Ensure that `Build Phases -> Link Binary With Library` contains the 
-`Accelerate framework` and `tensorflow/lite/gen/lib/benchmark-lib.a`.
+- Ensure that `Build Phases -> Link Binary With Library` contains the
+`Accelerate framework` and `tensorflow/lite/tools/make/gen/lib/benchmark-lib.a`.
 
 - Now try running the app. The app has a single button that runs the benchmark
   on the model and displays results in a text view below.
@@ -48,7 +48,7 @@ resources that need to be copied.
 If you want detailed profiling, use the following command:
 
 ```bash
-tensorflow/lite/build_ios_universal_lib.sh -p
+tensorflow/lite/tools/make/build_ios_universal_lib.sh -p
 ```
 
 Then following the same steps above and run the benchmark app. You will see the
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index e98ba9b2e0720b11b3a3a83d3c4c82d41027aeb6..4a22d7630adbdec4386034f045bd8b63e01f249a 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -109,7 +109,7 @@ $(wildcard tensorflow/lite/*test.cc) \
 $(wildcard tensorflow/lite/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*/*test.cc) \
-$(wildcard tensorflow/lite/kernels/test_util.cc) \
+$(wildcard tensorflow/lite/kernels/*test_util.cc) \
 $(MINIMAL_SRCS)
 
 ifeq ($(BUILD_TYPE),micro)
@@ -131,6 +131,14 @@ else
 	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate.cc
 endif
 
+ifeq ($(TARGET),ios)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_android.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_default.cc
+else
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_android.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_ios.cc
+endif
+
 
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index ecc48f807e4a5daa6bb7f7368e82c918002fa7d0..cf99cada4a2487bed08a05246b706c69edd9501d 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -56,134 +56,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "calibrator_lib",
-    srcs = ["calibrator.cc"],
-    hdrs = ["calibrator.h"],
-    deps = [
-        ":calibration_common",
-        ":calibration_logger",
-        ":calibration_reader",
-        ":logging_op_resolver",
-        ":node_info_delegate",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/memory",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "calibrator_test",
-    srcs = ["calibrator_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
-    ],
-    data = [
-        "//tensorflow/lite:testdata/multi_add.bin",
-    ],
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":calibrator_lib",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "logging_op_resolver",
-    srcs = ["logging_op_resolver.cc"],
-    hdrs = ["logging_op_resolver.h"],
-    deps = [
-        ":calibration_common",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/core/api",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-cc_test(
-    name = "logging_op_resolver_test",
-    srcs = ["logging_op_resolver_test.cc"],
-    deps = [
-        ":logging_op_resolver",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "calibration_reader",
-    srcs = ["calibration_reader.cc"],
-    hdrs = ["calibration_reader.h"],
-    deps = [
-        ":calibration_logger",
-        "//tensorflow/lite:framework",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "calibration_logger",
-    hdrs = ["calibration_logger.h"],
-    deps = [
-        "//tensorflow/lite/c:c_api_internal",
-    ],
-)
-
-cc_library(
-    name = "calibration_common",
-    hdrs = ["calibration_common.h"],
-    deps = [
-        "//tensorflow/lite:framework",
-    ],
-)
-
-cc_library(
-    name = "node_info_delegate",
-    srcs = ["node_info_delegate.cc"],
-    hdrs = ["node_info_delegate.h"],
-    deps = [
-        ":calibration_common",
-        "//tensorflow/lite:framework",
-    ],
-)
-
-tf_cc_test(
-    name = "node_info_delegate_test",
-    srcs = ["node_info_delegate_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
-    ],
-    data = [
-        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
-    ],
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":node_info_delegate",
-        ":test_util",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 tf_cc_test(
     name = "quantize_weights_test",
     srcs = ["quantize_weights_test.cc"],
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c1d2ad2bca8f76b1e07dfe6d6027ec69cd821c8a
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -0,0 +1,138 @@
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "calibrator_lib",
+    srcs = ["calibrator.cc"],
+    hdrs = ["calibrator.h"],
+    deps = [
+        ":calibration_common",
+        ":calibration_logger",
+        ":calibration_reader",
+        ":logging_op_resolver",
+        ":node_info_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "calibrator_test",
+    srcs = ["calibrator_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":calibrator_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "logging_op_resolver",
+    srcs = ["logging_op_resolver.cc"],
+    hdrs = ["logging_op_resolver.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_test(
+    name = "logging_op_resolver_test",
+    srcs = ["logging_op_resolver_test.cc"],
+    deps = [
+        ":logging_op_resolver",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "calibration_reader",
+    srcs = ["calibration_reader.cc"],
+    hdrs = ["calibration_reader.h"],
+    deps = [
+        ":calibration_logger",
+        "//tensorflow/lite:framework",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "calibration_logger",
+    hdrs = ["calibration_logger.h"],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "calibration_common",
+    hdrs = ["calibration_common.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "node_info_delegate",
+    srcs = ["node_info_delegate.cc"],
+    hdrs = ["node_info_delegate.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "node_info_delegate_test",
+    srcs = ["node_info_delegate_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":node_info_delegate",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/tools/optimize:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration_common.h b/tensorflow/lite/tools/optimize/calibration/calibration_common.h
similarity index 100%
rename from tensorflow/lite/tools/optimize/calibration_common.h
rename to tensorflow/lite/tools/optimize/calibration/calibration_common.h
diff --git a/tensorflow/lite/tools/optimize/calibration_logger.h b/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
similarity index 100%
rename from tensorflow/lite/tools/optimize/calibration_logger.h
rename to tensorflow/lite/tools/optimize/calibration/calibration_logger.h
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.cc b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibration_reader.cc
rename to tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
index b01a62bd6c15dee5b60edf5f3abdd40ba4c3a56b..69e9c5aed8dc3a6a27225fc55d87b900dc9d4730 100644
--- a/tensorflow/lite/tools/optimize/calibration_reader.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 #include "absl/memory/memory.h"
 
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibration_reader.h
rename to tensorflow/lite/tools/optimize/calibration/calibration_reader.h
index af0da1bb3835493e69ef7a6bccb7149ef14b1db9..0120d841900e4432fcee49e285ade46007bd3660 100644
--- a/tensorflow/lite/tools/optimize/calibration_reader.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibrator.cc
rename to tensorflow/lite/tools/optimize/calibration/calibrator.cc
index ccb55c3081f331189c35cc3dc302ae3f7725b2b5..eead4e590f8a42c5362b4efb952511b48e51d2de 100644
--- a/tensorflow/lite/tools/optimize/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/calibrator.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
 
 #include <fstream>
 #include <memory>
@@ -30,11 +30,11 @@ limitations under the License.
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/tools/optimize/calibration_common.h"
-#include "tensorflow/lite/tools/optimize/calibration_logger.h"
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
-#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
-#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibrator.h b/tensorflow/lite/tools/optimize/calibration/calibrator.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibrator.h
rename to tensorflow/lite/tools/optimize/calibration/calibrator.h
index ab3cb27eb7518b7327655023739e310e2a6b0249..fb7e03f5ce71f3601d6a1b0f8c912f570f67b1c9 100644
--- a/tensorflow/lite/tools/optimize/calibrator.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
similarity index 99%
rename from tensorflow/lite/tools/optimize/calibrator_test.cc
rename to tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index 1a415adc55b1b7790da190874b184a4da77716aa..60e652ec7a1fcc0d3844f0254fa6ff6072a861ce 100644
--- a/tensorflow/lite/tools/optimize/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibrator.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
 
 namespace {
 tensorflow::string* g_test_model_file = nullptr;
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
similarity index 96%
rename from tensorflow/lite/tools/optimize/logging_op_resolver.cc
rename to tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
index 7633ebb8dd9d7aee0b8a5befa5d51911f68a7e32..d2a09e898ae213c9a2aaa6e7e26adb6eda638a67 100644
--- a/tensorflow/lite/tools/optimize/logging_op_resolver.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
 
 #include "absl/memory/memory.h"
 
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.h b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/logging_op_resolver.h
rename to tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
index 58a3a0fe3c08288ccba6881a64b1fd581103da10..af4127e42f76dcdcfff00bee4b811dd20111165d 100644
--- a/tensorflow/lite/tools/optimize/logging_op_resolver.h
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/tools/optimize/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
similarity index 98%
rename from tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
rename to tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
index c7fe2d37ce49e4e467ef92c963ef33692d5d998b..d8d29ad8eff0cea0967a6d0e91e84714b5fbe80f 100644
--- a/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/mutable_op_resolver.h"
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
similarity index 88%
rename from tensorflow/lite/tools/optimize/node_info_delegate.cc
rename to tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
index ccaa69373fcf55adaef21a948089ea59821ca763..2b9197498b03dad6a37b7370ce2a0d2751ac9bcd 100644
--- a/tensorflow/lite/tools/optimize/node_info_delegate.cc
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 
 namespace tflite {
 namespace optimize {
@@ -33,11 +33,11 @@ TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
 }  // namespace
 
 TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
-  return {.data_ = params,
-          .Prepare = NodeInfoDelegatePrepare,
-          .CopyFromBufferHandle = nullptr,
-          .CopyToBufferHandle = nullptr,
-          .FreeBufferHandle = nullptr};
+  return {/*data_ */ params,
+          /* Prepare */ NodeInfoDelegatePrepare,
+          /* CopyFromBufferHandle*/ nullptr,
+          /* CopyToBufferHandle*/ nullptr,
+          /* FreeBufferHandle*/ nullptr};
 }
 
 TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.h b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/node_info_delegate.h
rename to tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
index 8ee2ce1978cf87b104518c4b64e84df166cef32d..56f6141f21dc3f807c53ac5e92833597f6cef4a9 100644
--- a/tensorflow/lite/tools/optimize/node_info_delegate.h
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/lite/context.h"
-#include "tensorflow/lite/tools/optimize/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
similarity index 98%
rename from tensorflow/lite/tools/optimize/node_info_delegate_test.cc
rename to tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
index 05332c56b549a49d72d67cbed4fa0832d38a8dcc..b110174b6325a8daadacfd472e62321ef69425f7 100644
--- a/tensorflow/lite/tools/optimize/node_info_delegate_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 
 namespace {
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
index 118e055058f37693c95cd7278f3229f35434ddb4..05115e8775044cd2ae3e8e89f12ba23adff62250 100644
--- a/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
@@ -67,17 +67,17 @@ struct OpWithBiasTensors {
 
 const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) {
   if (op_code == BuiltinOperator_CONV_2D) {
-    static OpWithBiasTensors op_info = {.activation_input_index = 0,
-                                        .weights_input_index = 1,
-                                        .bias_input_index = 2,
-                                        .index_for_channel_in_weights = 0};
+    static OpWithBiasTensors op_info = {/* activation_input_index */ 0,
+                                        /* weights_input_index */ 1,
+                                        /* bias_input_index */ 2,
+                                        /* index_for_channel_in_weights */ 0};
     return &op_info;
   }
   if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
-    static OpWithBiasTensors op_info = {.activation_input_index = 0,
-                                        .weights_input_index = 1,
-                                        .bias_input_index = 2,
-                                        .index_for_channel_in_weights = 3};
+    static OpWithBiasTensors op_info = {/* bias_input_index */ 0,
+                                        /* bias_input_index */ 1,
+                                        /* bias_input_index */ 2,
+                                        /* index_for_channel_in_weights */ 3};
     return &op_info;
   }
 
@@ -325,6 +325,27 @@ TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSingleInputOutputOp(
   return kTfLiteOk;
 }
 
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSoftmax(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1);
+
+  if (IsSubgraphInput(op->inputs[0])) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
+  }
+
+  auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
+  if (output_tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+
+  // Softmax output is hardcoded to have 1/256 as scale and -128 as zero point.
+  output_tensor->type = TensorType_INT8;
+  output_tensor->quantization->scale = {1.0f / 256.0f};
+  output_tensor->quantization->zero_point = {-128};
+  return kTfLiteOk;
+}
+
 bool SubgraphQuantizer::IsSubgraphInput(int32_t tensor_idx) const {
   return std::find(subgraph_->inputs.begin(), subgraph_->inputs.end(),
                    tensor_idx) != subgraph_->inputs.end();
@@ -342,8 +363,9 @@ TfLiteStatus SubgraphQuantizer::QuantizeOperator(int op_idx) {
     case BuiltinOperator_MAX_POOL_2D:
       return PropagateMinMaxForAvgAndMaxPool(op_code, op);
     case BuiltinOperator_SQUEEZE:
-    case BuiltinOperator_SOFTMAX:
       return AsymmetricQuantizeSingleInputOutputOp(op_code, op);
+    case BuiltinOperator_SOFTMAX:
+      return AsymmetricQuantizeSoftmax(op_code, op);
     default:
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.h b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
index 9d6ca7fad594f4831847c2b2f9de5d6bc0be5e6d..fd1c39253cd1e04c2737a3ceafa429cc15e80580 100644
--- a/tensorflow/lite/tools/optimize/subgraph_quantizer.h
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
@@ -51,6 +51,12 @@ class SubgraphQuantizer {
   TfLiteStatus AsymmetricQuantizeSingleInputOutputOp(BuiltinOperator op_code,
                                                      OperatorT* op);
 
+  // Asymmetric quantizes inputs and outputs of an Softmax Op.
+  // Input is quantized with the min-max range and output is hardcoded to have
+  // 1/256 as scale and -128 as zero point.
+  TfLiteStatus AsymmetricQuantizeSoftmax(BuiltinOperator op_code,
+                                         OperatorT* op);
+
   TfLiteStatus AsymmetricQuantizeTensor(BuiltinOperator op_code,
                                         int32_t tensor_idx);
 
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
index 4b23ced0fb9cc77309c5d25c01c8b53c08edaafe..7261d224c3782107c69696d36519420adf725c32 100644
--- a/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
@@ -291,6 +291,7 @@ TEST(SubgraphQuantizerTest, VerifySoftmaxQuantization) {
   ASSERT_EQ(op->outputs.size(), 1);
   auto float_graph = readonly_model->subgraphs()->Get(0);
 
+  // Verify input.
   ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
             TensorType_FLOAT32);
   ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
@@ -306,12 +307,18 @@ TEST(SubgraphQuantizerTest, VerifySoftmaxQuantization) {
   VerifyAsymmetricQuantizationScale(*float_input_quant_params,
                                     *input_quant_params);
 
+  // Verify output.
   auto float_output_quant_params =
       float_graph->tensors()->Get(op->outputs[0])->quantization();
   auto output_quant_params =
       subgraph->tensors[op->outputs[0]]->quantization.get();
-  VerifyAsymmetricQuantizationScale(*float_output_quant_params,
-                                    *output_quant_params);
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+
+  ASSERT_EQ(output_quant_params->scale.size(), 1);
+  ASSERT_EQ(output_quant_params->zero_point.size(), 1);
+  ASSERT_EQ(1.0f / 256.0f, output_quant_params->scale[0]);
+  ASSERT_EQ(-128, output_quant_params->zero_point[0]);
 }
 
 TEST(SubgraphQuantizerTest, VerifyAvgPoolQuantization) {
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 8631bef67b28e290357913a72614a86e3129f65a..ffc56c19658986e39c1d8085761120b8373c919e 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/verifier.h"
 #include <climits>
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/version.h"
@@ -53,7 +54,7 @@ const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
 
 // Verifies string tensor has legit buffer contents that follow the schema
 // defined in lite/string_util.h
-bool VerifyStringTensorBuffer(const Buffer& buffer,
+bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                               ErrorReporter* error_reporter) {
   uint32_t buffer_size = buffer.data()->size();
   const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
@@ -61,7 +62,8 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
   uint32_t num_strings = *GetIntPtr(buffer_ptr);
   if (num_strings > kMaxNumString) {
     ReportError(error_reporter,
-                "String tensor has invalid num of string set: %d", num_strings);
+                "String tensor %s has invalid num of string set: %d",
+                tensor.name()->c_str(), num_strings);
     return false;
   }
   uint32_t header_offsets =
@@ -69,9 +71,9 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 
   if (buffer_size < header_offsets) {
     ReportError(error_reporter,
-                "String tensor buffer requires at least %d bytes, but is "
+                "String tensor %s buffer requires at least %d bytes, but is "
                 "allocated with %d bytes",
-                header_offsets, buffer_size);
+                tensor.name()->c_str(), header_offsets, buffer_size);
     return false;
   }
 
@@ -80,22 +82,24 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 
   if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
     ReportError(error_reporter,
-                "String tensor buffer initial offset must be: %d",
-                header_offsets);
+                "String tensor %s buffer initial offset must be: %d",
+                tensor.name()->c_str(), header_offsets);
     return false;
   }
   offset += sizeof(int32_t);
   for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
     int string_offset = *GetIntPtr(buffer_ptr + offset);
     if (string_offset < prev_ptr || string_offset > buffer_size) {
-      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
-                  i);
+      ReportError(error_reporter,
+                  "String tensor %s buffer is invalid: index %d",
+                  tensor.name()->c_str(), i);
       return false;
     }
   }
   if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
-    ReportError(error_reporter, "String tensor buffer last offset must be %d",
-                buffer_size);
+    ReportError(error_reporter,
+                "String tensor %s buffer last offset must be %d",
+                tensor.name()->c_str(), buffer_size);
     return false;
   }
   return true;
@@ -106,13 +110,14 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                                ErrorReporter* error_reporter) {
   uint64_t bytes_required = 1;
   if (!tensor.shape()) {
-    ReportError(error_reporter, "Tensor shape is empty");
-    return false;
+    // Empty tensor. Avoid further checks.
+    return true;
   }
   for (int dim : *tensor.shape()) {
     bytes_required *= dim;
     if (bytes_required > UINT_MAX) {
-      ReportError(error_reporter, "Tensor dimension overflow");
+      ReportError(error_reporter, "Tensor %s dimension overflow",
+                  tensor.name()->c_str());
       return false;
     }
   }
@@ -120,31 +125,36 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_FLOAT32:
       bytes_required *= sizeof(float);
       break;
-    case TensorType_INT32:
-      bytes_required *= sizeof(int32_t);
+    case TensorType_INT8:
+      bytes_required *= sizeof(int8_t);
       break;
     case TensorType_UINT8:
       bytes_required *= sizeof(uint8_t);
       break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
     case TensorType_INT64:
       bytes_required *= sizeof(int64_t);
       break;
     case TensorType_FLOAT16:
       // FALLTHROUGH_INTENDED;
     default:
-      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      ReportError(error_reporter, "Tensor %s invalid type: %d",
+                  tensor.name()->c_str(), tensor.type());
       return false;
   }
   if (bytes_required > UINT_MAX) {
-    ReportError(error_reporter, "Tensor dimension overflow");
+    ReportError(error_reporter, "Tensor %s dimension overflow",
+                tensor.name()->c_str());
     return false;
   }
 
   if (bytes_required != buffer.data()->size()) {
     ReportError(
         error_reporter,
-        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
-        bytes_required, buffer.data()->size());
+        "Tensor %s requires %d bytes, but is allocated with %d bytes buffer",
+        tensor.name()->c_str(), bytes_required, buffer.data()->size());
     return false;
   }
   return true;
@@ -170,6 +180,86 @@ bool VerifyOperators(const Vector<Offset<Operator>>& operators,
   return true;
 }
 
+bool IsConstantTensor(const Tensor& tensor, const Model& model) {
+  if (!tensor.buffer() || !model.buffers()) return false;
+  if (tensor.buffer() > 0 && tensor.buffer() < model.buffers()->size()) {
+    auto* buffer = model.buffers()->Get(tensor.buffer());
+    if (buffer && buffer->data()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Performs basic consistency checks on a sub-graph.
+bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
+                               ErrorReporter* error_reporter) {
+  absl::flat_hash_set<int> subgraph_input_tensors, constant_tensors,
+      variable_tensors, output_tensors;
+  for (int i = 0; i < subgraph.tensors()->Length(); ++i) {
+    const auto* tensor = subgraph.tensors()->Get(i);
+    if (IsConstantTensor(*tensor, model)) {
+      constant_tensors.insert(i);
+    } else if (tensor->is_variable()) {
+      variable_tensors.insert(i);
+    }
+  }
+  for (const int tensor_idx : *subgraph.inputs()) {
+    subgraph_input_tensors.insert(tensor_idx);
+  }
+
+  for (int op_idx = 0; op_idx < subgraph.operators()->Length(); ++op_idx) {
+    const auto* op = subgraph.operators()->Get(op_idx);
+    const auto& opcode = model.operator_codes()->Get(op->opcode_index());
+    // Check for invalid inputs by ensuring all exist in produced_tensors.
+    for (const int input_idx : *op->inputs()) {
+      if (input_idx == kOptionalTensor) continue;
+      if (constant_tensors.find(input_idx) == constant_tensors.end() &&
+          variable_tensors.find(input_idx) == variable_tensors.end() &&
+          subgraph_input_tensors.find(input_idx) ==
+              subgraph_input_tensors.end() &&
+          output_tensors.find(input_idx) == output_tensors.end()) {
+        ReportError(error_reporter,
+                    "Input tensor %d to op %d (%s) is not produced", input_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+    }
+    // Check for cycles/invalid outputs by ensuring that none exist in
+    // produced_tensors.
+    for (const int output_idx : *op->outputs()) {
+      if (constant_tensors.find(output_idx) != constant_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a constant", output_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (variable_tensors.find(output_idx) != variable_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a variable", output_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (subgraph_input_tensors.find(output_idx) !=
+                 subgraph_input_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a subgraph input",
+                    output_idx, op_idx,
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (output_tensors.find(output_idx) != output_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is an output from "
+                    "another op. There is a cycle in the graph",
+                    output_idx, op_idx,
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+      // This can be an input to a subsequent op.
+      output_tensors.insert(output_idx);
+    }
+  }
+  return true;
+}
+
 bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
   if (!model.subgraphs()) {
     ReportError(error_reporter, "Missing 'subgraphs' section.");
@@ -184,6 +274,10 @@ bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
     if (!VerifyOperators(*subgraph->operators(), error_reporter)) {
       return false;
     }
+
+    if (!VerifySubGraphConsistency(model, *subgraph, error_reporter)) {
+      return false;
+    }
   }
   return true;
 }
@@ -207,14 +301,14 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
         continue;
       }
       if (tensor->buffer() >= model.buffers()->size()) {
-        ReportError(error_reporter, "Invalid tensor buffer index: %d",
-                    tensor->buffer());
+        ReportError(error_reporter, "Tensor %s invalid buffer index: %d",
+                    tensor->name(), tensor->buffer());
         return false;
       }
       auto* buffer = model.buffers()->Get(tensor->buffer());
       if (!buffer) {
-        ReportError(error_reporter, "Tensor buffer %d not set",
-                    tensor->buffer());
+        ReportError(error_reporter, "Tensor %s buffer %d not set",
+                    tensor->name(), tensor->buffer());
         return false;
       }
 
@@ -222,7 +316,7 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
       // buffers will be allocated by the interpreter at run-time.
       if (buffer->data()) {
         if (tensor->type() == TensorType_STRING) {
-          if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          if (!VerifyStringTensorBuffer(*tensor, *buffer, error_reporter)) {
             return false;
           }
         } else {
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index 083f00d445ae0274274b0b83641a6d7c6a718577..7d67d9430acf10bf1fadd0703f496e9bb3af0adc 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -72,7 +72,8 @@ class TfLiteFlatbufferModelBuilder {
   }
 
   void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
-                 const std::vector<uint8_t>& buffer, const char* name) {
+                 const std::vector<uint8_t>& buffer, const char* name,
+                 const bool is_variable = false) {
     int buffer_index = 0;
     if (!buffer.empty()) {
       buffer_index = buffers_.size();
@@ -81,11 +82,12 @@ class TfLiteFlatbufferModelBuilder {
     if (shape.empty()) {
       tensors_.push_back(CreateTensorDirect(builder_, /*shape=*/nullptr, type,
                                             buffer_index, name,
-                                            /*quantization=*/0));
+                                            /*quantization=*/0, is_variable));
       return;
     }
     tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
-                                          name, /*quantization=*/0));
+                                          name, /*quantization=*/0,
+                                          is_variable));
   }
 
   void AddOperator(const std::vector<int32_t>& inputs,
@@ -146,20 +148,18 @@ TEST(VerifyModel, TestEmptyModel) {
               ::testing::ContainsRegex("Missing 'subgraphs' section."));
 }
 
-TEST(VerifyModel, TestEmptyShape) {
+TEST(VerifyModel, TestEmptyVector) {
   TfLiteFlatbufferModelBuilder builder({}, {"test"});
-  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddOperator({0, 1}, {3}, BuiltinOperator_CUSTOM, "test");
   builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
-  builder.AddTensor({}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "inputtwo");
+  builder.AddTensor({}, TensorType_UINT8, {}, "empty_vector");
   builder.AddTensor(
       {2}, TensorType_STRING,
       {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
       "data");
   builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
-  builder.FinishModel({0, 1}, {2});
-  ASSERT_FALSE(builder.Verify());
-  EXPECT_THAT(builder.GetErrorString(),
-              ::testing::ContainsRegex("Tensor shape is empty"));
+  builder.FinishModel({0, 1}, {3});
+  ASSERT_TRUE(builder.Verify());
 }
 
 TEST(VerifyModel, TestSimpleModel) {
@@ -219,10 +219,9 @@ TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
   builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
-  EXPECT_THAT(
-      builder.GetErrorString(),
-      ::testing::ContainsRegex(
-          "Tensor requires 6 bytes, but is allocated with 4 bytes buffer"));
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input requires 6 bytes, but is "
+                                       "allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
@@ -230,10 +229,9 @@ TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
   builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
-  EXPECT_THAT(
-      builder.GetErrorString(),
-      ::testing::ContainsRegex(
-          "Tensor requires 2 bytes, but is allocated with 4 bytes buffer"));
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input requires 2 bytes, but is "
+                                       "allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeOverflow) {
@@ -243,7 +241,7 @@ TEST(VerifyModel, TestIntTensorShapeOverflow) {
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
   EXPECT_THAT(builder.GetErrorString(),
-              ::testing::ContainsRegex("Tensor dimension overflow"));
+              ::testing::ContainsRegex("Tensor input dimension overflow"));
 }
 
 TEST(VerifyModel, TensorBufferIsNotValid) {
@@ -282,10 +280,11 @@ TEST(VerifyModel, StringTensorHasInvalidNumString) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
-  EXPECT_THAT(builder.GetErrorString(),
-              ::testing::ContainsRegex(
-                  "String tensor buffer requires at least -2147483640 bytes, "
-                  "but is allocated with 18 bytes"));
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex(
+          "String tensor input buffer requires at least -2147483640 bytes, "
+          "but is allocated with 18 bytes"));
 }
 
 TEST(VerifyModel, StringTensorOffsetTooSmall) {
@@ -297,7 +296,7 @@ TEST(VerifyModel, StringTensorOffsetTooSmall) {
   ASSERT_FALSE(builder.Verify());
   EXPECT_THAT(builder.GetErrorString(),
               ::testing::ContainsRegex(
-                  "String tensor buffer initial offset must be: 16"));
+                  "String tensor input buffer initial offset must be: 16"));
 }
 
 TEST(VerifyModel, StringTensorOffsetOutOfRange) {
@@ -307,9 +306,9 @@ TEST(VerifyModel, StringTensorOffsetOutOfRange) {
       {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
-  EXPECT_THAT(
-      builder.GetErrorString(),
-      ::testing::ContainsRegex("String tensor buffer is invalid: index 2"));
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer is invalid: index 2"));
 }
 
 TEST(VerifyModel, StringTensorIsLargerThanRequired) {
@@ -320,18 +319,19 @@ TEST(VerifyModel, StringTensorIsLargerThanRequired) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
-  EXPECT_THAT(
-      builder.GetErrorString(),
-      ::testing::ContainsRegex("String tensor buffer last offset must be 19"));
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer last offset must be 19"));
 }
 
 TEST(VerifyModel, AllOpsAreSupported) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
   builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
   builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output2");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
-  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.AddOperator({0, 1}, {3}, BuiltinOperator_CUSTOM, "CustomOp");
   builder.FinishModel({}, {});
   ASSERT_TRUE(builder.Verify());
   EXPECT_EQ("", builder.GetErrorString());
@@ -363,6 +363,102 @@ TEST(VerifyModel, UseUnsupportedCustomOps) {
                   "Unsupported custom op: Not supported, version: 1"));
 }
 
+TEST(VerifyModel, UnpopulatedInputToOp) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({1, 2}, {3}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  // This tensor will never be populated.
+  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "invalid_input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 2}, {3});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Input tensor 1 to op 0 (CUSTOM) is not produced",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, MultipleOpsOutputToSameTensor) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output1");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
+  // This can't output to "output1", since the first operator does that.
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ(
+      "Output tensor 2 to op 1 (CUSTOM) is an output from another op. "
+      "There is a cycle in the graph",
+      builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsAConstantTensor) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  // Output shouldn't be populated with constant value.
+  builder.AddTensor({2, 3}, TensorType_INT32, {1, 2, 3, 4, 5, 6}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a constant",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsSubgraphInput) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  // Output shouldn't be a subgraph input.
+  builder.FinishModel({0, 1, 2}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a subgraph input",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsAVariable) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  // Output shouldn't be a variable.
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output", /*variable*/ true);
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a variable",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OpWithOptionalTensor) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({kOptionalTensor, 0, 1}, {2}, BuiltinOperator_CUSTOM,
+                      "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
+}
+
 // TODO(yichengfan): make up malicious files to test with.
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/tutorials/post_training_quant.ipynb
index 394ab0760b5672978e0638c0ff01a8f00442302c..8bc02eedf68551036cf81eba568118e6f7e32639 100644
--- a/tensorflow/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/tutorials/post_training_quant.ipynb
@@ -301,7 +301,7 @@
         "id": "7BONhYtYocQY"
       },
       "source": [
-        "To quantize the model on export, set the `post_training_quantize` flag:"
+        "To quantize the model on export, set the `optimizations` flag to optimize for size:"
       ]
     },
     {
@@ -313,11 +313,11 @@
         "id": "g8PUvLWDlmmz"
       },
       "outputs": [],
-      "source": [
+     "source": [
         "# Note: If you don't have a recent tf-nightly installed, the\n",
-        "# \"post_training_quantize\" line will have no effect.\n",
+        "# \"optimizations\" line will have no effect.\n",
         "tf.logging.set_verbosity(tf.logging.INFO)\n",
-        "converter.post_training_quantize = True\n",
+        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "tflite_quant_model = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
         "tflite_model_quant_file.write_bytes(tflite_quant_model)"
@@ -329,8 +329,8 @@
         "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
-      "source": [
-        "Note how the resulting file, with `post_training_quantize` set, is approximately `1/4` the size."
+    "source": [
+        "Note how the resulting file, is approximately `1/4` the size."
       ]
     },
     {
@@ -383,7 +383,7 @@
       "source": [
         "import numpy as np\n",
         "mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()\n",
-        "images, labels = tf.to_float(mnist_test[0])/255.0, mnist_test[1]\n",
+        "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
         "\n",
         "# Note: If you change the batch size, then use \n",
         "# `tf.lite.Interpreter.resize_tensor_input` to also change it for\n",
@@ -489,7 +489,7 @@
         "plt.imshow(img[0])\n",
         "template = \"True:{true}, predicted:{predict}\"\n",
         "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
-        "                              predict=str(predictions[0,0])))\n",
+        "                              predict=str(predictions[0])))\n",
         "plt.grid(False)"
       ]
     },
@@ -650,7 +650,7 @@
         "output_arrays = [\"output\"]\n",
         "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
-        "converter.post_training_quantize = True\n",
+        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
         "resnet_tflite_file.write_bytes(converter.convert())\n"
       ]
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 0312114abe7f117c914701b6eb0da1720260815e..9264939b3b886858b53b2ac5e893732c80898095 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -120,8 +120,8 @@ tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProdu
 tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
 tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
 tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/gpu_packet_math.patch
 tensorflow/third_party/eigen3/LICENSE
-tensorflow/third_party/eigen3/gebp_neon.patch
 tensorflow/third_party/eigen3/BUILD
 tensorflow/third_party/systemlibs/build_defs.bzl.tpl
 tensorflow/third_party/systemlibs/absl_py.BUILD
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index ca6a09cdd496cab687668e05e056f9b3c8755564..5a8935a6b9f5f6c18510ff83dfaa186034318ec4 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -143,7 +143,6 @@ py_library(
         ":map_fn",
         ":math_ops",
         ":metrics",
-        ":mode_keys",
         ":nccl_ops",
         ":nn",
         ":ops",
@@ -174,11 +173,13 @@ py_library(
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/compiler",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:profiler",
+        "//tensorflow/python/eager:profiler_client",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/module",
         "//tensorflow/python/ops/distributions",
@@ -679,6 +680,7 @@ py_library(
     deps = [
         ":common_shapes",
         ":composite_tensor",
+        ":convert_to_constants",
         ":cpp_shape_inference_proto_py",
         ":errors",
         ":framework_fast_tensor_util",
@@ -854,6 +856,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "convert_to_constants",
+    srcs = [
+        "framework/convert_to_constants.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":platform",
+        ":tensor_util",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_library(
     name = "kernels",
     srcs = [
@@ -971,6 +989,7 @@ tf_py_test(
     additional_deps = [
         ":auto_control_deps",
         ":client_testlib",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -1310,8 +1329,8 @@ py_library(
     ],
 )
 
-cuda_py_tests(
-    name = "framework_function_test",
+cuda_py_test(
+    name = "function_test",
     size = "medium",
     srcs = ["framework/function_test.py"],
     additional_deps = [
@@ -1341,6 +1360,7 @@ cuda_py_tests(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1813,7 +1833,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/training/checkpointable:__pkg__",
+        "//tensorflow/python/training/tracking:__pkg__",
     ],
 )
 
@@ -1986,6 +2006,27 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tpu_ops_gen",
+    visibility = [
+        "//smartass/brain/configure/python:__pkg__",
+        "//tensorflow/contrib/tpu:__pkg__",
+        "//tensorflow/python/tpu:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:tpu_configuration_ops_op_lib",
+        "//tensorflow/core:tpu_cross_replica_ops_op_lib",
+        "//tensorflow/core:tpu_embedding_ops_op_lib",
+        "//tensorflow/core:tpu_functional_ops_op_lib",
+        "//tensorflow/core:tpu_heartbeat_ops_op_lib",
+        "//tensorflow/core:tpu_host_compute_ops_op_lib",
+        "//tensorflow/core:tpu_infeed_ops_op_lib",
+        "//tensorflow/core:tpu_ordinal_selector_ops_op_lib",
+        "//tensorflow/core:tpu_outfeed_ops_op_lib",
+        "//tensorflow/core:tpu_replication_ops_op_lib",
+    ],
+)
+
 py_library(
     name = "array_grad",
     srcs = ["ops/array_grad.py"],
@@ -2737,6 +2778,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "critical_section_ops",
+    srcs = ["ops/critical_section_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":resource_variable_ops_gen",
+        ":tensor_array_ops",
+        ":util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "list_ops",
     srcs = ["ops/list_ops.py"],
@@ -3148,6 +3205,7 @@ py_library(
         ":clip_ops",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":critical_section_ops",
         ":cudnn_rnn_grad",
         ":data_flow_grad",
         ":data_flow_ops",
@@ -3332,7 +3390,7 @@ py_library(
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -3385,6 +3443,7 @@ cuda_py_test(
         ":framework_test_lib",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3503,6 +3562,7 @@ cuda_py_test(
         ":image_ops",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3528,6 +3588,7 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3627,6 +3688,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     shard_count = 16,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3647,6 +3709,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3677,6 +3740,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -3685,7 +3749,7 @@ py_library(
         ["training/**/*.py"],
         exclude = [
             "**/*test*",
-            "training/checkpointable/**/*.py",
+            "training/tracking/**/*.py",
             "training/saving/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
@@ -3746,8 +3810,8 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras/optimizer_v2:learning_rate_schedule",
         "//tensorflow/python/ops/losses",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3827,9 +3891,9 @@ py_library(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/training/tracking:base",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3910,6 +3974,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = visibility + [
         "//tensorflow:__pkg__",
+        "//third_party/py/tf_agents:__subpackages__",
     ],
     deps = [
         "//third_party/py/numpy",
@@ -4702,6 +4767,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
 )
 
 cuda_py_test(
@@ -4746,6 +4812,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "convert_to_constants_test",
+    size = "small",
+    srcs = ["framework/convert_to_constants_test.py"],
+    additional_deps = [
+        ":convert_to_constants",
+        "client_testlib",
+        "framework_test_lib",
+    ],
+)
+
 tf_py_test(
     name = "bfloat16_test",
     size = "small",
@@ -5777,6 +5854,8 @@ cuda_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    # This test will not run on XLA because it primarily tests the TF Classic flow.
+    xla_enable_strict_auto_jit = False,
 )
 
 py_library(
@@ -5917,6 +5996,8 @@ cuda_py_test(
     tags = [
         "grappler",
     ],
+    # This test will not run on XLA because it primarily tests the TF Classic flow.
+    xla_enable_strict_auto_jit = False,
 )
 
 py_library(
@@ -6057,29 +6138,6 @@ py_binary(
     ],
 )
 
-py_library(
-    name = "mode_keys",
-    srcs = [
-        "training/mode_keys.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":util",
-    ],
-)
-
-tf_py_test(
-    name = "mode_keys_test",
-    size = "small",
-    srcs = [
-        "training/mode_keys_test.py",
-    ],
-    additional_deps = [
-        ":client_testlib",
-        ":mode_keys",
-    ],
-)
-
 pyx_library(
     name = "framework_fast_tensor_util",
     srcs = ["framework/fast_tensor_util.pyx"],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 398fb375e1453866f3f1953a53012aaae2c22dd6..8538f8c5dc1d30ce10516effbb994830049630c3 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.module import module
 from tensorflow.python.ops import bitwise_ops as bitwise
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 7026a162a28da40476930928f2327ef0b93dc352..a35ff16eca57a25269bafd99fd6742c01502cf60 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,240 +22,108 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import gast
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
-from tensorflow.python.util import tf_inspect
-
-
-class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
-  pass
-
-
-# TODO(mdan): Move this to a separate transformer.
-KNOWN_NUMPY_FUNCTIONS = {
-    ('numpy', 'random', 'binomial'): FunctionInfo(dtype='tf.int64'),
-}
-
-
-# TODO(mdan): Get rid of these interfaces. Can now depend directly on Namer.
-
-
-class FunctionNamer(object):
-  """Describes the interface for CallTreeTransformer's namer."""
 
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    """Generate the name corresponding to the compiled version of a function.
 
-    Args:
-      original_fqn: string or tuple(string)
-      live_entity: Callable, the actual target function, if known.
-      owner_type: Optional object. If present, it indicates that the function is
-          a member of the given type.
-    Returns:
-      string, bool
-    """
-    raise NotImplementedError()
-
-  def compiled_class_name(self, original_fqn, live_entity=None):
-    """Generate the name corresponding to the compiled version of a class.
+# TODO(mdan): Rename to FunctionCallsTransformer.
 
-    Args:
-      original_fqn: string or tuple(string)
-      live_entity: The actual target class, if known.
-    Returns:
-      string
-    """
-    raise NotImplementedError()
 
+class _Function(object):
 
-# TODO(mdan): Rename to CallsTransformer.
+  no_root = True
 
 
 class CallTreeTransformer(converter.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def _resolve_decorator_name(self, node):
-    """Used to resolve decorator info."""
-    if isinstance(node, gast.Call):
-      return self._resolve_decorator_name(node.func)
-    if isinstance(node, gast.Name):
-      # TODO(mdan): Add test coverage for this branch.
-      return self.ctx.info.namespace.get(node.id)
-    if isinstance(node, gast.Attribute):
-      parent = self._resolve_decorator_name(node.value)
-      if parent is not None:
-        return getattr(parent, node.attr)
-      return None
-    raise ValueError(node)
-
-  def _try_resolve_target(self, node):
-    """Works for methods of objects of known type."""
-    if anno.hasanno(node, 'live_val'):
-      return anno.getanno(node, 'live_val')
-    if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
-      owner_type = anno.getanno(node, 'type')
-      if hasattr(owner_type, node.attr):
-        return getattr(owner_type, node.attr)
-      else:
-        # TODO(mdan): We should probably return None here rather than an error.
-        raise ValueError('Type "%s" has no attribute "%s". Is it dynamic?' %
-                         (owner_type, node.attr))
-    return None
-
-  def _function_is_compilable(self, target_entity):
-    """Determines whether an entity can be compiled at all."""
-    # TODO(mdan): Expand.
-
-    if target_entity.__module__ is None:
-      # Functions like builtins and NumPy don't expose a module.
-      # Those in general should not be compiled.
-      return False
-
-    if inspect_utils.isbuiltin(target_entity):
-      return False
-
-    if inspect_utils.isnamedtuple(target_entity):
-      # namedtuple doesn't expose its source code, making it uncompilable.
-      return False
-
-    return True
-
-  def _should_compile(self, node, fqn):
-    """Determines whether an entity should be compiled in the context."""
-    # TODO(mdan): Needs cleanup. We should remove the use of fqn altogether.
-    module_name = fqn[0]
-    for mod in self.ctx.program.uncompiled_modules:
-      if module_name.startswith(mod[0] + '.'):
-        return False
-
-    for i in range(1, len(fqn)):
-      if fqn[:i] in self.ctx.program.uncompiled_modules:
-        return False
-
-    target_entity = self._try_resolve_target(node.func)
-
-    if target_entity is not None:
-
-      # Currently, lambdas are always converted.
-      # TODO(mdan): Allow markers of the kind f = ag.do_not_convert(lambda: ...)
-      if inspect_utils.islambda(target_entity):
-        return True
-
-      # This may be reached when "calling" a callable attribute of an object.
-      # For example:
-      #
-      #   self.fc = tf.keras.layers.Dense()
-      #   self.fc()
-      #
-      for mod in self.ctx.program.uncompiled_modules:
-        if target_entity.__module__.startswith(mod[0] + '.'):
-          return False
-
-      # Inspect the target function decorators. If any include a @convert
-      # or @do_not_convert annotation, then they must be called as they are.
-      # TODO(mdan): This may be quite heavy. Perhaps always dynamically convert?
-      # To parse and re-analyze each function for every call site could be quite
-      # wasteful. Maybe we could cache the parsed AST?
-      try:
-        target_node, _ = parser.parse_entity(target_entity)
-        target_node = target_node.body[0]
-      except TypeError:
-        # Functions whose source we cannot access are compilable (e.g. wrapped
-        # to py_func).
-        return True
-
-      # This attribute is set when the decorator was applied before the
-      # function was parsed. See api.py.
-      if hasattr(target_entity, '__ag_compiled'):
-        return False
-
-      for dec in target_node.decorator_list:
-        decorator_fn = self._resolve_decorator_name(dec)
-        if (decorator_fn is not None and
-            self.ctx.program.options.should_strip(decorator_fn)):
-          return False
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    node.args = self.visit(node.args)
+    node.body = self.visit_block(node.body)
 
-    return True
+    if self.state[_Function].level < 2:
+      # Top-level functions lose their decorator because the conversion is
+      # always just-in-time and by the time it happens the decorators are
+      # already set to be applied.
+      node.decorator_list = []
+    else:
+      # Inner functions are converted already, so we insert a decorator to
+      # prevent double conversion. Double conversion would work too, but this
+      # saves the overhead.
+      node.decorator_list.append(
+          parser.parse_expression('ag__.do_not_convert_internal'))
 
-  def _rename_compilable_function(self, node):
-    assert anno.hasanno(node.func, 'live_val')
-    assert anno.hasanno(node.func, 'fqn')
-    target_entity = anno.getanno(node.func, 'live_val')
-    target_fqn = anno.getanno(node.func, 'fqn')
+    if node.returns:
+      node.returns = self.visit(node.returns)
 
-    if anno.hasanno(node, 'is_constructor'):
-      new_name = self.ctx.namer.compiled_class_name(
-          target_fqn, live_entity=target_entity)
-      do_rename = True
-    else:
-      if anno.hasanno(node.func, 'parent_type'):
-        owner_type = anno.getanno(node.func, 'parent_type')
-      else:
-        # Fallback - not reliable.
-        owner_type = inspect_utils.getmethodclass(target_entity)
-      new_name, do_rename = self.ctx.namer.compiled_function_name(
-          target_fqn, live_entity=target_entity, owner_type=owner_type)
+    self.state[_Function].exit()
+    return node
 
-    if do_rename:
-      if target_entity is not None:
-        if tf_inspect.ismethod(target_entity):
-          # The renaming process will transform it into a regular function.
-          # TODO(mdan): Is this complete? How does it work with nested members?
-          node.args = [node.func.value] + node.args
-      node.func = templates.replace_as_expression(
-          'func_name', func_name=new_name)
+  def visit_With(self, node):
+    # Context manager calls (in node.items) are not converted.
+    node.body = self.visit_block(node.body)
     return node
 
-  def _wrap_to_py_func_single_return(self, node, dtype):
-    # TODO(mdan): Properly handle varargs, etc.
-    template = """
-      ag__.utils.wrap_py_func(func, dtype, (args,), kwargs, False)
-    """
-    return templates.replace_as_expression(
-        template,
-        func=node.func,
-        dtype=parser.parse_expression(dtype),
-        args=node.args,
-        kwargs=ast_util.keywords_to_dict(node.keywords))
+  def visit_Call(self, node):
+    # TODO(mdan): Refactor converted_call as a 'Call' operator.
+
+    # Calls to the internal 'ag__' module are never converted (though their
+    # arguments might be).
+    full_name = str(anno.getanno(node.func, anno.Basic.QN, default=''))
+    if full_name.startswith('ag__.'):
+      return self.generic_visit(node)
+    if (full_name == 'print' and
+        not self.ctx.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS)):
+      return self.generic_visit(node)
 
-  def _insert_dynamic_conversion(self, node):
-    """Inlines a dynamic conversion for a dynamic function."""
-    # TODO(mdan): Pass information on the statically compiled functions.
-    # Having access to the statically compiled functions can help avoid
-    # unnecessary compilation.
-    # For example, this would lead to function `a` being compiled twice:
-    #
-    #   def a():
-    #     v = b
-    #     b()
-    #   def b():
-    #     a()
-    #
-    # This is really a problem with recursive calls, which currently can
-    # only be gated by a static condition, and should be rare.
-    # TODO(mdan): It probably makes sense to use dynamic conversion every time.
-    # Before we could convert all the time though, we'd need a reasonable
-    # caching mechanism.
-    template = """
-      ag__.converted_call(func, owner, options, args)
-    """
     if isinstance(node.func, gast.Attribute):
       func = gast.Str(node.func.attr)
       owner = node.func.value
     else:
       func = node.func
       owner = parser.parse_expression('None')
+
+    starred_arg = None
+    normal_args = []
+    for a in node.args:
+      if isinstance(a, gast.Starred):
+        assert starred_arg is None, 'Multiple *args should be impossible.'
+        starred_arg = a
+      else:
+        normal_args.append(a)
+    if starred_arg is None:
+      args = templates.replace_as_expression('(args,)', args=normal_args)
+    else:
+      args = templates.replace_as_expression(
+          '(args,) + tuple(stararg)',
+          stararg=starred_arg.value,
+          args=normal_args)
+
+    kwargs_arg = None
+    normal_keywords = []
+    for k in node.keywords:
+      if k.arg is None:
+        assert kwargs_arg is None, 'Multiple **kwargs should be impossible.'
+        kwargs_arg = k
+      else:
+        normal_keywords.append(k)
+    if kwargs_arg is None:
+      kwargs = ast_util.keywords_to_dict(normal_keywords)
+    else:
+      kwargs = templates.replace_as_expression(
+          'dict(kwargs, **keywords)',
+          kwargs=kwargs_arg.value,
+          keywords=ast_util.keywords_to_dict(normal_keywords))
+
+    template = """
+      ag__.converted_call(func, owner, options, args, kwargs)
+    """
     new_call = templates.replace_as_expression(
         template,
         func=func,
@@ -263,70 +131,10 @@ class CallTreeTransformer(converter.Base):
         options=self.ctx.program.options.to_ast(
             self.ctx,
             internal_convert_user_code=self.ctx.program.options.recursive),
-        args=node.args)
-    # TODO(mdan): Improve the template mechanism to better support this.
-    new_call.keywords = node.keywords
-    return new_call
-
-  def visit_FunctionDef(self, node):
-    node.args = self.visit(node.args)
-    node.body = self.visit_block(node.body)
-    node.decorator_list = []
-    node.returns = self.visit_block(node.returns)
-    return node
-
-  def visit_Call(self, node):
-    if anno.hasanno(node.func, 'live_val'):
-      target_entity = anno.getanno(node.func, 'live_val')
-
-      if anno.hasanno(node.func, 'fqn'):
-        target_fqn = anno.getanno(node.func, 'fqn')
-      else:
-        target_fqn = None
-
-      if self._function_is_compilable(target_entity):
-        if self._should_compile(node, target_fqn):
-          node = self._rename_compilable_function(node)
-        else:
-          node = self.generic_visit(node)
-          return node
-
-      elif target_fqn and target_fqn in KNOWN_NUMPY_FUNCTIONS:
-        # TODO(mdan): Should we replace these with equivalent TF ops instead?
-        node = self._wrap_to_py_func_single_return(
-            node, KNOWN_NUMPY_FUNCTIONS[target_fqn].dtype)
+        args=args,
+        kwargs=kwargs)
 
-      elif inspect_utils.isbuiltin(target_entity):
-        # Note: Any builtin that passed the builtins converter is assumed to be
-        # safe for graph mode.
-        return node
-
-      elif inspect_utils.isnamedtuple(target_entity):
-        # Although not compilable, we assume they are safe for graph mode.
-        node = self.generic_visit(node)
-        return node
-
-      else:
-        # TODO(mdan): Instert dynamic conversion here instead.
-        raise NotImplementedError(
-            'py_func with return values (unknown function)')
-    else:
-      # Special cases
-      # TODO(mdan): These need a systematic review - there may be more.
-
-      # 1. super() calls - these are preserved. The class conversion mechanism
-      # will ensure that they return the correct value.
-      if ast_util.matches(node, parser.parse_expression('super(_)')):
-        return node
-
-      # 2. super().method calls - these are preserved as well, when the
-      # conversion processes the entire class.
-      if (ast_util.matches(node, parser.parse_expression('super(_)._(_)')) and
-          self.ctx.info.owner_type is not None):
-        return node
-
-      node = self._insert_dynamic_conversion(node)
-    return node
+    return new_call
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 454d75d755c7273d11e1f89e4138cd997eb6e49a..654682edc737f8de291f50259c28a51c131aef58 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,147 +18,97 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-import numpy as np
-
 from tensorflow.python.autograph.converters import call_trees
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class CallTreesTest(converter_testing.TestCase):
 
-  def test_basic(self):
-
-    def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled version.')
-
-    def other_test_fn_1(a):
-      return a + 1
-
-    def test_fn_2(a):
-      return test_fn_1(a) + 1
-
-    ns = {'test_fn_1': test_fn_1}
-    node, ctx = self.prepare(test_fn_2, ns)
-    node = call_trees.transform(node, ctx)
+  def test_normal_function(self):
 
-    with self.compiled(node, ns) as result:
-      new_name, _ = ctx.namer.compiled_function_name(('test_fn_1',))
-      setattr(result, new_name, other_test_fn_1)
-      self.assertEquals(result.test_fn_2(1), 3)
-
-  def test_dynamic_function(self):
-
-    def test_fn_1():
-      raise ValueError('This should be masked by the mock in self.compiled.')
-
-    def test_fn_2(f):
+    def test_fn(f):
       return f() + 3
 
-    with self.converted(test_fn_2, call_trees, {}) as result:
-      # 10 = 7 (from the mock) + 3 (from test_fn_2)
-      self.assertEquals(10, result.test_fn_2(test_fn_1))
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+      self.assertListEqual(self.dynamic_calls, [((), {})])
 
-  def test_basic_method(self):
-
-    class TestClass(object):
+  def test_function_with_kwarg(self):
 
-      def test_fn_1(self, a):
-        return a + 1
+    def test_fn(f, a, b):
+      return f(a, c=b) + 3
 
-      def test_fn_2(self, a):
-        return self.test_fn_1(a) + 1
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, 2),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'c': 2})])
 
-    ns = {'TestClass': TestClass}
-    node, ctx = self.prepare(
-        TestClass.test_fn_2,
-        ns,
-        namer=converter_testing.FakeNoRenameNamer(),
-        arg_types={'self': (TestClass.__name__, TestClass)})
-    node = call_trees.transform(node, ctx)
+  def test_function_with_kwargs_starargs(self):
 
-    with self.compiled(node, ns) as result:
-      tc = TestClass()
-      self.assertEquals(3, result.test_fn_2(tc, 1))
+    def test_fn(f, a, *args, **kwargs):
+      return f(a, *args, **kwargs) + 5
 
-  def test_known_called_lambda(self):
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, *[2, 3], **{'b': 4, 'c': 5}),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {'b': 4, 'c': 5})])
 
-    l = lambda x: x
+  def test_function_with_kwargs_starargs_only(self):
 
-    def test_fn(a):
-      return l(a)
+    def f(*unused_args):  # Will not be called.
+      pass
 
-    ns = {'l': l}
-    node, ctx = self.prepare(test_fn, ns)
-    node = call_trees.transform(node, ctx)
-
-    with self.compiled(node, ns) as result:
-      self.assertEquals(1, result.test_fn(1))
-
-  def test_known_called_namedtuple(self):
-
-    nt = collections.namedtuple('TestNamedTuple', ['a'])
-
-    def test_fn(a):
-      return nt(a)
-
-    ns = {'nt': nt}
-    node, ctx = self.prepare(test_fn, ns)
-    node = call_trees.transform(node, ctx)
-
-    with self.compiled(node, ns) as result:
-      self.assertEquals(nt(1), result.test_fn(1))
+    def test_fn():
+      args = [1, 2, 3]
+      return f(*args) + 11
 
-  def test_py_func_known_function(self):
+    with self.converted(test_fn, call_trees, {'f': f}) as result:
+      self.assertEquals(
+          result.test_fn(),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 11)
+      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {})])
 
-    def test_fn():
-      return np.random.binomial(2, 0.5)
+  def test_function_with_kwargs_keywords(self):
 
-    with self.converted(test_fn, call_trees, {'np': np},
-                        dtypes.int64) as result:
-      with self.cached_session() as sess:
-        self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
-        self.assertIn(self.evaluate(result.test_fn()), (0, 1, 2))
+    def test_fn(f, a, b, **kwargs):
+      return f(a, b=b, **kwargs) + 5
 
-  def test_uncompiled_modules(self):
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, 2, **{'c': 3}),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'b': 2, 'c': 3})])
 
-    def test_fn(a):
-      a = math_ops.multiply(a, constant_op.constant(2))
-      a = math_ops.add(a, constant_op.constant(1))
-      return a
+  def test_class_method(self):
 
-    ns = {'math_ops': math_ops, 'constant_op': constant_op}
-    node, ctx = self.prepare(
-        test_fn,
-        ns,
-        arg_types=set(((math_ops.__name__,), (constant_op.__name__,))))
-    node = call_trees.transform(node, ctx)
+    class TestClass(object):
 
-    with self.compiled(node, ns) as result:
-      with self.cached_session() as sess:
-        result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(self.evaluate(result_tensor), 3)
+      def test_method(self, a):
+        return self.other_method(a) + 1
 
-  def test_call_to_decorated_function(self):
+    tc = TestClass()
+    with self.converted(TestClass.test_method, call_trees, {}) as result:
+      self.assertEquals(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
+                        result.test_method(tc, 1))
+      self.assertListEqual(self.dynamic_calls, [((1,), {})])
 
-    def decorator(f):
-      return f
+  def test_object_method(self):
 
-    @decorator
-    def called_fn(a):
-      return a
+    class TestClass(object):
 
-    def test_fn(a):
-      return called_fn(a)
+      def test_method(self, a):
+        return self.other_method(a) + 1
 
-    node, ctx = self.prepare(test_fn, {'called_fn': called_fn})
-    node = call_trees.transform(node, ctx)
+    tc = TestClass()
+    with self.converted(tc.test_method, call_trees, {}) as result:
+      self.assertEquals(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
+                        result.test_method(tc, 1))
+      self.assertListEqual(self.dynamic_calls, [((1,), {})])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index c3b6679e383cefd527b1dd0d8580d94bae8960e0..780f837fa3966c68383ab0ba4acdfcb7b221d005 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -29,11 +29,17 @@ class _Continue(object):
   def __init__(self):
     self.used = False
     self.control_var_name = None
-    self.create_guard = False
-    self.guard_created = False
 
   def __repr__(self):
-    return 'used: %s, var: %s' % (self.used, self.control_var_name)
+    return '<_Continue(used: {}, var: {})>'.format(self.used,
+                                                   self.control_var_name)
+
+
+class _Block(object):
+
+  def __init__(self):
+    self.guard_created = False
+    self.create_guard = False
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
@@ -68,15 +74,15 @@ class ContinueCanonicalizationTransformer(converter.Base):
     #    |                #         created if node)
 
     if self.state[_Continue].used:
-      if self.state[_Continue].guard_created:
+      if self.state[_Block].guard_created:
         return node, None
 
-      elif not self.state[_Continue].create_guard:
-        self.state[_Continue].create_guard = True
+      elif not self.state[_Block].create_guard:
+        self.state[_Block].create_guard = True
         return node, None
 
       else:
-        self.state[_Continue].guard_created = True
+        self.state[_Block].guard_created = True
         template = """
           if ag__.not_(var_name):
             original_node
@@ -90,6 +96,7 @@ class ContinueCanonicalizationTransformer(converter.Base):
 
   def _visit_loop_body(self, node, nodes):
     self.state[_Continue].enter()
+    self.state[_Block].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
     self.state[_Continue].control_var_name = continue_var
@@ -103,14 +110,21 @@ class ContinueCanonicalizationTransformer(converter.Base):
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
+    self.state[_Block].exit()
     self.state[_Continue].exit()
     return nodes
 
+  def _visit_non_loop_body(self, nodes):
+    self.state[_Block].enter()
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    self.state[_Block].exit()
+    return nodes
+
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse = self.visit_block(node.orelse)
+    node.orelse = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -118,7 +132,29 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse = self.visit_block(node.orelse)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    return node
+
+  def visit_If(self, node):
+    node.body = self.visit_block(node.body)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    return node
+
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body = self._visit_non_loop_body(node.body)
+    return node
+
+  def visit_Try(self, node):
+    node.body = self._visit_non_loop_body(node.body)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    # In Python 3.8 and later continue is allowed in finally blocks
+    node.finalbody = self._visit_non_loop_body(node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    node.body = self._visit_non_loop_body(node.body)
     return node
 
 
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index d6aaa504436aa13007142bc87623605be15667d2..5a1828e3189db7c2ae81991951d153074ff4904c 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -20,15 +20,15 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.eager import context as tfe_ctx
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
 class ContinueCanonicalizationTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn, continue_statements, {},
+    with self.converted(test_fn, continue_statements, {'ops': ops},
                         constant_op.constant) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
@@ -43,11 +43,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_for_loop(self):
 
@@ -60,11 +59,89 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, [])
-      self.assertTransformedEquivalent(test_fn, [1])
-      self.assertTransformedEquivalent(test_fn, [2])
-      self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+
+  def test_nested_with(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs_and_statements(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+          v.append(x)
+        v.append(x)
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs_and_nested_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+          with ops.name_scope(''):
+            v.append(x)
+        v.append(x)
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_nested(self):
 
@@ -83,11 +160,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u, w
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 0416820594a5d9d1692c4d1e4e9d1d5155ac184a..15ccbf74084573198031999ba81d666bde68babb 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -23,7 +23,6 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis import annos
 
@@ -260,9 +259,12 @@ class ControlFlowTransformer(converter.Base):
     assignments = []
     for s in undefined_symbols:
       template = '''
-        var = ag__.UNDEFINED
+        var = ag__.Undefined(symbol_name)
       '''
-      assignments += templates.replace(template, var=s)
+      assignments += templates.replace(
+          template,
+          var=s,
+          symbol_name=gast.Str(s.ssf()))
     return assignments
 
   def _get_loop_state(self, node):
@@ -278,26 +280,15 @@ class ControlFlowTransformer(converter.Base):
     # the loop state, regardless of whether they are later used or not.
     loop_state = body_scope.modified & live_in
 
-    undefined_lives = loop_state - defined_in
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop
+    undefined_lives = ((loop_state - defined_in)
+                       | ((body_scope.modified - live_in) & live_out))
     # Only simple variables must be defined. The composite ones will be
     # implicitly checked at runtime.
     undefined_simple_lives = {v for v in undefined_lives if v.is_simple()}
-    if undefined_simple_lives:
-      raise NameError(
-          'cannot convert loop: it includes symbols that are undefined'
-          ' when entering the loop: {}'.format(
-              self._fmt_symbols(undefined_simple_lives)))
-
-    live_defs_in_loop = (body_scope.modified - live_in) & live_out
-    if live_defs_in_loop:
-      # TODO(mdan): Include reference to explanation why.
-      raise NotImplementedError(
-          'cannot convert loop: it includes symbols that are defined'
-          ' inside the loop, but used later: {}. To fix, initialize'
-          ' these symbols before the loop'.format(
-              self._fmt_symbols(live_defs_in_loop)))
-
-    return loop_state, reserved_symbols
+
+    return loop_state, reserved_symbols, undefined_simple_lives
 
   def _state_constructs(self, loop_state, reserved_symbols):
     loop_state = list(loop_state)
@@ -321,7 +312,7 @@ class ControlFlowTransformer(converter.Base):
   def visit_While(self, node):
     self.generic_visit(node)
 
-    loop_state, reserved_symbols = self._get_loop_state(node)
+    loop_state, reserved_symbols, possibly_undef = self._get_loop_state(node)
 
     # Note: one might expect we can dispatch based on the loop condition.
     # But because that is dependent on the state, it cannot be evaluated ahead
@@ -340,7 +331,6 @@ class ControlFlowTransformer(converter.Base):
     cond_closure = set()
     for s in cond_scope.read:
       cond_closure |= s.support_set
-    cond_closure -= loop_state
 
     loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
         loop_state, reserved_symbols)
@@ -386,67 +376,108 @@ class ControlFlowTransformer(converter.Base):
           extra_deps=tuple(s.ast() for s in cond_closure),
       )
 
-    return node
+    undefined_assigns = self._create_undefined_assigns(possibly_undef)
+    return undefined_assigns + node
+
+  def _create_for_loop_early_stopping(self, loop_state, state_ssf,
+                                      state_ast_tuple, original_node,
+                                      extra_test_name, extra_test,
+                                      body_name, loop_body):
+    """Create node for for-loop with early stopping (e.g. break or return)."""
+    template = """
+      def extra_test_name(state_ssf):
+        return extra_test_expr
+      def body_name(loop_vars, state_ssf):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return state_ssf,
+      state_ast_tuple = ag__.for_stmt(
+          iter_, extra_test_name, body_name, (state,))
+    """
+    return templates.replace(
+        template,
+        state=loop_state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        extra_test_name=extra_test_name,
+        extra_test_expr=extra_test,
+        body_name=body_name,
+        body=loop_body)
+
+  def _create_for_loop_with_state(self, loop_state, state_ssf, state_ast_tuple,
+                                  original_node, body_name, loop_body):
+    """Create node for for-loop with loop-carried state, no early stopping."""
+    template = """
+      def body_name(loop_vars, state_ssf):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return state_ssf,
+      state_ast_tuple = ag__.for_stmt(
+          iter_, None, body_name, (state,))
+    """
+    return templates.replace(
+        template,
+        state=loop_state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        body_name=body_name,
+        body=loop_body)
+
+  def _create_for_loop_without_state(self, original_node, body_name, loop_body):
+    """Create node for for-loop with loop-carried state, no early stopping."""
+    template = """
+      def body_name(loop_vars):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return ()
+      ag__.for_stmt(iter_, None, body_name, ())
+    """
+    return templates.replace(
+        template,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        body_name=body_name,
+        body=loop_body)
 
   def visit_For(self, node):
     self.generic_visit(node)
 
-    loop_state, reserved_symbols = self._get_loop_state(node)
+    loop_state, reserved_symbols, possibly_undef = self._get_loop_state(node)
     loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
         loop_state, reserved_symbols)
     node_body = ast_util.rename_symbols(node.body, ssf_map)
-    if anno.hasanno(node, 'extra_test'):
-      extra_test = anno.getanno(node, 'extra_test')
-      extra_test = ast_util.rename_symbols(extra_test, ssf_map)
-    else:
-      extra_test = parser.parse_expression('True')
+    body_name = self.ctx.namer.new_symbol('loop_body', reserved_symbols)
 
+    has_extra_test = anno.hasanno(node, 'extra_test')
     if loop_state:
-      template = """
-        def extra_test_name(state_ssf):
-          return extra_test_expr
-        def body_name(loop_vars, state_ssf):
-          # Workaround for PEP-3113
-          iterate = loop_vars
-          body
-          return state_ssf,
-        state_ast_tuple = ag__.for_stmt(
-            iter_, extra_test_name, body_name, (state,))
-      """
-      node = templates.replace(
-          template,
-          state=loop_state,
-          state_ssf=state_ssf,
-          state_ast_tuple=state_ast_tuple,
-          iter_=node.iter,
-          iterate=node.target,
-          extra_test_name=self.ctx.namer.new_symbol('extra_test',
-                                                    reserved_symbols),
-          extra_test_expr=extra_test,
-          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-          body=node_body)
+      if has_extra_test:
+        # Loop with early stopping (e.g. break or return)
+        extra_test = anno.getanno(node, 'extra_test')
+        extra_test = ast_util.rename_symbols(extra_test, ssf_map)
+        extra_test_name = self.ctx.namer.new_symbol('extra_test',
+                                                    reserved_symbols)
+        node = self._create_for_loop_early_stopping(
+            loop_state, state_ssf, state_ast_tuple, node, extra_test_name,
+            extra_test, body_name, node_body)
+      else:
+        # Loop with loop-carried state and no early stopping
+        node = self._create_for_loop_with_state(
+            loop_state, state_ssf, state_ast_tuple, node, body_name, node_body)
     else:
-      template = """
-        def extra_test_name():
-          return extra_test_expr
-        def body_name(loop_vars):
-          # Workaround for PEP-3113
-          iterate = loop_vars
-          body
-          return ()
-        ag__.for_stmt(iter_, extra_test_name, body_name, ())
-      """
-      node = templates.replace(
-          template,
-          iter_=node.iter,
-          iterate=node.target,
-          extra_test_name=self.ctx.namer.new_symbol('extra_test',
-                                                    reserved_symbols),
-          extra_test_expr=extra_test,
-          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-          body=node_body)
+      # Loop with no loop-carried state and no early stopping
+      assert not has_extra_test, ('Early stoppiong (e.g. break and/or return) '
+                                  'should create state variables.')
+      node = self._create_for_loop_without_state(node, body_name, node_body)
 
-    return node
+    undefined_assigns = self._create_undefined_assigns(possibly_undef)
+    return undefined_assigns + node
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 1e73dcafa188dce7ac15db55d8f580f5266d84aa..37ea4c2ae8e3a76954360d79b8a97f5c1ce362ae 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -33,8 +33,7 @@ class ControlFlowTest(converter_testing.TestCase):
       inputs = (inputs,)
     with self.converted(test_fn, control_flow, {},
                         constant_op.constant) as result:
-      with self.cached_session() as sess:
-        self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
+      self.assertEqual(self.evaluate(result.test_fn(*inputs)), expected)
 
   @test_util.run_deprecated_v1
   def test_while_basic(self):
@@ -78,16 +77,32 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), 0)
 
-  def test_while_variable_defined_in_body(self):
-    def bad_while_loop(n):
+  @test_util.run_deprecated_v1
+  def test_while_dispatches_by_cond_only(self):
+
+    class TensorIncompatibleNumeric(object):
+      """Works in arithmetic expression, but errors out with TF ops."""
+
+      def __init__(self, val):
+        self.val = val
+
+      def __add__(self, other):
+        return TensorIncompatibleNumeric(self.val + other)
+
+    def test_fn(n, s):
       while n > 0:
         n -= 1
-        s = n
+        s += n
       return s
 
-    node, ctx = self.prepare(bad_while_loop, {})
-    with self.assertRaises(NameError):
-      control_flow.transform(node, ctx)
+    self.assertTransformedResult(test_fn, (constant_op.constant(5), 0), 10)
+    with self.converted(test_fn, control_flow, {}) as result:
+      # n alone controls the staging. When the loop is not staged, Python
+      # knows how to add the two objects. But when staged, tf.while_loop will
+      # not know how to deal with the TensorIncompatibleNumeric object.
+      self.assertEqual(result.test_fn(5, TensorIncompatibleNumeric(0)).val, 10)
+      with self.assertRaises(TypeError):
+        result.test_fn(constant_op.constant(5), TensorIncompatibleNumeric(0))
 
   @test_util.run_deprecated_v1
   def test_if_basic(self):
@@ -123,11 +138,10 @@ class ControlFlowTest(converter_testing.TestCase):
       return obj
 
     with self.converted(test_fn, control_flow, {}) as result:
-      with self.cached_session() as sess:
-        res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
-        self.assertEqual(sess.run((res_obj.a, res_obj.b)), (-1, 0))
-        res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
-        self.assertEqual(sess.run((res_obj.a, res_obj.b)), (0, -2))
+      res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
+      self.assertEqual(self.evaluate((res_obj.a, res_obj.b)), (-1, 0))
+      res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
+      self.assertEqual(self.evaluate((res_obj.a, res_obj.b)), (0, -2))
 
   @test_util.run_deprecated_v1
   def test_if_single_output(self):
@@ -225,16 +239,6 @@ class ControlFlowTest(converter_testing.TestCase):
       self.assertEqual(result.test_fn(5), 10)
       self.assertEqual(eval_count[0], 1)
 
-  def test_for_variable_defined_in_body(self):
-    def bad_for_loop(n):
-      for i in range(n):
-        s = i
-      return s
-
-    node, ctx = self.prepare(bad_for_loop, {})
-    with self.assertRaises(NameError):
-      control_flow.transform(node, ctx)
-
   @test_util.run_deprecated_v1
   def test_for_tuple_unpacking(self):
     def test_fn(x_list):
@@ -244,5 +248,7 @@ class ControlFlowTest(converter_testing.TestCase):
       return z
 
     self.assertTransformedResult(test_fn, [3, 3], 7)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 6cf6f266496cb8130e8067658964f6e141fe8cc5..3173e676e5dc383f399ca89cdc7814406afb28eb 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -115,6 +115,21 @@ class ConditionalReturnRewriter(converter.Base):
       anno.setanno(node, STMT_DEFINITELY_RETURNS, True)
     return node
 
+  def visit_Try(self, node):
+    # We could decide whether a 'try' DEFINITELY_RETURNS based on its components
+    # It is not clear whether we want to do anything with this given
+    # a 'try' is likely to throw an exception in some circumstances.
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
+    node.finalbody, _ = self._visit_statement_block(node, node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    # To determine whether `try` DEFINITELY_RETURNS we need to revisit this.
+    node.body, _ = self._visit_statement_block(node, node.body)
+    return node
+
   def visit_If(self, node):
     node.test = self.visit(node.test)
 
@@ -317,6 +332,17 @@ class ReturnStatementsTransformer(converter.Base):
     node.body = self._visit_statement_block(node, node.body)
     return node
 
+  def visit_Try(self, node):
+    node.body = self._visit_statement_block(node, node.body)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    node.finalbody = self._visit_statement_block(node, node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    node.body = self._visit_statement_block(node, node.body)
+    return node
+
   def visit_If(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_statement_block(node, node.body)
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index 2942555d176414ae510691cc00042d6baca33016..b2d3d1b92055216d45071fef1fe9f36553a7fb42 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -197,6 +197,25 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 4)
     self.assertTransformedEquivalent(test_fn, 5)
 
+  def test_nested_multiple_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            return v
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index d7c0951fcc68318ff82e4873deef8707e7018f73..7e556d95139366cb9747544fbaafe4a4039d82cd 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -125,6 +125,10 @@ class SideEffectGuardTransformer(converter.Base):
     node.orelse = self._visit_and_reindent(node.orelse)
     return node
 
+  # TODO(b/123995141) Remove once ExceptionHandlers are in the CFG
+  def visit_ExceptHandler(self, node):
+    return node
+
   def visit_Expr(self, node):
     self.generic_visit(node)
     if isinstance(node.value, gast.Call):
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 3ab2e7b1bcacf7efe136b01a10de2bb7728e2d90..fae327e50db57474f2f72fddbc57f04f90ca4f1e 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -22,6 +22,7 @@ py_library(
         "errors.py",
         "function_wrapping.py",
         "naming.py",
+        "unsupported_features_checker.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -30,6 +31,7 @@ py_library(
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
+        "@gast_archive//:gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index 574f819504e526420dd1956359dc974869d735f3..f038704a0741ef31d8701b41566d236f7caff0d8 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -28,21 +28,34 @@ PYTHON_LITERALS = {
     'float': float,
 }
 
+
+def _internal_name(name):
+  """This function correctly resolves internal and external names."""
+  reference_name = utils.__name__
+
+  reference_root = 'tensorflow.'
+  # If the TF module is foo.tensorflow, then all other modules
+  # are then assumed to be prefixed by 'foo'.
+
+  if reference_name.startswith(reference_root):
+    return name
+
+  reference_begin = reference_name.find('.' + reference_root)
+  assert reference_begin > 0
+
+  root_prefix = reference_name[:reference_begin]
+  return root_prefix + '.' + name
+
+
 DEFAULT_UNCOMPILED_MODULES = set((
     ('tensorflow',),
-    (utils.__name__,),
-
-    # All of tensorflow's subpackages. Unlike the root tf module, they don't
-    # have well-known names. Not referring to the module directly to avoid
-    # circular imports.
-    (
-        utils.__name__[:-len('.python.autograph.utils')],),
+    (_internal_name('tensorflow'),),
+    # TODO(mdan): Remove once the conversion process is optimized.
+    ('tensorflow_probability',),
+    (_internal_name('tensorflow_probability'),),
 ))
 
-NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names.
-# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
 )
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index c8db2ca427cca3bb50bc913aed0336e8108bab82..3a084836249fbb2693b44285f8f9baa141230c71 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -233,8 +233,7 @@ class ConversionOptions(object):
     """
 
     def as_qualified_name(o):
-      name = inspect_utils.getqualifiedname(
-          ctx.info.namespace, o, max_depth=1)
+      name = inspect_utils.getqualifiedname(ctx.info.namespace, o, max_depth=1)
       if not name:
         if isinstance(o, weakref.ref):
           # `o` might already be a weak reference, if this object was
@@ -252,9 +251,7 @@ class ConversionOptions(object):
 
     def list_of_features(values):
       return parser.parse_expression('({})'.format(', '.join(
-          'ag__.{}'.format(v)
-          for v in Feature.__members__.values()
-          if v in values)))
+          'ag__.{}'.format(str(v)) for v in values)))
 
     if internal_convert_user_code is None:
       internal_convert_user_code = self.internal_convert_user_code
diff --git a/tensorflow/python/autograph/core/unsupported_features_checker.py b/tensorflow/python/autograph/core/unsupported_features_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccbb76fea1c8c9068b1bc1f64cc0f00a0ca2e35
--- /dev/null
+++ b/tensorflow/python/autograph/core/unsupported_features_checker.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkers for detecting unsupported Python features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.pyct import errors
+
+
+class UnsupportedFeaturesChecker(gast.NodeTransformer):
+  """Quick check for Python features we know we don't support.
+
+  Any features detected will cause AutoGraph to not compile a function.
+  """
+
+  # TODO(b/124103128): Implement support for `global` statements
+  def visit_Global(self, node):
+    raise errors.AutoGraphError(
+        'The global keyword is not yet supported.')
+
+  def visit_Nonlocal(self, node):
+    raise errors.AutoGraphError(
+        'The nonlocal keyword is not yet supported.')
+
+  # These checks could potentially be replaced with inspect.isgeneratorfunction
+  # to avoid a getsource/parse/ast-walk round trip.
+  def visit_Yield(self, node):
+    raise errors.AutoGraphError(
+        'Generators are not supported by AutoGraph')
+
+  def visit_YieldFrom(self, node):
+    raise errors.AutoGraphError(
+        'Generators are not supported by AutoGraph')
+
+
+def verify(node):
+  UnsupportedFeaturesChecker().visit(node)
+
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 7e0b5e79b8cfc9f31f2d4ab5781cecfddc2f6d08..356cdc0e3bc3431a43ed78c981bff30380abec2f 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import copy
 import functools
+import pdb
 import sys
 
 from enum import Enum
@@ -35,9 +38,9 @@ from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -87,7 +90,7 @@ def convert(
               verbose=verbose,
               force_conversion=True,
               optional_features=optional_features,
-          ), *args, **kwargs)
+          ), args, kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
 
@@ -113,6 +116,12 @@ class RunMode(Enum):
   PY_FUNC = 2
 
 
+def do_not_convert_internal(f):
+  """Decorator that marks internal functions which do not need conversion."""
+  setattr(f, '__ag_compiled', True)
+  return f
+
+
 def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
   """Decorator that suppresses the conversion of a function.
 
@@ -151,17 +160,17 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
     else:
       raise ValueError('unknown value for run_as: %s' % run_as)
 
-    # Sometimes the decorator is just desugared, making it impossible to detect.
-    # This attribute makes detection easier.
     setattr(wrapper, '__ag_compiled', True)
     return wrapper
 
   return decorator
 
 
-def converted_call(f, owner, options, *args, **kwargs):
+def converted_call(f, owner, options, args, kwargs):
   """Compiles a function call inline. For internal use only."""
-  logging.vlog(logging.DEBUG, 'Converted call: %s; owner: %s', f, owner)
+  logging.log(1,
+              'Converted call: %s; owner: %s\n    args: %s\n    kwargs: %s\n',
+              f, owner, args, kwargs)
 
   if owner is not None:
     if not isinstance(f, str):
@@ -180,14 +189,33 @@ def converted_call(f, owner, options, *args, **kwargs):
   if inspect_utils.isbuiltin(f):
     return py_builtins.overload_of(f)(*args, **kwargs)
 
-  # Don't convert wrapt-decorated functions/methods.
-  # TODO(b/122265385): Fully support wrapt
+  # TODO(b/122265385): Remove this bypass.
   if ('wrapt' in sys.modules and
+      hasattr(sys.modules['wrapt'], 'FunctionWrapper') and
       isinstance(f, sys.modules['wrapt'].FunctionWrapper)):
+    logging.warn(
+        'Entity {} appears to be decorated by wrapt, which is not yet supported'
+        ' by AutoGraph. The function will be called without transformation.'
+        ' You may however apply AutoGraph before the decorator.'.format(f), 1)
+    logging.log(2, 'Permanently whitelisted: %s: wrapt decorated', f)
+    return f(*args, **kwargs)
+
+  # Constructors are permanently whitelisted.
+  # TODO(mdan): Toggle as experimental feature instead.
+  # TODO(b/124016764): Remove this limitation.
+  if tf_inspect.isclass(f):
+    logging.log(2, 'Permanently whitelisted: %s: constructor', f)
+    return f(*args, **kwargs)
+
+  # Other built-in modules are permanently whitelisted.
+  # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
+  # Note: TF linter disallows importing inspect.
+  if any(f in m.__dict__.values()
+         for m in (collections, pdb, copy, tf_inspect._inspect)):  # pylint:disable=protected-access
+    logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
     return f(*args, **kwargs)
 
   # TODO(mdan): This needs cleanup.
-  # In particular, we may want to avoid renaming functions altogether.
   if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
 
     # TODO(mdan): This may be inconsistent in certain situations.
@@ -213,91 +241,121 @@ def converted_call(f, owner, options, *args, **kwargs):
   if not options.internal_convert_user_code:
     return f(*args, **kwargs)
 
-  # Unwrap functools.partial objects
-  # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
-  while isinstance(f, functools.partial):
-    args = f.args + args
-    new_kwargs = {}
-    if f.keywords is not None:
-      new_kwargs.update(f.keywords)
-    new_kwargs.update(kwargs)
-    kwargs = new_kwargs
-    f = f.func
-
-  if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
-    # Regular functions
-    target_entity = f
-    arg_map_target = f
-    f_self = inspect_utils.getmethodself(f)
-
-    # TODO(b/119246461): This may be more elegantly handled using __get__?
-    if f_self is not None:
-      # If this is a method call, it may or may not include self.
-      #
-      # Example when self is included:
-      #   converted_call(to_graph(foo.bar), foo)
-      #
-      # Example when self is not included:
-      #   super(...).foo(args)
-      #
-      if owner is not None and (not args or args[0] is not owner):
-        effective_args = (owner,) + args
-      else:
-        # When the owner is not specified, use the result of
-        # inspect_utils.getmethodclass.
-        # TODO(b/119246461): Make sure an owner is always specified.
-        if not args or args[0] is not f_self:
-          effective_args = (f_self,) + args
+  # TODO(mdan): Move this entire block inside to_graph.
+  try:  # Begin of transformation error guards
+
+    # Unwrap functools.partial objects
+    # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
+    while isinstance(f, functools.partial):
+      args = f.args + args
+      new_kwargs = {}
+      if f.keywords is not None:
+        new_kwargs.update(f.keywords)
+      new_kwargs.update(kwargs)
+      kwargs = new_kwargs
+      f = f.func
+
+    if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
+      # Regular functions
+      target_entity = f
+      arg_map_target = f
+      f_self = inspect_utils.getmethodself(f)
+
+      # TODO(b/119246461): This may be more elegantly handled using __get__?
+      if f_self is not None:
+        # If this is a method call, it may or may not include self.
+        #
+        # Example when self is included:
+        #   converted_call(to_graph(foo.bar), foo)
+        #
+        # Example when self is not included:
+        #   super(...).foo(args)
+        #
+        if owner is not None and (not args or args[0] is not owner):
+          effective_args = (owner,) + args
         else:
-          effective_args = (f_self,) + args[1:]
-      partial_types = (f_self,)
-    else:
+          # When the owner is not specified, use the result of
+          # inspect_utils.getmethodclass.
+          # TODO(b/119246461): Make sure an owner is always specified.
+          if not args or args[0] is not f_self:
+            effective_args = (f_self,) + args
+          else:
+            effective_args = (f_self,) + args[1:]
+        partial_types = (f_self,)
+      else:
+        effective_args = args
+        partial_types = ()
+
+    elif tf_inspect.isclass(f):
+      # Constructors
+      # Note: Until we support class constructurs, and enable whole-class
+      # conversion with an experimental flag, this branch is dead code.
+      # TODO(mdan): Consider removing unless there is a compelling use case.
+      target_entity = f
+      arg_map_target = f.__init__
       effective_args = args
       partial_types = ()
 
-  elif tf_inspect.isclass(f):
-    # Constructors
-    target_entity = f
-    arg_map_target = f.__init__
-    effective_args = args
-    partial_types = ()
-
-  elif hasattr(f, '__call__') and hasattr(f, '__class__'):
-    # Callable objects
-    target_entity = f.__call__
-    arg_map_target = f.__call__
-    effective_args = (f,) + args
-    partial_types = (f.__class__,)
-
-  else:
-    raise NotImplementedError('unknown callable type "%s"' % type(f))
-
-  arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
-  arg_types = {}
-  for name, arg in arg_values.items():
-    arg_class = arg.__class__
-    arg_types[name] = (arg_class.__name__, arg_class)
-
-  # When called from within a decorator, this is the only indication that
-  # the function is a method - it appears that the decorator is applied
-  # before the method is bound.
-  if not partial_types:
-    if 'self' in arg_values:
-      if tf_inspect.isclass(arg_values['self'].__class__):
-        partial_types = (arg_values['self'].__class__,)
-    elif 'cls' in arg_values:
-      if tf_inspect.isclass(arg_values['cls']):
-        partial_types = (arg_values['cls'],)
-
-  converted_f = to_graph(
-      target_entity,
-      recursive=options.recursive,
-      arg_values=arg_values,
-      arg_types=arg_types,
-      experimental_optional_features=options.optional_features,
-      experimental_strip_decorators=options.strip_decorators,
-      experimental_verbose=options.verbose,
-      experimental_partial_types=partial_types)
+    elif hasattr(f, '__call__') and hasattr(f, '__class__'):
+      # Callable objects
+      target_entity = f.__call__
+      arg_map_target = f.__call__
+      effective_args = (f,) + args
+      partial_types = (f.__class__,)
+
+    else:
+      raise NotImplementedError('unknown callable type "%s"' % type(f))
+
+    arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
+    arg_types = {}
+    for name, arg in arg_values.items():
+      arg_class = arg.__class__
+      arg_types[name] = (arg_class.__name__, arg_class)
+
+    # When called from within a decorator, this is the only indication that
+    # the function is a method - it appears that the decorator is applied
+    # before the method is bound.
+    if not partial_types:
+      if 'self' in arg_values:
+        if tf_inspect.isclass(arg_values['self'].__class__):
+          partial_types = (arg_values['self'].__class__,)
+      elif 'cls' in arg_values:
+        if tf_inspect.isclass(arg_values['cls']):
+          partial_types = (arg_values['cls'],)
+
+    logging.log(3, 'Partial types in conversion of %s: %s', target_entity,
+                partial_types)
+
+    converted_f = to_graph(
+        target_entity,
+        recursive=options.recursive,
+        arg_values=arg_values,
+        arg_types=arg_types,
+        experimental_optional_features=options.optional_features,
+        experimental_strip_decorators=options.strip_decorators,
+        experimental_verbose=options.verbose,
+        experimental_partial_types=partial_types)
+
+    if logging.has_verbosity(2):
+      logging.log(2, 'Defaults of %s : %s', converted_f,
+                  converted_f.__defaults__)
+      callargs = tf_inspect.getcallargs(converted_f, *effective_args, **kwargs)
+      formatted_callargs = '\n'.join(
+          '    {}: {}'.format(k, v) for k, v in callargs.items())
+      logging.log(2, 'Calling %s with\n%s\n', converted_f, formatted_callargs)
+
+  # TODO(mdan): Reduce this list.
+  except (errors.AutoGraphError, AssertionError, AttributeError, IndexError,
+          KeyError, NameError, NotImplementedError, SyntaxError, TypeError,
+          ValueError, IOError) as e:
+    logging.log(1, 'Error transforming entity %s', target_entity, exc_info=True)
+    logging.warn(
+        'Entity %s could not be transformed and will be staged without change.'
+        ' Error details can be found in the logs when running with the env'
+        ' variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the'
+        ' AutoGraph team. Cause: %s', target_entity, e)
+
+    return f(*args, **kwargs)
 
   result = converted_f(*effective_args, **kwargs)
 
@@ -442,8 +500,15 @@ def to_graph(entity,
         compiled_module.__dict__[key] = val
     compiled = getattr(compiled_module, name)
 
-    if tf_inspect.isfunction(entity):
+    if hasattr(entity, '__defaults__'):
+      logging.log(3, 'Default args mapping: %s has: %s', entity,
+                  entity.__defaults__)
       compiled.__defaults__ = entity.__defaults__
+    else:
+      logging.log(3, 'Default args mapping: %s has no __defaults__', entity)
+
+    logging.log(3, 'Namespace of %s includes: %s', compiled,
+                compiled_module.__dict__.keys())
 
     if hasattr(compiled, '__globals__'):
       # Remove self to avoid circular references. This will probably only work
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index d5561ba8249f539e720fa1ecb5800b76c61a8c2f..aa9ad0c882e9699e3c46f3ea2203b1ebf42fc3a3 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import gc
 
@@ -26,6 +27,8 @@ import numpy as np
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
+from tensorflow.python.autograph.pyct import errors
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
@@ -39,6 +42,9 @@ from tensorflow.python.util import tf_inspect
 tf = utils.fake_tf()
 
 
+testing_global_numeric = 2
+
+
 class TestResource(str):
   pass
 
@@ -46,7 +52,7 @@ class TestResource(str):
 class ApiTest(test.TestCase):
 
   @test_util.run_deprecated_v1
-  def test_decorator_recurses(self):
+  def test_decorator_recursive(self):
 
     class TestClass(object):
 
@@ -69,7 +75,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_decorator_does_not_recurse(self):
+  def test_decorator_not_recursive(self):
 
     class TestClass(object):
 
@@ -90,7 +96,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_decorator_calls_unconverted_graph(self):
+  def test_convert_then_do_not_convert_graph(self):
 
     class TestClass(object):
 
@@ -105,14 +111,13 @@ class ApiTest(test.TestCase):
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant((2, 4)), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertAllEqual((0, 1), self.evaluate(x))
 
   @test_util.run_deprecated_v1
-  def test_decorator_calls_unconverted_py_func(self):
+  def test_convert_then_do_not_convert_py_func(self):
 
     class TestClass(object):
 
@@ -132,11 +137,10 @@ class ApiTest(test.TestCase):
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant((2, 4)), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertAllEqual((0, 1), self.evaluate(x))
 
   @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
@@ -192,18 +196,17 @@ class ApiTest(test.TestCase):
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
           x //= api.converted_call(self.called_member, None,
-                                   converter.ConversionOptions(), self, a)
+                                   converter.ConversionOptions(), (self, a), {})
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant([2, 4]), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
-    x = api.converted_call(range, None, converter.ConversionOptions(), 3)
+    x = api.converted_call(range, None, converter.ConversionOptions(), (3,), {})
     self.assertEqual((0, 1, 2), tuple(x))
 
   def test_converted_call_function(self):
@@ -213,10 +216,9 @@ class ApiTest(test.TestCase):
         return -x
       return x
 
-    with self.cached_session() as sess:
-      x = api.converted_call(test_fn, None, converter.ConversionOptions(),
-                             constant_op.constant(-1))
-      self.assertEqual(1, self.evaluate(x))
+    x = api.converted_call(test_fn, None, converter.ConversionOptions(),
+                           (constant_op.constant(-1),), {})
+    self.assertEqual(1, self.evaluate(x))
 
   @test_util.run_v1_only('b/120545219')
   def test_converted_call_functools_partial(self):
@@ -227,16 +229,14 @@ class ApiTest(test.TestCase):
       return x, y, z
 
     x = api.converted_call(
-        functools.partial(test_fn, constant_op.constant(-1), z=-3),
-        None, converter.ConversionOptions(),
-        constant_op.constant(-2))
+        functools.partial(test_fn, constant_op.constant(-1), z=-3), None,
+        converter.ConversionOptions(), (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
     x = api.converted_call(
         functools.partial(
-            functools.partial(test_fn, constant_op.constant(-1)), z=-3),
-        None, converter.ConversionOptions(),
-        constant_op.constant(-2))
+            functools.partial(test_fn, constant_op.constant(-1)), z=-3), None,
+        converter.ConversionOptions(), (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
@@ -259,11 +259,31 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc.test_method, None,
-                             converter.ConversionOptions(), tc)
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc.test_method, None, converter.ConversionOptions(),
+                           (tc,), {})
+    self.assertEqual(1, self.evaluate(x))
+
+  def test_converted_call_method_converts_recursively(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def other_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+      def test_method(self):
+        return self.other_method()
+
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc.test_method, None,
+                           converter.ConversionOptions(recursive=True), (tc,),
+                           {})
+    self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -277,11 +297,10 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(TestClass.test_method, None,
-                             converter.ConversionOptions(), tc)
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(TestClass.test_method, None,
+                           converter.ConversionOptions(), (tc,), {})
+    self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -295,11 +314,11 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc, None, converter.ConversionOptions())
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc, None, converter.ConversionOptions(), (), {})
+    self.assertEqual(1, self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_converted_call_constructor(self):
 
     class TestClass(object):
@@ -312,27 +331,44 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
-                              constant_op.constant(-1))
-      # tc is now a converted object.
-      x = tc.test_method()
-      self.assertEqual(1, self.evaluate(x))
+    tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
+                            (constant_op.constant(-1),), {})
+    # tc is still a TestClass - constructors are whitelisted.
+    # TODO(b/124016764): Support this use case.
+    # The error below is specific to the `if` statement not being converted.
+    with self.assertRaisesRegex(
+        TypeError, 'Using a `tf.Tensor` as a Python `bool`'):
+      tc.test_method()
 
   def test_converted_call_already_converted(self):
 
     def f(x):
       return x == 0
 
-    with self.cached_session() as sess:
-      x = api.converted_call(f, None, converter.ConversionOptions(),
-                             constant_op.constant(0))
-      self.assertTrue(self.evaluate(x))
+    x = api.converted_call(f, None, converter.ConversionOptions(),
+                           (constant_op.constant(0),), {})
+    self.assertTrue(self.evaluate(x))
+
+    converted_f = api.to_graph(f)
+    x = api.converted_call(converted_f, None, converter.ConversionOptions(),
+                           (constant_op.constant(0),), {})
+    self.assertTrue(self.evaluate(x))
+
+  def test_converted_call_then_already_converted_dynamic(self):
+
+    @api.convert()
+    def g(x):
+      if x > 0:
+        return x
+      else:
+        return -x
+
+    def f(g, x):
+      return g(x)
 
-      converted_f = api.to_graph(f)
-      x = api.converted_call(converted_f, None, converter.ConversionOptions(),
-                             constant_op.constant(0))
-      self.assertTrue(self.evaluate(x))
+    x = api.converted_call(f, None, converter.ConversionOptions(),
+                           (g, constant_op.constant(1)), {})
+    self.assertEqual(self.evaluate(x), 1)
 
   @test_util.run_deprecated_v1
   def test_converted_call_no_user_code(self):
@@ -345,10 +381,10 @@ class ApiTest(test.TestCase):
     # f should not be converted, causing len to error out.
     with self.assertRaisesRegexp(Exception,
                                  'object of type \'Tensor\' has no len()'):
-      api.converted_call(f, None, opts, constant_op.constant([0]))
+      api.converted_call(f, None, opts, (constant_op.constant([0]),), {})
 
     # len on the other hand should work fine.
-    x = api.converted_call(len, None, opts, constant_op.constant([0]))
+    x = api.converted_call(len, None, opts, (constant_op.constant([0]),), {})
     # The constant has static shape so the result is a primitive not a Tensor.
     self.assertEqual(x, 1)
 
@@ -361,11 +397,10 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call(model.call, None, opts,
-                           constant_op.constant([[0.0]]), training=True)
+                           (constant_op.constant([[0.0]]),), {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_extra_self(self):
 
@@ -376,11 +411,11 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call(model.call, None, opts,
-                           model, constant_op.constant([[0.0]]), training=True)
+                           (model, constant_op.constant([[0.0]])),
+                           {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_via_owner(self):
 
@@ -391,11 +426,28 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call('call', model, opts,
-                           constant_op.constant([[0.0]]), training=True)
+                           (constant_op.constant([[0.0]]),), {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_namedtuple(self):
+
+    opts = converter.ConversionOptions()
+
+    x = api.converted_call(collections.namedtuple, None, opts,
+                           ('TestNamedtuple', ('a', 'b')), {})
+
+    self.assertTrue(inspect_utils.isnamedtuple(x))
+
+  def test_converted_call_namedtuple_via_collections(self):
+
+    opts = converter.ConversionOptions()
+
+    x = api.converted_call('namedtuple', collections, opts, ('TestNamedtuple',
+                                                             ('a', 'b')), {})
+
+    self.assertTrue(inspect_utils.isnamedtuple(x))
 
   def test_converted_call_lambda(self):
 
@@ -403,11 +455,10 @@ class ApiTest(test.TestCase):
 
     l = lambda x: x == 0
 
-    x = api.converted_call(l, None, opts, constant_op.constant(0))
+    x = api.converted_call(l, None, opts, (constant_op.constant(0),), {})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual(True, self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(True, self.evaluate(x))
 
   @test_util.run_deprecated_v1
   def test_to_graph_basic(self):
@@ -439,6 +490,31 @@ class ApiTest(test.TestCase):
       x = compiled_fn(constant_op.constant([4, 8]))
       self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
+  def test_to_graph_with_globals(self):
+
+    def test_fn(x):
+      global testing_global_numeric
+      testing_global_numeric = x + testing_global_numeric
+      return testing_global_numeric
+
+    # TODO(b/122368197)
+    with self.assertRaisesRegex(
+        errors.AutoGraphError, 'global keyword is not yet supported'):
+      api.to_graph(test_fn)
+
+  def test_to_graph_with_kwargs_clashing_converted_call(self):
+
+    def called_fn(**kwargs):
+      return kwargs['f'] + kwargs['owner']
+
+    def test_fn():
+      # These arg names intentionally match converted_call's
+      return called_fn(f=1, owner=2)
+
+    compiled_fn = api.to_graph(test_fn)
+
+    self.assertEqual(compiled_fn(), 3)
+
   def test_to_code_basic(self):
 
     def test_fn(x, s):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 691210616596e4724f4346b0e12d157fb919c20e..4d93f02695f5b35704b96ccececc4d947ed8bb41 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import imp
+import unittest
 
 import gast
 
@@ -45,6 +46,7 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors as ag_errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
@@ -52,6 +54,7 @@ from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
@@ -80,29 +83,33 @@ def is_whitelisted_for_graph(o):
     m = functools
   else:
     m = tf_inspect.getmodule(o)
-  if not hasattr(m, '__name__'):
-    # Note: typically it's builtins that fall in this category. Builtins will
-    # be handled by specific code that follows this screening layer.
-    logging.log(2, '%s is NOT whitelisted: unknown module name', o)
-    return False
-
-  for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
-    if m.__name__.startswith(prefix):
-      logging.log(2, '%s is whitelisted: name starts with "%s"', o, prefix)
+
+  if hasattr(m, '__name__'):
+    # Builtins typically have unnamed modules.
+    for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
+      if m.__name__.startswith(prefix):
+        logging.log(2, 'Whitelisted: %s: name starts with "%s"', o, prefix)
+        return True
+
+    # Temporary -- whitelist tensorboard modules.
+    # TODO(b/122731813): Remove.
+    if m.__name__ == 'tensorboard' or '.tensorboard' in m.__name__:
+      logging.log(2, 'Whitelisted: %s: name contains "tensorboard"', o)
       return True
 
   if hasattr(o, 'autograph_info__') or hasattr(o, '__ag_compiled'):
-    logging.log(2, '%s is whitelisted: already converted', o)
+    logging.log(2, 'Whitelisted: %s: already converted', o)
     return True
 
-  if (not inspect_utils.isweakrefself(o) and not tf_inspect.isclass(o) and
-      hasattr(o, '__call__') and hasattr(o, '__class__')):
+  if hasattr(o, '__call__'):
     # Callable objects: whitelisted if their __call__ method is.
-    call_whitelisted = is_whitelisted_for_graph(o.__call__)
-    if call_whitelisted:
-      logging.log(2, '%s is whitelisted: object __call__ whitelisted', o)
-      return call_whitelisted
+    # The type check avoids infinite recursion around the __call__ method
+    # of function objects.
+    if (type(o) != type(o.__call__)) and is_whitelisted_for_graph(o.__call__):  # pylint: disable=unidiomatic-typecheck
+      logging.log(2, 'Whitelisted: %s: object __call__ whitelisted', o)
+      return True
 
+  owner_class = None
   if tf_inspect.ismethod(o):
     # Methods of whitelisted classes are also whitelisted, even if they are
     # bound via user subclasses.
@@ -121,9 +128,13 @@ def is_whitelisted_for_graph(o):
 
     owner_class = inspect_utils.getmethodclass(o)
     if owner_class is not None:
+      if issubclass(owner_class, unittest.TestCase):
+        logging.log(2, 'Whitelisted: %s: method of TestCase subclass', o)
+        return True
+
       owner_class = inspect_utils.getdefiningclass(o, owner_class)
       if is_whitelisted_for_graph(owner_class):
-        logging.log(2, '%s is whitelisted: owner is whitelisted %s', o,
+        logging.log(2, 'Whitelisted: %s: owner is whitelisted %s', o,
                     owner_class)
         return True
 
@@ -132,13 +143,14 @@ def is_whitelisted_for_graph(o):
     # because they don't expose source code. But we assume they are safe for
     # graph mode since they are just containers.
     if tf_inspect.isclass(o) and len(o.__bases__) > 1:
-      logging.warn_first_n(
-          'Entity {} looks like a namedtuple subclass. If it has any custom'
-          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
-    logging.log(2, '%s is whitelisted: named tuple', o)
+      logging.warn(
+          'Entity {} looks like a namedtuple subclass. Its constructor will'
+          ' not be converted by AutoGraph, but if it has any custom methods,'
+          ' those will be.'.format(o), 1)
+    logging.log(2, 'Whitelisted: %s: named tuple', o)
     return True
 
-  logging.log(2, '%s is NOT whitelisted', o)
+  logging.log(2, 'Not whitelisted: %s: default rule', o)
   return False
 
 
@@ -207,6 +219,10 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   if logging.has_verbosity(2):
     logging.log(2, 'Compiled output of %s:\n\n%s\n', o,
                 compiler.ast_to_source(node))
+  if logging.has_verbosity(4):
+    for n in node:
+      logging.log(4, 'Compiled AST of %s:\n\n%s\n\n', o,
+                  pretty_printer.fmt(n, color=False))
 
   if program_ctx.options.recursive:
     while True:
@@ -341,7 +357,7 @@ def function_to_graph(f,
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
-  logging.log(3, 'Source code of %s:\n%s', f, source)
+  logging.log(3, 'Source code of %s:\n\n%s\n', f, source)
   node = node.body[0]
 
   # In general, the output of inspect.getsource is inexact for lambdas because
@@ -415,6 +431,7 @@ def node_to_graph(node, context):
             dependencies that this node has.
   """
   # TODO(mdan): Insert list_comprehensions somewhere.
+  unsupported_features_checker.verify(node)
 
   node = converter.standard_analysis(node, context, is_initial=True)
   # Past this point, line numbers are no longer accurate so we ignore the
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index cd893e3ff14eaa53633b60963923fe7ebd10823b..ddda4089fd89465d4915963e7d57143e89ed2aef 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -92,11 +92,9 @@ class ConversionTest(test.TestCase):
     conversion.entity_to_graph(f, program_ctx, None, None)
 
     self.assertTrue(f in program_ctx.dependency_cache)
-    self.assertTrue(g in program_ctx.dependency_cache)
+    self.assertFalse(g in program_ctx.dependency_cache)
     f_node = program_ctx.dependency_cache[f][0]
-    g_node = program_ctx.dependency_cache[g][0]
     self.assertEqual('tf__f', f_node.name)
-    self.assertEqual('tf__g', g_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
 
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 58ed72b7008da05dda801675b521bd3efb6dc308..5b3f45de056bf0354c3864aa51fd485fbc891624 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -71,4 +71,5 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
-from tensorflow.python.autograph.operators.special_values import UNDEFINED
+from tensorflow.python.autograph.operators.special_values import is_undefined
+from tensorflow.python.autograph.operators.special_values import Undefined
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index f046000720cae8282218cd6bfd3853dac1cdd3dc..adfde4623dd0c842a747a0d50272cfc8961fab24 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.autograph.operators import special_values
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -62,6 +63,17 @@ def for_stmt(iter_, extra_test, body, init_state):
   if tensor_util.is_tensor(iter_):
     return _known_len_for_stmt(iter_, extra_test, body, init_state)
   elif isinstance(iter_, dataset_ops.DatasetV2):
+    # Check for undefined symbols and report an error. This prevents the error
+    # from propagating into the TF runtime. We have more information here and
+    # can provide a clearer error message.
+    undefined_symbols = _filter_undefined(init_state)
+
+    if undefined_symbols:
+      raise ValueError(
+          'TensorFlow requires that the following symbols must be initialized '
+          'to a Tensor, Variable or TensorArray before the loop: {}'
+          .format(tuple(undefined_symbols)))
+
     return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
     return _py_for_stmt(iter_, extra_test, body, init_state)
@@ -71,7 +83,7 @@ def _py_for_stmt(iter_, extra_test, body, init_state):
   """Overload of for_stmt that executes a Python for loop."""
   state = init_state
   for target in iter_:
-    if not extra_test(*state):
+    if extra_test is not None and not extra_test(*state):
       break
     state = body(target, *state)
   return state
@@ -92,7 +104,9 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
     return state
 
   def while_cond(iterate_index, *state):
-    return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+    if extra_test is not None:
+      return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+    return iterate_index < n
 
   results = while_stmt(
       while_cond,
@@ -115,9 +129,11 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
 
 def _dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
-  if extra_test(*init_state) is not True:
+
+  if extra_test is not None:
     raise NotImplementedError(
-        'break statements are not yet supported in for/Dataset loops')
+        'break and return statements are not yet supported in '
+        'for/Dataset loops.')
 
   def reduce_body(state, iterate):
     new_state = body(iterate, *state)
@@ -153,13 +169,32 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
   # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
-  if any(tensor_util.is_tensor(v)
-         for v in nest.flatten(init_state + extra_deps)):
+  if any(tensor_util.is_tensor(v) for v in nest.flatten(extra_deps)):
+    # Check for undefined symbols and report an error. This prevents the error
+    # from propagating into the TF runtime. We have more information here and
+    # can provide a clearer error message.
+    undefined_symbols = _filter_undefined(init_state)
+
+    if undefined_symbols:
+      raise ValueError(
+          'TensorFlow requires that the following symbols must be initialized '
+          'to a Tensor, Variable or TensorArray before the loop: {}'
+          .format(tuple(undefined_symbols)))
     return _tf_while_stmt(test, body, init_state, opts)
   else:
     return _py_while_stmt(test, body, init_state, opts)
 
 
+def _filter_undefined(all_symbols):
+  """Returns the names of undefined symbols contained in all_symbols."""
+  undefined_symbols = [
+      s.symbol_name
+      for s in all_symbols
+      if special_values.is_undefined(s)
+  ]
+  return undefined_symbols
+
+
 def _tf_while_stmt(test, body, init_state, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   if opts is None:
@@ -203,7 +238,33 @@ def if_stmt(cond, body, orelse):
 
 def tf_if_stmt(cond, body, orelse):
   """Overload of if_stmt that stages a TF cond."""
-  return control_flow_ops.cond(cond, body, orelse)
+  protected_body = _wrap_in_protection_from_undefined(body, branch_name='if')
+  protected_orelse = _wrap_in_protection_from_undefined(orelse,
+                                                        branch_name='else')
+
+  return control_flow_ops.cond(cond, protected_body, protected_orelse)
+
+
+def _wrap_in_protection_from_undefined(func, branch_name):
+  """Wraps function to raise useful error when it returns undefined symbols."""
+  def protected_func():
+    """Calls function and raises an error if undefined symbols are returned."""
+    results = func()
+    undefined_symbols = None
+    if isinstance(results, tuple):
+      undefined_symbols = _filter_undefined(results)
+    elif special_values.is_undefined(results):
+      # Single return value
+      undefined_symbols = results.symbol_name
+
+    if undefined_symbols:
+      message = ('The following symbols must also be initialized in the %s '
+                 'branch: {}. Alternatively, you may initialize them before '
+                 'the if statement.') % branch_name
+      message = message.format(undefined_symbols)
+      raise ValueError(message)
+    return results
+  return protected_func
 
 
 def _py_if_stmt(cond, body, orelse):
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 590b51820f831cf1ce1bb3ff9b98f66b6532f56d..c06c4bcb97034dc79415df274862be51dfabd525 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -52,7 +52,7 @@ class ForLoopTest(test.TestCase):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
     s = control_flow.for_stmt(
         dataset_ops.Dataset.range(5).map(to_int32),
-        extra_test=lambda s: True,
+        None,
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session():
@@ -65,30 +65,41 @@ class WhileLoopTest(test.TestCase):
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
-        test=lambda i, sum: i < n,
-        body=lambda i, sum: (i + 1, sum + i,),
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i,),
         init_state=(0, 0),
         extra_deps=(n,))
-    with self.cached_session():
-      self.assertEqual((5, 10), self.evaluate(results))
+    self.assertEqual((5, 10), self.evaluate(results))
 
   @test_util.run_deprecated_v1
-  def test_tensor_dict_state(self):
+  def test_python_with_tensor_state(self):
     n = 5
-    init_state = {'i': constant_op.constant(0), 'sum': constant_op.constant(0)}
     results = control_flow.while_stmt(
-        test=lambda s: s['i'] < n,
-        body=lambda s: ({'i': s['i'] + 1, 'sum': s['sum'] + s['i']},),
-        init_state=(init_state,),
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i),
+        init_state=(0, constant_op.constant(0)),
         extra_deps=())
-    with self.cached_session():
-      self.assertEqual(({'i': 5, 'sum': 10},), self.evaluate(results))
+    result_i, result_s = results
+    self.assertEqual(5, result_i)
+    self.assertEqual(10, self.evaluate(result_s))
+
+  @test_util.run_deprecated_v1
+  def test_python_due_to_hidden_cond_type(self):
+    n = 5
+
+    # TODO(b/124002646): Improve the error message.
+    with self.assertRaises(Exception):
+      control_flow.while_stmt(
+          test=lambda i, s: i < n,
+          body=lambda i, s: (i + 1, s + i),
+          init_state=(constant_op.constant(0), constant_op.constant(0)),
+          extra_deps=())
 
   def test_python(self):
     n = 5
     results = control_flow.while_stmt(
-        test=lambda i, sum: i < n,
-        body=lambda i, sum: (i + 1, sum + i),
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i),
         init_state=(0, 0),
         extra_deps=(n,))
     self.assertEqual((5, 10), results)
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/special_values.py
index 08a1013524e5f039421be04e16f4b94f002281cd..4c1b3d1f30b36c98b969e92bd2587ab62fbfc2a9 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/special_values.py
@@ -19,5 +19,47 @@ from __future__ import division
 from __future__ import print_function
 
 
-# Used to reify undefined Python symbols so they can be used during staging.
-UNDEFINED = object()
+class Undefined(object):
+  """Represents an undefined symbol in Python.
+
+  This is used to reify undefined symbols, which is required to use the
+  functional form of loops.
+  Example:
+
+    while n > 0:
+      n = n - 1
+      s = n
+    return s  # Runtime error if n == 0
+
+  This is valid Python code and will not result in an error as long as n
+  is positive. The use of this class is to stay as close to Python semantics
+  as possible for staged code of this nature.
+
+  Converted version of the above showing the possible usage of this class:
+
+    s = Undefined('s')
+    init_state = (s,)
+    s = while_loop(cond, body, init_state)
+    return s  # s is an instance of Undefined if the loop never runs
+
+  Attributes:
+    symbol_name: Text, identifier for the undefined symbol
+  """
+
+  def __init__(self, symbol_name):
+    self.symbol_name = symbol_name
+
+
+def is_undefined(value):
+  """Checks whether Autograph has determined that a given value is undefined.
+
+  This only works in places where Autograph reifies undefined symbols. Note that
+  if this function is passed a truly undefined symbol the call-site will raise
+  NameError.
+
+  Args:
+    value: value to test for undefinedness
+  Returns:
+    Boolean, whether the input value is undefined.
+  """
+  return isinstance(value, Undefined)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/special_values_test.py
index 45fcf45067d9b38f987b29b678bac9092d2b01d1..2e1e087a9f3f586b646c9a73877d9bb4470c6f3e 100644
--- a/tensorflow/python/autograph/operators/special_values_test.py
+++ b/tensorflow/python/autograph/operators/special_values_test.py
@@ -22,11 +22,17 @@ from tensorflow.python.autograph.operators import special_values
 from tensorflow.python.platform import test
 
 
-class PythonLangUtilsTest(test.TestCase):
+class SpecialValuesTest(test.TestCase):
 
   def test_undefined(self):
-    self.assertIs(special_values.UNDEFINED, special_values.UNDEFINED)
+    undefined_symbol = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
 
+    undefined_symbol2 = special_values.Undefined('name')
+    self.assertNotEqual(undefined_symbol, undefined_symbol2)
+
+    self.assertTrue(special_values.is_undefined(undefined_symbol))
+    self.assertTrue(special_values.is_undefined(undefined_symbol2))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index 3496c514ee199f3eeeeede5caabe16dec29f3652..b091285cab6f2f643d7f99f3063a903c1e5efdb8 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -283,13 +283,18 @@ def parallel_walk(node, other):
     n = node_stack.pop()
     o = other_stack.pop()
 
-    if (not isinstance(n, (ast.AST, gast.AST)) or
-        not isinstance(o, (ast.AST, gast.AST)) or
+    if (not isinstance(n, (ast.AST, gast.AST, str)) or
+        not isinstance(o, (ast.AST, gast.AST, str)) or
         n.__class__.__name__ != o.__class__.__name__):
-      raise ValueError('inconsistent nodes: {} and {}'.format(n, o))
+      raise ValueError('inconsistent nodes: {} ({}) and {} ({})'.format(
+          n, n.__class__.__name__, o, o.__class__.__name__))
 
     yield n, o
 
+    if isinstance(n, str):
+      assert isinstance(o, str), 'The check above should have ensured this'
+      continue
+
     for f in n._fields:
       n_child = getattr(n, f, None)
       o_child = getattr(o, f, None)
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index 00ef7f5e058df3571ac2090922bf3cb9eaa5b20c..c6c1132dd64351dde9274d8d3c408d4bc4988b03 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -159,11 +159,20 @@ class AstUtilTest(test.TestCase):
     })
 
   def test_parallel_walk(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
+    src = """
       def f(a):
         return a + 1
-    """))
+    """
+    node = parser.parse_str(textwrap.dedent(src))
+    for child_a, child_b in ast_util.parallel_walk(node, node):
+      self.assertEqual(child_a, child_b)
+
+  def test_parallel_walk_string_leaves(self):
+    src = """
+      def f(a):
+        global g
+    """
+    node = parser.parse_str(textwrap.dedent(src))
     for child_a, child_b in ast_util.parallel_walk(node, node):
       self.assertEqual(child_a, child_b)
 
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index fdfcd4dcc15b0c6238dcdc3fedef60f2984c33a4..0cedfa84ab33cc3d0931c3998214b017a6907cfc 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -393,6 +393,8 @@ class GraphBuilder(object):
   def _connect_jump_to_finally_sections(self, node):
     """Connects a jump node to the finally sections protecting it."""
     cursor = set((node,))
+    if node not in self.finally_sections:
+      return cursor
     for guard_section_id in self.finally_sections[node]:
       guard_begin, guard_ends = self.finally_section_subgraphs[guard_section_id]
       self._connect_nodes(cursor, guard_begin)
@@ -620,10 +622,10 @@ class AstToCfg(gast.NodeVisitor):
     leaving_node = self.lexical_scopes.pop()
     assert node == leaving_node
 
-  def _get_enclosing_scopes(self, include, stop_at):
+  def _get_enclosing_finally_scopes(self, stop_at):
     included = []
     for node in reversed(self.lexical_scopes):
-      if isinstance(node, include):
+      if isinstance(node, gast.Try) and node.finalbody:
         included.append(node)
       if isinstance(node, stop_at):
         return node, included
@@ -635,10 +637,8 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_exit_statement(self, node, *exits_nodes_of_type):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=tuple(exits_nodes_of_type),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes(
+        tuple(exits_nodes_of_type))
     if try_node is None:
       raise ValueError(
           '%s that is not enclosed by any of %s' % (node, exits_nodes_of_type))
@@ -646,10 +646,8 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_continue_statement(self, node, *loops_to_nodes_of_type):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=tuple(loops_to_nodes_of_type),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes(
+        tuple(loops_to_nodes_of_type))
     if try_node is None:
       raise ValueError('%s that is not enclosed by any of %s' %
                        (node, loops_to_nodes_of_type))
@@ -698,10 +696,7 @@ class AstToCfg(gast.NodeVisitor):
     self._process_basic_statement(node)
 
   def visit_Raise(self, node):
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=(gast.FunctionDef,),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes((gast.FunctionDef,))
     if try_node is None:
       raise ValueError('%s that is not enclosed by any FunctionDef' % node)
     self.builder.add_error_node(node, guards)
@@ -797,16 +792,13 @@ class AstToCfg(gast.NodeVisitor):
     for stmt in node.orelse:
       self.visit(stmt)
 
-    if node.handlers:
-      # TODO(mdan): Should we still support bare try/except? Might be confusing.
-      raise NotImplementedError('exceptions are not yet supported')
-
     self._exit_lexical_scope(node)
 
-    self.builder.enter_finally_section(node)
-    for stmt in node.finalbody:
-      self.visit(stmt)
-    self.builder.exit_finally_section(node)
+    if node.finalbody:
+      self.builder.enter_finally_section(node)
+      for stmt in node.finalbody:
+        self.visit(stmt)
+      self.builder.exit_finally_section(node)
 
   def visit_With(self, node):
     # TODO(mdan): Mark the context manager's exit call as exit guard.
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
index 3de3fadc883fdae705ed49aa322b943dd89870b2..2f2049c40bb5fb72f7dd8d3191bc3163abdcf309 100644
--- a/tensorflow/python/autograph/pyct/errors.py
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -44,5 +44,5 @@ def report_internal_error(entity, exception):
   raise AutoGraphError(
       'Unexpected error transforming %s. If you believe this is due to a bug,'
       ' please set the verbosity to 10 (on Linux, `export '
-      'AUTOGRAPH_VERBOSITY=1`) and attach the full output when filing the bug '
-      'report. Caused by %s' % (entity, exception))
+      'AUTOGRAPH_VERBOSITY=10`) and attach the full output when filing the bug '
+      'report. Caused by: %s' % (entity, exception))
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 6d9bc43d34652f2fd67b74faf4bff77afad54119..eab01ee9cd613b25548412ea78f1fd07d3d432cb 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -31,7 +31,7 @@ from tensorflow.python.util import tf_inspect
 
 # These functions test negative for isinstance(*, types.BuiltinFunctionType)
 # and inspect.isbuiltin, and are generally not visible in globals().
-# TODO(mdan): Find a more generic way to test this - just enumerate __builtin__?
+# TODO(mdan): Remove this.
 SPECIAL_BUILTINS = {
     'dict': dict,
     'enumerate': enumerate,
@@ -42,6 +42,7 @@ SPECIAL_BUILTINS = {
     'print': print,
     'range': range,
     'tuple': tuple,
+    'type': type,
     'zip': zip
 }
 
@@ -73,7 +74,7 @@ def isnamedtuple(f):
 
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
-  if f in SPECIAL_BUILTINS.values():
+  if f in six.moves.builtins.__dict__.values():
     return True
   if isinstance(f, types.BuiltinFunctionType):
     return True
@@ -125,6 +126,10 @@ def getqualifiedname(namespace, object_, max_depth=5, visited=None):
   if visited is None:
     visited = set()
 
+  # Copy the dict to avoid "changed size error" during concurrent invocations.
+  # TODO(mdan): This is on the hot path. Can we avoid the copy?
+  namespace = dict(namespace)
+
   for name in namespace:
     # The value may be referenced by more than one symbol, case in which
     # any symbol will be fine. If the program contains symbol aliases that
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 011d80dd62afea948abd13bf82f8272a034921af..f6b2a7863bd99bbc75886b2522f4e3a7d35ec0f1 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 import re
 import textwrap
+import threading
 
 import gast
 import six
@@ -30,10 +31,14 @@ import six
 from tensorflow.python.util import tf_inspect
 
 
+_parse_lock = threading.Lock()  # Prevents linecache concurrency errors.
+
+
 def parse_entity(entity):
   """Returns the AST of given entity."""
   try:
-    source = tf_inspect.getsource(entity)
+    with _parse_lock:
+      source = tf_inspect.getsource_no_unwrap(entity)
   except (IOError, OSError) as e:
     raise ValueError(
         'Unable to locate the source code of {}. Note that functions defined'
@@ -106,7 +111,7 @@ def parse_str(src):
   """Returns the AST of given piece of code."""
   # TODO(mdan): This should exclude the module things are autowrapped in.
 
-  if six.PY2 and re.search('\\Wprint\\(', src):
+  if six.PY2 and re.search('\\Wprint\\s*\\(', src):
     # This special treatment is required because gast.parse is not aware of
     # whether print_function was present in the original context.
     src = 'from __future__ import print_function\n' + src
diff --git a/tensorflow/python/autograph/pyct/parser_test.py b/tensorflow/python/autograph/pyct/parser_test.py
index cc0123408d51aed887967688671926b89588c6f8..e7fa3c7aeb53c984c40e8709b2ce8e49e8879acf 100644
--- a/tensorflow/python/autograph/pyct/parser_test.py
+++ b/tensorflow/python/autograph/pyct/parser_test.py
@@ -51,6 +51,15 @@ class ParserTest(test.TestCase):
     """))
     self.assertEqual('f', mod.body[0].name)
 
+  def test_parse_str_weird_print(self):
+    mod = parser.parse_str(
+        textwrap.dedent("""
+            def f(x):
+              print (x)
+              return x + 1
+    """))
+    self.assertEqual('f', mod.body[0].name)
+
   def test_parse_comments(self):
     def f():
 # unindented comment
diff --git a/tensorflow/python/autograph/pyct/pretty_printer.py b/tensorflow/python/autograph/pyct/pretty_printer.py
index d963416bed7c68967d76aba72ec4b46a24cda146..a92017f4142f671f337b15104b049581309bd290 100644
--- a/tensorflow/python/autograph/pyct/pretty_printer.py
+++ b/tensorflow/python/autograph/pyct/pretty_printer.py
@@ -56,6 +56,15 @@ class PrettyPrinter(gast.NodeVisitor):
     self.result += '\n'
 
   def generic_visit(self, node, name=None):
+    # In very rare instances, a list can contain something other than a Node.
+    # e.g. Global contains a list of strings.
+    if isinstance(node, str):
+      if name:
+        self._print('%s%s="%s"' % (self._indent(), name, node))
+      else:
+        self._print('%s"%s"' % (self._indent(), node))
+      return
+
     if node._fields:
       cont = ':'
     else:
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 65e468221551716fa285a8557d5f2d7d243ebbed..dd3d1d5d1365c6a5aa5f1a7f16a485d40c3da6a1 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -265,10 +265,10 @@ class ActivityAnalyzer(transformer.Base):
     self._exit_scope()
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
   def visit_Expr(self, node):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index f8b8d7fa77c167e0ebf96dd533e3c42b0c30b8e5..ad567a0a4fc97e246461274f33fa403634638ed8 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -144,10 +144,10 @@ class WholeTreeAnalyzer(transformer.Base):
     self.current_analyzer = parent_analyzer
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
 
@@ -219,6 +219,10 @@ class Annotator(transformer.Base):
                  frozenset(self.current_analyzer.out[cfg_node]))
     return node
 
+  def visit_ExceptHandler(self, node):
+    # TODO(b/123995141) Add Exception Handlers to the CFG
+    return node
+
 
 def resolve(node, source_info, graphs):
   """Resolves the live symbols at the exit of control flow statements.
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
index d1587d81780780f56ab0ec1fb0dbb9942a3d4539..ce6f3c528477713bb3ac04af00baffb9a1b7a145 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
@@ -217,12 +217,16 @@ class TreeAnnotator(transformer.Base):
 
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
+  def visit_ExceptHandler(self, node):
+    # TODO(b/123995141) Add Exception Handlers to the CFG
+    return node
+
   def visit_Name(self, node):
     if self.current_analyzer is None:
       # Names may appear outside function defs - for example in class
@@ -232,7 +236,8 @@ class TreeAnnotator(transformer.Base):
     analyzer = self.current_analyzer
     cfg_node = self.current_cfg_node
 
-    assert cfg_node is not None, 'name node outside of any statement?'
+    assert cfg_node is not None, ('name node, %s, outside of any statement?'
+                                  % node.id)
 
     qn = anno.getanno(node, anno.Basic.QN)
     if isinstance(node.ctx, gast.Load):
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 831eb6dbbf2f492ccb36ddeac932ce096ca8d908..b682a21bec16bcfae4c873dcd9c6ab8f0f3eb73b 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -92,6 +92,14 @@ class ContextAdjuster(gast.NodeTransformer):
     return self.generic_visit(node)
 
   def visit_comprehension(self, node):
+    # We may be able to override some of these, but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Lambda(self, node):
+    # We may be able to override some of these, but for now it's simpler
+    # to just assert that they're set.
     self._ctx_override = None
     return self.generic_visit(node)
 
diff --git a/tensorflow/python/autograph/pyct/templates_test.py b/tensorflow/python/autograph/pyct/templates_test.py
index bd6b451eda67e998c0a99f0837e018bacc13561f..4762aaf3ff68391bf4cfdee46ba88ff69cd7e8c0 100644
--- a/tensorflow/python/autograph/pyct/templates_test.py
+++ b/tensorflow/python/autograph/pyct/templates_test.py
@@ -248,6 +248,16 @@ class TemplatesTest(test.TestCase):
     self.assertIsInstance(arg_node.generators[0].target.ctx, gast.Store)
     self.assertIsInstance(arg_node.elt.ctx, gast.Load)
 
+  def test_lambda_in_function_call(self):
+    template = """
+      a = foo(arg)
+    """
+    source = parser.parse_expression('[lambda i: i]')
+    node = templates.replace(template, arg=source)
+    lambda_arg = node[0].value.args[0].elts[0]
+    self.assertIsInstance(lambda_arg.args.args[0].ctx, gast.Param)
+    self.assertIsInstance(lambda_arg.body.ctx, gast.Load)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/utils/ag_logging.py b/tensorflow/python/autograph/utils/ag_logging.py
index 847000aa25351d1ab4309f135b0ec3da5a464ea7..8229b828d305dc5acd8e61ceacb325d3f681487f 100644
--- a/tensorflow/python/autograph/utils/ag_logging.py
+++ b/tensorflow/python/autograph/utils/ag_logging.py
@@ -46,8 +46,11 @@ def set_verbosity(level, alsologtostdout=False):
   more in-depth debugging.
 
   There are two controls that control the logging verbosity:
+
    * The `set_verbosity` function
+
    * The `AUTOGRAPH_VERBOSITY` environment variable
+
   `set_verbosity` takes precedence over the environment variable.
 
   For example:
@@ -110,7 +113,7 @@ def get_verbosity():
   global verbosity_level
   if verbosity_level is not None:
     return verbosity_level
-  return os.getenv(VERBOSITY_VAR_NAME, DEFAULT_VERBOSITY)
+  return int(os.getenv(VERBOSITY_VAR_NAME, DEFAULT_VERBOSITY))
 
 
 def has_verbosity(level):
@@ -131,5 +134,11 @@ def log(level, msg, *args, **kwargs):
       print(msg % args)
 
 
+def warn(msg, *args, **kwargs):
+  logging.warn(msg, *args, **kwargs)
+  if echo_log_to_stdout:
+    print('WARNING:', msg % args)
+
+
 def warn_first_n(msg, *args, **kwargs):
   logging.log_first_n(logging.WARN, msg, *args, **kwargs)
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index b9056f86e6d0465a8521f054a459c06eb5aeb37c..244820f41a85778a01cd811d96c3e8228d8b7c8c 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -12,22 +12,26 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 # consumers of the tf_gen_op_wrapper_py rule would be simplified if we don't
 # hard code the ops/ directory.
 
-def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
-                                 require_shape_functions=True,
-                                 visibility=[]):
-  if not name.endswith("_gen"):
-    fail("name must end in _gen")
-  if not visibility:
-    visibility = ["//visibility:private"]
-  bare_op_name = name[:-4] # Strip off the _gen
-  tf_gen_op_wrapper_py(name=bare_op_name,
-    out=out,
-    visibility=visibility,
-    deps=deps,
-    require_shape_functions=require_shape_functions,
-    generated_target_name=name,
-    api_def_srcs = [
-        "//tensorflow/core/api_def:base_api_def",
-        "//tensorflow/core/api_def:python_api_def",
-    ],
-  )
+def tf_gen_op_wrapper_private_py(
+        name,
+        out = None,
+        deps = [],
+        require_shape_functions = True,
+        visibility = []):
+    if not name.endswith("_gen"):
+        fail("name must end in _gen")
+    if not visibility:
+        visibility = ["//visibility:private"]
+    bare_op_name = name[:-4]  # Strip off the _gen
+    tf_gen_op_wrapper_py(
+        name = bare_op_name,
+        out = out,
+        visibility = visibility,
+        deps = deps,
+        require_shape_functions = require_shape_functions,
+        generated_target_name = name,
+        api_def_srcs = [
+            "//tensorflow/core/api_def:base_api_def",
+            "//tensorflow/core/api_def:python_api_def",
+        ],
+    )
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index bdca7dee556eb6e2fd8f4e98a31e0a5ccbe5c83b..4f3eb61d4fddbdad2758e0aef00727ede5d37b74 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1590,7 +1590,21 @@ class Session(BaseSession):
     self._default_session_context_manager = None
     self._default_graph_context_manager = None
 
-    self.close()
+    # If we are closing due to an exception, set a time limit on our Close() to
+    # avoid blocking forever.
+    # TODO(b/120204635) remove this when deadlock is fixed.
+    if exec_type:
+      close_thread = threading.Thread(
+          name='SessionCloseThread', target=self.close)
+      close_thread.daemon = True
+      close_thread.start()
+      close_thread.join(30.0)
+      if close_thread.is_alive():
+        logging.error(
+            'Session failed to close after 30 seconds. Continuing after this '
+            'point may leave your program in an undefined state.')
+    else:
+      self.close()
 
   @staticmethod
   def reset(target, containers=None, config=None):
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index ef7527d887f062621d1fb21511e08c5f7ea389c0..3b9677bf251cad98e1ed54403f93e9de2741e1b5 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -604,6 +604,27 @@ def TF_Reset(target, containers=None, config=None):
   }
 }
 
+// $input is a Python list of wrapped TF_Operations
+%typemap(in) (const std::vector<TF_Operation*>* control_outputs)
+    (std::vector<TF_Operation*> control_outputs) {
+  if ($input != Py_None) {
+    if (!PyList_Check($input)) {
+      SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+    }
+    size_t size = PyList_Size($input);
+    for (int i = 0; i < size; ++i) {
+      PyObject* item = PyList_GetItem($input, i);
+      TF_Operation* oper_ptr;
+      SWIG_ConvertPtr(item, reinterpret_cast<void**>(&oper_ptr),
+                      $descriptor(TF_Operation*), 0);
+      control_outputs.push_back(oper_ptr);
+    }
+    $1 = &control_outputs;
+  } else {
+    $1 = nullptr;
+  }
+}
+
 // Typemaps for TF_GraphGetTensorShapeHelper.
 
 // Convert from C++ integer vector to Python list of ints.
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index dc0c10bab74635e240502e2f8e762b61e533b319..56b4eec98e314dd6474acec51b4208d5120f2fa4 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -590,7 +590,9 @@ TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
-    const NameVector& output_names, const TF_FunctionOptions* opts,
+    const NameVector& output_names,
+    const std::vector<TF_Operation*>* control_outputs,
+    const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status) {
   if (!output_names.empty() && output_names.size() != outputs.size()) {
     Set_TF_Status_from_Status(
@@ -613,10 +615,18 @@ TF_Function* TF_GraphToFunction_wrapper(
       output_names.empty() ? nullptr
                            : const_cast<const char**>(output_names.data());
 
-  return TF_GraphToFunction(fn_body, fn_name, append_hash_to_fn_name, nopers,
-                            opers_array, inputs.size(), inputs.data(),
-                            outputs.size(), outputs.data(), output_names_ptr,
-                            opts, description, out_status);
+  const char** control_output_names_ptr =
+      control_output_names.empty()
+          ? nullptr
+          : const_cast<const char**>(control_output_names.data());
+
+  return TF_GraphToFunctionWithControlOutputs(
+      fn_body, fn_name, append_hash_to_fn_name, nopers, opers_array,
+      inputs.size(), inputs.data(), outputs.size(), outputs.data(),
+      output_names_ptr,
+      control_outputs == nullptr ? 0 : control_outputs->size(),
+      control_outputs == nullptr ? nullptr : control_outputs->data(),
+      control_output_names_ptr, opts, description, out_status);
 }
 
 void TF_GraphSetOutputHandleShapesAndTypes_wrapper(
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index dab7e71aac5a7f4cbf9f8825ad6dd5d3f556bd43..d2c7dc34d8d54f384a69954db37f7ba18b527197 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -208,7 +208,9 @@ TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
-    const NameVector& output_names, const TF_FunctionOptions* opts,
+    const NameVector& output_names,
+    const std::vector<TF_Operation*>* control_outputs,
+    const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index bef953b28f4be2bc2d31d979c6d1e387b23b831d..c1df5c4f8b0295fbb73928083ee16c1f167a3bac 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -27,7 +27,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 2, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 2, 21)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..07209a9eca976e049f6e44eef6a75952dd8a1823
--- /dev/null
+++ b/tensorflow/python/compiler/BUILD
@@ -0,0 +1,19 @@
+# Description:
+# Python APIs for various Tensorflow backends.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "compiler",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = if_not_windows([
+        "//tensorflow/python/compiler/tensorrt:init_py",
+    ]),
+)
diff --git a/tensorflow/python/compiler/__init__.py b/tensorflow/python/compiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index a382579f5a1251249172625c697a0751458a9800..6c90a165cbeacd7f5ca9c00112e354c2d5b7ff20 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -32,8 +32,6 @@ py_library(
     deps = [
         ":tf_trt_integration_test_base",
         ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:errors",
     ],
 )
 
@@ -51,6 +49,7 @@ py_library(
     srcs = ["trt_convert.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":trt_ops_py",
         ":wrap_conversion",
         "//tensorflow/python:graph_util",
         "//tensorflow/python:session",
@@ -83,7 +82,6 @@ py_library(
     srcs = ["test/tf_trt_integration_test_base.py"],
     deps = [
         ":trt_convert_py",
-        ":trt_ops_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
@@ -112,6 +110,7 @@ cuda_py_test(
         "no_windows",
         "nomac",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -149,6 +148,7 @@ cuda_py_tests(
         "no_windows",
         "nomac",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -168,9 +168,11 @@ cuda_py_test(
     ],
     tags = [
         "no_cuda_on_cpu_tap",
+        "no_oss",  # TODO(b/125290478): allow running in at least some OSS configurations.
         "no_pip",
         "no_tap",  # It is not able to download the mnist data.
         "no_windows",
         "nomac",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/compiler/tensorrt/__init__.py b/tensorflow/python/compiler/tensorrt/__init__.py
index 88fb69101d0aeb624d9726027f9603e2e3e7b91d..db3540ba45d8082079a04db9e9de5bf7aa178f93 100644
--- a/tensorflow/python/compiler/tensorrt/__init__.py
+++ b/tensorflow/python/compiler/tensorrt/__init__.py
@@ -18,25 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import errors
-
-# pylint: disable=unused-import,g-import-not-at-top,line-too-long
-try:
-  from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
-  from tensorflow.python.compiler.tensorrt.trt_convert import add_test_value
-  from tensorflow.python.compiler.tensorrt.trt_convert import calib_graph_to_infer_graph
-  from tensorflow.python.compiler.tensorrt.trt_convert import clear_test_values
-  from tensorflow.python.compiler.tensorrt.trt_convert import create_inference_graph
-  from tensorflow.python.compiler.tensorrt.trt_convert import enable_test_value
-  from tensorflow.python.compiler.tensorrt.trt_convert import get_test_value
-  from tensorflow.python.compiler.tensorrt.trt_convert import is_tensorrt_enabled
-except errors.NotFoundError as e:
-  no_trt_message = (
-      '**** Failed to initialize TensorRT. This is either because the TensorRT'
-      ' installation path is not in LD_LIBRARY_PATH, or because you do not have'
-      ' it installed. If not installed, please go to'
-      ' https://developer.nvidia.com/tensorrt to download and install'
-      ' TensorRT ****')
-  print(no_trt_message)
-  raise e
-# pylint: enable=unused-import,g-import-not-at-top,line-too-long
+# pylint: disable=unused-import,line-too-long
+from tensorflow.python.compiler.tensorrt.trt_convert import create_inference_graph
+# pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index cc3109968e24c83ccecbee32a3d3535dbf570bee..a1199c5040a51bf7db2cebaad1fe1523c3b6f421 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.compiler.tensorrt import trt_convert
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.wrap_conversion import add_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -154,7 +155,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Setup method."""
     super(PartiallyConvertedTestA, self).setUp()
     # Let it fail to build the second engine.
-    trt_convert.add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
+    add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
 
   def GetParams(self):
     """Create a graph containing two segment."""
@@ -209,8 +210,8 @@ class PartiallyConvertedTestB(PartiallyConvertedTestA):
     """Setup method."""
     super(PartiallyConvertedTestB, self).setUp()
     # Let it fail to build the first engine.
-    trt_convert.clear_test_values("")
-    trt_convert.add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
+    clear_test_values("")
+    add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index 1d7792c01a20fd91655156448cc89bf01e548bcc..cdd25e3b981bb7944ece564ee4a9fa0d976fba66 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
-from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
-# pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import data
 from tensorflow.python import keras
 from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.model_fn import ModeKeys
@@ -139,9 +138,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     if use_trt:
       logging.info('Number of nodes before TF-TRT conversion: %d',
                    len(graph_def.node))
-      graph_def = trt_convert.create_inference_graph(
-          graph_def,
-          outputs=[OUTPUT_NODE_NAME],
+      converter = trt_convert.TrtGraphConverter(
+          input_graph_def=graph_def,
+          nodes_blacklist=[OUTPUT_NODE_NAME],
           max_batch_size=max_batch_size,
           precision_mode='INT8',
           # There is a 2GB GPU memory limit for each test, so we set
@@ -151,6 +150,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
           minimum_segment_size=2,
           use_calibration=False,
       )
+      graph_def = converter.convert()
       logging.info('Number of nodes after TF-TRT conversion: %d',
                    len(graph_def.node))
       num_engines = len(
@@ -263,7 +263,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
   #     num_epochs=100,
   #     model_dir=model_dir)
   def testEval(self):
-    if not trt_convert.is_tensorrt_enabled():
+    if not is_tensorrt_enabled():
       return
     model_dir = test.test_src_dir_path('python/compiler/tensorrt/test/testdata')
 
@@ -274,9 +274,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
         num_epochs=None,
         model_dir=model_dir)['accuracy']
     logging.info('accuracy_tf_native: %f', accuracy_tf_native)
-    self.assertAllClose(0.9662, accuracy_tf_native, rtol=1e-3, atol=1e-3)
+    self.assertAllClose(0.9662, accuracy_tf_native, rtol=3e-3, atol=3e-3)
 
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return
 
     accuracy_tf_trt = self._Run(
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
index 086e070f1b18c98cef8e9f8772fefa967577df85..3e1c9ff8ddc70469ba3516111b9d3821f1bbb6bc 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.compiler.tensorrt import trt_convert
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -72,7 +72,7 @@ class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(add_quantization_nodes=False)
 
   def ShouldRunTest(self, run_params):
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return False
     # Only test static engine mode, with or without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
@@ -96,7 +96,7 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(add_quantization_nodes=True)
 
   def ShouldRunTest(self, run_params):
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return False
     # Test static/dynamic engine with/without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index bba661b879e75758c1d47fef5fbb86b5414eb235..3a14a1cf6d56df34364a0678a264f61c1bb25f2c 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -25,12 +25,13 @@ import warnings
 import numpy as np
 import six
 
-# pylint: disable=unused-import
-from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
-# pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
+from tensorflow.python.compiler.tensorrt.wrap_conversion import enable_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -110,7 +111,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   @property
   def trt_incompatible_op(self):
-    return math_ops.sin
+    return math_ops.erf
 
   @property
   def precision_modes(self):
@@ -151,7 +152,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def setUpClass(cls):
     """Setup method for the module."""
     super(TfTrtIntegrationTestBase, cls).setUpClass()
-    trt_convert.enable_test_value()
+    enable_test_value()
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TfTrtIntegrationTestBase, self).__init__(methodName)
@@ -161,7 +162,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     """Setup method."""
     super(TfTrtIntegrationTestBase, self).setUp()
     warnings.simplefilter("always")
-    trt_convert.clear_test_values("")
+    clear_test_values("")
 
   def GetParams(self):
     """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
@@ -246,9 +247,9 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _PrepareRun(self, graph_state):
     """Set up necessary testing environment before calling sess.run()."""
     # Clear test values added by TRTEngineOp.
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
+    clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
+    clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
+    clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
 
   def _GetGPUOptions(self):
     gpu_options = config_pb2.GPUOptions()
@@ -258,8 +259,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     conversion_params = self.GetConversionParams(run_params)
-    if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
-      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
+    if graph_state == GraphState.INFERENCE and run_params.use_optimizer:
+      rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
           conversion_params.precision_mode,
@@ -282,7 +283,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def _ExpectTestValue(self, engine_name, method, expected_value):
     label = "%s:%s" % (engine_name, method)
-    actual_value = trt_convert.get_test_value(label)
+    actual_value = get_test_value(label)
     self.assertEqual(
         expected_value,
         actual_value,
@@ -298,6 +299,23 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _ExpectNativeSegment(self, engine_name, value):
     self._ExpectTestValue(engine_name, "ExecuteNativeSegment", value)
 
+  def _GetFeedNames(self):
+    params = self._GetParamsCached()
+    # Construct the feeds tensor names by appending :0 to the node names.
+    return [input_name + ":0" for input_name in params.input_names]
+
+  def _GetFetchNames(self):
+    params = self._GetParamsCached()
+    # Construct the fetches tensor names by appending :0 to the node names.
+    return [output_name + ":0" for output_name in params.output_names]
+
+  def _GetFeedDict(self, inputs_data, input_shape_index):
+    assert input_shape_index < len(inputs_data)
+    feeds = self._GetFeedNames()
+    return {
+        feeds[i]: inputs_data[input_shape_index][i] for i in range(len(feeds))
+    }
+
   def _RunGraph(self,
                 run_params,
                 gdef,
@@ -307,32 +325,23 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                 num_runs=2):
     """Run given graphdef multiple times."""
     params = self._GetParamsCached()
-    for current_input_data in inputs_data:
-      assert len(params.input_names) == len(current_input_data)
+    for data in inputs_data:
+      assert len(params.input_names) == len(data)
 
-    vals = []
+    fetches = self._GetFetchNames()
     g = ops.Graph()
     with g.as_default():
-      io_ops = importer.import_graph_def(
-          graph_def=gdef,
-          return_elements=params.input_names + params.output_names,
-          name="")
-      inputs = [op.outputs[0] for op in io_ops[:len(params.input_names)]]
-      for current_input_data in inputs_data:
-        assert len(inputs) == len(current_input_data)
-      outputs = [op.outputs[0] for op in io_ops[len(params.input_names):]]
-      with self.test_session(
+      importer.import_graph_def(graph_def=gdef, name="")
+      with self.session(
           graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+        vals = []
         # Run for each input(s) shape
         for shape_index in range(len(inputs_data)):
           val = None
-          # Defaults to 2 runs to verify result across multiple runs is same.
           for _ in range(num_runs):
             self._PrepareRun(graph_state)
-            new_val = sess.run(outputs, {
-                inputs[i]: inputs_data[shape_index][i]
-                for i in range(len(inputs))
-            })
+            new_val = sess.run(fetches,
+                               self._GetFeedDict(inputs_data, shape_index))
             output_len = len(params.expected_output_dims[shape_index])
             self.assertEqual(output_len, len(new_val))
             for i in range(output_len):
@@ -344,25 +353,15 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
             val = new_val
             self.VerifyRun(run_params, graph_state)
           vals.append(val)
-    return vals
-
-  # Use real data that is representative of the inference dataset
-  # for calibration. For this test script it is random data.
-  def _RunCalibration(self, run_params, gdef, inputs_data, config):
-    """Run calibration on given graph."""
-    return self._RunGraph(
-        run_params, gdef, inputs_data, config, GraphState.CALIBRATE, num_runs=5)
+        return vals
 
-  def _GetTrtGraphDef(self, run_params, graph_state, gdef):
-    """Return trt converted graphdef."""
+  def _CreateConverter(self, gdef, session_config, conversion_params):
+    """Return a TrtGraphConverter."""
     params = self._GetParamsCached()
-    conversion_params = self.GetConversionParams(run_params)
-    logging.info(conversion_params)
-
-    config_for_trt = self._GetConfigProto(run_params, graph_state)
-    return trt_convert.create_inference_graph(
+    converter = trt_convert.TrtGraphConverter(
         input_graph_def=gdef,
-        outputs=params.input_names + params.output_names,
+        nodes_blacklist=params.input_names + params.output_names,
+        session_config=session_config,
         max_batch_size=conversion_params.max_batch_size,
         max_workspace_size_bytes=conversion_params.max_workspace_size_bytes,
         precision_mode=conversion_params.precision_mode,
@@ -370,8 +369,42 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
         cached_engine_batches=conversion_params.cached_engine_batches,
-        use_calibration=conversion_params.use_calibration,
-        session_config=config_for_trt)
+        use_calibration=conversion_params.use_calibration)
+    return converter
+
+  def _GetCalibratedInferGraph(self, run_params, gdef, inputs_data):
+    """Return trt converted graphdef in INT8 mode."""
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+    assert conversion_params.precision_mode == "INT8"
+    assert conversion_params.is_dynamic_op
+    assert conversion_params.maximum_cached_engines == 1
+    assert not conversion_params.cached_engine_batches
+    assert conversion_params.use_calibration
+    assert len(inputs_data) == 1  # We only support calibrating single engine.
+
+    session_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
+    logging.info("Running calibration graph, config:\n%s", str(session_config))
+
+    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    int8_gdef = converter.convert()
+    self._VerifyGraphDef(run_params, int8_gdef, GraphState.CALIBRATE)
+
+    return converter.calibrate(
+        fetch_names=self._GetFetchNames(),
+        num_runs=5,
+        feed_dict_fn=lambda: self._GetFeedDict(inputs_data, 0))
+
+  def _GetInferGraph(self, run_params, gdef):
+    """Return trt converted graphdef."""
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+
+    session_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
+    logging.info("Creating TRT graph for inference, config\n%s",
+                 str(session_config))
+    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    return converter.convert()
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -454,13 +487,14 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
     expected_engines = self.ExpectedEnginesToBuild(run_params)
     num_engines = 0
+    functions = [f.signature.name for f in gdef.library.function]
     for node in gdef.node:
       if node.op == "TRTEngineOp":
         logging.info("Found TRTEngineOp: " + node.name)
-    for node in gdef.node:
-      if node.op == "TRTEngineOp":
         num_engines += 1
-        self.assertTrue(node.name in expected_engines, node.name)
+        function_name = node.name + "_native_segment"
+        self.assertIn(function_name, functions)
+        self.assertIn(node.name, expected_engines)
         self.assertTrue(len(node.attr["serialized_segment"].s), node.name)
         self.assertTrue(len(node.attr["segment_funcdef_name"].s), node.name)
         self.assertEqual(
@@ -521,9 +555,10 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
             (scale * np.random.random_sample(dims)).astype(dtype))
       inputs_data.append(current_input_data)
 
+    # Verify original graph.
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
-    # Get reference result without running trt.
+    # Run original graph without trt to get reference result.
     config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
     logging.info("Running original graph w/o trt, config:\n%s",
                  str(config_no_trt))
@@ -533,27 +568,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     # Run calibration if necessary.
     if (IsQuantizationMode(run_params.precision_mode) and
         run_params.use_calibration):
-
-      calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
-      logging.info("Running calibration graph, config:\n%s", str(calib_config))
-      if run_params.use_optimizer:
-        result = self._RunCalibration(run_params, input_gdef, inputs_data,
-                                      calib_config)
-      else:
-        calib_gdef = self._GetTrtGraphDef(run_params, GraphState.CALIBRATE,
-                                          input_gdef)
-        self._VerifyGraphDef(run_params, calib_gdef, GraphState.CALIBRATE)
-        result = self._RunCalibration(run_params, calib_gdef, inputs_data,
-                                      calib_config)
-      infer_gdef = trt_convert.calib_graph_to_infer_graph(
-          calib_gdef, run_params.dynamic_engine)
+      infer_gdef = self._GetCalibratedInferGraph(run_params, input_gdef,
+                                                 inputs_data)
+      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
+    elif not run_params.use_optimizer:
+      infer_gdef = self._GetInferGraph(run_params, input_gdef)
       self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-
-      self.assertAllClose(
-          ref_result,
-          result,
-          atol=self.ExpectedAbsoluteTolerance(run_params),
-          rtol=self.ExpectedRelativeTolerance(run_params))
     else:
       infer_gdef = input_gdef
 
@@ -561,11 +581,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
-    if not run_params.use_optimizer:
-      infer_gdef = self._GetTrtGraphDef(run_params, GraphState.INFERENCE,
-                                        infer_gdef)
-      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-
     result = self._RunGraph(run_params, infer_gdef, inputs_data, infer_config,
                             GraphState.INFERENCE)
     self.assertAllClose(
@@ -610,9 +625,8 @@ def _AddTests(test_class):
   for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
     if IsQuantizationMode(precision_mode):
       if use_optimizer:
-        # TODO(aaroey): if use_optimizer is True we need to get the inference
-        # graphdef using custom python wrapper class, which is not currently
-        # supported yet.
+        # We ignore the use_optimizer option and always use TrtGraphConverter
+        # for INT8 mode, so no need to run it twice.
         continue
       if use_calibration and not dynamic_engine:
         # Static engine with use_calibration=False will be static, so we want to
@@ -639,5 +653,5 @@ def _AddTests(test_class):
     setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params))
 
 
-if trt_convert.is_tensorrt_enabled():
+if is_tensorrt_enabled():
   _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/python/compiler/tensorrt/trt_conversion.i b/tensorflow/python/compiler/tensorrt/trt_conversion.i
index c12895c730047898f366bf651c798c1f1c5b93f7..35a6fa137d02d968f8929409709cb669f5c619cb 100644
--- a/tensorflow/python/compiler/tensorrt/trt_conversion.i
+++ b/tensorflow/python/compiler/tensorrt/trt_conversion.i
@@ -17,38 +17,10 @@ limitations under the License.
 %{
 #define SWIG_FILE_WITH_INIT
 %}
-%include "std_pair.i"
+%include "std_string.i"
 %include "tensorflow/python/platform/base.i"
 
 %{
-PyObject* pair_helper(std::pair<string, string>* in) {
-  PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
-  first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
-  if (!first) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
-    }
-    return NULL;
-  }
-  second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
-  if (!second) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Pair conversion second argument failed");
-    }
-    return NULL;
-  }
-  tuple = Py_BuildValue("(OO)", first, second);
-  if (!tuple) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Tuple creation from pair<string,string> failed!");
-    }
-    return NULL;
-  }
-  return tuple;
-}
-
 struct version_struct{
   int vmajor;
   int vminor;
@@ -67,6 +39,7 @@ PyObject* version_helper(version_struct* in) {
   }
   return tuple;
 }
+
 /* Define converters for vector<int> */
 template<>
 bool _PyObjAs(PyObject *pyobj, int* dest) {
@@ -83,12 +56,6 @@ PyObject *_PyObjFrom(const int& src) {
 
 _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 
-%typemap(out) std::pair<string, string> {
-  PyObject *tuple = pair_helper(&$1);
-  if (!tuple) SWIG_fail;
-  $result = tuple;
-}
-
 %typemap(out) version_struct {
   PyObject *tuple = version_helper(&$1);
   if (!tuple) SWIG_fail;
@@ -96,9 +63,6 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 }
 
 %{
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/stat_summarizer.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
@@ -106,7 +70,6 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 
 %ignoreall
 %unignore tensorflow;
-%unignore calib_convert;
 %unignore get_linked_tensorrt_version;
 %unignore get_loaded_tensorrt_version;
 %unignore is_tensorrt_enabled;
@@ -117,52 +80,6 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 
 %{
 
-std::pair<string, string> calib_convert(
-    string graph_def_string, bool is_dyn_op
-    // unfortunately we can't use TF_Status here since it
-    // is in c/c_api and brings in a lot of other libraries
-    // which in turn declare ops. These ops are included
-    // statically in our library and cause an abort when
-    // module is loaded due to double registration
-    // until Tensorflow properly exposes these headers
-    // we have to work around this by returning a string
-    // and converting it to exception on python side.
-    //,TF_Status* out_status) {
-) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  string out_status;
-
-  tensorflow::GraphDef graph_def;
-  if (!graph_def.ParseFromString(graph_def_string)) {
-    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  graph_def_string.resize(0);
-  tensorflow::GraphDef out_graph;
-  tensorflow::Status conversion_status =
-      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(
-          graph_def, &out_graph, is_dyn_op);
-  if (!conversion_status.ok()) {
-    auto retCode = (int)conversion_status.code();
-    char buff[2000];
-    snprintf(buff, 2000, "%d;%s", retCode,
-             conversion_status.error_message().c_str());
-    out_status = buff;
-    return std::pair<string, string>{out_status, ""};
-  }
-  string result;
-  if (!out_graph.SerializeToString(&result)) {
-    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  out_status = "OK;All good!";
-  return std::pair<string, string>{out_status, result};
-#else
-  // Returns FAILED_PRECONDITION.
-  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
-}
-
 version_struct get_linked_tensorrt_version() {
   // Return the version at the link time.
   version_struct s;
@@ -221,8 +138,6 @@ PyObject* get_test_value(PyObject* label) {
 
 %}
 
-std::pair<string, string> calib_convert(
-    string graph_def_string, bool is_dyn_op);
 version_struct get_linked_tensorrt_version();
 version_struct get_loaded_tensorrt_version();
 bool is_tensorrt_enabled();
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index ca7872b478c80b4f863724f5228ec85497b4e8e7..0caa2bfaf3b29c26111b554cd30e0f1ab5cebe78 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -19,29 +19,20 @@ from __future__ import division
 from __future__ import print_function
 
 import six as _six
-# pylint: disable=unused-import,line-too-long
-from tensorflow.python.compiler.tensorrt.wrap_conversion import add_test_value
-from tensorflow.python.compiler.tensorrt.wrap_conversion import calib_convert
-from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
-from tensorflow.python.compiler.tensorrt.wrap_conversion import enable_test_value
-from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
-from tensorflow.python.compiler.tensorrt.wrap_conversion import get_loaded_tensorrt_version
-from tensorflow.python.compiler.tensorrt.wrap_conversion import get_test_value
-from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
-# pylint: enable=unused-import,line-too-long
-from tensorflow.core.framework import graph_pb2
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import errors_impl as _impl
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
 
@@ -60,6 +51,303 @@ def _to_string(s):
   return s
 
 
+class GraphConverter(object):
+  """Base class for offline converters to optimize SavedModels/GraphDefs.
+
+  A `GraphConverter` object encapsulates the environment to convert (optimize) a
+  TensorFlow SavedModel or GraphDef.
+
+  To create a custom GraphConverter:
+
+  ```python
+  class MyGraphConverter(GraphConverter):
+    ...
+
+    def get_rewriter_config(self, rewriter_config_template=None):
+      my_rewriter_config = ...
+      return my_rewriter_config
+  ```
+
+  Then to run the conversion without quantization calibration:
+
+  ```python
+  my_converter = MyGraphConverter(input_saved_model_dir="my_dir")
+  converted_graph_def = my_converter.convert()
+  my_converter.save(output_saved_model_dir)  # Optional
+  ```
+
+  To run the conversion with quantization calibration:
+
+  ```python
+  my_converter = MyGraphConverter(input_saved_model_dir="my_dir")
+  my_converter.convert()
+
+  # Run calibration 10 times.
+  converted_graph_def = my_converter.calibrate(
+      fetch_names=['output:0'],
+      num_runs=10,
+      feed_dict_fn=lambda: {'input:0': my_next_data()})
+
+  my_converter.save(output_saved_model_dir)  # Optional
+  ```
+  """
+
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_graph_def=None,
+               nodes_blacklist=None,
+               session_config=None):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_graph_def: a GraphDef object containing a model to be transformed.
+        If set to None, the graph will be read from the SavedModel loaded from
+        input_saved_model_dir.
+      nodes_blacklist: list of node names to prevent the converter from
+        touching. Only used when input_graph_def is not None.
+      session_config: the ConfigProto used to create a Session. It's also used
+        as a template to create a RewriterConfig for conversion. If not
+        specified, a default ConfigProto will be used.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
+    """
+    if input_graph_def and input_saved_model_dir:
+      raise ValueError(
+          "Can only specify one of input_graph_def and input_saved_model_dir")
+    if not input_graph_def and not input_saved_model_dir:
+      raise ValueError("Must specify one of input_graph_def and "
+                       "input_saved_model_dir")
+
+    self._input_graph_def = input_graph_def
+    self._nodes_blacklist = nodes_blacklist
+    self._input_saved_model_dir = input_saved_model_dir
+    self._converted = False
+    self._grappler_meta_graph_def = None
+
+    self._input_saved_model_tags = (
+        input_saved_model_tags or [tag_constants.SERVING])
+    self._session_config = session_config or config_pb2.ConfigProto()
+
+    # For calibration usage.
+    self._calibration_graph = None
+    self._calibration_sess = None
+    self._calibration_data_collected = False
+
+  def get_rewriter_config(self, rewriter_config_template=None):
+    """Returns a RewriterConfig proto for TRT transformation.
+
+    Args:
+      rewriter_config_template: a template RewriterConfig proto used to create a
+        RewriterConfig for the conversion. The implementation should not modify
+        the template. If None, it will use a default one.
+
+    Returns:
+      A RewriterConfig proto which will be used to run the conversion using
+      Grappler.
+    """
+    raise NotImplementedError("get_rewriter_config")
+
+  def _run_conversion(self):
+    """Run Grappler's OptimizeGraph() tool to convert the graph."""
+    # Create custom ConfigProto for Grappler.
+    grappler_session_config = config_pb2.ConfigProto()
+    grappler_session_config.CopyFrom(self._session_config)
+    rewriter_config = None
+    if (grappler_session_config.HasField("graph_options") and
+        grappler_session_config.graph_options.HasField("rewrite_options")):
+      rewriter_config = grappler_session_config.graph_options.rewrite_options
+    custom_rewriter_config = self.get_rewriter_config(rewriter_config)
+    grappler_session_config.graph_options.rewrite_options.CopyFrom(
+        custom_rewriter_config)
+
+    # Run Grappler.
+    self._converted_graph_def = tf_optimizer.OptimizeGraph(
+        grappler_session_config,
+        self._grappler_meta_graph_def,
+        graph_id=b"tf_graph")
+    self._converted = True
+
+  def _convert_graph_def(self):
+    """Convert the input GraphDef."""
+    graph = ops.Graph()
+    with graph.as_default():
+      importer.import_graph_def(self._input_graph_def, name="")
+    self._grappler_meta_graph_def = saver.export_meta_graph(
+        graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
+    if self._nodes_blacklist:
+      output_collection = meta_graph_pb2.CollectionDef()
+      output_list = output_collection.node_list.value
+      for i in self._nodes_blacklist:
+        if isinstance(i, ops.Tensor):
+          output_list.append(_to_bytes(i.name))
+        else:
+          output_list.append(_to_bytes(i))
+      # TODO(laigd): use another key as the self._nodes_blacklist are really
+      # not train_op.
+      self._grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
+          output_collection)
+
+    self._run_conversion()
+
+  def _convert_saved_model(self):
+    """Convert the input SavedModel."""
+    graph = ops.Graph()
+    with session.Session(graph=graph, config=self._session_config) as sess:
+      input_meta_graph_def = loader.load(sess, self._input_saved_model_tags,
+                                         self._input_saved_model_dir)
+
+      def _gather_names(tensor_info):
+        """Get the node names from a TensorInfo."""
+        return set([tensor_info[key].name.split(":")[0] for key in tensor_info])
+
+      # Get input and outputs from all SignatureDef.
+      output_node_names = set()
+      for key in input_meta_graph_def.signature_def:
+        signature_def = input_meta_graph_def.signature_def[key]
+        output_node_names.update(_gather_names(signature_def.inputs))
+        output_node_names.update(_gather_names(signature_def.outputs))
+
+      # Freeze the variables in the SavedModel graph and copy the frozen
+      # graph over.
+      frozen_graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph.as_graph_def(add_shapes=True),
+          list(output_node_names))
+      self._grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
+      self._grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
+
+      # Copy the collections that are not variables.
+      for key in input_meta_graph_def.collection_def:
+        # TODO(laigd): currently we use the collection key to filter out
+        # collections that depend on variable ops, but this may miss some
+        # other user-defined collections. A better way would be to use
+        # CollectionDef::NodeList for the filtering.
+        if key not in [
+            "variables", "local_variables", "model_variables",
+            "trainable_variables", "train_op", "table_initializer"
+        ]:
+          self._grappler_meta_graph_def.collection_def[key].CopyFrom(
+              input_meta_graph_def.collection_def[key])
+
+      # Copy other information.
+      self._grappler_meta_graph_def.meta_info_def.CopyFrom(
+          input_meta_graph_def.meta_info_def)
+      for key in input_meta_graph_def.signature_def:
+        self._grappler_meta_graph_def.signature_def[key].CopyFrom(
+            input_meta_graph_def.signature_def[key])
+      # TODO(laigd): maybe add back AssetFileDef.
+
+    self._run_conversion()
+
+  def convert(self):
+    """Run the conversion.
+
+    Returns:
+      The converted GraphDef.
+    """
+    assert not self._converted
+
+    if self._input_graph_def:
+      self._convert_graph_def()
+    else:
+      self._convert_saved_model()
+    return self._converted_graph_def
+
+  def calibrate(self,
+                fetch_names,
+                num_runs,
+                feed_dict_fn=None,
+                input_map_fn=None):
+    """Run the calibration and return the calibrated GraphDef.
+
+    Args:
+      fetch_names: a list of output tensor name to fetch during calibration.
+      num_runs: number of runs of the graph during calibration.
+      feed_dict_fn: a function that returns a dictionary mapping input names (as
+        strings) in the GraphDef to be calibrated to values (e.g. Python list,
+        numpy arrays, etc). One and only one of `feed_dict_fn` and
+        `input_map_fn` should be specified.
+      input_map_fn: a function that returns a dictionary mapping input names (as
+        strings) in the GraphDef to be calibrated to Tensor objects. The values
+        of the named input tensors in the GraphDef to be calibrated will be
+        re-mapped to the respective `Tensor` values during calibration. One and
+        only one of `feed_dict_fn` and `input_map_fn` should be specified.
+
+    Raises:
+      ValueError: if the input combination is invalid.
+
+    Returns:
+      The GraphDef after the calibration.
+    """
+    assert self._converted
+    assert not self._calibration_sess
+    if (feed_dict_fn and input_map_fn) or (not feed_dict_fn and
+                                           not input_map_fn):
+      raise ValueError(
+          "Should specify one and only one of feed_dict_fn and input_map_fn.")
+
+    self._calibration_graph = ops.Graph()
+    with self._calibration_graph.as_default():
+      fetches = importer.import_graph_def(
+          self._converted_graph_def,
+          input_map=input_map_fn() if input_map_fn else None,
+          return_elements=fetch_names,
+          name="")
+    self._calibration_sess = session.Session(
+        graph=self._calibration_graph, config=self._session_config)
+
+    for _ in range(num_runs):
+      self._calibration_sess.run(
+          fetches, feed_dict=feed_dict_fn() if feed_dict_fn else None)
+
+    self.finalize_calibration()
+    return self._converted_graph_def
+
+  def finalize_calibration(self):
+    """Clean up calibration resources and finalize the calibration.
+
+    Implementations need to close self._calibration_sess before returning.
+    """
+    raise NotImplementedError("finalize_calibration")
+
+  def save(self, output_saved_model_dir):
+    """Save the converted graph as a SavedModel.
+
+    Args:
+      output_saved_model_dir: construct a SavedModel using the converted
+        GraphDef and save it to the specified directory. This option only works
+        when the input graph is loaded from a SavedModel, i.e. when
+        input_saved_model_dir is specified and input_graph_def is None in
+        __init__().
+
+    Raises:
+      ValueError: if the input to the converter is a GraphDef instead of a
+      SavedModel.
+    """
+    assert self._converted
+
+    if self._input_graph_def:
+      raise ValueError(
+          "Not able to save to a SavedModel since input is a GraphDef")
+
+    # Write the transformed graphdef as SavedModel.
+    saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
+    with ops.Graph().as_default():
+      importer.import_graph_def(self._converted_graph_def, name="")
+      # We don't use any specific converter here.
+      with session.Session(config=self._session_config) as sess:
+        saved_model_builder.add_meta_graph_and_variables(
+            sess,
+            self._input_saved_model_tags,
+            signature_def_map=self._grappler_meta_graph_def.signature_def)
+    # Ignore other meta graphs from the input SavedModel.
+    saved_model_builder.save()
+
+
 class TrtPrecisionMode(object):
   FP32 = "FP32"
   FP16 = "FP16"
@@ -70,112 +358,326 @@ class TrtPrecisionMode(object):
     return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
 
 
-def get_tensorrt_rewriter_config(rewriter_config=None,
-                                 max_batch_size=1,
-                                 max_workspace_size_bytes=2 << 20,
-                                 precision_mode=TrtPrecisionMode.FP32,
-                                 minimum_segment_size=3,
-                                 is_dynamic_op=False,
-                                 maximum_cached_engines=1,
-                                 cached_engine_batches=None,
-                                 use_calibration=True):
-  """Returns a RewriterConfig proto for TRT transformation.
-
-  Args:
-    rewriter_config: a template RewriterConfig proto used to create a
-      TRT-enabled RewriterConfig. If None, it will use a default one.
-    max_batch_size: max size for the input batch
-    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
-      engine can use at execution time. This corresponds to the 'workspaceSize'
-      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
-    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
-    minimum_segment_size: the minimum number of nodes required for a subgraph to
-      be replaced by TRTEngineOp.
-    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
-      network and engine at run time.
-    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
-      If the number of cached engines is already at max but none of them can
-      serve the input, the TRTEngineOp will fall back to run the TF function
-      based on which the TRTEngineOp is created.
-    cached_engine_batches: a list of batch sizes used to create cached
-      engines, only used when is_dynamic_op is True. The length of the list
-      should be <= maximum_cached_engines, and the dynamic TRT op will
-      use this list to determine the batch sizes of the cached engines, instead
-      of making the decision on the fly. This is useful when we know the most
-      common batch size(s) the application is going to generate.
-    use_calibration: this argument is ignored if precision_mode is not INT8. If
-      set to True, a calibration graph will be created to calibrate the missing
-      ranges. The calibration graph must be converted to an inference graph
-      using calib_graph_to_infer_graph() after running calibration. if set to
-      False, quantization nodes will be expected for every tensor in the graph
-      (exlcuding those which will be fused). If a range is missing, an error
-      will occur. Please note that accuracy may be negatively affected if there
-      is a mismatch between which tensors TRT quantizes and which tensors were
-      trained with fake quantization.
-
-  Returns:
-    A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
-
-  Raises:
-    TypeError: if any of the parameters are of unexpected type.
-    ValueError: if any of the parameters are of unexpected value.
-  """
-  if rewriter_config is not None and not isinstance(
-      rewriter_config, rewriter_config_pb2.RewriterConfig):
-    raise TypeError("rewriter_config should be a RewriterConfig proto.")
-
-  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
-  if rewriter_config is None:
-    # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
-    # need to run constant folding again.
-    rewriter_config_with_trt.optimizers.extend(
-        ["constfold", "layout", "constfold"])
-    rewriter_config_with_trt.meta_optimizer_iterations = (
-        rewriter_config_pb2.RewriterConfig.ONE)
-  else:
-    rewriter_config_with_trt.CopyFrom(rewriter_config)
-
-  if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
-    raise ValueError(("precision mode '{}' is not supported."
-                      "It should be one of {}").format(
-                          precision_mode,
-                          TrtPrecisionMode.supported_precision_modes))
-
-  optimizer = rewriter_config_with_trt.custom_optimizers.add()
-  optimizer.name = "TensorRTOptimizer"
-  optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
-  optimizer.parameter_map["max_batch_size"].i = max_batch_size
-  optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
-  optimizer.parameter_map[
-      "max_workspace_size_bytes"].i = max_workspace_size_bytes
-  optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
-  optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
-  if cached_engine_batches:
-    if not isinstance(cached_engine_batches, list):
-      raise TypeError("cached_engine_batches should be a list.")
-    if len(cached_engine_batches) > maximum_cached_engines:
-      raise ValueError("cached_engine_batches should not contain more than "
-                       "maximum_cached_engines items.")
-    optimizer.parameter_map["cached_engine_batches"].list.i.extend(
-        cached_engine_batches)
-  optimizer.parameter_map["use_calibration"].b = use_calibration
-  return rewriter_config_with_trt
-
-
-def create_inference_graph(input_graph_def,
-                           outputs,
-                           max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20,
-                           precision_mode=TrtPrecisionMode.FP32,
-                           minimum_segment_size=3,
-                           is_dynamic_op=False,
-                           maximum_cached_engines=1,
-                           cached_engine_batches=None,
-                           use_calibration=True,
-                           input_saved_model_dir=None,
-                           input_saved_model_tags=None,
-                           output_saved_model_dir=None,
-                           session_config=None):
+# Use a large enough number as the default max_workspace_size for TRT engines,
+# so it can produce reasonable performance results with the default.
+DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30
+
+
+class TrtGraphConverter(GraphConverter):
+  """A GraphConverter for TRT transformation."""
+
+  _TRT_CALIBRATION_RESOURCE_CONTAINER_NAME = "TF_TRT_Calibration"
+
+  @classmethod
+  def get_tensorrt_rewriter_config(
+      cls,
+      rewriter_config_template=None,
+      max_batch_size=1,
+      max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+      precision_mode=TrtPrecisionMode.FP32,
+      minimum_segment_size=3,
+      is_dynamic_op=False,
+      maximum_cached_engines=1,
+      cached_engine_batches=None,
+      use_calibration=True):
+    """Returns a RewriterConfig proto for TRT transformation.
+
+    Args:
+      rewriter_config_template: a template RewriterConfig proto used to create a
+        TRT-enabled RewriterConfig. If None, it will use a default one.
+      max_batch_size: max size for the input batch
+      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+        engine can use at execution time. This corresponds to the
+        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+      minimum_segment_size: the minimum number of nodes required for a subgraph
+        to be replaced by TRTEngineOp.
+      is_dynamic_op: whether to generate dynamic TRT ops which will build the
+        TRT network and engine at run time.
+      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
+        ops. If the number of cached engines is already at max but none of them
+        can serve the input, the TRTEngineOp will fall back to run the TF
+        function based on which the TRTEngineOp is created.
+      cached_engine_batches: a list of batch sizes used to create cached
+        engines, only used when is_dynamic_op is True. The length of the list
+        should be <= maximum_cached_engines, and the dynamic TRT op will use
+        this list to determine the batch sizes of the cached engines, instead of
+        making the decision on the fly. This is useful when we know the most
+        common batch size(s) the application is going to generate.
+      use_calibration: this argument is ignored if precision_mode is not INT8.
+        If set to True, a calibration graph will be created to calibrate the
+        missing ranges. The calibration graph must be converted to an inference
+        graph using calib_graph_to_infer_graph() after running calibration. if
+        set to False, quantization nodes will be expected for every tensor in
+        the graph (exlcuding those which will be fused). If a range is missing,
+        an error will occur. Please note that accuracy may be negatively
+        affected if there is a mismatch between which tensors TRT quantizes and
+        which tensors were trained with fake quantization.
+
+    Returns:
+      A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
+
+    Raises:
+      TypeError: if any of the parameters are of unexpected type.
+      ValueError: if any of the parameters are of unexpected value.
+    """
+    # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain
+    # even if it cannot find TensorRT library.
+    trt_ops.load_trt_ops()
+    # pylint: disable=g-import-not-at-top,unused-import,line-too-long,unused-variable
+    # Import a random symbol to trigger loading of TRT library.
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+    # pylint: enable=g-import-not-at-top,unused-import,line-too-long,unused-variable
+
+    if rewriter_config_template is not None and not isinstance(
+        rewriter_config_template, rewriter_config_pb2.RewriterConfig):
+      raise TypeError(
+          "rewriter_config_template should be a RewriterConfig proto.")
+
+    rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
+    if rewriter_config_template is None:
+      # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
+      # need to run constant folding again.
+      rewriter_config_with_trt.optimizers.extend(
+          ["constfold", "layout", "constfold"])
+      rewriter_config_with_trt.meta_optimizer_iterations = (
+          rewriter_config_pb2.RewriterConfig.ONE)
+    else:
+      rewriter_config_with_trt.CopyFrom(rewriter_config_template)
+
+    optimizer = rewriter_config_with_trt.custom_optimizers.add()
+    optimizer.name = "TensorRTOptimizer"
+    optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
+    optimizer.parameter_map["max_batch_size"].i = max_batch_size
+    optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
+    optimizer.parameter_map[
+        "max_workspace_size_bytes"].i = max_workspace_size_bytes
+    optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
+    optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
+    if cached_engine_batches:
+      optimizer.parameter_map["cached_engine_batches"].list.i.extend(
+          cached_engine_batches)
+    optimizer.parameter_map["use_calibration"].b = use_calibration
+    return rewriter_config_with_trt
+
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_graph_def=None,
+               nodes_blacklist=None,
+               session_config=None,
+               max_batch_size=1,
+               max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+               precision_mode=TrtPrecisionMode.FP32,
+               minimum_segment_size=3,
+               is_dynamic_op=False,
+               maximum_cached_engines=1,
+               cached_engine_batches=None,
+               use_calibration=True):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_graph_def: a GraphDef object containing a model to be transformed.
+        If set to None, the graph will be read from the SavedModel loaded from
+        input_saved_model_dir.
+      nodes_blacklist: list of node names to prevent the converter from
+        touching. Only used when input_graph_def is not None.
+      session_config: the ConfigProto used to create a Session. It's also used
+        as a template to create a TRT-enabled ConfigProto for conversion. If not
+        specified, a default ConfigProto will be used.
+      max_batch_size: max size for the input batch.
+      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+        engine can use at execution time. This corresponds to the
+        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+      minimum_segment_size: the minimum number of nodes required for a subgraph
+        to be replaced by TRTEngineOp.
+      is_dynamic_op: whether to generate dynamic TRT ops which will build the
+        TRT network and engine at run time.
+      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
+        ops. If the number of cached engines is already at max but none of them
+        can serve the input, the TRTEngineOp will fall back to run the TF
+        function based on which the TRTEngineOp is created.
+      cached_engine_batches: a list of batch sizes used to create cached
+        engines, only used when is_dynamic_op is True. The length of the list
+        should be <= maximum_cached_engines, and the dynamic TRT op will use
+        this list to determine the batch sizes of the cached engines, instead of
+        making the decision on the fly. This is useful when we know the most
+        common batch size(s) the application is going to generate.
+      use_calibration: this argument is ignored if precision_mode is not INT8.
+        If set to True, a calibration graph will be created to calibrate the
+        missing ranges. The calibration graph must be converted to an inference
+        graph using calib_graph_to_infer_graph() after running calibration. if
+        set to False, quantization nodes will be expected for every tensor in
+        the graph (exlcuding those which will be fused). If a range is missing,
+        an error will occur. Please note that accuracy may be negatively
+        affected if there is a mismatch between which tensors TRT quantizes and
+        which tensors were trained with fake quantization.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
+      RuntimeError: if the TensorRT library version is incompatible.
+    """
+    super(TrtGraphConverter, self).__init__(
+        input_saved_model_dir=input_saved_model_dir,
+        input_saved_model_tags=input_saved_model_tags,
+        input_graph_def=input_graph_def,
+        nodes_blacklist=nodes_blacklist,
+        session_config=session_config)
+
+    # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain
+    # even if it cannot find TensorRT library.
+    trt_ops.load_trt_ops()
+    # pylint: disable=g-import-not-at-top,line-too-long
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_loaded_tensorrt_version
+    # pylint: enable=g-import-not-at-top,line-too-long
+
+    # Check compatibility of TensorRT version.
+    compiled_version = get_linked_tensorrt_version()
+    loaded_version = get_loaded_tensorrt_version()
+    version_mismatch = False
+    if loaded_version[0] < compiled_version[0]:
+      tf_logging.error(
+          "TensorRT version mismatch. Tensorflow was compiled against " +
+          "TensorRT %s but library loaded from environment is TensorRT %s" %
+          (".".join([str(x) for x in compiled_version]),
+           ".".join([str(x) for x in loaded_version])) +
+          ". Please make sure that correct version of TensorRT " +
+          "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
+      raise RuntimeError("Incompatible TensorRT library version")
+    for i in zip(loaded_version, compiled_version):
+      if i[0] != i[1]:
+        tf_logging.warn("TensorRT mismatch. Compiled against version " +
+                        "%s, but loaded %s. Things may not work" %
+                        (".".join([str(x) for x in compiled_version]),
+                         ".".join([str(x) for x in loaded_version])))
+        version_mismatch = True
+        break
+    if not version_mismatch:
+      tf_logging.info("Running against TensorRT version %s" %
+                      ".".join([str(x) for x in loaded_version]))
+
+    # Check input arguments.
+    if precision_mode not in TrtPrecisionMode.supported_precision_modes():
+      raise ValueError(("precision mode '{}' is not supported."
+                        "It should be one of {}").format(
+                            precision_mode,
+                            TrtPrecisionMode.supported_precision_modes))
+
+    if cached_engine_batches:
+      if not isinstance(cached_engine_batches, list):
+        raise TypeError("cached_engine_batches should be a list.")
+      if len(cached_engine_batches) > maximum_cached_engines:
+        raise ValueError("cached_engine_batches should not contain more than "
+                         "maximum_cached_engines items.")
+
+    self._need_calibration = (
+        precision_mode == TrtPrecisionMode.INT8 and use_calibration)
+
+    # TODO(laigd):
+    # - Get rid of is_dynamic_op option, it should always be True, and it should
+    #   accept N shapes as input.
+    # - Verify in int8 mode that maximum_cached_engines and
+    #   cached_engine_batches are set appropriately.
+    # - If it fails to build the int8 engine it should return error.
+    self._max_batch_size = max_batch_size
+    self._max_workspace_size_bytes = max_workspace_size_bytes
+    self._precision_mode = precision_mode
+    self._minimum_segment_size = minimum_segment_size
+    self._is_dynamic_op = is_dynamic_op
+    self._maximum_cached_engines = maximum_cached_engines
+    self._cached_engine_batches = cached_engine_batches
+
+  def get_rewriter_config(self, rewriter_config_template=None):
+    return TrtGraphConverter.get_tensorrt_rewriter_config(
+        rewriter_config_template,
+        max_batch_size=self._max_batch_size,
+        max_workspace_size_bytes=self._max_workspace_size_bytes,
+        precision_mode=self._precision_mode,
+        minimum_segment_size=self._minimum_segment_size,
+        is_dynamic_op=self._is_dynamic_op,
+        maximum_cached_engines=self._maximum_cached_engines,
+        cached_engine_batches=self._cached_engine_batches,
+        use_calibration=self._need_calibration)
+
+  def finalize_calibration(self):
+    assert self._need_calibration
+    assert self._converted
+    assert not self._calibration_data_collected
+
+    # Lazily load the op, since it's not available in cpu-only builds. Importing
+    # this at top will cause tests that imports TF-TRT fail when they're built
+    # and run without CUDA/GPU.
+    # pylint: disable=g-import-not-at-top,line-too-long
+    from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import get_serialized_resource_op
+    # pylint: enable=g-import-not-at-top,line-too-long
+
+    # TODO(laigd): a better way would be to use self._calibration_sess to list
+    # all the devices, add one get_serialized_resource_op for each device, and
+    # fetch each such op for every resource until its found. This can work
+    # even when the device of the TRTEngineOp is empty or not fully specified.
+
+    # Maps device name to the corresponding get_serialized_resource_op.
+    device_to_get_resource_op_map = {}
+
+    with self._calibration_graph.as_default():
+      container_input = array_ops.placeholder(dtypes.string)
+      resource_name_input = array_ops.placeholder(dtypes.string)
+
+      for node in self._converted_graph_def.node:
+        if node.op == "TRTEngineOp":
+          # Adds the get_serialized_resource_op for the device if not done
+          # before. We only add one such op for each device.
+          # TODO(laigd): What if the device is empty?????
+          if node.device not in device_to_get_resource_op_map:
+            with self._calibration_graph.device(node.device):
+              serialized_resources_output = (
+                  get_serialized_resource_op(container_input,
+                                             resource_name_input))
+            device_to_get_resource_op_map[node.device] = (
+                serialized_resources_output)
+
+          # Get the calibration resource.
+          calibration_result = self._calibration_sess.run(
+              device_to_get_resource_op_map[node.device],
+              feed_dict={
+                  container_input:
+                      TrtGraphConverter
+                      ._TRT_CALIBRATION_RESOURCE_CONTAINER_NAME,
+                  resource_name_input:
+                      node.name
+              })
+          node.attr["calibration_data"].s = calibration_result
+
+    self._calibration_data_collected = True
+    self._calibration_sess.close()
+
+  def save(self, output_saved_model_dir):
+    """Save the converted graph as a SavedModel."""
+    if self._need_calibration:
+      assert self._calibration_data_collected
+    super(TrtGraphConverter, self).save(output_saved_model_dir)
+
+
+def create_inference_graph(
+    input_graph_def,
+    outputs,
+    max_batch_size=1,
+    max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    cached_engine_batches=None,
+    use_calibration=True,
+    input_saved_model_dir=None,
+    input_saved_model_tags=None,
+    output_saved_model_dir=None,
+    session_config=None):
   """Python wrapper for the TRT transformation.
 
   Args:
@@ -197,12 +699,12 @@ def create_inference_graph(input_graph_def,
       If the number of cached engines is already at max but none of them can
       serve the input, the TRTEngineOp will fall back to run the TF function
       based on which the TRTEngineOp is created.
-    cached_engine_batches: a list of batch sizes used to create cached
-      engines, only used when is_dynamic_op is True. The length of the list
-      should be <= maximum_cached_engines, and the dynamic TRT op will
-      use this list to determine the batch sizes of the cached engines, instead
-      of making the decision on the fly. This is useful when we know the most
-      common batch size(s) the application is going to generate.
+    cached_engine_batches: a list of batch sizes used to create cached engines,
+      only used when is_dynamic_op is True. The length of the list should be <=
+      maximum_cached_engines, and the dynamic TRT op will use this list to
+      determine the batch sizes of the cached engines, instead of making the
+      decision on the fly. This is useful when we know the most common batch
+      size(s) the application is going to generate.
     use_calibration: this argument is ignored if precision_mode is not INT8. If
       set to True, a calibration graph will be created to calibrate the missing
       ranges. The calibration graph must be converted to an inference graph
@@ -246,183 +748,21 @@ def create_inference_graph(input_graph_def,
     ValueError: if the combination of the parameters is invalid.
     RuntimeError: if the TensorRT library version is incompatible.
   """
-  compiled_version = get_linked_tensorrt_version()
-  loaded_version = get_loaded_tensorrt_version()
-  version_mismatch = False
-  if loaded_version[0] < compiled_version[0]:
-    tf_logging.error(
-        "TensorRT version mismatch. Tensorflow was compiled against " +
-        "TensorRT %s but library loaded from environment is TensorRT %s" %
-        (".".join([str(x) for x in compiled_version]),
-         ".".join([str(x) for x in loaded_version])) +
-        ". Please make sure that correct version of TensorRT " +
-        "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
-    raise RuntimeError("Incompatible TensorRT library version")
-  for i in zip(loaded_version, compiled_version):
-    if i[0] != i[1]:
-      tf_logging.warn("TensorRT mismatch. Compiled against version " +
-                      "%s, but loaded %s. Things may not work" %
-                      (".".join([str(x) for x in compiled_version]),
-                       ".".join([str(x) for x in loaded_version])))
-      version_mismatch = True
-      break
-  if not version_mismatch:
-    tf_logging.info("Running against TensorRT version %s" % ".".join(
-        [str(x) for x in loaded_version]))
-
-  if session_config is None:
-    session_config = config_pb2.ConfigProto()
-
-  if input_saved_model_tags is None:
-    input_saved_model_tags = [tag_constants.SERVING]
-  saved_model_loader = None
-  grappler_meta_graph_def = None
-
-  if input_graph_def is None:
-    # Read from SavedModel and freeze the graph if necessary.
-    if input_saved_model_dir is None:
-      raise ValueError("input_graph_def and input_saved_model_dir cannot be "
-                       "both None")
-    with ops.Graph().as_default():
-      with session.Session(config=session_config) as sess:
-        saved_model_loader = loader_impl.SavedModelLoader(input_saved_model_dir)
-        input_meta_graph_def = saved_model_loader.load(sess,
-                                                       input_saved_model_tags)
-        output_node_names = set()
-
-        def _gather_names(tensor_info):
-          """Get the node names from a TensorInfo."""
-          return set(
-              [tensor_info[key].name.split(":")[0] for key in tensor_info])
-
-        # Get input and outputs from all SignatureDef.
-        for key in input_meta_graph_def.signature_def:
-          signature_def = input_meta_graph_def.signature_def[key]
-          output_node_names.update(_gather_names(signature_def.inputs))
-          output_node_names.update(_gather_names(signature_def.outputs))
-
-        # Freeze the variables in the SavedModel graph and copy the frozen
-        # graph over.
-        frozen_graph_def = graph_util.convert_variables_to_constants(
-            sess, sess.graph.as_graph_def(add_shapes=True),
-            list(output_node_names))
-        grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
-        grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
-
-        # Copy the collections that are not variables.
-        for key in input_meta_graph_def.collection_def:
-          # TODO(laigd): currently we use the collection key to filter out
-          # collections that depend on variable ops, but this may miss some
-          # other user-defined collections. A better way would be to use
-          # CollectionDef::NodeList for the filtering.
-          if key not in [
-              "variables", "local_variables", "model_variables",
-              "trainable_variables", "train_op", "table_initializer"
-          ]:
-            grappler_meta_graph_def.collection_def[key].CopyFrom(
-                input_meta_graph_def.collection_def[key])
-
-        # Copy other information.
-        grappler_meta_graph_def.meta_info_def.CopyFrom(
-            input_meta_graph_def.meta_info_def)
-        for key in input_meta_graph_def.signature_def:
-          grappler_meta_graph_def.signature_def[key].CopyFrom(
-              input_meta_graph_def.signature_def[key])
-        # TODO(laigd): maybe add back AssetFileDef.
-  else:
-    if output_saved_model_dir is not None:
-      raise ValueError("output_saved_model_dir cannot be set when "
-                       "input_graph_def is set")
-    # Create MetaGraphDef from input graph.
-    graph = ops.Graph()
-    with graph.as_default():
-      importer.import_graph_def(input_graph_def, name="")
-    grappler_meta_graph_def = saver.export_meta_graph(
-        graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
-    if outputs:
-      output_collection = meta_graph_pb2.CollectionDef()
-      output_list = output_collection.node_list.value
-      for i in outputs:
-        if isinstance(i, ops.Tensor):
-          output_list.append(_to_bytes(i.name))
-        else:
-          output_list.append(_to_bytes(i))
-      # TODO(laigd): use another key as the outputs are really not train_op.
-      grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
-          output_collection)
-
-  # Create TRT-enabled ConfigProto.
-  session_config_with_trt = config_pb2.ConfigProto()
-  session_config_with_trt.CopyFrom(session_config)
-  rewriter_config = None
-  if (session_config_with_trt.HasField("graph_options") and
-      session_config_with_trt.graph_options.HasField("rewrite_options")):
-    rewriter_config = session_config_with_trt.graph_options.rewrite_options
-  rewriter_config_with_trt = get_tensorrt_rewriter_config(
-      rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
-      minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batches, use_calibration)
-  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
-      rewriter_config_with_trt)
-
-  # Run Grappler.
-  transformed_graph_def = tf_optimizer.OptimizeGraph(
-      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
-
-  # Optionally write the transformed graphdef as SavedModel.
-  if output_saved_model_dir is not None:
-    saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
-    with ops.Graph().as_default():
-      importer.import_graph_def(transformed_graph_def, name="")
-      # We don't use TRT here.
-      with session.Session(config=session_config) as sess:
-        saved_model_builder.add_meta_graph_and_variables(
-            sess,
-            input_saved_model_tags,
-            signature_def_map=grappler_meta_graph_def.signature_def)
-    # Ignore other meta graphs from the input SavedModel.
-    saved_model_builder.save()
-
-  return transformed_graph_def
-
-
-def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False):
-  """Convert an existing calibration graph to inference graph.
-
-  Args:
-    calibration_graph_def: the calibration GraphDef object with calibration data
-    is_dynamic_op: whether to create dynamic static engines from calibration
-
-  Returns:
-    New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
-  Raises:
-    RuntimeError: if the returned status message is malformed.
-  """
-
-  is_calib_graph = False
-  for n in calibration_graph_def.node:
-    if n.op == "TRTEngineOp":
-      is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s
-  if not is_calib_graph:
-    tf_logging.error(
-        "Not a calib graph. Doesn't seem to contain any calibration nodes.")
-    return None
-  graph_str = calibration_graph_def.SerializeToString()
-  out = calib_convert(graph_str, is_dynamic_op)
-  status = _to_string(out[0])
-  output_graph_def_string = out[1]
-  del graph_str  # Save some memory
-  if len(status) < 2:
-    raise _impl.UnknownError(None, None, status)
-  if status[:2] != "OK":
-    msg = status.split(";")
-    if len(msg) == 1:
-      raise RuntimeError("Status message is malformed {}".format(status))
-    # pylint: disable=protected-access
-    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
-                                         int(msg[0]))
-    # pylint: enable=protected-access
-  output_graph_def = graph_pb2.GraphDef()
-  output_graph_def.ParseFromString(output_graph_def_string)
-  del output_graph_def_string  # Save some memory
-  return output_graph_def
+  trt_converter = TrtGraphConverter(
+      input_saved_model_dir=input_saved_model_dir,
+      input_saved_model_tags=input_saved_model_tags,
+      input_graph_def=input_graph_def,
+      nodes_blacklist=outputs,
+      session_config=session_config,
+      max_batch_size=max_batch_size,
+      max_workspace_size_bytes=max_workspace_size_bytes,
+      precision_mode=precision_mode,
+      minimum_segment_size=minimum_segment_size,
+      is_dynamic_op=is_dynamic_op,
+      maximum_cached_engines=maximum_cached_engines,
+      cached_engine_batches=cached_engine_batches,
+      use_calibration=use_calibration)
+  converted_graph_def = trt_converter.convert()
+  if output_saved_model_dir:
+    trt_converter.save(output_saved_model_dir)
+  return converted_graph_def
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 7f839117cd085b0bc544aa960a1e07efbcc7b70f..97dea1bfc8a69d8819de38f3c5d954f57a52b8de 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import os
 
-# pylint: disable=unused-import
-from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
-# pylint: enable=unused-import
+from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
+from tensorflow.python.compiler.tensorrt.wrap_conversion import enable_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -47,10 +48,16 @@ from tensorflow.python.tools import saved_model_utils
 class TrtConvertTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
+  # Use a small max_workspace_size for tests so they don't consume too much GPU
+  # memory.
+  _TRT_MAX_WORKSPACE_SIZE_BYTES = 2 << 20
+
   def testGetTensorrtRewriterConfig(self):
-    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
-        rewriter_config=None,
+    """Test case for TrtGraphConverter.get_tensorrt_rewriter_config()."""
+    if not is_tensorrt_enabled():
+      return
+    rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
+        rewriter_config_template=None,
         max_batch_size=128,
         max_workspace_size_bytes=1234,
         precision_mode="INT8",
@@ -94,6 +101,17 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
 
   def _GetGraph(self):
     """Get the graph for testing."""
+    # The graph computes (input+1)^2, it looks like:
+    #
+    # input (Placeholder)  v1 (Variable)
+    #               |   \ /
+    #                \   +
+    #                 \ / \
+    #                  *   |
+    #                   \ /
+    #                    +
+    #                    |
+    #                 output (Identity)
     g = ops.Graph()
     with g.as_default():
       with g.device("/GPU:0"):
@@ -140,18 +158,61 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
           signature_def_map={"mypredict": signature_def})
     saved_model_builder.save()
 
-  def _TestCreateInferenceGraph(self,
-                                input_saved_model_dir=None,
-                                output_saved_model_dir=None):
-    """General method to test trt_convert.create_inference_graph()."""
-    input_graph_def = None if input_saved_model_dir else self._GetGraphDef()
-    output_graph_def = trt_convert.create_inference_graph(
-        input_graph_def, ["output"],
+  def _ConvertGraph(self,
+                    input_saved_model_dir=None,
+                    output_saved_model_dir=None,
+                    need_calibration=False,
+                    max_batch_size=1,
+                    minimum_segment_size=3,
+                    is_dynamic_op=False,
+                    maximum_cached_engines=1):
+    """Helper method to convert a GraphDef or SavedModel using TF-TRT."""
+    converter = trt_convert.TrtGraphConverter(
+        input_saved_model_dir=input_saved_model_dir,
+        input_graph_def=None if input_saved_model_dir else self._GetGraphDef(),
+        nodes_blacklist=["output"],
+        session_config=self._GetConfigProto(),
+        max_batch_size=max_batch_size,
+        max_workspace_size_bytes=TrtConvertTest._TRT_MAX_WORKSPACE_SIZE_BYTES,
+        precision_mode=(trt_convert.TrtPrecisionMode.INT8 if need_calibration
+                        else trt_convert.TrtPrecisionMode.FP32),
+        minimum_segment_size=minimum_segment_size,
+        is_dynamic_op=is_dynamic_op,
+        maximum_cached_engines=maximum_cached_engines)
+    output_graph_def = converter.convert()
+
+    if need_calibration:
+
+      class CalibrationData(object):
+
+        def __init__(self):
+          self._data = 0
+
+        def next(self):
+          self._data += 1
+          return {"input:0": [[[self._data]]]}
+
+      output_graph_def = converter.calibrate(
+          fetch_names=["output:0"],
+          num_runs=10,
+          feed_dict_fn=CalibrationData().next)
+
+    if output_saved_model_dir is not None:
+      converter.save(output_saved_model_dir=output_saved_model_dir)
+    return output_graph_def
+
+  def _TestTrtGraphConverter(self,
+                             input_saved_model_dir=None,
+                             output_saved_model_dir=None,
+                             need_calibration=False):
+    """General method to test trt_convert.TrtGraphConverter()."""
+    output_graph_def = self._ConvertGraph(
         input_saved_model_dir=input_saved_model_dir,
         output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
+        need_calibration=need_calibration)
     graph_defs_to_verify = [output_graph_def]
-    if output_saved_model_dir is not None:
+
+    if output_saved_model_dir:
       saved_model_graph_def = saved_model_utils.get_meta_graph_def(
           output_saved_model_dir, tag_constants.SERVING).graph_def
       self.assertTrue(isinstance(saved_model_graph_def, graph_pb2.GraphDef))
@@ -165,42 +226,60 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
           "output": "Identity"
       }, node_name_to_op)
 
-  def testCreateInferenceGraph_BasicConversion(self):
-    """Test case for trt_convert.create_inference_graph()."""
-    if not trt_convert.is_tensorrt_enabled():
-      return
+      if need_calibration:
+        trt_engine_nodes = [
+            node for node in graph_def.node if node.op == "TRTEngineOp"
+        ]
+        self.assertNotEmpty(trt_engine_nodes)
+        for node in trt_engine_nodes:
+          self.assertTrue(len(node.attr["calibration_data"].s))
+        # Run the calibrated graph.
+        # TODO(laigd): consider having some input where the answer is different.
+        with ops.Graph().as_default():
+          importer.import_graph_def(graph_def, name="")
+          with self.session(config=self._GetConfigProto()) as sess:
+            for test_data in range(10):
+              self.assertEqual((test_data + 1.0)**2,
+                               sess.run(
+                                   "output:0",
+                                   feed_dict={"input:0": [[[test_data]]]}))
 
-    # Use GraphDef as input.
-    self._TestCreateInferenceGraph()
+  def testTrtGraphConverter_BasicConversion(self):
+    """Test case for trt_convert.TrtGraphConverter()."""
+    if not is_tensorrt_enabled():
+      return
 
-    # Use SavedModel as input.
     tmp_dir = self.get_temp_dir()
     input_saved_model_dir = os.path.join(tmp_dir, "in_dir1")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir1")
     self._WriteInputSavedModel(input_saved_model_dir)
-    self._TestCreateInferenceGraph(input_saved_model_dir,
-                                   output_saved_model_dir)
+
+    for need_calibration in [False, True]:
+      # Use GraphDef as input.
+      self._TestTrtGraphConverter()
+
+      # Use SavedModel as input.
+      output_saved_model_dir = os.path.join(
+          tmp_dir, "out_dir1%s" % ("_int8" if need_calibration else ""))
+      self._TestTrtGraphConverter(
+          input_saved_model_dir=input_saved_model_dir,
+          output_saved_model_dir=output_saved_model_dir,
+          need_calibration=need_calibration)
 
   def _TestRun(self, sess, batch_size, expect_engine_is_run):
-    trt_convert.clear_test_values("")
+    clear_test_values("")
     result = sess.run("output:0", feed_dict={"input:0": [[[1.0]]] * batch_size})
     self.assertAllEqual([[[4.0]]] * batch_size, result)
     execute_engine_test_value = ("done" if expect_engine_is_run else "")
     execute_native_segment_test_value = ("" if expect_engine_is_run else "done")
-    self.assertEqual(
-        execute_engine_test_value,
-        trt_convert.get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
-    self.assertEqual(
-        execute_native_segment_test_value,
-        trt_convert.get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
+    self.assertEqual(execute_engine_test_value,
+                     get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
+    self.assertEqual(execute_native_segment_test_value,
+                     get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
 
-  def testCreateInferenceGraph_MinimumSegmentSize(self):
-    if not trt_convert.is_tensorrt_enabled():
+  def testTrtGraphConverter_MinimumSegmentSize(self):
+    if not is_tensorrt_enabled():
       return
-    output_graph_def = trt_convert.create_inference_graph(
-        self._GetGraphDef(), ["output"],
-        minimum_segment_size=5,
-        is_dynamic_op=False)
+    output_graph_def = self._ConvertGraph(minimum_segment_size=5)
     node_name_to_op = {node.name: node.op for node in output_graph_def.node}
     self.assertEqual({
         "v1/read": "Const",
@@ -211,28 +290,25 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         "output": "Identity"
     }, node_name_to_op)
 
-  def testCreateInferenceGraph_DynamicOp(self):
-    if not trt_convert.is_tensorrt_enabled():
+  def testTrtGraphConverter_DynamicOp(self):
+    if not is_tensorrt_enabled():
       return
-    trt_convert.enable_test_value()
+    enable_test_value()
 
     tmp_dir = self.get_temp_dir()
     input_saved_model_dir = os.path.join(tmp_dir, "in_dir2")
     output_saved_model_dir = os.path.join(tmp_dir, "out_dir2")
     self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = trt_convert.create_inference_graph(
-        None,
-        None,
-        is_dynamic_op=True,
-        maximum_cached_engines=2,
+    output_graph_def = self._ConvertGraph(
         input_saved_model_dir=input_saved_model_dir,
         output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
+        is_dynamic_op=True,
+        maximum_cached_engines=2)
 
     # Test the output GraphDef.
     with ops.Graph().as_default():
       importer.import_graph_def(output_graph_def, name="")
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         # Run with batch size 1, a new engine is created and cached.
         self._TestRun(sess, 1, True)
         # Run with batch size 2, a new engine is created and cached.
@@ -243,7 +319,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
 
     # Test the output SavedModel
     with ops.Graph().as_default():
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
         # Run with batch size 1, a new engine is created and cached.
         self._TestRun(sess, 1, True)
@@ -253,29 +329,24 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         # the max, it should evict an old engine and create a new one.
         self._TestRun(sess, 3, True)
 
-  def testCreateInferenceGraph_StaticOp(self):
-    if not trt_convert.is_tensorrt_enabled():
+  def testTrtGraphConverter_StaticOp(self):
+    if not is_tensorrt_enabled():
       return
-    trt_convert.enable_test_value()
+    enable_test_value()
 
     tmp_dir = self.get_temp_dir()
     input_saved_model_dir = os.path.join(tmp_dir, "in_dir3")
     output_saved_model_dir = os.path.join(tmp_dir, "out_dir3")
     self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = trt_convert.create_inference_graph(
-        None,
-        None,
-        max_batch_size=1,
-        is_dynamic_op=False,
-        maximum_cached_engines=2,  # This is noop, added just for testing.
+    output_graph_def = self._ConvertGraph(
         input_saved_model_dir=input_saved_model_dir,
         output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
+        maximum_cached_engines=2)  # This is noop, added just for testing.
 
     # Test the output GraphDef.
     with ops.Graph().as_default():
       importer.import_graph_def(output_graph_def, name="")
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         # Run with batch size 1, the default engine embedded in the graphdef
         # will be used.
         self._TestRun(sess, 1, True)
@@ -285,7 +356,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
 
     # Test the output SavedModel
     with ops.Graph().as_default():
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
         # Run with batch size 1, the default engine embedded in the graphdef
         # will be used.
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 739e5bab192d9188781dca8d396504fa7071786d..031476100f448528503b5bc9b7c6c360caf9f8b1 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -17,15 +17,23 @@ py_test(
     ],
 )
 
+py_library(
+    name = "benchmark_base",
+    srcs = ["benchmark_base.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -37,12 +45,8 @@ py_test(
     srcs = ["filter_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -51,9 +55,7 @@ py_test(
     srcs = ["from_tensor_slices_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -64,6 +66,7 @@ py_test(
     srcs = ["list_files_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":benchmark_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
@@ -78,11 +81,8 @@ py_test(
     srcs = ["map_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -91,8 +91,7 @@ py_test(
     srcs = ["range_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
index 0ccf5c57d1954078bea1fca02885824a796236f5..8cad91212a7c3699be8fcc0140505a9c8824723e 100644
--- a/tensorflow/python/data/benchmarks/batch_benchmark.py
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -17,70 +17,37 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
 
 
-# TODO(b/119837791): Add eager benchmarks.
-class BatchBenchmark(test.Benchmark):
+class BatchBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.batch()`."""
 
-  def benchmarkBatchSparse(self):
+  def benchmark_batch_sparse(self):
     non_zeros_per_row_values = [0, 1, 5, 10, 100]
     batch_size_values = [1, 32, 64, 128, 1024]
 
-    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
-    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
-
-    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
-        ).batch(batch_size_placeholder)
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
     for non_zeros_per_row in non_zeros_per_row_values:
 
-      sparse_value = sparse_tensor.SparseTensorValue(
+      tensor = sparse_tensor.SparseTensor(
           indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
           values=np.arange(non_zeros_per_row, dtype=np.int64),
           dense_shape=[1000])
 
       for batch_size in batch_size_values:
-
-        with session.Session() as sess:
-          sess.run(iterator.initializer, feed_dict={
-              sparse_placeholder: sparse_value,
-              batch_size_placeholder: batch_size})
-          # Run five steps to warm up the session caches before taking the
-          # first measurement.
-          for _ in range(5):
-            sess.run(next_element.indices.op)
-          deltas = []
-          for _ in range(100):
-            start = time.time()
-            for _ in range(100):
-              sess.run(next_element.indices.op)
-            end = time.time()
-            deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100.0
-
-        self.report_benchmark(
-            iters=10000,
-            wall_time=median_wall_time,
-            name="sparse_num_elements_%d_batch_size_%d" %
-            (non_zeros_per_row, batch_size))
+        dataset = dataset_ops.Dataset.from_tensors(tensor).repeat().batch(
+            batch_size)
+        self.run_and_report_benchmark(
+            dataset,
+            num_elements=100000 // batch_size,
+            iters=1,
+            name="sparse_num_elements_%d_batch_size_%d" % (non_zeros_per_row,
+                                                           batch_size))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f992d955ba49cf0de1bdd7cf1ee760b6af8ef6
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities for tf.data benchmarking functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class DatasetBenchmarkBase(test.Benchmark):
+  """Base class for dataset benchmarks."""
+
+  def run_benchmark(self, dataset, num_elements, iters=1):
+    """Benchmarks the dataset.
+
+    Runs the dataset `iters` times. In each iteration, the benchmark measures
+    the time it takes to go through `num_elements` elements of the dataset.
+
+    Args:
+      dataset: Dataset to benchmark.
+      num_elements: Number of dataset elements to iterate through each benchmark
+        iteration.
+      iters: Number of times to repeat the timing.
+
+    Returns:
+      A float, representing the per-element wall time of the dataset in seconds.
+      This is the median time (with respect to `iters`) it takes for the dataset
+      to go through `num_elements` elements, divided by `num_elements.`
+    """
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    # NOTE: We use `dataset.skip()` to perform the iterations in C++, avoiding
+    # the overhead of multiple `session.run()` calls. Note that this relies on
+    # the underlying implementation of `skip`: if it is optimized in the future,
+    # we will have to change this code.
+    dataset = dataset.skip(num_elements - 1)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+    next_element = nest.flatten(next_element)[0]
+
+    deltas = []
+    for _ in range(iters):
+      with session.Session() as sess:
+        # Run once to warm up the session caches.
+        sess.run(iterator.initializer)
+        sess.run(next_element)
+
+        sess.run(iterator.initializer)
+        start = time.time()
+        sess.run(next_element.op)
+        end = time.time()
+      deltas.append(end - start)
+    return np.median(deltas) / float(num_elements)
+
+  def run_and_report_benchmark(self,
+                               dataset,
+                               num_elements,
+                               name,
+                               iters=5,
+                               extras=None):
+    # Measure the per-element wall time.
+    wall_time = self.run_benchmark(dataset, num_elements, iters)
+
+    if extras is None:
+      extras = {}
+    extras["elements_per_second"] = 1 / wall_time
+    extras["num_elements"] = num_elements
+    # 'mode' represents the mechanism used for iterating over dataset elements.
+    name = "%s_mode_cpp" % name
+    self.report_benchmark(
+        wall_time=wall_time, iters=iters, name=name, extras=extras)
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
index e0ecf19e11f95f0f2726eb0959ddc23ac9141283..eb47b4089c7f57f9426fd5dcc15b2296fdb0bd25 100644
--- a/tensorflow/python/data/benchmarks/filter_benchmark.py
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -17,51 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class FilterBenchmark(test.Benchmark):
+class FilterBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.filter()`."""
 
   def _benchmark(self, predicate, name):
-    with ops.Graph().as_default():
-      dataset = (
-          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      options = dataset_ops.Options()
-      options.experimental_optimization.apply_default_optimizations = False
-      dataset = dataset.with_options(options)
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        self.report_benchmark(iters=100, wall_time=median_wall_time, name=name)
+    dataset = (
+        dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+    self.run_and_report_benchmark(dataset, num_elements=100000, name=name)
 
-  def benchmarkSimpleFunction(self):
+  def benchmark_simple_function(self):
     self._benchmark(array_ops.identity, "simple_function")
 
-  def benchmarkReturnComponentOptimization(self):
+  def benchmark_return_component_optimization(self):
     self._benchmark(lambda x: x, "return_component")
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
index 4e5559ddbafb2ee0501ec9c87a98b314594cdc75..3af174acc320186ae368f23145bb9700e4d3aaa1 100644
--- a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -17,170 +17,70 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class FromTensorSlicesBenchmark(test.Benchmark):
+class FromTensorSlicesBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
 
-  def benchmarkSliceRepeatBatch(self):
+  def benchmark_slice_repeat_batch(self):
     input_size = 10000
     batch_size = 100
     num_epochs = 100
+    num_elements = input_size * num_epochs // batch_size
 
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        sess.run(next_element)
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          sess.run(next_element)
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
+        dataset_ops.Dataset.from_tensor_slices(input_data).repeat(
+            num_epochs).batch(batch_size))
+
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
         name="slice_repeat_batch_input_%d_batch_%d" % (input_size, batch_size))
 
-  def benchmarkSliceRepeatBatchCallable(self):
+  def benchmark_reshape_slice_repeat(self):
     input_size = 10000
-    batch_size = 100
+    reshape_dim = [100, 100]
     num_epochs = 100
 
+    num_elements = num_epochs * reshape_dim[0]
+
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="slice_repeat_batch_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkReshapeSliceRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
+        dataset_ops.Dataset.from_tensor_slices(
+            input_data.reshape(*reshape_dim)).repeat(num_epochs))
 
-    input_data = np.random.randn(input_size)
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="reshape_slice_repeat_input_%d" % input_size,
+    )
 
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
-        .repeat(num_epochs + 1))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="reshape_slice_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkSliceBatchCacheRepeatCallable(self):
+  def benchmark_slice_batch_cache_repeat(self):
     input_size = 10000
     batch_size = 100
     num_epochs = 100
+    num_elements = input_size * num_epochs // batch_size
 
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
-        .cache().repeat(num_epochs + 1))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="slice_batch_cache_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+            batch_size).cache().repeat(num_epochs))
+
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="slice_batch_cache_repeat_input_%d_batch_%d" % (input_size,
+                                                             batch_size))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index b620eaaed52c5bdea4fab776442ddd6bc2801605..75b71fffac4d493cb0d2e4d579597de302ad89c2 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -17,114 +17,51 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class MapBenchmark(test.Benchmark):
-  """Bechmarks for `tf.data.Dataset.map()`."""
-
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = True
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = False
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda x: x
-          use_inter_op_parallelism = True  # should not have any significance
-          benchmark_label = "_short_circuit"
+class MapBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for `tf.data.Dataset.map()`."""
 
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-          for _ in range(chain_length):
-            dataset = dataset_ops.MapDataset(
-                dataset,
-                map_fn,
-                use_inter_op_parallelism=use_inter_op_parallelism)
-          options = dataset_ops.Options()
-          options.experimental_optimization.apply_default_optimizations = False
-          dataset = dataset.with_options(options)
-          iterator = dataset_ops.make_one_shot_iterator(dataset)
-          next_element = iterator.get_next()
+  def benchmark_chain_of_maps(self):
 
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element.op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element.op)
-              end = time.time()
-              deltas.append(end - start)
+    def benchmark_helper(chain_length, map_fn, use_inter_op_parallelism, label):
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset_ops.MapDataset(
+            dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=10000,
+          name="chain_length_%d%s" % (chain_length, label))
 
-            median_wall_time = np.median(deltas) / 100
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="chain_length_%d%s" % (chain_length, benchmark_label))
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      benchmark_helper(chain_length, lambda x: x + 1, True, "")
+      benchmark_helper(chain_length, lambda x: x + 1, False, "_single_threaded")
+      benchmark_helper(chain_length, lambda x: x, True, "_short_circuit")
 
-  def benchmarkMapFanOut(self):
+  def benchmark_map_fan_out(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
-    for fan_out in fan_outs:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = True
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = False
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda *xs: xs
-          use_inter_op_parallelism = True  # should not have any significance
-          benchmark_label = "_short_circuit"
 
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(
-              tuple(0 for _ in range(fan_out))).repeat(None)
-          dataset = dataset_ops.MapDataset(
-              dataset,
-              map_fn,
-              use_inter_op_parallelism=use_inter_op_parallelism)
-          options = dataset_ops.Options()
-          options.experimental_optimization.apply_default_optimizations = False
-          dataset = dataset.with_options(options)
-          iterator = dataset_ops.make_one_shot_iterator(dataset)
-          next_element = iterator.get_next()
+    def benchmark_helper(fan_out, map_fn, use_inter_op_parallelism, label):
+      dataset = dataset_ops.Dataset.from_tensors(
+          tuple(0 for _ in range(fan_out))).repeat(None)
+      dataset = dataset_ops.MapDataset(
+          dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=10000,
+          name="fan_out_%d%s" % (fan_out, label))
 
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element[0].op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element[0].op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="fan_out_%d%s" % (fan_out, benchmark_label))
+    for fan_out in fan_outs:
+      benchmark_helper(fan_out, lambda *xs: [x + 1 for x in xs], True, "")
+      benchmark_helper(fan_out, lambda *xs: [x + 1 for x in xs], False,
+                       "_single_threaded")
+      benchmark_helper(fan_out, lambda *xs: xs, True, "_short_circuit")
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 375ff339a82207f8c5662ecf67ac47fc8c79c2a6..80569e4e313c0ccc44aa7f0413e92cd4c529c167 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -17,54 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.platform import test
-
-_NUMPY_RANDOM_SEED = 42
 
 
-class RangeBenchmark(test.Benchmark):
+class RangeBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.range()`."""
 
-  def _benchmarkRangeHelper(self, modeling_enabled):
-    num_elements = 10000000 if modeling_enabled else 50000000
-
-    # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
-    # C++, and focus on the minimal overheads (excluding Python invocation
-    # costs).
-    dataset = dataset_ops.Dataset.range(num_elements).skip(
-        num_elements - 1).take(1)
-    options = dataset_ops.Options()
-    options.experimental_autotune = modeling_enabled
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      # Run once to warm up the session caches.
-      sess.run(iterator.initializer)
-      sess.run(next_element)
-
-      # Run once for timing.
-      sess.run(iterator.initializer)
-      start = time.time()
-      sess.run(next_element)
-      end = time.time()
-
-      time_per_element = (end - start) / num_elements
-      self.report_benchmark(
-          iters=num_elements,
-          wall_time=time_per_element,
-          name="modeling_%s" % ("on" if modeling_enabled else "off"))
-
-  def benchmarkRange(self):
+  def benchmark_range(self):
     for modeling_enabled in [False, True]:
-      self._benchmarkRangeHelper(modeling_enabled)
+      num_elements = 10000000 if modeling_enabled else 50000000
+      options = dataset_ops.Options()
+      options.experimental_autotune = modeling_enabled
+      dataset = dataset_ops.Dataset.range(num_elements)
+      dataset = dataset.with_options(options)
+
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=num_elements,
+          name="modeling_%s" % ("on" if modeling_enabled else "off"))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 3c1d798bd23fec5990d6d1f3080e5a8557240aed..275bdf7ef44b1012bf443e23644ee71df9d31709 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -58,6 +58,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@make_csv_dataset
 @@make_saveable_from_iterator
 @@map_and_batch
+@@map_and_batch_with_legacy_function
 @@parallel_interleave
 @@parse_example_dataset
 @@prefetch_to_device
@@ -82,6 +83,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
+from tensorflow.python.data.experimental.ops.batching import map_and_batch_with_legacy_function
 from tensorflow.python.data.experimental.ops.batching import unbatch
 from tensorflow.python.data.experimental.ops.cardinality import cardinality
 from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
index 50e3a5c469232e2ff3ea8f0bd74866d829c31770..8f8fbe86dff47be6e058979251b066e017a5ba7b 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -159,7 +159,7 @@ class MapVectorizationBenchmark(test.Benchmark):
 
   def benchmarkCast(self):
     self._benchmark_helper(
-        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
+        lambda *args: [math_ops.cast(x, dtypes.float32) for x in args], "cast")
 
   def benchmarkReshape(self):
     self._benchmark_helper(
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index d0e5abcd5b1b3a9cfc3560ba60026180e1577e67..1733b9817b3348563ff67efba0ecbca9ff757963 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -322,6 +322,8 @@ py_test(
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:map_defun",
         "//tensorflow/python/data/kernel_tests:test_base",
     ],
@@ -481,6 +483,7 @@ py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -603,7 +606,7 @@ py_library(
 
 py_test(
     name = "sql_dataset_test",
-    size = "small",
+    size = "medium",
     srcs = ["sql_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index 4a8296d08482d4d800eb3bb0b94bbae940264da6..993b511d5e3635b38e6e0a73f86c873a39a6c127 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -49,8 +49,7 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
            dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
       ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
-       concatenate(dataset_ops.Dataset.range(5)),
-       cardinality.INFINITE),
+       concatenate(dataset_ops.Dataset.range(5)), cardinality.INFINITE),
       ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
           dataset_ops.Dataset.range(5).filter(lambda _: True)),
        cardinality.UNKNOWN),
@@ -70,8 +69,7 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
            dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
       ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)),
-       cardinality.UNKNOWN),
+          lambda _: dataset_ops.Dataset.from_tensors(0)), cardinality.UNKNOWN),
       ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
        cardinality.UNKNOWN),
       ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
@@ -117,6 +115,13 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        cardinality.INFINITE),
       ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
        5),
+      ("Shard1", lambda: dataset_ops.Dataset.range(5).shard(2, 0), 3),
+      ("Shard2", lambda: dataset_ops.Dataset.range(5).shard(8, 7), 0),
+      ("Shard3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).shard(2, 0),
+       cardinality.UNKNOWN),
+      ("Shard4", lambda: dataset_ops.Dataset.range(5).repeat().shard(2, 0),
+       cardinality.INFINITE),
       ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
       ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
       ("Skip3",
@@ -138,15 +143,13 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        5),
       ("Zip2", lambda: dataset_ops.Dataset.zip(
           (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
-      ("Zip3", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5),
-           dataset_ops.Dataset.range(3).repeat())), 5),
-      ("Zip4", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5).repeat(),
-           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
-      ("Zip5", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5),
-           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+      ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5).repeat(), dataset_ops.Dataset.range(3).repeat())),
+       cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).filter(lambda _: True))),
        cardinality.UNKNOWN),
       # pylint: enable=g-long-lambda
   )
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 3e2cf779a3f9d138e83986abcf5b8387d7c19412..f65740c56518c2c0baa1d1d56cac5e0314db4b97 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -22,10 +22,12 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -71,6 +73,52 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
 
+  def testSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).map(increment_fn)
+
+    @function.defun
+    def fn():
+      _ = get_single_element.get_single_element(dataset_fn())
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 1)
+
+  def testAutomaticControlDependencies(self):
+    counter_var = variables.Variable(1)
+
+    def increment_fn(x):
+      counter_var.assign(counter_var + 1)
+      return x
+
+    def multiply_fn(x):
+      counter_var.assign(counter_var * 2)
+      return x
+
+    def dataset1_fn():
+      return dataset_ops.Dataset.range(1).map(increment_fn)
+
+    def dataset2_fn():
+      return dataset_ops.Dataset.range(1).map(multiply_fn)
+
+    @function.defun
+    def fn():
+      _ = get_single_element.get_single_element(dataset1_fn())
+      _ = get_single_element.get_single_element(dataset2_fn())
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 4)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 19830a23bb2ea7ace55a458351d4eda556ba3bf8..4e99189279c5333029a8c068a7f334b7c02b62a5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -27,15 +27,18 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-# TODO(b/117581999): add eager coverage.
+@test_util.run_v1_only("b/123903858: Add eager and V2 test coverage")
 class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunSimple(self):
@@ -253,6 +256,70 @@ class MapDefunTest(test_base.DatasetTestBase):
     expected = x + c
     self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op))
 
+  def testMapDefunWithVariantTensor(self):
+
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.variant)])
+    def fn(x):
+      return x
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.variant)
+    serialized = array_ops.stack([serialized, serialized])
+    map_defun_op = map_defun.map_defun(fn, [serialized], [dtypes.variant],
+                                       [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
+  def testMapDefunWithVariantTensorAsCaptured(self):
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.variant)
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
+    def fn(x):
+      del x
+      return serialized
+
+    x = constant_op.constant([0, 0])
+    map_defun_op = map_defun.map_defun(fn, [x], [dtypes.variant], [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
+  def testMapDefunWithStrTensor(self):
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    def fn(x):
+      return x
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.string)
+    serialized = array_ops.stack([serialized, serialized])
+    map_defun_op = map_defun.map_defun(fn, [serialized], [dtypes.string],
+                                       [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index bba2f352e2bab394931b359854f6a7a9f0e6b031..0e14f75eadce3d4872ea25ea9fab2ee59564d2d5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -378,6 +379,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
       return array_ops.gather(x, 10)
+
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
       base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
@@ -478,8 +480,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationIgnoreStateful(self):
+  def testOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -489,10 +490,13 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                            [3, 4]]).repeat(5)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self.assertDatasetsRaiseSameError(
-        unoptimized, optimized, errors.InvalidArgumentError,
-        [("OneShotIterator", "OneShotIterator_1", 1),
-         ("IteratorGetNext", "IteratorGetNext_1", 1)])
+    replacements = None
+    if not context.executing_eagerly():
+      # In graph mode, the ops have unique names.
+      replacements = [("OneShotIterator", "OneShotIterator_1", 1),
+                      ("IteratorGetNext", "IteratorGetNext_1", 1)]
+    self.assertDatasetsRaiseSameError(unoptimized, optimized,
+                                      errors.InvalidArgumentError, replacements)
 
   def testOptimizationIgnoreRagged(self):
     # Make sure we ignore inputs that might not be uniformly sized
@@ -505,8 +509,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationIgnoreRaggedMap(self):
+  def testOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
@@ -514,10 +517,13 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(20).batch(1, drop_remainder=True)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self.assertDatasetsRaiseSameError(
-        unoptimized, optimized, errors.InvalidArgumentError,
-        [("OneShotIterator", "OneShotIterator_1", 1),
-         ("IteratorGetNext", "IteratorGetNext_1", 1)])
+    replacements = None
+    if not context.executing_eagerly():
+      # In graph mode, the ops have unique names.
+      replacements = [("OneShotIterator", "OneShotIterator_1", 1),
+                      ("IteratorGetNext", "IteratorGetNext_1", 1)]
+    self.assertDatasetsRaiseSameError(unoptimized, optimized,
+                                      errors.InvalidArgumentError, replacements)
 
   def testOptimizationWithUnknownBatchShape(self):
     tensor = sparse_tensor.SparseTensor(
@@ -535,5 +541,25 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     optimized = unoptimized.with_options(options)
     self.assertDatasetsEqual(unoptimized, optimized)
 
+  def testOptimizationWithSparseTensor(self):
+    base_dataset = dataset_ops.Dataset.from_tensors(0)
+
+    def map_fn(x):
+      del x
+      return sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    # Datasets with sparse tensors have unknown output shapes.
+    unoptimized = base_dataset.apply(batching.map_and_batch(map_fn, 2))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized = unoptimized.with_options(options)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = unoptimized.with_options(options)
+    self.assertDatasetsEqual(unoptimized, optimized)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index bcd027ebbbc07fa4fd9458cc979293c9c417c9cf..a85e0cf801cda08cfe997c4ebce6497ae806aecd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -112,7 +112,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
 
-  # TODO(b/123300735): Add eager coverage for the following tests.
+  @test_util.run_v1_only("b/123902160")
   def testSkipEagerOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
@@ -127,7 +127,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
       self.evaluate(get_next)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
+  @test_util.run_v1_only("b/123902160")
   def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
@@ -219,7 +219,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   @parameterized.named_parameters(_generate_captured_refvar_test_cases())
-  # Skip eager because RefVariables are not supported in eager mode.
+  @test_util.run_v1_only("RefVariables are not supported in eager mode.")
   def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
     """Tests that default optimizations are disabled with ref variables."""
     variable = variable_scope.get_variable(
@@ -236,7 +236,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_optimization.noop_elimination = True
       options.experimental_optimization.map_and_batch_fusion = True
       optimized_dataset = unoptimized_dataset.with_options(options)
-      optimized_it = optimized_dataset.make_initializable_iterator()
+      optimized_it = dataset_ops.make_initializable_iterator(optimized_dataset)
 
     self.assertGreaterEqual(len(w), 1)
     expected = ("tf.data static optimizations are not compatible with "
@@ -248,7 +248,8 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # Check that outputs are the same in the optimized and unoptimized cases,
     # when the variable value is changing.
-    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    unoptimized_it = dataset_ops.make_initializable_iterator(
+        unoptimized_dataset)
     with ops.control_dependencies([assign_op]):
       unoptimized_output = unoptimized_it.get_next()
       optimized_output = optimized_it.get_next()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 0dcbd563facf16d62dd97b90522625cd967a6dd9..28f9aad5b9dc0ad218ab71790c5c3c25c61824f3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -17,44 +17,304 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@parameterized.named_parameters(("WithDropRemainder", True),
+                                ("WithoutDropRemainder", False))
 @test_util.run_all_in_graph_and_eager_modes
 class RebatchDatasetTest(test_base.DatasetTestBase):
 
-  def testBasic(self):
-    dataset = dataset_ops.Dataset.range(1024).batch(32, drop_remainder=True)
+  def testBasic(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder)
     rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
     self.assertEqual(
-        [[32]], [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
     self.assertEqual(
-        [[8]],
+        [[8 if drop_remainder else None]],
         [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
 
     expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testScalarInputError(self):
+  def testScalarInputError(self, _):
     dataset = dataset_ops.Dataset.range(1024)
     with self.assertRaisesRegexp(ValueError, "at least one dimension"):
       batching._RebatchDataset(dataset, num_workers=4)
 
-  def testUnknownBatchSizeError(self):
-    dataset = dataset_ops.Dataset.range(1024).batch(32)
-    with self.assertRaisesRegexp(ValueError, "unknown batch size datasets"):
-      batching._RebatchDataset(dataset, num_workers=4)
+  def testNotDivisibleError(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "not divisible by"):
+      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=5)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+  def testTupleOutput(self, drop_remainder):
+    dataset = (
+        dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(
+            32, drop_remainder=drop_remainder))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    expected_output = [([k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
+                        [k for k in range(i, i + 8)])
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testNestedDictionaryOutput(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).map(
+        lambda x: {"a": x, "b": {"c": x}}).batch(
+            32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    expected_output = [{"a": [k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
+                        "b": {"c": [k for k in range(i, i + 8)]}}
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testFinalPartialBatchOriginal(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1032).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1032, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testFinalPartialBatchAfterRebatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(34).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]  # pylint: disable=g-complex-comprehension
+    if not drop_remainder:
+      expected_output += [[32, 33]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testMultipleBatches(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(128).batch(
+        4, drop_remainder=drop_remainder)
+    dataset = dataset.batch(8, drop_remainder=drop_remainder)
+    self.assertEqual(
+        [[8, 4]] if drop_remainder else [[None, None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Each element is a list of 8 elements where each element is a list of 4.
+    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 32, 4)]  # generates 8 elements
+                       for i in range(0, 128, 32)]
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, 4)
+    self.assertEqual(
+        [[2, 4]] if drop_remainder else [[None, None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # Each element is a list of 2 elements where each element is a list of 4.
+    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 8, 4)]  # generates 2 elements
+                       for i in range(0, 128, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testMapAndBatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).apply(
+        batching.map_and_batch(
+            math_ops.square, 32, drop_remainder=drop_remainder))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = [[k**2 for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testPaddedBatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(128).batch(4).padded_batch(
+        8, padded_shapes=[5], drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8, 5]] if drop_remainder else [[None, 5]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Each element is a list of 8 elements in which each element is a list of 5
+    # elements, first four are numbers and the last one is a padded zero.
+    expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 32, 4)]  # generates 8 elements
+                       for i in range(0, 128, 32)]
+    self.assertDatasetProduces(dataset, expected_output)
+    self.assertEqual(
+        [[2, 5]] if drop_remainder else [[None, 5]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # Each element is a list of 2 elements in which each element is a list of 5
+    # elements, first four are numbers and the last one is a padded zero.
+    expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 8, 4)]  # generates 2 elements
+                       for i in range(0, 128, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testConcatenate(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        8, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset1.concatenate(dataset2)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[2 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = ([[i, i + 1] for i in range(0, 64, 2)] +
+                       [[i, i + 1] for i in range(0, 32, 2)])
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testConcatenateDifferentShapes(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        16, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset1.concatenate(dataset2)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[None]], [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = ([[i, i + 1, i + 2, i + 3] for i in range(0, 64, 4)] +
+                       [[i, i + 1] for i in range(0, 32, 2)])
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testZip(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        8, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8], [8]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[2], [2]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testZipDifferentShapes(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        16, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[16], [8]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[4], [2]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = [([2 * i, 2 * i + 1, 2 * i + 2, 2 * i + 3], [i, i + 1])
+                       for i in range(0, 32, 2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testUnsupportedTransformError(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder).apply(
+            scan_ops.scan([0], lambda _, a: ([0], a)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
 
-  def testNotDivisibleError(self):
-    dataset = dataset_ops.Dataset.range(1024).batch(32, drop_remainder=True)
-    with self.assertRaisesRegexp(ValueError, "not divisible by"):
-      batching._RebatchDataset(dataset, num_workers=5)
+  def testFlatMapBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).flat_map(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder))
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
 
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # Two elements where each element is a list of 4 elements where each element
+    # is a list of 8.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for _ in range(2)
+                       for i in range(0, 32, 8)]  # generates 4 elements
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testInterleaveBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder), cycle_length=2)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # List of 4 elements where each element is a list of 8 numbering from 0 to
+    # 31 repeated twice.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 32, 8)  # generates 4 elements
+                       for _ in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testParallelInterleaveBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder), cycle_length=2,
+                      num_parallel_calls=2)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # List of 4 elements where each element is a list of 8 numbering from 0 to
+    # 31 repeated twice in collated fashion i.e [0...8], [0...8] etc.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 32, 8)  # generates 4 elements
+                       for _ in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index dc168d867aed3e2b7900f331a3cbed66f04dbd5d..caf571ef4ee643f39b39abd825a38e03dc34aaef 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -623,6 +623,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "shard_dataset_serialization_test",
+    size = "medium",
+    srcs = ["shard_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "shuffle_and_repeat_dataset_serialization_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 8cc66d0c29392b206015ad886780d854fb2b5d5c..84b8e5ca3647a0597f6823249743a678900751b8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import iterator_ops
-from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,7 +34,8 @@ from tensorflow_estimator.python.estimator import estimator
 from tensorflow_estimator.python.estimator import model_fn
 
 
-class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
+@test_util.run_v1_only('b/123904664')
+class CheckpointInputPipelineHookTest(test.TestCase):
 
   @staticmethod
   def _model_fn(features, labels, mode, config):
@@ -69,7 +69,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
-  @test_util.run_deprecated_v1
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -82,7 +81,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
-  @test_util.run_deprecated_v1
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -97,7 +95,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
-  @test_util.run_deprecated_v1
   def testDoNotRestore(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index bdbd8702b7f8d315a730c5cd2b000218ea5e19be..ca45ecca4c734f6a896c093921844c425cbba7e6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,7 +23,6 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
@@ -78,7 +77,6 @@ class DatasetSerializationTestBase(test.TestCase):
     # NOTE: We disable all default optimizations in serialization tests in order
     # to test the actual dataset in question.
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.apply_default_optimizations = False
 
     def ds_fn1_no_opt():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index ed4a1da59679c8c85141cb38e46ad95441b71b73..aaa46bacefed1865ff85bf5478fbd0f22c65c227 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -34,6 +34,20 @@ class OptimizeDatasetSerializationTest(
 
     self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
 
+  def testWithNewFunction(self):
+    """Tests that optimized datasets with new functions work."""
+
+    def build_dataset():
+      dataset = dataset_ops.Dataset.range(100)
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(5)
+      # map_vectorization adds a new vectorized function to the function
+      # library.
+      dataset = dataset.apply(optimization.optimize(["map_vectorization"]))
+      return dataset
+
+    self.run_core_tests(build_dataset, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..99674b6910312c35f065fc3dd2cdd738fe544615
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
@@ -0,0 +1,42 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ShardDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class ShardDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
+  def _build_dataset(self, num_elements, num_shards, index):
+    return dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
+
+  @parameterized.parameters((10, 5, 2, 3), (10, 10, 0, 9), (100, 2, 0, 1))
+  def testCore(self, elems, num_shards, index1, index2):
+    self.run_core_tests(lambda: self._build_dataset(elems, num_shards, index1),
+                        lambda: self._build_dataset(elems, num_shards, index2),
+                        elems // num_shards)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 8b330559f5f927bed1c0a206c962f4350868b276..c53ac82c6e23a5cc26ca2fc8d22ce7e6970638a4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
 def function_set_stats_aggregator(dataset,
                                   aggregator,
                                   prefix="",
@@ -53,20 +52,21 @@ def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   return dataset.with_options(options)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 @parameterized.named_parameters(
     ("SetStatsAggregator", function_set_stats_aggregator),
     ("StatsOptions", function_apply_options),
 )
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
-  def testBytesProduced(self, dataset_transformation):
+  @test_util.run_v1_only("b/123901126")
+  def testSkipEagerBytesProduced(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
     dataset = dataset_transformation(dataset, aggregator)
     next_element = self.getNext(dataset, requires_initialization=True)
-    summary_t = aggregator.get_summary()
 
     expected_sum = 0.0
     for i in range(100):
@@ -78,8 +78,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
-    # TODO(shivaniagrawal): ntentional breaking case
-    summary_str = self.evaluate(summary_t)
+    summary_str = self.evaluate(aggregator.get_summary())
     self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
     self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
@@ -357,13 +356,11 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         100.0)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 @parameterized.named_parameters(
-    dict(
-        testcase_name="SetStatsAggregator",
-        dataset_transformation=function_set_stats_aggregator),
-    dict(
-        testcase_name="StatsOptions",
-        dataset_transformation=function_apply_options))
+    ("SetStatsAggregator", function_set_stats_aggregator),
+    ("StatsOptions", function_apply_options)
+)
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
index 14a4241ec2e6930622aaf9e35ae70e18eaaa004f..783b2e6e22ae618f255673011d72201c993e0a85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.experimental.ops import writers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.lib.io import tf_record
@@ -94,6 +95,20 @@ class TFRecordWriterTest(test_base.DatasetTestBase):
     with self.assertRaises(TypeError):
       writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset)
 
+  def testSideEffect(self):
+    def writer_fn():
+      input_dataset = readers.TFRecordDataset(self._createFile())
+      return writers.TFRecordWriter(self._outputFilename()).write(input_dataset)
+
+    @function.defun
+    def fn():
+      _ = writer_fn()
+      return "hello"
+
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
+      self.assertAllEqual(self._record(i), r)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
index a8f50501517c24d5aea78d7dda18240f54921197..e6e77575a6f98b5becc9fe4ceb3126e22403b471 100644
--- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
@@ -31,7 +31,7 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase):
 
   def testBasic(self):
     ds = dataset_ops.Dataset.range(100)
-    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    ds_variant = ds._variant_tensor  # pylint: disable=protected-access
 
     wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
     unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(wrapped_variant)
@@ -42,10 +42,10 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase):
     for i in range(100):
       self.assertEqual(i, self.evaluate(get_next()))
 
-  # TODO(b/117581999): add eager coverage when supported.
+  @test_util.run_v1_only("b/123901304")
   def testSkipEagerGPU(self):
     ds = dataset_ops.Dataset.range(100)
-    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    ds_variant = ds._variant_tensor  # pylint: disable=protected-access
     wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
 
     with ops.device("/gpu:0"):
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 39cb0a68f8d1414cbb58742e0e49ccb31ac88aaa..983f7640b895639195ac8f6ff91784023c226165 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -38,6 +39,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -549,12 +551,15 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
-               drop_remainder):
+               drop_remainder, use_legacy_function=False):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
 
     self._map_func = dataset_ops.StructuredFunctionWrapper(
-        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
+        map_func,
+        "tf.data.experimental.map_and_batch()",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -589,6 +594,62 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
     return self._structure
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.map_and_batch()")
+@tf_export(v1=["data.experimental.map_and_batch_with_legacy_function"])
+def map_and_batch_with_legacy_function(map_func,
+                                       batch_size,
+                                       num_parallel_batches=None,
+                                       drop_remainder=False,
+                                       num_parallel_calls=None):
+  """Fused implementation of `map` and `batch`.
+
+  NOTE: This is an escape hatch for existing uses of `map_and_batch` that do not
+  work with V2 functions. New uses are strongly discouraged and existing uses
+  should migrate to `map_and_batch` as this method will not be removed in V2.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors to another
+      nested structure of tensors.
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+      representing the number of batches to create in parallel. On one hand,
+      higher values can help mitigate the effect of stragglers. On the other
+      hand, higher values can increase contention if CPU is scarce.
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in case its size is smaller than
+      desired; the default behavior is not to drop the smaller batch.
+    num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      the number of parallel calls is set dynamically based on available CPU.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
+      specified.
+  """
+
+  if num_parallel_batches is None and num_parallel_calls is None:
+    num_parallel_calls = batch_size
+  elif num_parallel_batches is not None and num_parallel_calls is None:
+    num_parallel_calls = batch_size * num_parallel_batches
+  elif num_parallel_batches is not None and num_parallel_calls is not None:
+    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
+                     "arguments are mutually exclusive.")
+
+  def _apply_fn(dataset):
+    return _MapAndBatchDataset(dataset, map_func, batch_size,
+                               num_parallel_calls, drop_remainder,
+                               use_legacy_function=True)
+
+  return _apply_fn
+
+
 @tf_export("data.experimental.map_and_batch")
 def map_and_batch(map_func,
                   batch_size,
@@ -652,18 +713,24 @@ class _RebatchDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, num_workers):
     self._input_dataset = input_dataset
-    output_shapes = input_dataset.output_shapes
-    if len(output_shapes) < 1:
-      raise ValueError("Input shape should have at least one dimension.")
-    if not output_shapes.dims[0].value:
-      raise ValueError("Cannot rebatch unknown batch size datasets.")
-    if output_shapes.dims[0].value % num_workers != 0:
-      raise ValueError(
-          "First dim of input shape: %d is not divisible by num_workers: %d" %
-          (output_shapes[0], num_workers))
-    output_dims = [d for d in output_shapes.dims]
-    output_dims[0] = output_dims[0] // num_workers
-    output_shapes = tensor_shape.TensorShapeV1(output_dims)
+
+    def recalculate_output_shapes(output_shapes):
+      """Recalculates the output_shapes after dividing it by num_workers."""
+      if len(output_shapes) < 1:
+        raise ValueError("Input shape should have at least one dimension.")
+      if (tensor_shape.dimension_value(output_shapes[0]) and
+          tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
+        raise errors.InvalidArgumentError(
+            None, None,
+            "First dim of input shape: %d is not divisible by num_workers: %d" %
+            (output_shapes[0], num_workers))
+      output_dims = [d for d in output_shapes.dims]
+      output_dims[0] = output_dims[0] // num_workers
+      return tensor_shape.TensorShape(output_dims)
+
+    output_shapes = nest.map_structure(recalculate_output_shapes,
+                                       input_dataset.output_shapes)
+
     self._structure = structure.convert_legacy_structure(
         self._input_dataset.output_types, output_shapes,
         self._input_dataset.output_classes)
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 177886e64beabfd404864ffe75371d742a8d1385..24a399ab4e4d117ced025ec0f801b774c668dd85 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -24,6 +24,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import parsing_ops
@@ -328,6 +329,7 @@ def make_csv_dataset_v2(
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
+    ignore_errors=False,
 ):
   """Reads CSV files into a dataset.
 
@@ -402,6 +404,10 @@ def make_csv_dataset_v2(
       the files. Defaults to 100.
     compression_type: (Optional.) A `tf.string` scalar evaluating to one of
       `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no compression.
+    ignore_errors: (Optional.) If `True`, ignores errors with CSV file parsing,
+      such as malformed data or empty lines, and moves on to the next valid
+      CSV record. Otherwise, the dataset raises an error and stops processing
+      when encountering any invalid records. Defaults to `False`.
 
   Returns:
     A dataset, where each element is a (features, labels) tuple that corresponds
@@ -457,7 +463,7 @@ def make_csv_dataset_v2(
     raise ValueError("`label_name` provided must be one of the columns.")
 
   def filename_to_dataset(filename):
-    return CsvDataset(
+    dataset = CsvDataset(
         filename,
         record_defaults=column_defaults,
         field_delim=field_delim,
@@ -465,8 +471,11 @@ def make_csv_dataset_v2(
         na_value=na_value,
         select_cols=select_columns,
         header=header,
-        compression_type=compression_type,
+        compression_type=compression_type
     )
+    if ignore_errors:
+      dataset = dataset.apply(error_ops.ignore_errors())
+    return dataset
 
   def map_fn(*columns):
     """Organizes columns into a features dictionary.
@@ -528,13 +537,14 @@ def make_csv_dataset_v1(
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
+    ignore_errors=False,
 ):  # pylint: disable=missing-docstring
   return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
       file_pattern, batch_size, column_names, column_defaults, label_name,
       select_columns, field_delim, use_quote_delim, na_value, header,
       num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
       prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
-      compression_type))
+      compression_type, ignore_errors))
 make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
 
 
diff --git a/tensorflow/python/data/experimental/ops/resampling.py b/tensorflow/python/data/experimental/ops/resampling.py
index 3a3040ae9a4b072ae5c1a2dc218863246b6310e6..6676085ae593bf98d7c7c3cc9bd7fdbdb1db90ff 100644
--- a/tensorflow/python/data/experimental/ops/resampling.py
+++ b/tensorflow/python/data/experimental/ops/resampling.py
@@ -168,8 +168,7 @@ def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
 def _estimate_initial_dist_ds(
     target_dist_t, class_values_ds, dist_estimation_batch_size=32,
     smoothing_constant=10):
-  num_classes = (target_dist_t.shape[0].value or
-                 array_ops.shape(target_dist_t)[0])
+  num_classes = (target_dist_t.shape[0] or array_ops.shape(target_dist_t)[0])
   initial_examples_per_class_seen = array_ops.fill(
       [num_classes], np.int64(smoothing_constant))
 
@@ -207,7 +206,7 @@ def _estimate_data_distribution(c, num_examples_per_class_seen):
       `[num_classes]`.
     dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
   """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  num_classes = num_examples_per_class_seen.get_shape()[0]
   # Update the class-count based on what labels are seen in batch.
   num_examples_per_class_seen = math_ops.add(
       num_examples_per_class_seen, math_ops.reduce_sum(
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index b134a8deaef4463c7dd38e4aafedc7f92bf306d6..af1c6ab97cb9791110c874e17809191f065d2773 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -108,8 +108,26 @@ tf_py_test(
     size = "small",
     srcs = ["filter_test.py"],
     additional_deps = [
+        ":filter_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "filter_with_legacy_function_test",
+    size = "small",
+    srcs = ["filter_with_legacy_function_test.py"],
+    additional_deps = [
+        ":filter_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "filter_test_base",
+    srcs = ["filter_test_base.py"],
+    deps = [
         ":test_base",
-        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -118,6 +136,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -272,7 +291,7 @@ tf_py_test(
         ":test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -325,7 +344,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index f319b24bee87d127cda11f84c75fa295a1cb67c3..1e764b3e25205d4fb369e8fb76f8908a76ed4c02 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -92,15 +92,16 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("TFRecord", lambda: readers.TFRecordDataset(""), 1),
   )
   def testDatasetSimpleSourceInputs(self, dataset_fn, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset_fn()._inputs()))
+    self.assertLen(dataset_fn()._inputs(), num_inputs)
 
+  @test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
   def testDatasetComplexSourceInputs(self):
     dataset_fn = dataset_ops.Dataset.from_sparse_tensor_slices(
         sparse_tensor.SparseTensor(
             indices=np.array([[0, 0], [1, 0], [2, 0]]),
             values=np.array([0, 0, 0]),
             dense_shape=np.array([3, 1])))
-    self.assertEqual(0, len(dataset_fn._inputs()))
+    self.assertEmpty(dataset_fn._inputs())
 
   @parameterized.named_parameters(
       ("Batch",
@@ -266,27 +267,24 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           round_trip_dataset, [self.evaluate(tf_value_fn())],
           requires_initialization=True)
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
   def testSkipEagerSameGraphErrorOneShot(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         dataset = dataset.batch(2)
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
   def testSkipEagerSameGraphErrorOneShotSimple(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
       with test.mock.patch.object(logging, "warning") as mock_log:
-        _ = dataset.make_one_shot_iterator()
+        _ = dataset_ops.make_one_shot_iterator(dataset)
         self.assertRegexpMatches(
             str(mock_log.call_args), "Please ensure that all datasets in the "
             "pipeline are created in the same graph as the iterator.")
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
   def testSkipEagerSameGraphErrorInitializable(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
index cd8f526e72843243c2803291be1337ee40b2b2b2..b81e9a892dfbb0baded27cbfb36ec94a0101d78f 100644
--- a/tensorflow/python/data/kernel_tests/filter_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,111 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.data.kernel_tests import filter_test_base
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import map_fn
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FilterTest(test_base.DatasetTestBase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def do_test(count, modulus):
-      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
-          _map_fn).repeat(count).filter(
-              lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-      self.assertEqual([c.shape[1:] for c in components],
-                       [shape for shape in dataset.output_shapes])
-      get_next = self.getNext(dataset)
-      for _ in range(count):
-        for i in [x for x in range(7) if x**2 % modulus == 0]:
-          result = self.evaluate(get_next())
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next())
-
-    do_test(14, 2)
-    do_test(4, 18)
-
-    # Test an empty dataset.
-    do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(4).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
-
-  def testFilterDict(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
-            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
-                lambda d: d["foo"] + d["bar"])
-    self.assertDatasetProduces(
-        dataset,
-        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = map_fn.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [[1, 2, 3], [4, 5, 6]]).filter(_predicate)
-    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    dataset = dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-        lambda x, i: x)
-    self.assertDatasetProduces(
-        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
-
-  def testShortCircuit(self):
-    dataset = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(10),
-         dataset_ops.Dataset.from_tensors(True).repeat(None)
-        )).filter(lambda x, y: y)
-    self.assertDatasetProduces(
-        dataset, expected_output=[(i, True) for i in range(10)])
+class FilterTest(filter_test_base.FilterTestBase):
 
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10).filter(
-        lambda x: math_ops.equal(x % 2, 0))
-    next_elements = [self.getNext(dataset) for _ in range(10)]
-    self.assertEqual([0 for _ in range(10)],
-                     self.evaluate(
-                         [next_element() for next_element in next_elements]))
+  def apply_filter(self, input_dataset, predicate):
+    return input_dataset.filter(predicate)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/filter_test_base.py b/tensorflow/python/data/kernel_tests/filter_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c53fa39906d4efee66a8f71762e780550d7cda9
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_test_base.py
@@ -0,0 +1,134 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+
+
+class FilterTestBase(test_base.DatasetTestBase):
+  """Base class for FilterDataset tests."""
+
+  def apply_filter(self, input_dataset, predicate):
+    raise NotImplementedError("FilterTestBase._apply_filter")
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):  # pylint: disable=missing-docstring
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count)
+      # pylint: disable=g-long-lambda
+      dataset = self.apply_filter(
+          dataset, lambda x, _y, _z: math_ops.equal(
+              math_ops.mod(x, modulus), 0))
+      # pylint: enable=g-long-lambda
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = self.apply_filter(
+        dataset, lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  def testFilterDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2})
+    dataset = self.apply_filter(
+        dataset, lambda d: math_ops.equal(d["bar"] % 2, 0))
+    dataset = dataset.map(lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  def testUseStepContainerInFilter(self):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = map_fn.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6]])
+    dataset = self.apply_filter(dataset, _predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    dataset = self.apply_filter(dataset, _filter_fn)
+    dataset = dataset.map(lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  def testShortCircuit(self):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)
+        ))
+    dataset = self.apply_filter(dataset, lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = self.apply_filter(dataset, lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
diff --git a/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py b/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a283fb3302318ca526c0d43f8b025749b52c2fc
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
@@ -0,0 +1,33 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter_with_legacy_function()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import filter_test_base
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_v1_only
+class FilterWithLegacyFunctionTest(filter_test_base.FilterTestBase):
+
+  def apply_filter(self, input_dataset, predicate):
+    return input_dataset.filter_with_legacy_function(predicate)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index ff52821b10740196286c30d19b0cda3b4b44bae5..69b5fd0d77fe743c02f441f1d65ae0bc9d731dae 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -65,11 +65,11 @@ class FlatMapTest(test_base.DatasetTestBase):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
     iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
+        dataset_ops.make_initializable_iterator(
+            dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+                lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+                    lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))),
+            shared_name="shared_flat_map_iterator"))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
index 546c2fb2ed3f7584001e9aa2dbeb93ac82ca7709..2ce9c9a061c63b6acea899aef0518e516befb388 100644
--- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -29,10 +29,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-# NOTE: deprecated method in V2, no eager coverage added.
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
index 91b356691b75eb337ad61643646ba717e4929ab9..dfb54b50ad6b2dd8f242fba09218d6eae871a49c 100644
--- a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -43,7 +43,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], get_next())
     save_path = checkpoint.save(checkpoint_prefix)
     self.assertAllEqual([9, 16], get_next())
@@ -73,7 +73,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset_2)
     get_next_3 = iterator_3.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(
+    checkpoint = trackable_utils.Checkpoint(
         iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
     self.assertAllEqual([1, 4], get_next_1())
     self.assertAllEqual(0, get_next_3())
@@ -96,7 +96,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual(0, get_next())
     self.assertAllEqual(1, get_next())
     save_path = checkpoint.save(checkpoint_prefix)
@@ -115,7 +115,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     iterator = iter(dataset) if context.executing_eagerly(
     ) else dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     for i in range(5):
       checkpoint.restore(
           checkpoint_management.latest_checkpoint(
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index bc9a67e5dbdf39c57cd63f2e4f328539202c9a2a..4badcffd463ae8d12c5704730d710beadc563040 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -84,6 +84,7 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
   return dataset, coordination_events
 
 
+# TODO(jsimsa): Add tests for `map_with_legacy_function`.
 @test_util.run_all_in_graph_and_eager_modes
 class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
@@ -313,8 +314,8 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
       if context.executing_eagerly():
         captured_iterator = iter(dataset_ops.Dataset.range(10))
       else:
-        captured_iterator = dataset_ops.Dataset.range(
-            10).make_initializable_iterator()
+        captured_iterator = dataset_ops.make_initializable_iterator(
+            dataset_ops.Dataset.range(10))
       ds = _build_ds(captured_iterator)
       return captured_iterator, ds
 
@@ -351,6 +352,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  @test_util.run_v1_only("b/123904513")
   def testCaptureQueue(self):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
@@ -392,11 +394,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
-  # TODO(b/121264236): add eager mode coverage when we have mutli-device setup.
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
   @test_util.run_v1_only("b/121264236")
   def testSkipEagerCaptureConstantsWithConflictingDevices(self):
-    config = config_pb2.ConfigProto(
-        device_count={"CPU": 3}, log_device_placement=True)
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.cached_session(config=config):
       with ops.device("/device:CPU:0"):
         a = constant_op.constant(3.0)
@@ -410,12 +411,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
       expected_output = [8.0] * 10
       self.assertDatasetProduces(dataset, expected_output=expected_output)
 
-  # TODO(b/121264236): add eager mode coverage when we have mutli-device setup.
-  @test_util.run_v1_only(
-      "defun will convert RefVariables to ResourceVariables.")
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @test_util.run_v1_only("b/121264236")
   def testSkipEagerRefVariablesWithConflictingDevices(self):
-    config = config_pb2.ConfigProto(
-        device_count={"CPU": 3}, log_device_placement=True)
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.cached_session(config=config):
 
       def func(_):
@@ -425,7 +424,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
           b = variables.VariableV1(5.0)
         return math_ops.add(a, b)
 
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).map(func)
+      # Use the legacy function implementation as eager function will convert
+      # RefVariables to ResourceVariables.
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+      dataset = dataset.map_with_legacy_function(func)
       self.evaluate(variables.global_variables_initializer())
       expected_output = [8.0] * 10
       self.assertDatasetProduces(
@@ -433,11 +435,10 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
           expected_output=expected_output,
           requires_initialization=True)
 
-  # TODO(b/121264236): add eager mode coverage when we have mutli-device setup.
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
   @test_util.run_v1_only("b/121264236")
   def testSkipEagerResourceVariablesWithConflictingDevices(self):
-    config = config_pb2.ConfigProto(
-        device_count={"CPU": 3}, log_device_placement=True)
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.cached_session(config=config):
 
       def func(_):
@@ -453,7 +454,8 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
       dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).map(func)
       expected_error = (
           errors.InvalidArgumentError,
-          "Could not colocate node with its resource and reference inputs")
+          "Cannot place the graph because a reference or resource edge "
+          "connects colocation groups with incompatible assigned devices")
       self.assertDatasetProduces(
           dataset, expected_error=expected_error, requires_initialization=True)
 
@@ -705,6 +707,13 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  def testNestedListMapDataset(self):
+    dataset = dataset_ops.Dataset.from_tensors(
+        [0, 1, 2]).repeat(10).map(lambda a: ([a[1], a[0] + a[2]], a[1]))
+
+    expected_output = [(np.array([1, 2]), 1)] * 10
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
   def testPrefetch(self):
     # We will use this event to test that `_map_py_func()` has been
     # invoked a certain number of times (6 times, to be exact) after
@@ -812,6 +821,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset,
         expected_output=[self.evaluate(_check(_sparse(i))) for i in range(10)])
 
+  @test_util.run_v1_only("b/123904513")
   def testParallelMapOutOfRangeError(self):
     def raising_py_func(i):
       if i == 100:
@@ -1021,6 +1031,18 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
 
+  # NOTE: collection test is specific to graph mode only, no eager coverage.
+  @test_util.run_v1_only("graph specific test")
+  def testSkipEagerCollectionCopy(self):
+    w = variable_scope.get_variable("w", [])
+    self.assertIn(w, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+
+    def func(x):
+      self.assertIn(w, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+      return x
+
+    dataset = dataset_ops.Dataset.from_tensors(constant_op.constant(1.0))
+    dataset.map(func)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index e6dfc338eba7390de0171b64325f9f5c3953ccdb..a040a64013602b23ddeeb315a2fe3035f5f9be1d 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 # memory_profiler might not be available in the OSS version of TensorFlow.
@@ -65,6 +66,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       for _ in six.moves.range(num_iters):
         f()
       increase = memory_profiler.memory_usage(-1)[0] - initial
+      logging.info("Memory increase observed: %f MB" % increase)
       assert increase < increase_threshold_absolute_mb, (
           "Increase is too high. Initial memory usage: %f MB. Increase: %f MB. "
           "Maximum allowed increase: %f") % (initial, increase,
@@ -101,22 +103,41 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_2)
 
   @test_util.run_v1_only("b/121264236")
-  def testEagerNoMemoryLeak(self):
+  def testEagerMemoryUsageWithReset(self):
     if not context.executing_eagerly():
       self.skipTest("Only eager mode test")
     if memory_profiler is None:
       self.skipTest("memory_profiler required to run this test")
 
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+
+    def f():
+      self.evaluate(multi_device_iterator.get_next())
+      multi_device_iterator._eager_reset()
+
+    self.assertNotIncreasingMemory(
+        f, num_iters=100, increase_threshold_absolute_mb=50)
+
+  @test_util.run_v1_only("b/121264236")
+  def testEagerMemoryUsageWithRecreation(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only eager mode test")
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    dataset = dataset_ops.Dataset.range(10)
+
     def f():
-      dataset = dataset_ops.Dataset.range(10)
       multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
           dataset, ["/cpu:1", "/cpu:2"])
       self.evaluate(multi_device_iterator.get_next())
       del multi_device_iterator
-      del dataset
 
+    # TODO(b/123316347): Reduce threshold once bug is fixed.
     self.assertNotIncreasingMemory(
-        f, num_iters=100, increase_threshold_absolute_mb=175)
+        f, num_iters=100, increase_threshold_absolute_mb=500)
 
   @test_util.run_v1_only("b/121264236")
   def testOneOnSameDevice(self):
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
index 2269bb8724dba40f73ceef8797206adb513a2f60..4fde0aa002b82a3f076eff21d7bc9e5b2b4aee33 100644
--- a/tensorflow/python/data/kernel_tests/optional_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -329,7 +329,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
     if not works_on_gpu and test.is_gpu_available():
       self.skipTest("Test case not yet supported on GPU.")
     ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
-    iterator = ds.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(ds)
     next_elem = iterator_ops.get_next_as_optional(iterator)
     self.assertIsInstance(next_elem, optional_ops.Optional)
     self.assertTrue(
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index 93acc1565fd34beb3b0be1eaf0408272c81effed..846d9a6cef9cd362eca269fa44824436766afa2a 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -22,12 +22,14 @@ import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -123,6 +125,71 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(((i + 1) * i) // 2, result["dense"])
       self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
 
+  def testDatasetSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10).map(increment_fn)
+
+    def reduce_fn(state, value):
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 10)
+
+  def testSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10)
+
+    def reduce_fn(state, value):
+      counter_var.assign_add(1)
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 10)
+
+  def testAutomaticControlDependencies(self):
+    counter_var = variables.Variable(1)
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1)
+
+    def reduce1_fn(state, value):
+      counter_var.assign(counter_var + 1)
+      return state + value
+
+    def reduce2_fn(state, value):
+      counter_var.assign(counter_var * 2)
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce1_fn)
+      _ = dataset_fn().reduce(np.int64(0), reduce2_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 4)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 928550676d5b05c2e5a459af355acebe2f1f1cc4..9fc70ff60752c02ec626ee5f89606b428fc183fd 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -19,11 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class ShardTest(test_base.DatasetTestBase):
 
   def testSimpleCase(self):
@@ -41,20 +42,24 @@ class ShardTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(dataset, expected_output=[0, 5])
 
   def testOffsetGreaterNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, 7)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(5, 7)
+      self.evaluate(self.getNext(dataset)())
 
   def testNegativeOffset(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, -3)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(5, -3)
+      self.evaluate(self.getNext(dataset)())
 
   def testNegativeNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(-3, 1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(-3, 1)
+      self.evaluate(self.getNext(dataset)())
 
   def testZeroNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(0, 1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(0, 1)
+      self.evaluate(self.getNext(dataset)())
 
   def testIteratorEndsBeforeFirstElem(self):
     dataset = dataset_ops.Dataset.range(1).shard(5, 2)
@@ -72,5 +77,10 @@ class ShardTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10).shard(4, 3)
     self.assertDatasetProduces(dataset, expected_output=[3, 7])
 
+  def testNumShardsLargerThanDataset(self):
+    dataset = dataset_ops.Dataset.range(10).shard(20, 5)
+    self.assertDatasetProduces(dataset, expected_output=[5])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 7aa7f33003cf7195f5ecde406e181b26644c8038..57df29ead57b6f69af1d292fb1480ec90051fa54 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
@@ -32,6 +33,13 @@ from tensorflow.python.platform import test
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
+  @classmethod
+  def setUpClass(cls):
+    if tf2.enabled():
+      dataset_ops.Dataset = dataset_ops.DatasetV2
+    else:
+      dataset_ops.Dataset = dataset_ops.DatasetV1
+
   def assertSparseValuesEqual(self, a, b):
     """Asserts that two SparseTensors/SparseTensorValues are equal."""
     self.assertAllEqual(a.indices, b.indices)
@@ -178,6 +186,8 @@ class DatasetTestBase(test.TestCase):
                                    exception_class,
                                    replacements=None):
     """Checks that datasets raise the same error on the first get_next call."""
+    if replacements is None:
+      replacements = []
     next1 = self.getNext(dataset1)
     next2 = self.getNext(dataset2)
     try:
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 112aa926ae5c1f6cedb967de7943dc8d1ec4048d..a911d8c8195816456f1c0dc5eb422462c7a57b9e 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -74,7 +74,7 @@ py_library(
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 766c6d5395e480b44ddf1b92e312c93899e4fd21..e185ba78a0c0c0a34e707ec32cc6ba87c4a22980 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -28,7 +28,6 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
 
 from tensorflow.python.compat import compat
-from tensorflow.python.data.experimental.ops import filter_for_shard_ops
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.experimental.ops import threading_options
@@ -40,6 +39,7 @@ from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -48,6 +48,7 @@ from tensorflow.python.framework import random_seed as core_random_seed
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -86,12 +87,12 @@ class DatasetV2(object):
     Args:
       variant_tensor: A DT_VARIANT tensor that represents the dataset.
     """
-    self._dataset_variant_tensor = variant_tensor
+    self._variant_tensor_attr = variant_tensor
     self._graph_attr = ops.get_default_graph()
 
   @property
   def _variant_tensor(self):
-    return self._dataset_variant_tensor
+    return self._variant_tensor_attr
 
   @_variant_tensor.setter
   def _variant_tensor(self, _):
@@ -143,6 +144,8 @@ class DatasetV2(object):
     return any(
         [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
 
+  # TODO(jsimsa): Change this to be the transitive closure of functions used
+  # by this dataset and its inputs.
   def _functions(self):
     """Returns a list of functions associated with this dataset.
 
@@ -171,12 +174,12 @@ class DatasetV2(object):
     options = self.options()
     if options.experimental_threading is not None:
       t_options = options.experimental_threading
-      if t_options.private_threadpool_size is not None:
-        dataset = _PrivateThreadPoolDataset(dataset,
-                                            t_options.private_threadpool_size)
       if t_options.max_intra_op_parallelism is not None:
         dataset = _MaxIntraOpParallelismDataset(
             dataset, t_options.max_intra_op_parallelism)
+      if t_options.private_threadpool_size is not None:
+        dataset = _PrivateThreadPoolDataset(dataset,
+                                            t_options.private_threadpool_size)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
       if self._has_captured_ref():
@@ -747,6 +750,12 @@ class DatasetV2(object):
     elements. For perfect shuffling, a buffer size greater than or equal to the
     full size of the dataset is required.
 
+    For instance, if your dataset contains 10,000 elements but `buffer_size` is
+    set to 1,000, then `shuffle` will initially select a random element from
+    only the first 1,000 elements in the buffer. Once an element is selected,
+    its space in the buffer is replaced by the next (i.e. 1,001-st) element,
+    maintaining the 1,000 element buffer.
+
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
         number of elements from this dataset from which the new
@@ -805,6 +814,59 @@ class DatasetV2(object):
     """
     return SkipDataset(self, count)
 
+  def shard(self, num_shards, index):
+    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+    This dataset operator is very useful when running distributed training, as
+    it allows each worker to read a unique subset.
+
+    When reading a single input file, you can skip elements as follows:
+
+    ```python
+    d = tf.data.TFRecordDataset(input_file)
+    d = d.shard(num_workers, worker_index)
+    d = d.repeat(num_epochs)
+    d = d.shuffle(shuffle_buffer_size)
+    d = d.map(parser_fn, num_parallel_calls=num_map_threads)
+    ```
+
+    Important caveats:
+
+    - Be sure to shard before you use any randomizing operator (such as
+      shuffle).
+    - Generally it is best if the shard operator is used early in the dataset
+      pipeline. For example, when reading from a set of TFRecord files, shard
+      before converting the dataset to input samples. This avoids reading every
+      file on every worker. The following is an example of an efficient
+      sharding strategy within a complete pipeline:
+
+    ```python
+    d = Dataset.list_files(pattern)
+    d = d.shard(num_workers, worker_index)
+    d = d.repeat(num_epochs)
+    d = d.shuffle(shuffle_buffer_size)
+    d = d.interleave(tf.data.TFRecordDataset,
+                     cycle_length=num_readers, block_length=1)
+    d = d.map(parser_fn, num_parallel_calls=num_map_threads)
+    ```
+
+    Args:
+      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel.
+      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+    Returns:
+      Dataset: A `Dataset`.
+
+    Raises:
+      InvalidArgumentError: if `num_shards` or `index` are illegal values.
+        Note: error checking is done on a best-effort basis, and errors aren't
+        guaranteed to be caught upon dataset creation. (e.g. providing in a
+        placeholder tensor bypasses the early checking, and will instead result
+        in an error during a session.run call.)
+    """
+    return ShardDataset(self, num_shards, index)
+
   def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
@@ -1100,6 +1162,18 @@ class DatasetV2(object):
   def filter(self, predicate):
     """Filters this dataset according to `predicate`.
 
+    ```python
+    d = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+    
+    d = d.filter(lambda x: x < 3) # [1, 2]
+
+    # `tf.math.equal(x, y)` is required for equality comparison
+    def filter_fn(x):
+      return tf.math.equal(x, 1)
+
+    d = d.filter(filter_fn) # [1]
+    ```
+
     Args:
       predicate: A function mapping a nested structure of tensors (having shapes
         and types defined by `self.output_shapes` and `self.output_types`) to a
@@ -1550,60 +1624,9 @@ class DatasetV1(DatasetV2):
   def skip(self, count):
     return DatasetV1Adapter(super(DatasetV1, self).skip(count))
 
-  @deprecation.deprecated(
-      None, "Use `dataset.apply(tf.data.experimental.filter_for_shard(...))`.")
+  @functools.wraps(DatasetV2.shard)
   def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      Dataset: A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and errors aren't guaranteed
-        to be caught upon dataset creation. (e.g. providing in a placeholder
-        tensor bypasses the early checking, and will instead result in an error
-        during a session.run call.)
-    """
-    return self.apply(filter_for_shard_ops.filter_for_shard(num_shards, index))
+    return DatasetV1Adapter(super(DatasetV1, self).shard(num_shards, index))
 
   @functools.wraps(DatasetV2.batch)
   def batch(self, batch_size, drop_remainder=False):
@@ -1629,6 +1652,43 @@ class DatasetV1(DatasetV2):
           ParallelMapDataset(
               self, map_func, num_parallel_calls, preserve_cardinality=False))
 
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.map()")
+  def map_with_legacy_function(self, map_func, num_parallel_calls=None):
+    """Maps `map_func` across the elements of this dataset.
+
+    NOTE: This is an escape hatch for existing uses of `map` that do not work
+    with V2 functions. New uses are strongly discouraged and existing uses
+    should migrate to `map` as this method will be removed in V2.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to
+        another nested structure of tensors.
+      num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+        representing the number elements to process asynchronously in parallel.
+        If not specified, elements will be processed sequentially. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
+
+    Returns:
+      Dataset: A `Dataset`.
+    """
+    if num_parallel_calls is None:
+      return DatasetV1Adapter(
+          MapDataset(
+              self,
+              map_func,
+              preserve_cardinality=False,
+              use_legacy_function=True))
+    else:
+      return DatasetV1Adapter(
+          ParallelMapDataset(
+              self,
+              map_func,
+              num_parallel_calls,
+              preserve_cardinality=False,
+              use_legacy_function=True))
+
   @functools.wraps(DatasetV2.flat_map)
   def flat_map(self, map_func):
     return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
@@ -1646,6 +1706,25 @@ class DatasetV1(DatasetV2):
   def filter(self, predicate):
     return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
 
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.filter()")
+  def filter_with_legacy_function(self, predicate):
+    """Filters this dataset according to `predicate`.
+
+    NOTE: This is an escape hatch for existing uses of `filter` that do not work
+    with V2 functions. New uses are strongly discouraged and existing uses
+    should migrate to `filter` as this method will be removed in V2.
+
+    Args:
+      predicate: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        scalar `tf.bool` tensor.
+
+    Returns:
+      Dataset: The `Dataset` containing the elements of this dataset for which
+          `predicate` is `True`.
+    """
+    return FilterDataset(self, predicate, use_legacy_function=True)
+
   @functools.wraps(DatasetV2.apply)
   def apply(self, transformation_func):
     return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
@@ -2057,9 +2136,9 @@ structure_lib.Structure._register_custom_converter(DatasetV2,
 
 
 class StructuredFunctionWrapper(object):
-  """A wrapper for `Defun` that supports structured arguments and return values.
-  """
+  """A function wrapper that supports structured arguments and return values."""
 
+  # pylint: disable=protected-access
   def __init__(self,
                func,
                transformation_name,
@@ -2069,6 +2148,7 @@ class StructuredFunctionWrapper(object):
                input_types=None,
                input_structure=None,
                add_to_graph=True,
+               use_legacy_function=False,
                defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
@@ -2090,9 +2170,12 @@ class StructuredFunctionWrapper(object):
         defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
+      use_legacy_function: (Optional.) A boolean that determines whether the
+        function be created using `tensorflow.python.eager.function.defun`
+        (default behavior) or `tensorflow.python.framework.function.Defun`
+        (legacy beheavior).
       defun_kwargs: (Optional.) A dictionary mapping string argument names to
-        values. If supplied, will be passed to `function.Defun()` as keyword
-        arguments.
+        values. If supplied, will be passed to `function` as keyword arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
@@ -2112,7 +2195,7 @@ class StructuredFunctionWrapper(object):
           raise ValueError("Either `dataset`, `input_structure` or all of "
                            "`input_classes`, `input_shapes`, and `input_types` "
                            "must be specified.")
-        self._input_structure = dataset._element_structure  # pylint: disable=protected-access
+        self._input_structure = dataset._element_structure
     else:
       if not (dataset is None and input_classes is None and input_shapes is None
               and input_types is None):
@@ -2121,24 +2204,38 @@ class StructuredFunctionWrapper(object):
                          "must be specified.")
       self._input_structure = input_structure
 
-    self._transformation_name = transformation_name
+    if defun_kwargs is None:
+      defun_kwargs = {}
+
     readable_transformation_name = transformation_name.replace(
         ".", "_")[:-2] if len(transformation_name) > 2 else ""
-    self._func_name = "_".join([
-        readable_transformation_name,
-        function_utils.get_func_name(func),
-        str(ops.uid())
-    ])
 
-    if defun_kwargs is None:
-      defun_kwargs = {}
+    func_name = "_".join(
+        [readable_transformation_name,
+         function_utils.get_func_name(func)])
+
+    def _warn_if_collections(transformation_name, graph, initial_length):
+      """Prints a warning if the given graph uses common graph collections.
+
+      NOTE(mrry): Currently a warning is only generated for lookup tables. Any
+      variables created will be automatically hoisted out to the outermost scope
+      using `init_scope()`. Some collections (such as for control-flow contexts)
+      are benign and should not generate a warning.
 
-    @function.Defun(
-        *self._input_structure._flat_types, func_name=self._func_name,  # pylint: disable=protected-access
-        **defun_kwargs)
-    def tf_data_structured_function_wrapper(*args):
+      Args:
+        transformation_name: A human-readable name for the transformation.
+        graph: The graph to check for collections.
+        initial_length: The initial length of the lookup table collection.
+      """
+      length = len(graph.get_collection(ops.GraphKeys.TABLE_INITIALIZERS))
+      if length != initial_length:
+        warnings.warn("Creating lookup tables inside a function passed to %s "
+                      "is not supported. Create each table outside the "
+                      "function, and capture it inside the function to use it."
+                      % transformation_name)
+
+    def _wrapper_helper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
-      # pylint: disable=protected-access
       nested_args = self._input_structure._from_compatible_tensor_list(args)
       if not _should_unpack_args(nested_args):
         nested_args = (nested_args,)
@@ -2162,18 +2259,53 @@ class StructuredFunctionWrapper(object):
       except (ValueError, TypeError):
         raise TypeError("Unsupported return value from function passed to "
                         "%s: %s." % (transformation_name, ret))
+      return ret
+
+    if use_legacy_function:
+      func_name = func_name + "_" + str(ops.uid())
+
+      @function.Defun(
+          *self._input_structure._flat_types,
+          func_name=func_name,
+          **defun_kwargs)
+      def wrapper_fn(*args):
+        ret = _wrapper_helper(*args)
+        _warn_if_collections(transformation_name, ops.get_default_graph(), 0)
+        return self._output_structure._to_tensor_list(ret)
+
+      self._function = wrapper_fn
+      if add_to_graph:
+        self._function.add_to_graph(ops.get_default_graph())
+      else:
+        # Use the private method that will execute `wrapper_fn` but delay adding
+        # it to the graph in case (e.g.) we need to rerun the function.
+        self._function._create_definition_if_needed()
+    else:
+      defun_kwargs.update({"func_name": func_name})
+
+      @eager_function.defun_with_attributes(
+          input_signature=[
+              tensor_spec.TensorSpec(input_shape, input_type)  # pylint: disable=g-complex-comprehension
+              for input_shape, input_type in zip(
+                  self._input_structure._flat_shapes,
+                  self._input_structure._flat_types)
+          ],
+          attributes=defun_kwargs)
+      def wrapper_fn(*args):  # pylint: disable=missing-docstring
+        ret = _wrapper_helper(*args)
+        ret = self._output_structure._to_tensor_list(ret)
+        return [ops.convert_to_tensor(t) for t in ret]
 
-      _warn_if_collections(transformation_name)
-      return self._output_structure._to_tensor_list(ret)
+      initial_length = len(ops.get_default_graph().get_collection(
+          ops.GraphKeys.TABLE_INITIALIZERS))
 
-    self._function = tf_data_structured_function_wrapper
-    if add_to_graph:
-      self._function.add_to_graph(ops.get_default_graph())
-    else:
-      # Use the private method that will execute
-      # `tf_data_structured_function_wrapper` but delay adding it to the graph
-      # in case (e.g.) we need to rerun the function.
-      self._function._create_definition_if_needed()  # pylint: disable=protected-access
+      self._function = wrapper_fn._get_concrete_function_internal()
+      if add_to_graph:
+        self._function.add_to_graph(ops.get_default_graph())
+
+      _warn_if_collections(transformation_name, self._function.graph,
+                           initial_length)
+  # pylint: enable=protected-access
 
   @property
   def output_structure(self):
@@ -2504,6 +2636,23 @@ class SkipDataset(UnaryUnchangedStructureDataset):
     super(SkipDataset, self).__init__(input_dataset, variant_tensor)
 
 
+class ShardDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` for sharding its input."""
+
+  def __init__(self, input_dataset, num_shards, index):
+    """See `Dataset.shard()` for details."""
+    self._input_dataset = input_dataset
+    self._num_shards = ops.convert_to_tensor(
+        num_shards, dtype=dtypes.int64, name="num_shards")
+    self._index = ops.convert_to_tensor(index, dtype=dtypes.int64, name="index")
+    variant_tensor = gen_dataset_ops.shard_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_shards=self._num_shards,
+        index=self._index,
+        **flat_structure(self))
+    super(ShardDataset, self).__init__(input_dataset, variant_tensor)
+
+
 class BatchDataset(UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
 
@@ -2729,24 +2878,6 @@ def _should_unpack_args(args):
   return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
-def _warn_if_collections(transformation_name):
-  """Prints warning message if the current graph uses common graph collections.
-
-  NOTE(mrry): Currently a warning is only generated for lookup tables. Any
-  variables created will be automatically hoisted out to the outermost scope
-  using `init_scope()`. Some collections (such as for control-flow contexts)
-  are benign and should not generate a warning.
-
-  Args:
-    transformation_name: A human-readable name for the transformation.
-  """
-  if ops.get_default_graph().get_collection(ops.GraphKeys.TABLE_INITIALIZERS):
-    warnings.warn("Creating lookup tables inside a function passed to %s is not"
-                  " supported. Create each table outside the function, and "
-                  "capture it inside the function to use it."
-                  % transformation_name)
-
-
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
@@ -2754,13 +2885,17 @@ class MapDataset(UnaryDataset):
                input_dataset,
                map_func,
                use_inter_op_parallelism=True,
-               preserve_cardinality=False):
+               preserve_cardinality=False,
+               use_legacy_function=False):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
     self._preserve_cardinality = preserve_cardinality
     self._map_func = StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     variant_tensor = gen_dataset_ops.map_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
@@ -2789,12 +2924,16 @@ class ParallelMapDataset(UnaryDataset):
                map_func,
                num_parallel_calls,
                use_inter_op_parallelism=True,
-               preserve_cardinality=False):
+               preserve_cardinality=False,
+               use_legacy_function=False):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
     self._map_func = StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
     self._preserve_cardinality = preserve_cardinality
@@ -2929,11 +3068,14 @@ class ParallelInterleaveDataset(UnaryDataset):
 class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
-  def __init__(self, input_dataset, predicate):
+  def __init__(self, input_dataset, predicate, use_legacy_function=False):
     """See `Dataset.filter()` for details."""
     self._input_dataset = input_dataset
     wrapped_func = StructuredFunctionWrapper(
-        predicate, self._transformation_name(), dataset=input_dataset)
+        predicate,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     if not wrapped_func.output_structure.is_compatible_with(
         structure_lib.TensorStructure(dtypes.bool, [])):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d6fb73813cd06e440d69f900e6b1076606a068c0..efa90209512b09b2b9727275b6e6ef3dfe7df529 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -31,8 +31,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -68,7 +68,7 @@ def _device_stack_is_empty():
 
 
 @tf_export(v1=["data.Iterator"])
-class Iterator(checkpointable.Checkpointable):
+class Iterator(trackable.Trackable):
   """Represents the state of iterating through a `Dataset`."""
 
   def __init__(self, iterator_resource, initializer, output_types,
@@ -491,7 +491,7 @@ def _generate_shared_name(prefix):
   return "{}{}".format(prefix, uid)
 
 
-class EagerIterator(checkpointable.Checkpointable):
+class EagerIterator(trackable.Trackable):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset."""
 
   def __init__(self, dataset):
@@ -641,7 +641,7 @@ class EagerIterator(checkpointable.Checkpointable):
     return {"ITERATOR": _saveable_factory}
 
 
-# TODO(b/71645805): Expose checkpointable stateful objects from dataset
+# TODO(b/71645805): Expose trackable stateful objects from dataset
 # attributes(potential).
 class _IteratorSaveable(BaseSaverBuilder.SaveableObject):
   """SaveableObject for saving/restoring iterator state."""
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 051bc458433aa970f35c59a3366e0ec0b6b3b387..2592282104877a13e3f28290eb80801382667249 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -89,6 +89,11 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
+    self._incarnation_id_index = -1
+    for i, arg in enumerate(self._next_captured_args):
+      if arg == incarnation_id:
+        self._incarnation_id_index = i
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
@@ -126,6 +131,49 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     return self._structure
 
 
+class _ReincarnatedPerDeviceGenerator(dataset_ops.DatasetV2):
+  """Creates a _PerDeviceGenerator-like dataset with a new incarnation_id.
+
+  Re-uses the functions from the provided per_device_dataset and just switches
+  out the function argument corresponding to the incarnation_id.
+  """
+
+  def __init__(self, per_device_dataset, incarnation_id):
+    # pylint: disable=protected-access
+    self._structure = per_device_dataset._structure
+
+    self._init_func = per_device_dataset._init_func
+    self._init_captured_args = self._init_func.captured_inputs
+
+    self._next_func = per_device_dataset._next_func
+    self._next_captured_args = per_device_dataset._next_captured_args
+    # The captured arguments to the next_func are string_handle, incarnation_id.
+    # We update the incarnation id to the new one.
+    self._next_captured_args[
+        per_device_dataset._incarnation_id_index] = incarnation_id
+
+    self._finalize_func = per_device_dataset._finalize_func
+    self._finalize_captured_args = per_device_dataset._finalize_captured_args
+
+    variant_tensor = gen_dataset_ops.generator_dataset(
+        self._init_captured_args,
+        self._next_captured_args,
+        self._finalize_captured_args,
+        init_func=self._init_func,
+        next_func=self._next_func,
+        finalize_func=self._finalize_func,
+        **dataset_ops.flat_structure(self))
+    super(_ReincarnatedPerDeviceGenerator, self).__init__(variant_tensor)
+
+  def _inputs(self):
+    # TODO(b/116506223): Determine which datasets should be used as inputs here.
+    return []
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
 class MultiDeviceIterator(object):
   """An iterator over multiple devices."""
 
@@ -156,9 +204,11 @@ class MultiDeviceIterator(object):
     self._devices = devices
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
+    self._max_buffer_size = max_buffer_size
+    self._prefetch_buffer_size = prefetch_buffer_size
 
-    if prefetch_buffer_size > max_buffer_size:
-      max_buffer_size = prefetch_buffer_size
+    if self._prefetch_buffer_size > self._max_buffer_size:
+      self._max_buffer_size = self._prefetch_buffer_size
 
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
@@ -171,7 +221,7 @@ class MultiDeviceIterator(object):
               devices=self._devices,
               shared_name=shared_name,
               container="",
-              **dataset_ops.flat_structure(dataset)))
+              **dataset_ops.flat_structure(self._dataset)))
       if context.executing_eagerly():
         # Delete the resource when this object is deleted
         self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
@@ -183,7 +233,15 @@ class MultiDeviceIterator(object):
       self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
           self._dataset._variant_tensor,  # pylint: disable=protected-access
           self._multi_device_iterator_resource,
-          max_buffer_size=max_buffer_size)
+          max_buffer_size=self._max_buffer_size)
+
+    self._prototype_device_datasets = []
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        ds = _PerDeviceGenerator(
+            i, self._multi_device_iterator_resource, self._incarnation_id,
+            self._source_device_tensor, self._dataset._element_structure)  # pylint: disable=protected-access
+        self._prototype_device_datasets.append(ds)
 
     # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to
     # initialize the device side of the pipeline. This would allow the
@@ -193,17 +251,7 @@ class MultiDeviceIterator(object):
     self._device_iterators = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
-        ds = _PerDeviceGenerator(
-            i, self._multi_device_iterator_resource, self._incarnation_id,
-            self._source_device_tensor, dataset._element_structure)  # pylint: disable=protected-access
-        if prefetch_buffer_size > 0:
-          ds = ds.prefetch(prefetch_buffer_size)
-        # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
-        # non-CPU devices.
-        options = dataset_ops.Options()
-        options.experimental_autotune = False
-        options.experimental_optimization.apply_default_optimizations = False
-        ds = ds.with_options(options)
+        ds = self._create_device_dataset(i)
         if context.executing_eagerly():
           self._device_iterators.append(dataset_ops.make_one_shot_iterator(ds))
         else:
@@ -216,6 +264,20 @@ class MultiDeviceIterator(object):
       ]
       self._initializer = control_flow_ops.group(*device_iterator_initializers)
 
+  def _create_device_dataset(self, i):
+    """Uses _prototype_device_datasets[i] to build a dataset for the device."""
+    ds = self._prototype_device_datasets[i]
+    ds = _ReincarnatedPerDeviceGenerator(ds, self._incarnation_id)
+    if self._prefetch_buffer_size > 0:
+      ds = ds.prefetch(self._prefetch_buffer_size)
+    # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+    # non-CPU devices.
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    options.experimental_optimization.apply_default_optimizations = False
+    ds = ds.with_options(options)
+    return ds
+
   def get_next(self, device=None):
     """Returns the next element given a `device`, else returns all in a list."""
     if device is not None:
@@ -242,6 +304,23 @@ class MultiDeviceIterator(object):
       return control_flow_ops.no_op()
     return self._initializer
 
+  def _eager_reset(self):
+    """Resets the MultiDeviceIterator in eager mode."""
+    if not context.executing_eagerly():
+      raise ValueError("Eager reset is only supported in eager mode.")
+    # pylint: disable=protected-access
+    self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
+        self._dataset._variant_tensor,
+        self._multi_device_iterator_resource,
+        max_buffer_size=self._max_buffer_size)
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        ds = self._create_device_dataset(i)
+        # Reset the device iterator resources with the new dataset.
+        ds_variant = ds._variant_tensor
+        gen_dataset_ops.make_iterator(ds_variant,
+                                      self._device_iterators[i]._resource)
+
   @property
   def output_types(self):
     return self._dataset.output_types
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index e5abc654da77cd9409f52d3ba5c8868c0916c712..ebfd8af34233516d3f447d03735371b7e2be8f22 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -46,7 +46,7 @@ from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 def _sorted(dict_):
   """Returns a sorted list of the dict keys, with error if keys not sortable."""
   try:
-    return sorted(_six.iterkeys(dict_))
+    return sorted(list(dict_))
   except TypeError:
     raise TypeError("nest only supports dicts with sortable keys.")
 
@@ -68,7 +68,7 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+    return type(instance)((key, result[key]) for key in instance)
   elif (isinstance(instance, tuple) and
         hasattr(instance, "_fields") and
         isinstance(instance._fields, _collections.Sequence) and
@@ -317,8 +317,7 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
         raise ValueError(
             "The two structures don't have the same keys. Input "
             "structure has keys %s, while shallow structure has keys %s." %
-            (list(_six.iterkeys(input_tree)),
-             list(_six.iterkeys(shallow_tree))))
+            (list(input_tree), list(shallow_tree)))
       input_tree = list(sorted(_six.iteritems(input_tree)))
       shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 4634f505f67dc5e12a08c8774457375a0adcdfb0..3b1b214a6f458430876f9467a51fae6b8054c46a 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -569,6 +569,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -657,6 +658,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -816,6 +818,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["notsan"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -833,6 +836,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -850,6 +854,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -975,6 +980,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_windows"],  # TODO: needs investigation on Windows
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -1015,6 +1021,13 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    tags = [
+        "manual",
+        "no_pip",
+        "no_windows",
+        "notap",
+    ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -1042,6 +1055,7 @@ cuda_py_test(
         "notsan",
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -1067,6 +1081,7 @@ cuda_py_test(
         "optonly",  # Test flaky (b/80130873)
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 # TODO(cais): Run the test in OSS, perhaps through a sh_test.
@@ -1095,6 +1110,7 @@ cuda_py_test(
         "no_windows",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
diff --git a/tensorflow/python/debug/cli/stepper_cli.py b/tensorflow/python/debug/cli/stepper_cli.py
index 94eb2754da21b2a6c66271f53a2a0917deb25515..fe1a012a5444a0140edd15b9e66a4de0449a5e47 100644
--- a/tensorflow/python/debug/cli/stepper_cli.py
+++ b/tensorflow/python/debug/cli/stepper_cli.py
@@ -251,6 +251,9 @@ class NodeStepperCLI(object):
       lines.extend(
           ["Topologically-sorted transitive input(s) and fetch(es):", ""])
 
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+    self._add_deprecation_warning(output)
+
     for i, element_name in enumerate(self._sorted_nodes):
       if i < index_range[0] or i >= index_range[1]:
         continue
@@ -269,15 +272,36 @@ class NodeStepperCLI(object):
           override_names,
           dirty_variable_names)
 
-      lines.append(node_prefix + "] " + element_name)
-
-    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+      output.append_rich_line(node_prefix + "] " + element_name)
 
     if verbose:
       output.extend(self._node_status_label_legend())
 
     return output
 
+  def _add_deprecation_warning(self, message):
+    """Add deprecation warning as RichTextLines."""
+    color = "yellow"
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "WARNING: the invoke_stepper feature of tfdbg has been deprecated ",
+            color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "and will be removed in the next release of TensorFlow.",
+            color))
+    message.append_rich_line(debugger_cli_common.RichLine("", color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "There now exist better alternatives of stepping debugging, "
+            "including:",
+            color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine("- TensorBoard Debugger Plugin", color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine("- Eager Execution", color))
+    message.append_rich_line(debugger_cli_common.RichLine("", color))
+
   def _get_status_labels(self,
                          element_name,
                          handle_node_names,
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
index 29add425e946aadfe941c73e9f9cef4aef3c8a9c..dce400c9ab0b6be3cabaea7c465baa1a6d2f471d 100644
--- a/tensorflow/python/debug/lib/source_remote_test.py
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -48,7 +48,8 @@ class SendTracebacksTest(test_util.TensorFlowTestCase):
     test_util.TensorFlowTestCase.setUpClass()
     (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
      cls._server_thread,
-     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread(
+         poll_server=True)
     cls._server_address = "localhost:%d" % cls._server_port
     (cls._server_port_2, cls._debug_server_url_2, cls._server_dump_dir_2,
      cls._server_thread_2,
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index bc465e1fdeaabbf6b8d339026c870701f52c33c0..feeae8d9f414c8be4ed053cc0150c98c4173ddd1 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
 )
@@ -110,8 +111,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":cross_device_ops",
         ":distribute_lib",
         ":mirrored_strategy",
+        ":one_device_strategy",
+        "//tensorflow/python/distribute/experimental",
     ],
 )
 
@@ -138,7 +142,6 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/ops/losses",
         "//tensorflow/tools/docs:doc_controls",
     ],
 )
@@ -458,7 +461,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
index 54e539ecdd291c272919bccc23df8fc25c643970..f9d0a95ea580a8bb125e6610c232d1eabfe105a6 100644
--- a/tensorflow/python/distribute/__init__.py
+++ b/tensorflow/python/distribute/__init__.py
@@ -20,7 +20,11 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute import cross_device_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.distribute.experimental import collective_all_reduce_strategy
+from tensorflow.python.distribute.experimental import parameter_server_strategy
 # pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index 48bfc60651762f8e87b24699c16739e0a14efab6..0cd823916022f037cf8c9669cc73dd242798d986 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -77,17 +77,17 @@ class TFConfigClusterResolver(ClusterResolver):
   def task_type(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
-      return task_info['type'] if 'type' in task_info else None
+      return str(task_info['type']) if 'type' in task_info else None
     else:
-      return self._task_type
+      return str(self._task_type)
 
   @property
   def task_id(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
-      return task_info['index'] if 'index' in task_info else None
+      return int(task_info['index']) if 'index' in task_info else None
     else:
-      return self._task_id
+      return int(self._task_id)
 
   @task_type.setter
   def task_type(self, task_type):
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index 97a5eb685fa633f107aa8c8cafec9944f7812300..65e75d4dd3486c33868565bc4e8a34ec6bde3ce1 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -183,6 +183,40 @@ class TFConfigClusterResolverTest(test.TestCase):
     self.assertEqual(1, cluster_resolver.task_id)
     self.assertEqual('test', cluster_resolver.rpc_layer)
 
+  def testTaskTypeCastToString(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "123456": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": 123456,
+        "index": 0
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('123456', cluster_resolver.task_type)
+
+  def testTaskIndexCastToInteger(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": "1"
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual(1, cluster_resolver.task_id)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 74d30302706c32a64317a4d7206b44740a696439..a9d092784d9c841124e2c8af6f2a39b53ad4f498 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
@@ -38,9 +39,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(yuefengz): support in-graph replication.
+@tf_export("distribute.experimental.MultiWorkerMirroredStrategy")
 class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
@@ -354,6 +357,29 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     return updated_config
 
+  def _reduce_to(self, reduce_op, value, destinations):
+    if (isinstance(value, values.Mirrored) and
+        reduce_op == reduce_util.ReduceOp.MEAN):
+      return value
+    assert not isinstance(value, values.Mirrored)
+
+    if (isinstance(value, values.DistributedValues) and
+        len(self.worker_devices) == 1):
+      value = value.values[0]
+
+    # When there are multiple workers, we need to reduce across workers using
+    # collective ops.
+    if (not isinstance(value, values.DistributedValues) and
+        self._num_workers == 1):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
   @property
   def experimental_between_graph(self):
     return True
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 9729302c6dc1e22772c1a80a25eff17720c50994..afb7a35c0a29ed6e20816d2d062af8fb68b897cf 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -32,6 +32,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 def check_destinations(destinations):
@@ -114,6 +116,9 @@ def _make_tensor_into_per_replica(input_tensor):
 def _normalize_value_destination_pairs(value_destination_pairs):
   """Converts each tensor into a PerReplica object in the input list."""
   result = []
+
+  value_destination_pairs = list(value_destination_pairs)
+
   if not isinstance(value_destination_pairs, (list, tuple)):
     raise ValueError("`value_destination_pairs` should be a list or tuple")
   for pair in value_destination_pairs:
@@ -215,6 +220,7 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
   return reduced
 
 
+@tf_export("distribute.CrossDeviceOps")
 class CrossDeviceOps(object):
   """Base class for cross-device reduction and broadcasting algorithms."""
 
@@ -244,7 +250,8 @@ class CrossDeviceOps(object):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(reduce_op, per_replica_value, destinations)
+    return self.reduce_implementation(reduce_op, per_replica_value,
+                                      destinations)
 
   def batch_reduce(self, reduce_op, value_destination_pairs):
     """Reduce PerReplica objects in a batch.
@@ -265,6 +272,8 @@ class CrossDeviceOps(object):
       ValueError: if `value_destination_pairs` is not a list or a tuple of
         tuples of PerReplica objects and destinations
     """
+    # TODO(yuefengz): if destinations are different, split into several
+    # `_batch_reduce` invocations.
     if not _validate_value_destination_pairs(value_destination_pairs):
       # If the first element of each pair is a tensor, we try to turn it into a
       # PerReplica object.
@@ -274,7 +283,7 @@ class CrossDeviceOps(object):
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
-    return self._batch_reduce(reduce_op, value_destination_pairs)
+    return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
     """Broadcast the `tensor` to destinations.
@@ -287,41 +296,93 @@ class CrossDeviceOps(object):
       a Mirrored object.
     """
     validate_destinations(destinations)
-    return self._broadcast(tensor, destinations)
+    return self.broadcast_implementation(tensor, destinations)
+
+  @doc_controls.for_subclass_implementers
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    """The implementation of reduce of `per_replica_value` to `destinations`.
+
+    It runs the reduction operation defined by `reduce_op` and put the
+    result on `destinations`.
+
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      per_replica_value: a PerReplica object or a tensor with device set.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+    Raises:
+      ValueError: if per_replica_value can't be converted to a PerReplica
+        object.
+    """
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  @doc_controls.for_subclass_implementers
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
+    """Implementation of reduce PerReplica objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
+        (or tensors with device set if there is one device) and destinations.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerReplica objects and destinations
+    """
     raise NotImplementedError(
         "_batch_reduce method must be implemented in descendants.")
 
-  def _broadcast(self, tensor, destinations):
+  @doc_controls.for_subclass_implementers
+  def broadcast_implementation(self, tensor, destinations):
+    """Implementation of broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
     return simple_broadcast(tensor, destinations, always_mirrored=True)
 
 
-class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
+@tf_export("distribute.ReductionToOneDevice")
+class ReductionToOneDevice(CrossDeviceOps):
   """Always do reduction to one device first and then do broadcasting.
 
     Batch reduction is done by reduction on each element one by one.
   """
 
-  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+  def __init__(self, reduce_to_device=None, accumulation_fn=None):
     """Constructor.
 
     Args:
       reduce_to_device: the intermediate device to reduce to. If None, reduce
         to the first device in `destinations` of the reduce() method.
-      accumulation_fn: a function that does accumulation.
+      accumulation_fn: a function that does accumulation.  If None, then
+        `tf.math.add_n` is used.
     """
     self.reduce_to_device = reduce_to_device
-    self.accumulation_fn = accumulation_fn
-    super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
+    self.accumulation_fn = accumulation_fn or math_ops.add_n
+    super(ReductionToOneDevice, self).__init__()
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
-    assert check_destinations(destinations)
-    devices = get_devices_from(destinations)
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    if check_destinations(destinations):
+      devices = get_devices_from(destinations)
+    else:
+      devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
     logging.log_first_n(
         logging.INFO,
@@ -330,9 +391,9 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
                              self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, destinations)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     return [
-        self._reduce(reduce_op, t, destinations=v)
+        self.reduce_implementation(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
@@ -399,11 +460,11 @@ def _ungroup_and_make_mirrored(grouped_reduced,
   return [value_lib.Mirrored(device_map, v, logical_device) for v in index]
 
 
-class ConcatAndSplitPacker(object):
+class _ConcatAndSplitPacker(object):
   """Concatenate and split tensors for reduction."""
 
   def __init__(self, num_packs=1):
-    """Initialize the ConcatAndSplitPacker object.
+    """Initialize the _ConcatAndSplitPacker object.
 
     Args:
       num_packs: specifies the number of split packs that will be
@@ -503,13 +564,13 @@ class ConcatAndSplitPacker(object):
     return aggregated_device_grads
 
 
-class AggregateSmallTensorPacker(object):
+class _AggregateSmallTensorPacker(object):
   """Concatenate small gradient tensors together for reduction."""
 
   def __init__(self,
                agg_small_grads_max_bytes=1048576,
                agg_small_grads_max_group=16):
-    """Initialize the AggregateSmallTensorPacker object.
+    """Initialize the _AggregateSmallTensorPacker object.
 
     Args:
       agg_small_grads_max_bytes: largest tensor eligible for aggregation,
@@ -549,11 +610,11 @@ def _pack_tensors(device_grads,
                   agg_small_grads_max_group=0):
   """Pack tensors if specified."""
   if num_packs > 0:
-    tensor_packer = ConcatAndSplitPacker(num_packs)
+    tensor_packer = _ConcatAndSplitPacker(num_packs)
     device_grad_packs = tensor_packer.pack(device_grads)
   elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tensor_packer = AggregateSmallTensorPacker(agg_small_grads_max_bytes,
-                                               agg_small_grads_max_group)
+    tensor_packer = _AggregateSmallTensorPacker(agg_small_grads_max_bytes,
+                                                agg_small_grads_max_group)
     device_grad_packs = tensor_packer.pack(device_grads)
   else:
     tensor_packer = None
@@ -569,7 +630,7 @@ def _unpack_tensors(reduced, tensor_packer=None):
 
 
 class AllReduceCrossDeviceOps(CrossDeviceOps):
-  """Reduction using all reduce."""
+  """Reduction using all-reduce."""
 
   def __init__(self,
                all_reduce_alg="nccl",
@@ -594,37 +655,22 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
       num_packs: see above.
       agg_small_grads_max_bytes: see above.
       agg_small_grads_max_group: see above.
-        tensors.
     """
     self._all_reduce_alg = all_reduce_alg
     self._num_packs = num_packs
     self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
     self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._simple_cross_replica_ops = ReductionToOneDevice()
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
-    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
-        per_replica_value)
-    if (_devices_match(per_replica_value, destinations)
-        and not context.executing_eagerly()
-        and not contains_indexed_slices):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    if _devices_match(per_replica_value, destinations):
       return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
-      if contains_indexed_slices:
-        logging.log_first_n(
-            logging.WARN,
-            "Efficient allreduce is not supported for IndexedSlices.", 10)
+      return self._simple_cross_replica_ops.reduce(reduce_op, per_replica_value,
+                                                   destinations)
 
-      if check_destinations(destinations):
-        devices = get_devices_from(destinations)
-      else:
-        devices = get_devices_from(per_replica_value)
-      reduce_to_device = devices[0]
-      reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                               math_ops.add_n, reduce_op)
-      return self.broadcast(reduced, destinations)
-
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
     contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         value_destination_pairs)
@@ -640,20 +686,37 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                             10)
 
       return [
-          self._reduce(reduce_op, t, destinations=v)
+          self.reduce_implementation(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All reduce algorithm in a batch."""
+    """All-reduce algorithm in a batch."""
+    dense_values, dense_indices, sparse_values, sparse_indices = (
+        cross_device_utils.split_by_sparsity(per_replica_values))
+    if dense_values:
+      dense_results = self._do_batch_all_reduce(reduce_op, dense_values)
+    else:
+      dense_results = []
+    if sparse_values:
+      sparse_results = self._do_batch_all_reduce_sparse(reduce_op,
+                                                        sparse_values)
+    else:
+      sparse_results = []
+    return cross_device_utils.stitch_values(((dense_results, dense_indices),
+                                             (sparse_results, sparse_indices)))
+
+  def _do_batch_all_reduce(self, reduce_op, dense_values):
+    """Run batch all-reduces."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
         "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
         "agg_small_grads_max_group = %d" %
-        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
+        (len(dense_values), self._all_reduce_alg, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-    destinations = per_replica_values[0].devices
-    grouped = _group_value_by_device(per_replica_values)
+
+    destinations = dense_values[0].devices
+    grouped = _group_value_by_device(dense_values)
 
     device_grad_packs, tensor_packer = _pack_tensors(
         grouped, self._num_packs, self._agg_small_grads_max_bytes,
@@ -674,7 +737,18 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
-    return _ungroup_and_make_mirrored(reduced, per_replica_values[0], reduce_op)
+    return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
+
+  def _do_batch_all_reduce_sparse(self, reduce_op, sparse_values):
+    """Run batch all-reduce for sparse values."""
+    logging.log_first_n(
+        logging.WARN,
+        "Efficient allreduce is not supported for %d IndexedSlices" %
+        len(sparse_values), 10)
+    # Use `sparse_values` as destinations to do all-reduces. It is effectively
+    # an allgather under the hood but not an efficient one.
+    return self._simple_cross_replica_ops.batch_reduce(
+        reduce_op, zip(sparse_values, sparse_values))
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -685,6 +759,49 @@ AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
                                             "alg shards limit")
 
 
+@tf_export("distribute.NcclAllReduce")
+class NcclAllReduce(AllReduceCrossDeviceOps):
+  """Reduction using NCCL all-reduce."""
+
+  def __init__(self, num_packs=1):
+    """NCCL all-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation.
+
+    Args:
+      num_packs: values will be packed in this many splits.  `num_packs` should
+        be greater than 0.
+    """
+    assert num_packs > 0, (
+        "NCLL all-reduce requires num_packs > 0, but {} is specified".format(
+            num_packs))
+    super(NcclAllReduce, self).__init__(
+        all_reduce_alg="nccl", num_packs=num_packs)
+
+
+@tf_export("distribute.HierarchicalCopyAllReduce")
+class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
+  """Reduction using hierarchical copy all-reduce.
+
+  This is a good reduction for configurations like Nvidia DGX-1.
+  """
+
+  def __init__(self, num_packs=1):
+    """Hierarchical copy all-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation.
+
+    Args:
+      num_packs: values will be packed in this many splits.  `num_packs` should
+        be greater than 0.
+    """
+    super(HierarchicalCopyAllReduce, self).__init__(
+        all_reduce_alg="hierarchical_copy",
+        num_packs=num_packs)
+
+
 class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
   """All-reduce algorithms for distributed TensorFlow."""
 
@@ -758,7 +875,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All reduce algorithm in a batch."""
+    """All-reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
         "distributed batch_all_reduce invoked for batches size = %d with "
@@ -769,7 +886,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
 
     device_grads = _group_value_by_device(per_replica_values)
 
-    # The all reduce library requires fully defined shapes.
+    # The all-reduce library requires fully defined shapes.
     # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
     # required as well.
     for device_grad in device_grads:
@@ -838,7 +955,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
@@ -861,7 +978,7 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     return value_lib.Mirrored(device_map, index, logical_device)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
@@ -877,7 +994,7 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self._reduce(reduce_op, t, destinations=v)
+          self.reduce_implementation(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
@@ -930,7 +1047,7 @@ _dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
 def _has_dgx1_like_links(gpu_links):
   if not gpu_links:
     return False
-  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # TODO(yuefengz): figure out the right topology for hierarchical copy if
   # number of gpus are less than 8.
   if len(gpu_links) < 8:
     return False
@@ -943,10 +1060,9 @@ def _has_dgx1_like_links(gpu_links):
 
 def _choose_all_reduce_algorithm(device_links):
   if _has_dgx1_like_links(device_links):
-    return AllReduceCrossDeviceOps(
-        "hierarchical_copy", num_packs=len(device_links))
+    return HierarchicalCopyAllReduce(num_packs=len(device_links))
   else:
-    return AllReduceCrossDeviceOps("nccl", num_packs=1)
+    return NcclAllReduce(num_packs=1)
 
 
 def choose_the_best(devices, session_config=None):
@@ -973,12 +1089,12 @@ def choose_the_best(devices, session_config=None):
   if len(using_devices) != len(requested_devices):
     logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                     "to TensorFlow.")
-    return ReductionToOneDeviceCrossDeviceOps()
+    return ReductionToOneDevice()
 
   if any(d.device_type.lower() != "gpu" for d in using_devices):
     logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                     "to TensorFlow.")
-    return ReductionToOneDeviceCrossDeviceOps()
+    return ReductionToOneDevice()
 
   device_links = [[] for _ in range(len(using_devices))]
   for i, device in enumerate(using_devices):
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index cb5417ea15bc40944f7e378233f7b147fac43bb0..a28628dae4af94038b084b2d5261b6be51910138 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -681,3 +681,58 @@ def contains_indexed_slices(value):
     return contains_indexed_slices(value.values)
   else:
     return False
+
+
+def is_indexed_slices(value):
+  if isinstance(value, ops.IndexedSlices):
+    return True
+  assert isinstance(value, value_lib.DistributedValues)
+  return all([isinstance(v, ops.IndexedSlices) for v in value.values])
+
+
+def split_by_sparsity(values):
+  """Split values into dense and sparse values.
+
+  Args:
+    values: a list of tensors or `PerReplica`s.
+
+  Returns:
+    Four lists:
+      a list of dense values, a list of their indices in `values` and
+      a list of sparse values, a list of their indices in `values`.
+  """
+  dense_values = []
+  dense_indices = []
+  sparse_values = []
+  sparse_indices = []
+  for i, v in enumerate(values):
+    if is_indexed_slices(v):
+      sparse_values.append(v)
+      sparse_indices.append(i)
+    else:
+      dense_values.append(v)
+      dense_indices.append(i)
+  return dense_values, dense_indices, sparse_values, sparse_indices
+
+
+def stitch_values(values_and_indices_list):
+  """Stitch values together according to their indices.
+
+  Args:
+    values_and_indices_list: a list of tuples of values and indices indicating
+      the values and postions in the returned list.
+
+  Returns:
+    a stitched list of values.
+  """
+  length = 0
+  for values_and_indices in values_and_indices_list:
+    length += len(values_and_indices[0])
+
+  result = [None] * length
+  for values_and_indices in values_and_indices_list:
+    if values_and_indices and values_and_indices[0]:
+      for v, i in zip(*values_and_indices):
+        assert result[i] is None
+        result[i] = v
+  return result
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 2cc99b3db4619d86d18507d1beb1adb6d42c528b..3e4836448737ffa87eff4d4c9e5c02b26d17b70c 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -37,7 +37,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -79,14 +78,14 @@ class UpdateContext(object):
 # Public utility functions.
 
 
-@tf_export("distribute.get_loss_reduction")
+@tf_export(v1=["distribute.get_loss_reduction"])
 def get_loss_reduction():
-  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
-  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
-  if (loss_reduction == losses_impl.Reduction.SUM or
-      loss_reduction == losses_impl.ReductionV2.SUM):
-    return reduce_util.ReduceOp.SUM
-  return reduce_util.ReduceOp.MEAN
+  """DEPRECATED: Now always returns `tf.distribute.ReduceOp.SUM`.
+
+  We now always make the complete adjustment when computing the loss, so
+  code should always add gradients/losses across replicas, never average.
+  """
+  return reduce_util.ReduceOp.SUM
 
 
 # ------------------------------------------------------------------------------
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index c147849e5de62659b91ccdbf38c35611aad3fd91..6876af377515ff2c626d12cb5ad2cdf2a3014ddd 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
@@ -60,13 +60,12 @@ class _TestExtended(distribute_lib.DistributionStrategyExtended):
 
 
 def _assert_in_default_state(t):
-  t.assertIs(distribution_strategy_context._get_default_replica_context(),
-             distribution_strategy_context.get_replica_context())
-  t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
-  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
-  t.assertIs(distribution_strategy_context._get_default_strategy(),
-             distribution_strategy_context.get_strategy())
-  t.assertFalse(distribution_strategy_context.has_strategy())
+  t.assertIs(ds_context._get_default_replica_context(),
+             ds_context.get_replica_context())
+  t.assertIs(None, ds_context.get_cross_replica_context())
+  t.assertFalse(ds_context.in_cross_replica_context())
+  t.assertIs(ds_context._get_default_strategy(), ds_context.get_strategy())
+  t.assertFalse(ds_context.has_strategy())
 
 
 class TestStrategyTest(test.TestCase):
@@ -76,14 +75,12 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
 
     def run_fn():
-      replica_context = distribution_strategy_context.get_replica_context()
+      replica_context = ds_context.get_replica_context()
       self.assertTrue(replica_context is not None)
-      self.assertIs(None,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.has_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_strategy())
+      self.assertIs(None, ds_context.get_cross_replica_context())
+      self.assertFalse(ds_context.in_cross_replica_context())
+      self.assertTrue(ds_context.has_strategy())
+      self.assertIs(dist, ds_context.get_strategy())
       self.assertEqual("foo", replica_context.merge_call(None, test_arg="foo"))
       expected_value = _get_test_variable(
           "bar", variable_scope.VariableSynchronization.AUTO,
@@ -101,13 +98,11 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      self.assertIs(None, distribution_strategy_context.get_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.has_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_strategy())
+      self.assertIs(None, ds_context.get_replica_context())
+      self.assertIs(dist, ds_context.get_cross_replica_context())
+      self.assertTrue(ds_context.in_cross_replica_context())
+      self.assertTrue(ds_context.has_strategy())
+      self.assertIs(dist, ds_context.get_strategy())
       expected_value = _get_test_variable(
           "baz", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
@@ -138,22 +133,16 @@ class DefaultDistributionStrategyTest(test.TestCase):
     _assert_in_default_state(self)
 
     def merge_fn(dist, s):
-      self.assertIs(
-          distribution_strategy_context._get_default_strategy(),
-          dist)
-      self.assertIs(None, distribution_strategy_context.get_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_strategy())
-      self.assertFalse(
-          distribution_strategy_context.has_strategy())
+      self.assertIs(ds_context._get_default_strategy(), dist)
+      self.assertIs(None, ds_context.get_replica_context())
+      self.assertIs(dist, ds_context.get_cross_replica_context())
+      self.assertTrue(ds_context.in_cross_replica_context())
+      self.assertIs(dist, ds_context.get_strategy())
+      self.assertFalse(ds_context.has_strategy())
       return "foo_" + s
 
-    replica_ctx = distribution_strategy_context.get_replica_context()
-    self.assertIs(distribution_strategy_context._get_default_replica_context(),
-                  replica_ctx)
+    replica_ctx = ds_context.get_replica_context()
+    self.assertIs(ds_context._get_default_replica_context(), replica_ctx)
     self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 7d5f231c37da41f10f945adc468f40ffd0ecc743..0ec6703b8692fe313f12b6e9952e19f43e1e7adb 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
@@ -296,10 +297,11 @@ def estimator_train(estimator, train_distributed_fn, hooks):
   assert estimator._config._distribute_coordinator_mode
   run_config = estimator._config
   assert estimator._config.cluster_spec
-  cluster_spec = estimator._config.cluster_spec
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      estimator._config.cluster_spec)
   assert estimator._config._train_distribute
 
-  if 'evaluator' in cluster_spec:
+  if 'evaluator' in cluster_spec.jobs:
     raise ValueError("'evaluator' job is not supported if you don't use "
                      '`train_and_evaluate`')
 
@@ -344,10 +346,11 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
   assert estimator._config._distribute_coordinator_mode
   run_config = estimator._config
   assert estimator._config.cluster_spec
-  cluster_spec = estimator._config.cluster_spec
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      estimator._config.cluster_spec)
   assert estimator._config._eval_distribute
 
-  if 'evaluator' in cluster_spec:
+  if 'evaluator' in cluster_spec.jobs:
     raise ValueError("'evaluator' job is not supported if you don't use "
                      '`train_and_evaluate`')
 
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..18893290f3116d5ff7ba5dbf58e8e507fc7b854a
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -0,0 +1,19 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "experimental",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:parameter_server_strategy",
+    ],
+)
diff --git a/tensorflow/python/distribute/experimental/__init__.py b/tensorflow/python/distribute/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f74cac8b5ffce5f3982ce1e39a5108a4d841c8cf
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental Distribution Strategy library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import parameter_server_strategy
+# pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 6b13db39c726fc5f786ffe2f025b86822996cbaa..14b153b21933ee886af3858ca59ab53aa86b34b5 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -246,7 +246,7 @@ class DatasetIterator(InputIteratorImpl):
     """
     assert isinstance(input_workers, InputWorkers)
     if split_batch_by:
-      dataset = _split_dataset_batch(dataset, split_batch_by)
+      dataset = batching._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
 
     iterators = []
     for i, worker in enumerate(input_workers.worker_devices):
@@ -255,6 +255,7 @@ class DatasetIterator(InputIteratorImpl):
         cloned_dataset = dataset
         if not context.executing_eagerly():
           cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
+          cloned_dataset = cloned_dataset.with_options(dataset.options())
         iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
                                                 worker_devices)
         iterators.append(iterator)
@@ -305,7 +306,7 @@ class _SingleWorkerDatasetIterator(object):
       A list of any initializer ops that should be run.
     """
     if context.executing_eagerly():
-      self._make_iterator()
+      self._iterator._eager_reset()  # pylint: disable=protected-access
       return []
     else:
       return [self._iterator.initializer]
@@ -352,7 +353,8 @@ def _get_batched_dataset(d):
 
   if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
     return d
-  elif isinstance(d, dataset_ops.PrefetchDataset):
+  elif isinstance(d, (dataset_ops.PrefetchDataset,
+                      dataset_ops._OptionsDataset)):
     return _get_batched_dataset(d._input_dataset)
 
   raise ValueError(
@@ -405,24 +407,6 @@ def _get_dataset_attributes(dataset):
   return batch_size, drop_remainder, prefetch_buffer
 
 
-def _split_dataset_batch(dataset, split_batch_by):
-  """Divide a batch-ed dataset's batches into smaller batches."""
-  batch_size, drop_remainder, prefetch_buffer = (
-      _get_dataset_attributes(dataset))
-
-  if batch_size % split_batch_by:
-    raise ValueError(
-        "Batch size %s cannot be sharded evenly across replicas %s" % (
-            batch_size, split_batch_by))
-  new_batch_size = batch_size // split_batch_by
-
-  dataset = dataset.apply(batching.unbatch())
-  dataset = dataset.batch(new_batch_size, drop_remainder=drop_remainder)
-  if prefetch_buffer is not None:
-    dataset = dataset.prefetch(prefetch_buffer)
-  return dataset
-
-
 class MultiStepContext(object):
   """A context object that can be used to capture things when running steps.
 
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index df96f83a58a5ce5a34fb3d7ecffd5839ca79bda9..96c7191652a2d93da400a5b14a01f1ea26e9079d 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -454,7 +454,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     assert devices, "Must specify at least one device."
     devices = tuple(device_util.resolve(d) for d in devices)
     assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument: %s" % devices)
+        "No duplicates allowed in `devices` argument: %s" % (devices,))
     # TODO(josh11b): Require at least 2 devices?
     self._device_map = values.ReplicaDeviceMap(devices)
     self._input_workers = input_lib.InputWorkers(self._device_map)
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index b2d2f0361854417c484ec33b521b21c1dc6cebf6..9a255b2fe10e4cf56f8338e7f83e9258de2b12de 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(josh11b): Replace asserts in this file with if ...: raise ...
 
 
+@tf_export("distribute.OneDeviceStrategy")
 class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   """A distribution strategy for running on a single device."""
   # TODO(josh11b): Do we wrap values in types to generate errors if you are
@@ -50,7 +52,6 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
   def __init__(self, container_strategy, device):
     super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
-    self._default_device = device
     self._input_device = device_util.canonicalize("/device:CPU:0")
     worker_device_pairs = [(self._input_device, [self._device])]
     device_map = values.SingleDeviceMap(device)
@@ -62,8 +63,12 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
     if colocate_with is None:
       with ops.device(self._device):
         return next_creator(*args, **kwargs)
-    with ops.colocate_with(colocate_with):
-      return next_creator(*args, **kwargs)
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      with ops.colocate_with(colocate_with):
+        return next_creator(*args, **kwargs)
 
   def _validate_colocate_with_variable(self, colocate_with_variable):
     values.validate_colocate(colocate_with_variable, self)
@@ -83,7 +88,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
 
   def _experimental_make_numpy_dataset(self, numpy_input, session):
     return numpy_dataset.one_host_numpy_dataset(
-        numpy_input, self._input_device, session)
+        numpy_input, numpy_dataset.SingleDevice(self._input_device), session)
 
   def _broadcast_to(self, tensor, destinations):
     del destinations
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 269c7417d16cc903d32a65f28b90fbaa2773209e..535327b9fe27b71688fa2cdcea1e2895cfa87016 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 import copy
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
+
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import values
@@ -39,12 +40,14 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_setter
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 _LOCAL_CPU = "/device:CPU:0"
 _LOCAL_GPU_0 = "/device:GPU:0"
 
 
 # TODO(yuefengz): maybe cache variables on local CPU.
+@tf_export("distribute.experimental.ParameterServerStrategy")
 class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   """A parameter server DistributionStrategy.
 
@@ -101,8 +104,7 @@ class ParameterServerStrategyExtended(
 
     # We typically don't need to do all-reduce in this strategy.
     self._cross_device_ops = (
-        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
-            reduce_to_device=_LOCAL_CPU))
+        cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
 
   def _initialize_strategy(self, cluster_resolver):
     if cluster_resolver.cluster_spec().as_dict():
@@ -318,7 +320,8 @@ class ParameterServerStrategyExtended(
           if kwargs.get("trainable", True):
             collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
             l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l.remove(v)
+            if v in l:
+              l.remove(v)
           g.add_to_collections(collections, wrapped)
         elif ops.GraphKeys.GLOBAL_STEP in collections:
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 585ae1bd6c627fc270a9617c37f03fd97d9040e5..40a9f9b012c8f9959748d98bb3feccc3dcf55a6d 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 
 
@@ -630,7 +630,7 @@ class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
 
 
 class MirroredVariable(DistributedVariable, Mirrored,
-                       checkpointable.Checkpointable):
+                       trackable.Trackable):
   """Holds a map from device to variables whose values are kept in sync."""
 
   def __init__(
@@ -710,7 +710,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
     return self.get()._as_graph_element()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     MirroredVariables.
@@ -720,7 +720,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
     """
     def _saveable_factory(name=self._common_name):
       return _MirroredSaveable(self, self.primary, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
 
 # Register a conversion function which reads the value of the variable,
@@ -752,7 +752,7 @@ def _enclosing_tpu_context():
 # tpu.replicate() because it assumes that you're in a device context where you
 # can operate on a single version of the variable, but a tpu.replicate()
 # operates on all variables and is replicated during a rewrite pass.
-class TPUMirroredVariable(checkpointable.Checkpointable):
+class TPUMirroredVariable(trackable.Trackable):
   """Holds a map from device to TPU variables whose values are kept in sync."""
 
   def __init__(
@@ -983,7 +983,8 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._read_variable_op()
 
   def assign_sub(self, *args, **kwargs):
-    def assign_sub_fn(var, delta, **kw):
+    def assign_sub_fn(var, delta, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -997,7 +998,8 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._assign_func(f=assign_sub_fn, *args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
-    def assign_add_fn(var, delta, **kw):
+    def assign_add_fn(var, delta, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -1011,7 +1013,8 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._assign_func(f=assign_add_fn, *args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    def assign_fn(var, value, **kw):
+    def assign_fn(var, value, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -1085,7 +1088,7 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._read_variable_op()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     MirroredVariables.
@@ -1095,7 +1098,7 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     """
     def _saveable_factory(name=self._common_name):
       return _MirroredSaveable(self, self.primary, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -1204,8 +1207,9 @@ def _assert_replica_context(strategy):
         "Replica-local variables may only be assigned in a replica context.")
 
 
+# TODO(josh11b): Rename this to SyncOnReadVariable.
 class ReplicaLocalVariable(DistributedVariable, PerReplica,
-                           checkpointable.Checkpointable):
+                           trackable.Trackable):
   """Holds a map from device to variables whose values are reduced on save."""
 
   def __init__(
@@ -1243,11 +1247,8 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       return self.primary
-    # TODO(josh11b): Use a strategy-specific method.
-    total = math_ops.add_n(self._values)
-    if self._aggregation == vs.VariableAggregation.MEAN:
-      return total * (1./ len(self._values))
-    return total
+    return self._distribute_strategy.reduce(
+        reduce_util.ReduceOp.from_variable_aggregation(self.aggregation), self)
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
@@ -1256,7 +1257,7 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
     return self.get()._as_graph_element()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     ReplicaLocalVariables.
@@ -1266,7 +1267,7 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
     """
     def _saveable_factory(name=self._common_name):
       return _ReplicaLocalSaveable(self, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
 
 # Register a conversion function for ReplicaLocalVariable which allows as_ref to
@@ -1436,7 +1437,7 @@ def value_container(val):
 
 
 # TODO(josh11b): Descend from Variable.
-class AggregatingVariable(checkpointable.Checkpointable):
+class AggregatingVariable(trackable.Trackable):
   """A wrapper around a variable that aggregates updates across replicas."""
 
   def __init__(self, strategy, v, aggregation):
@@ -1514,7 +1515,7 @@ class AggregatingVariable(checkpointable.Checkpointable):
 
   # TODO(josh11b): Test saving & restoring.
   def _gather_saveables_for_checkpoint(self):
-    return {checkpointable.VARIABLE_VALUE_KEY: self._v}
+    return {trackable.VARIABLE_VALUE_KEY: self._v}
 
   # pylint: disable=multiple-statements
   def __add__(self, o): return self._v + o
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af7930224b5d7eca3da8fbbaf469ffc70ed8baa5..45e2682aac2d0a1b7591536ff55c78a4aad56b12 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -58,6 +58,7 @@ py_library(
         ":function",
         ":graph_only_ops",
         ":profiler",
+        ":profiler_client",
         ":tape",
         ":test",
         ":wrap_function",
@@ -110,7 +111,17 @@ cuda_py_test(
         ":profiler",
         ":test",
         "//tensorflow/python:constant_op",
-        "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
+        "//tensorflow/core/profiler:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "profiler_client",
+    srcs = ["profiler_client.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
     ],
 )
 
@@ -151,6 +162,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
     ],
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -384,6 +396,7 @@ cuda_py_test(
         ":context",
         ":function",
         ":test",
+        ":profiler",
         "//third_party/py/numpy",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
@@ -498,7 +511,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -527,7 +540,11 @@ tf_py_test(
 tf_xla_py_test(
     name = "def_function_xla_test",
     srcs = ["def_function_xla_test.py"],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "no_rocm",
+        "nomac",
+    ],
     deps = [
         ":def_function",
         "//tensorflow/compiler/tests:xla_test",
@@ -549,7 +566,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 6117d8a4ea154fc09acc77f8dbd5daa5afea81e0..694b05c1eeaf969a08b37c0025b34eed6b47ee9d 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -80,6 +80,8 @@ def make_attr(attr_type, value):
     return tensor_shape.as_shape(value).as_proto()
   elif attr_type == [pywrap_tensorflow.TF_ATTR_SHAPE]:
     return [tensor_shape.as_shape(v).as_proto() for v in value]
+  elif isinstance(value, str):
+    return value.encode()
   return value
 
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 16b52c9baa6fda290bf92889066836a000970ebf..f44185df0b5a016d1e1f14aabfcf3704d91752b4 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import function
+from tensorflow.python.eager import profiler
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -815,10 +816,26 @@ class MicroBenchmarks(test.Benchmark):
       model = make_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
+  def benchmark_keras_model_functional_fit_graph_mode_with_profiler(self):
+    profiler.start()
+    with context.graph_mode():
+      model = make_keras_model(initializer="glorot_uniform")
+      self._benchmark_keras_model_fit(model)
+    result = profiler.stop()
+    assert result is not None
+
   def benchmark_keras_model_functional_fit_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_fit(model, run_eagerly=True)
 
+  def benchmark_keras_model_functional_fit_run_model_eagerly_with_profiler(
+      self):
+    profiler.start()
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
+    result = profiler.stop()
+    assert result is not None
+
   def benchmark_keras_model_sequential_fit(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_fit(model)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index fd9be060ac15e638bf52fd686bd4ddf3d3339e88..4364b4925823d8f18f7abefc45ac66116c7ac464 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -44,6 +44,7 @@ default_execution_mode = EAGER_MODE if tf2.enabled() else GRAPH_MODE
 # Note that we do not protect this with a lock and instead rely on python's GIL
 # and the idempotent nature of writes to provide thread safety.
 _device_parsing_cache = {}
+_starting_device_spec = pydev.DeviceSpec.from_string("")
 
 _MAXINT32 = 2**31 - 1
 
@@ -135,26 +136,52 @@ class _EagerContext(threading.local):
 
   def __init__(self, config=None):
     super(_EagerContext, self).__init__()
-    self.device_spec = pydev.DeviceSpec.from_string("")
-    self.device_name = self.device_spec.to_string()
+    self.device_spec = _starting_device_spec
+    self.device_name = ""
     self.mode = default_execution_mode
     self.is_eager = default_execution_mode == EAGER_MODE
     self.scope_name = ""
-    self.recording_summaries = False
     self.summary_writer_resource = None
+    self.recording_summaries = None
     self.scalar_cache = {}
-    self.ones_rank_cache = _EagerTensorCache()
-    self.zeros_cache = _EagerTensorCache()
+    self._ones_rank_cache = None
+    self._zeros_cache = None
     self.execution_mode = None
 
     # Default rewriter config corresponds to turning all default grappler
     # optimizations on.
-    base_config = config_pb2.ConfigProto()
+    self._config = config
 
-    if config is not None:
-      base_config.MergeFrom(config)
+    self._function_call_options = None
 
-    self.function_call_options = FunctionCallOptions(config_proto=base_config)
+  @property
+  def function_call_options(self):
+    if self._function_call_options is None:
+      base_config = config_pb2.ConfigProto()
+      if self._config is not None:
+        base_config.MergeFrom(self._config)
+      self._config = None
+      self._function_call_options = FunctionCallOptions(
+          config_proto=base_config)
+
+    return self._function_call_options
+
+  @function_call_options.setter
+  def function_call_options(self, function_call_options):
+    self._function_call_options = function_call_options
+    self._config = None
+
+  @property
+  def ones_rank_cache(self):
+    if not self._ones_rank_cache:
+      self._ones_rank_cache = _EagerTensorCache()
+    return self._ones_rank_cache
+
+  @property
+  def zeros_cache(self):
+    if not self._zeros_cache:
+      self._zeros_cache = _EagerTensorCache()
+    return self._zeros_cache
 
 
 ContextSwitch = collections.namedtuple(
@@ -493,6 +520,16 @@ class Context(object):
     """Sets summary writer resource."""
     self._eager_context.summary_writer_resource = resource
 
+  @property
+  def recording_summaries(self):
+    """Returns summary recording condition."""
+    return self._eager_context.recording_summaries
+
+  @recording_summaries.setter
+  def recording_summaries(self, condition):
+    """Sets summary recording condition."""
+    self._eager_context.recording_summaries = condition
+
   @property
   def device_name(self):
     """Returns the device name for the current thread."""
@@ -688,14 +725,6 @@ class Context(object):
     """Get the list of post-execution callbacks added to the context."""
     return self._post_execution_callbacks
 
-  def enable_run_metadata(self):
-    """Enables tracing of op execution via RunMetadata.
-
-    To retrieve the accumulated metadata call context.export_run_metadata()
-    and to stop tracing call context.disable_run_metadata().
-    """
-    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
-
   @tf_contextlib.contextmanager
   def device_policy(self, policy):
     handle = self._handle
@@ -708,12 +737,34 @@ class Context(object):
       pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
           handle, old)
 
+  def enable_run_metadata(self):
+    """Enables tracing of op execution via RunMetadata.
+
+    To retrieve the accumulated metadata call context.export_run_metadata()
+    and to stop tracing call context.disable_run_metadata().
+    """
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
+
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
     if not self._context_handle:
       return
     pywrap_tensorflow.TFE_ContextDisableRunMetadata(self._context_handle)
 
+  def enable_graph_collection(self):
+    """Enables graph collection of executed functions.
+
+    To retrieve the accumulated graphs call context.export_run_metadata()
+    and to stop collecting graphs call context.disable_graph_collection().
+    """
+    pywrap_tensorflow.TFE_ContextEnableGraphCollection(self._handle)
+
+  def disable_graph_collection(self):
+    """Disables graph collections of executed functions."""
+    if not self._context_handle:
+      return
+    pywrap_tensorflow.TFE_ContextDisableGraphCollection(self._context_handle)
+
   def export_run_metadata(self):
     """Returns a RunMetadata proto with accumulated information.
 
@@ -870,6 +921,7 @@ def device(name):
   return context().device(name)
 
 
+@tf_export("config.experimental_list_devices")
 def list_devices():
   """List the names of the available devices.
 
@@ -941,6 +993,20 @@ def disable_run_metadata():
   context().disable_run_metadata()
 
 
+def enable_graph_collection():
+  """Enables tracing of op execution via RunMetadata.
+
+  To retrieve the accumulated metadata call context.export_run_metadata()
+  and to stop tracing call context.disable_run_metadata().
+  """
+  context().enable_graph_collection()
+
+
+def disable_graph_collection():
+  """Disables tracing of op execution via RunMetadata."""
+  context().disable_graph_collection()
+
+
 def export_run_metadata():
   """Returns a RunMetadata proto with accumulated information.
 
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 43cb07e79d74bcf2be4375ff00efe4b72449ac17..1d54973487ca4c6a0221e376954824d4eba2aacd 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -31,7 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
@@ -96,7 +96,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
         shape and `validate_shape` is `True`.
       RuntimeError: If called outside of a function definition.
     """
-    if context.executing_eagerly():
+    if not ops.inside_function():
       # If we've been init_scope()d out of the function definition nothing to do
       # here; we can't really do the capturing or conditional logic.
       resource_variable_ops.ResourceVariable.__init__(
@@ -113,8 +113,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -156,8 +156,14 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
+        func_graph = ops.get_default_graph()
+        function_placeholders = (
+            func_graph.inputs + func_graph.internal_captures)
+        placeholder_ops = set(
+            [tensor.op for tensor in function_placeholders])
         lifted_initializer = lift_to_graph.lift_to_graph(
-            initial_value, outer_graph)[initial_value]
+            [initial_value], outer_graph,
+            disallowed_placeholders=placeholder_ops)[initial_value]
         with ops.init_scope():
           self._initial_value = lifted_initializer
           with ops.name_scope("IsInitialized"):
@@ -204,6 +210,29 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     self._cached_shape_as_list = None
 
 
+RUN_FUNCTIONS_EAGERLY = False
+
+
+@tf_export("config.experimental_run_functions_eagerly")
+def run_functions_eagerly(run_eagerly):
+  """Enables / disables eager execution of `tf.function`s.
+
+  After calling `tf.config.experimental_run_functions_eagerly(True)` all
+  invocations of tf.function will run eagerly instead of running through a graph
+  function.
+
+  This can be useful for debugging or profiling.
+
+  Similarly, calling `tf.config.experimental_run_functions_eagerly(False)` will
+  revert the behavior of all functions to graph functions.
+
+  Args:
+    run_eagerly: Boolean. Whether to run functions eagerly.
+  """
+  global RUN_FUNCTIONS_EAGERLY
+  RUN_FUNCTIONS_EAGERLY = bool(run_eagerly)
+
+
 class FunctionDeleter(object):
 
   def __init__(self, func_graph):
@@ -349,7 +378,35 @@ class Function(object):
     self._stateless_fn = self._defun_with_scope(invalid_creator_scope)
     self._stateless_fn._name = self._name  # pylint: disable=protected-access
 
+  def _decorate(self, decorator):
+    """Allows the captured Python function to be decorated in place.
+
+    This method is only safe to call when the Function has not been called by a
+    user. It makes sense to use this method to push a decorator into the
+    function rather than wrapping the function in the decorator.
+
+    We use this in tf.Module to allow user annotated `tf.functions` to remain as
+    `Function` objects but still automatically enter the Module name_scope
+    when they are evaluated like all other methods.
+
+    Args:
+      decorator: A callable accepting a single argument which is the function
+        to decorate and returning a callable result.
+
+    Raises:
+      ValueError: If the function has been called a ValueError is raised.
+    """
+    if self._stateful_fn is not None or self._stateless_fn is not None:
+      raise ValueError(
+          "Functions cannot be decorated after they have been traced.")
+
+    self._python_function = decorator(self._python_function)
+    self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
+        self._python_function, self._input_signature)
+
   def __call__(self, *args, **kwds):
+    if RUN_FUNCTIONS_EAGERLY:
+      return self._python_function(*args, **kwds)
     """Calls the graph function."""
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
@@ -465,7 +522,7 @@ class Function(object):
             # Ignore variables which are already initialized at trace time.
             continue
         v.assign(lift_to_graph.lift_to_graph(
-            init, ops.get_default_graph())[init])
+            [init], ops.get_default_graph())[init])
 
     with ops.init_scope():
       return initialize_variables.get_concrete_function()()
@@ -506,7 +563,7 @@ class Function(object):
     def initialize_variables():
       for v, init in initializer_map.items():
         v.assign(lift_to_graph.lift_to_graph(
-            init, ops.get_default_graph())[init])
+            [init], ops.get_default_graph())[init])
 
     return initialize_variables.get_concrete_function()
 
@@ -521,9 +578,11 @@ class Function(object):
     concrete_functions = []
     # pylint: disable=protected-access
     if self._stateful_fn:
-      concrete_functions.extend(self._stateful_fn._function_cache.values())
+      concrete_functions.extend(
+          self._stateful_fn._function_cache.all_values())
     if self._stateless_fn:
-      concrete_functions.extend(self._stateless_fn._function_cache.values())
+      concrete_functions.extend(
+          self._stateless_fn._function_cache.all_values())
     # pylint: enable=protected-access
     deduplicated_concrete_functions = list()
     seen_signatures = list()
@@ -623,7 +682,6 @@ class Function(object):
     Raises:
       ValueError: if this object has not yet been called on concrete values.
     """
-    assert context.executing_eagerly()
     if self._stateful_fn is None:
       initializer_map = {}
       self._initialize(args, kwargs, add_initializers_to=initializer_map)
@@ -727,6 +785,9 @@ def function(func=None,
       l.append(i)                           # Caution! Doesn't work.
   ```
 
+  Note that unlike other TensorFlow operations, we don't convert python
+  numerical inputs to tensors.
+
   _Referencing `tf.Variable`s_
 
   The Python function `func` may reference stateful objects (such as
@@ -827,8 +888,8 @@ def function(func=None,
   def f(x): return tf.add(x, 1.)
   ```
 
-  When an `input_signature` is specified, the callable will only accept `Tensor`
-  (or NumPy `ndarray`) objects as arguments.
+  When an `input_signature` is specified, the callable will convert the inputs
+  to the specified TensorSpecs.
 
   _Tracing and staging_
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 76e47fce85f284e9e6dbebf2981e5a2c70fe18fd..fdf054a9990d9c12109088a509c6d762dea310ee 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -21,16 +21,21 @@ import functools
 import weakref
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -209,7 +214,8 @@ class DefFunctionTest(test.TestCase):
           state.append(variables.Variable(2.0 * x))
         return state[0] * x
 
-      with self.assertRaises(lift_to_graph.UnliftableError):
+      with self.assertRaisesRegexp(
+          lift_to_graph.UnliftableError, r'transitively.* mul .* x'):
         fn(constant_op.constant(3.0))
 
   def testMethod(self):
@@ -266,7 +272,40 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose(4., concrete(constant_op.constant(2.)))
     signature_args, _ = concrete.structured_input_signature
     self.assertEqual(signature_args,
-                     (tensor_spec.TensorSpec(None, dtypes.float32),))
+                     (tensor_spec.TensorSpec(
+                         None, dtypes.float32, name='x'),))
+
+  def test_error_inner_capture(self):
+
+    @def_function.function
+    def f(inputs):
+      num_steps, _ = inputs.shape[:2]
+      outputs = []
+      for t in math_ops.range(num_steps):
+        outputs.append(inputs[t])
+      return outputs
+
+    with self.assertRaisesRegexp(ValueError, 'inner'):
+      f(array_ops.zeros(shape=(8, 42, 3)))
+
+  def testRuntimeErrorNotSticky(self):
+
+    @def_function.function
+    def fail(i):
+      control_flow_ops.Assert(math_ops.equal(i, 0), ['ick'])
+
+    fail(constant_op.constant(0))  # OK
+    with self.assertRaises(errors.InvalidArgumentError):
+      fail(constant_op.constant(1))  # InvalidArgument: "ick"
+    fail(constant_op.constant(0))  # OK
+
+  def testUnderscoreName(self):
+
+    @def_function.function
+    def f(_):
+      return _ + _
+
+    self.assertAllEqual(2.0, f(constant_op.constant(1.0)))
 
   def test_serialization_signature_cache(self):
 
@@ -286,10 +325,10 @@ class DefFunctionTest(test.TestCase):
 
     self.assertEqual(
         signatures_args,
-        set(((tensor_spec.TensorSpec([1, 2], dtypes.float32),
-              tensor_spec.TensorSpec([1], dtypes.float32)),
-             (tensor_spec.TensorSpec([1, 3], dtypes.int32),
-              tensor_spec.TensorSpec([1], dtypes.int32)))))
+        set(((tensor_spec.TensorSpec([1, 2], dtypes.float32, name='x'),
+              tensor_spec.TensorSpec([1], dtypes.float32, name='y')),
+             (tensor_spec.TensorSpec([1, 3], dtypes.int32, name='x'),
+              tensor_spec.TensorSpec([1], dtypes.int32, name='y')))))
 
   @test_util.assert_no_garbage_created
   def testFunctionReferenceCycles(self):
@@ -371,6 +410,18 @@ class DefFunctionTest(test.TestCase):
 
     self.assertAllEqual(add(v, v), 2.0)
 
+  def testShapeCache(self):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    func_a = func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32))
+    func_b = func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32))
+
+    self.assertIs(func_a, func_b)
+
   def testInitializationInNestedCall(self):
     v_holder = []
 
@@ -393,6 +444,60 @@ class DefFunctionTest(test.TestCase):
     v_holder[1].assign(11.)
     self.assertAllClose([14., 15.], wrapper(constant_op.constant(2.)))
 
+  def testDeviceAnnotationRespected(self):
+    if not context.num_gpus():
+      self.skipTest("Needs multiple devices")
+
+    a = []
+
+    @def_function.function()
+    def create_variable():
+      with ops.init_scope():
+        initial_value = random_ops.random_uniform(
+            (2, 2), maxval=1000000, dtype=dtypes.int64)
+
+      if not a:
+        with ops.device("CPU:0"):
+          a.append(resource_variable_ops.ResourceVariable(initial_value))
+
+      return a[0].read_value()
+
+    created_variable_read = create_variable()
+    self.assertRegexpMatches(created_variable_read.device, "CPU")
+
+  def testDecorate(self):
+    func = def_function.function(lambda: 1)
+    def decorator(f):
+      return lambda: 1 + f()
+
+    func._decorate(decorator)
+    self.assertEqual(func().numpy(), 2)
+
+  def testLiftPlaceholderInitializedVariable(self):
+    with ops.Graph().as_default():
+      var_list = []
+
+      @def_function.function
+      def use_variable():
+        if not var_list:
+          initial_value = array_ops.placeholder(shape=[], dtype=dtypes.float32)
+          v = variables.Variable(initial_value)
+          var_list.append(v)
+        return var_list[0] + 1.
+
+      var_plus_one = use_variable()
+      with self.session() as session:
+        init_op = var_list[0].initializer
+        session.run(init_op, feed_dict={init_op.inputs[1]: 2.})
+        self.assertEqual(3., session.run(var_plus_one))
+
+  def testDecorate_rejectedAfterTrace(self):
+    func = def_function.function(lambda: 1)
+    self.assertEqual(func().numpy(), 1)
+    msg = 'Functions cannot be decorated after they have been traced.'
+    with self.assertRaisesRegexp(ValueError, msg):
+      func._decorate(lambda f: f)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 04f1999a840051856ff4399cf73b89253796e120..487fd5a9728daa8b6ce6f859ec2dfb9ac0e4ca4b 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import collections
 import functools
-import re
 import threading
 import types as types_lib
 import weakref
@@ -44,6 +43,7 @@ from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
@@ -61,17 +61,94 @@ from tensorflow.python.util import tf_inspect
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
 BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 
-# TODO(scottzhu): Update this to allow arbitrary attribute names in future.
-WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
-    "experimental_.*",
-    FORWARD_FUNCTION_ATTRIBUTE_NAME,
-    BACKWARD_FUNCTION_ATTRIBUTE_NAME
-]
+class CacheKey(
+    collections.namedtuple("CacheKey", [
+        "input_signature", "parent_graph", "device_functions",
+        "colocation_stack", "uses_xla"])):
 
-CacheKey = collections.namedtuple("CacheKey", [
-    "input_signature", "parent_graph", "device_functions", "colocation_stack",
-    "uses_xla"
-])
+  def replace(self, *args, **kwargs):
+    return self._replace(*args, **kwargs)
+
+
+def _flat_shape_list(*params):
+  """Return a flat list of TensorShapes, one for each tensor[spec] in `*params`.
+
+  Args:
+    *params: Set of nested entries containing Tensors, TensorSpec, and
+      non-tensors.
+
+  Returns:
+    A list of entries containing either `None` or `TensorShape`.
+  """
+  return [tensor_shape.TensorShape(x.shape)
+          if isinstance(x, (ops.Tensor, tensor_spec.TensorSpec)) else None
+          for x in nest.flatten(params)]
+
+
+def _compatible_shapes(flat_x, flat_y):
+  """Check if lists of TensorShapes contain compatible shapes.
+
+  Args:
+    flat_x: List of TensorShape or None.
+    flat_y: List of TensorShape or None.
+
+  Returns:
+    A python bool.
+
+  Raises:
+    RuntimeError: if `len(flat_x) != len(flat_y)`.
+    RuntimeError: if `flat_x[i] is None != flat_y[i] is None` for any `i`.
+  """
+  if len(flat_x) != len(flat_y):
+    raise RuntimeError("Expected shape lists of identical lengths, but saw: "
+                       "%s and %s" % (flat_x, flat_y))
+  def is_compatible(x, y):
+    """Internal help function.
+
+    Args:
+      x: TensorShape or None.
+      y: TensorShape or None.
+
+    Returns:
+      Python bool.
+
+    Raises:
+      RuntimeError: If `x is None != y is None`.
+    """
+    # If both x and y are None, there is no shape to compare.  Otherwise check
+    # if they are compatible with each other.  Either way, both input signatures
+    # must have have Tensors in the same entries.  If not, raise an assertion
+    # error.
+    if x is None != y is None:
+      raise RuntimeError(
+          "Expected signature type matches between flattened input shapes "
+          "%s and %s; but saw that (%s is None) != (%s is None)"
+          % (flat_x, flat_y, x, y))
+    return x is None or x.is_compatible_with(y)
+  return all(is_compatible(x, y) for x, y in zip(flat_x, flat_y))
+
+
+def _common_shape(x, y):
+  """Find a `TensorShape` that is compatible with both `x` and `y`."""
+  if x is None != y is None:
+    raise RuntimeError(
+        "Cannot find a common shape when LHS shape is None but RHS shape "
+        "is not (or vice versa): %s vs. %s" % (x, y))
+  if x is None:
+    return None  # The associated input was not a Tensor, no shape generated.
+  if not isinstance(x, tensor_shape.TensorShape):
+    raise TypeError("Expected x to be a TensorShape but saw %s" % (x,))
+  if not isinstance(y, tensor_shape.TensorShape):
+    raise TypeError("Expected y to be a TensorShape but saw %s" % (y,))
+  if x.rank != y.rank or x.rank is None:
+    return tensor_shape.TensorShape(None)
+  dims = []
+  for dim_x, dim_y in zip(x.dims, y.dims):
+    if dim_x != dim_y or tensor_shape.dimension_value(dim_x) is None:
+      dims.append(None)
+    else:
+      dims.append(tensor_shape.dimension_value(dim_x))
+  return tensor_shape.TensorShape(dims)
 
 
 def is_same_structure(structure1,
@@ -108,12 +185,6 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not any(re.match(reg, key)
-               for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX):
-      raise ValueError("Attribute name is not whitelisted. "
-                       "Whitelisted: prefix %s, got: %s" %
-                       (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
-
     if isinstance(value, attr_value_pb2.AttrValue):
       attrs[key] = value
     # bool type check has to happen before int since bool is a subclass of int.
@@ -219,6 +290,8 @@ class _EagerDefinedFunction(object):
         [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
         [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
         [],
+        [o._c_op for o in graph.control_outputs],  # pylint: disable=protected-access
+        [],  # control_output_names
         None,
         compat.as_str(""))
 
@@ -423,21 +496,30 @@ class ConcreteFunction(object):
           "through the public interface. Use get_concrete_function instead.")
     if len(args) > self._num_positional_args:
       raise TypeError(
-          ("Expected at most {} positional arguments ({}), got {}. When "
-           "calling a concrete function, positional arguments may not be bound "
-           "to Tensors within nested structures.").format(
-               self._num_positional_args,
-               self._arg_keywords[:self._num_positional_args],
-               args))
+          ("Expected at most {} positional arguments (and the rest keywords, "
+           "of {}), got {}. When calling a concrete function, positional "
+           "arguments may not be bound to Tensors within nested structures."
+          ).format(self._num_positional_args, self._arg_keywords, args))
     args = list(args)
     for keyword in self._arg_keywords[len(args):]:
-      args.append(kwargs.pop(compat.as_str(keyword)))
+      try:
+        args.append(kwargs.pop(compat.as_str(keyword)))
+      except KeyError:
+        specified_keywords = (list(self._arg_keywords[:len(args)])
+                              + list(kwargs.keys()))
+        raise TypeError(
+            "Expected argument names {} but got values for {}. Missing: {}."
+            .format(
+                list(self._arg_keywords),
+                specified_keywords,
+                list(set(self._arg_keywords) - set(specified_keywords))))
     if kwargs:
       positional_arg_keywords = set(self._arg_keywords[:len(args)])
       for unused_key in kwargs:
         if unused_key in positional_arg_keywords:
           raise TypeError("Got two values for keyword '{}'.".format(unused_key))
-      raise TypeError("Keyword arguments {} unknown.".format(kwargs.keys()))
+      raise TypeError("Keyword arguments {} unknown. Expected {}.".format(
+          list(kwargs.keys()), list(self._arg_keywords)))
     return self._call_flat(args)
 
   def _filtered_call(self, args, kwargs):
@@ -847,6 +929,10 @@ class FunctionSpec(object):
       python_function_to_inspect = python_function.func
       args_to_prepend = python_function.args or tuple()
       kwargs_to_include = python_function.keywords or {}
+      if input_signature is not None:
+        # TODO(b/124441704): Add support for input_signature + partial.
+        raise NotImplementedError(
+            "Missing support for input_signature when using partial functions.")
     else:
       python_function_to_inspect = python_function
       args_to_prepend = tuple()
@@ -948,6 +1034,21 @@ class FunctionSpec(object):
         argument when an input signature is specified, or when the inputs
         do not conform to the input signature.
     """
+    if self._input_signature is not None:
+      if len(args) > len(self._input_signature):
+        raise TypeError(
+            "When input_signature is provided, only pass arguments "
+            "covered by it. Received %d argument(s)." % len(args))
+      for arg in six.iterkeys(kwargs):
+        index = self._args_to_indices.get(arg, None)
+        if index is None:
+          raise TypeError(
+              "Function got an unexpected keyword argument %s" % arg)
+        if index >= len(self._input_signature):
+          raise TypeError(
+              "When input_signature is provided, only pass arguments "
+              "covered by it. Received argument %s." % arg)
+
     args = self._args_to_prepend + args
     kwargs = dict(kwargs, **self._kwargs_to_include)
     if not kwargs:
@@ -979,42 +1080,110 @@ class FunctionSpec(object):
         # opposed to named arguments called in a keyword-like fashion.
         kwargs.pop(arg)
       inputs = args + _deterministic_dict_values(arg_indices_to_values)
-    flat_inputs = nest.flatten(inputs)
-
-    # Check for NumPy arrays in arguments and convert them to Tensors.
-    # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
-    # finding a way to store them directly in the cache key (currently not
-    # possible since ndarrays are not hashable).
-    need_packing = False
-    for index, value in enumerate(flat_inputs):
-      if type(value) == np.ndarray:
-        flat_inputs[index] = constant_op.constant(value)
-        need_packing = True
-    if need_packing:
-      inputs = nest.pack_sequence_as(
-          structure=inputs, flat_sequence=flat_inputs)
+
     if self._input_signature is None:
+      inputs = _convert_numpy_inputs(inputs)
       return inputs, kwargs
     else:
       assert not kwargs
-      signature_relevant_inputs = inputs[:len(self._input_signature)]
-      if not is_same_structure(self._input_signature,
-                               signature_relevant_inputs):
-        raise ValueError("Structure of Python function inputs does not match "
-                         "input_signature.")
-      signature_inputs_flat = nest.flatten(signature_relevant_inputs)
-      if any(
-          not pywrap_tensorflow.IsTensor(arg) for arg in signature_inputs_flat):
-        raise ValueError("When input_signature is provided, all inputs to "
-                         "the Python function must be Tensors.")
-      if any(not spec.is_compatible_with(other) for spec, other in zip(
-          self._flat_input_signature, signature_inputs_flat)):
-        raise ValueError("Python inputs incompatible with input_signature: "
-                         "inputs (%s), input_signature (%s)" %
-                         (str(inputs), str(self._input_signature)))
+      inputs = _convert_inputs_to_signature(
+          inputs,
+          self._input_signature,
+          self._flat_input_signature)
       return inputs, {}
 
 
+def _convert_numpy_inputs(inputs):
+  """Convert numpy array inputs to tensors."""
+  flat_inputs = nest.flatten(inputs)
+
+  # Check for NumPy arrays in arguments and convert them to Tensors.
+  # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
+  # finding a way to store them directly in the cache key (currently not
+  # possible since ndarrays are not hashable).
+  need_packing = False
+  for index, value in enumerate(flat_inputs):
+    if type(value) == np.ndarray:
+      flat_inputs[index] = constant_op.constant(value)
+      need_packing = True
+  if need_packing:
+    return nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs)
+  else:
+    return inputs
+
+
+def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
+  """Convert inputs to pass into a function with an explicit signature."""
+  try:
+    # TODO(b/124370185): Use all elements as inputs to throw an error if there
+    # are ignored arguments. Calling with arguments that are not part of the
+    # signature should throw an error.
+    flatten_inputs = nest.flatten_up_to(
+        input_signature,
+        inputs[:len(input_signature)])
+  except ValueError:
+    raise ValueError("Structure of Python function inputs does not match "
+                     "input_signature. Inputs (%s), input_signature(%s)." %
+                     (str(inputs), str(input_signature)))
+
+  need_packing = False
+  for index, (value, spec) in enumerate(zip(flatten_inputs,
+                                            flat_input_signature)):
+    if not pywrap_tensorflow.IsTensor(value):
+      try:
+        flatten_inputs[index] = ops.convert_to_tensor(
+            value, dtype_hint=spec.dtype)
+        need_packing = True
+      except ValueError:
+        raise ValueError("When input_signature is provided, all inputs to "
+                         "the Python function must be convertible to tensors."
+                         "Inputs (%s), input_signature(%s)." %
+                         (str(inputs), str(input_signature)))
+
+  if any(not spec.is_compatible_with(other) for spec, other in zip(
+      flat_input_signature,
+      flatten_inputs)):
+    raise ValueError("Python inputs incompatible with input_signature: "
+                     "inputs (%s), input_signature (%s)" %
+                     (str(inputs), str(input_signature)))
+
+  if need_packing:
+    inputs = nest.pack_sequence_as(
+        structure=input_signature,
+        flat_sequence=flatten_inputs)
+
+  return inputs
+
+
+class FunctionCache(object):
+  """A lightweight container for cached functions.
+  """
+
+  def __init__(self):
+    # The set of functions that have been missed; entries are CacheKey with
+    # input_signature `None` (e.g. a "call context key")
+    self.missed = set()
+    # The primary cache, mapping a fully shaped CacheKey to a function.
+    self.primary = collections.OrderedDict()
+    # A cache key lookup, mapping a CacheKey generated without shape info to a
+    # flat list of relaxed shapes (one for each argument).  Arguments that are
+    # not Tensors contain a `None` for the corresponding relaxed shape.
+    self.arg_relaxed_shapes = collections.OrderedDict()
+    # The secondary cache, mapping a CacheKey generated without shape info to a
+    # function.
+    self.arg_relaxed = collections.OrderedDict()
+    # All OrderedDicts require manual garbage collection.
+    self._garbage_collectors = [
+        _FunctionGarbageCollector(self.primary),
+        _FunctionGarbageCollector(self.arg_relaxed),
+        _FunctionGarbageCollector(self.arg_relaxed_shapes)]
+
+  def all_values(self):
+    """A set of all `ConcreteFunction` instances held by this cache."""
+    return set(self.primary.values()) | set(self.arg_relaxed.values())
+
+
 class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
@@ -1033,7 +1202,8 @@ class Function(object):
                input_signature=None,
                attributes=None,
                autograph=True,
-               autograph_options=None):
+               autograph_options=None,
+               capture_by_value=None):
     """Initializes a `Function`.
 
     Args:
@@ -1050,6 +1220,9 @@ class Function(object):
       autograph_options: Experimental knobs to control behavior
         `when autograph=True`. See https://www.tensorflow.org/guide/autograph
         for more information.
+      capture_by_value: Experimental. Whether to capture resource variables by
+        value or reference. If None, will inherit from a parent context or
+        default to False.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -1064,9 +1237,9 @@ class Function(object):
     self._name = name
     self._autograph = autograph
     self._autograph_options = autograph_options
-    self._function_cache = collections.OrderedDict()
-    self._garbage_collector = _FunctionGarbageCollector(self._function_cache)
+    self._function_cache = FunctionCache()
     self._function_attributes = attributes or {}
+    self._capture_by_value = capture_by_value
 
     self._lock = threading.Lock()
     # _descriptor_cache is a of instance of a class to an instance-specific
@@ -1221,13 +1394,15 @@ class Function(object):
     # Return the cached `Function` for the instance
     return self._descriptor_cache[instance]
 
-  def _cache_key(self, args, kwargs):
+  def _cache_key(self, args, kwargs, include_tensor_ranks_only=False):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(
+          inputs, include_tensor_ranks_only)
     else:
       del args, kwargs
+      assert not include_tensor_ranks_only
       input_signature = self._flat_input_signature
 
     ctx = context.context()
@@ -1273,6 +1448,46 @@ class Function(object):
     return CacheKey(input_signature, parent_graph, device_functions,
                     colocation_stack, uses_xla)
 
+  def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
+    """Create a `ConcreteFunction` from `args` and `kwargs`."""
+    if self._input_signature is None:
+      arglen = len(args)
+    else:
+      arglen = len(self._input_signature)
+    base_arg_names = self._function_spec.arg_names[:arglen]
+    num_missing_args = arglen - len(self._function_spec.arg_names)
+    missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
+    # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
+    # where arg is based on the self._function_spec.vararg_name.
+    missing_arg_names = [
+        "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
+    ]
+    arg_names = base_arg_names + missing_arg_names
+    graph_function = ConcreteFunction(
+        func_graph_module.func_graph_from_py_func(
+            self._name,
+            self._python_function,
+            args,
+            kwargs,
+            self._input_signature,
+            autograph=self._autograph,
+            autograph_options=self._autograph_options,
+            arg_names=arg_names,
+            override_flat_arg_shapes=override_flat_arg_shapes,
+            capture_by_value=self._capture_by_value),
+        self._function_attributes)
+
+    # pylint: disable=protected-access
+    # Tell the ConcreteFunction to clean up its graph once it goes out of
+    # scope. ConcreteFunction does not do this in its constructor since it
+    # gets used in some places (like Keras) where the FuncGraph lives
+    # longer than the ConcreteFunction.
+    graph_function._garbage_collector = ConcreteFunctionGarbageCollector(
+        graph_function.graph)
+    # pylint: enable=protected-access
+
+    return graph_function
+
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
 
@@ -1290,55 +1505,76 @@ class Function(object):
     Raises:
       ValueError: If inputs are incompatible with the input signature.
       TypeError: If the function inputs include non-hashable objects
+      RuntimeError: If there's an internal bug (inconsistency) in handling
+        shape relaxation retracing.
     """
     if self._input_signature is None or args is not None or kwargs is not None:
       args, kwargs = self._function_spec.canonicalize_function_inputs(
           *args, **kwargs)
     cache_key = self._cache_key(args, kwargs)
+
+    try:
+      hash(cache_key)
+    except TypeError as e:
+      raise TypeError(
+          "Arguments supplied to `defun`-generated functions must be"
+          " hashable.  Original error: %s" % e)
+
     with self._lock:
-      try:
-        graph_function = self._function_cache.get(cache_key, None)
-      except TypeError as e:
-        raise TypeError(
-            "Arguments supplied to `defun`-generated functions must be"
-            " hashable.  Original error: %s" % e)
-
-      if graph_function is None:
-        logging.vlog(1,
-                     "Creating new FuncGraph for Python function %r (key: %r)",
-                     self._python_function, cache_key)
-        if self._input_signature is None:
-          arglen = len(args)
-        else:
-          arglen = len(self._input_signature)
-        base_arg_names = self._function_spec.arg_names[:arglen]
-        num_missing_args = arglen - len(self._function_spec.arg_names)
-        missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
-        # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
-        # where arg is based on the self._function_spec.vararg_name.
-        missing_arg_names = [
-            "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
-        ]
-        arg_names = base_arg_names + missing_arg_names
-        graph_function = ConcreteFunction(
-            func_graph_module.func_graph_from_py_func(
-                self._name,
-                self._python_function,
-                args,
-                kwargs,
-                self._input_signature,
-                autograph=self._autograph,
-                autograph_options=self._autograph_options,
-                arg_names=arg_names), self._function_attributes)
-        # pylint: disable=protected-access
-        # Tell the ConcreteFunction to clean up its graph once it goes out of
-        # scope. ConcreteFunction does not do this in its constructor since it
-        # gets used in some places (like Keras) where the FuncGraph lives
-        # longer than the ConcreteFunction.
-        graph_function._garbage_collector = _ConcreteFunctionGarbageCollector(
-            graph_function.graph)
-        # pylint: enable=protected-access
-        self._function_cache[cache_key] = graph_function
+      graph_function = self._function_cache.primary.get(cache_key, None)
+      if graph_function is not None:
+        return graph_function, args, kwargs
+
+      logging.vlog(1,
+                   "Creating new FuncGraph for Python function %r (key: %r)",
+                   self._python_function, cache_key)
+      logging.vlog(2,
+                   "Python function signature [args: %s] [kwargs: %s]",
+                   str(args),
+                   str(kwargs))
+
+      call_context_key = cache_key.replace(input_signature=None)
+
+      # If there's a provided input signature, or XLA is being used, or
+      # there's no cache miss for this calling context so far, go ahead and
+      # build the function and bypass shape relaxation retracing.
+      if (self._input_signature is not None
+          or cache_key.uses_xla
+          or call_context_key not in self._function_cache.missed):
+        self._function_cache.missed.add(call_context_key)
+        graph_function = self._create_graph_function(args, kwargs)
+        self._function_cache.primary[cache_key] = graph_function
+        return graph_function, args, kwargs
+
+      rank_only_cache_key = self._cache_key(
+          args, kwargs, include_tensor_ranks_only=True)
+
+      arg_shapes = _flat_shape_list(args, kwargs)
+      relaxed_arg_shapes = self._function_cache.arg_relaxed_shapes.get(
+          rank_only_cache_key, None)
+      relaxed_arg_function = self._function_cache.arg_relaxed.get(
+          rank_only_cache_key, None)
+
+      if (relaxed_arg_function is not None
+          and _compatible_shapes(relaxed_arg_shapes, arg_shapes)):
+        return relaxed_arg_function, args, kwargs
+
+      if relaxed_arg_shapes is None:
+        relaxed_arg_shapes = arg_shapes
+      else:
+        if len(arg_shapes) != len(relaxed_arg_shapes):
+          raise RuntimeError("Expected arg_shapes len to match "
+                             "relaxed_arg_shapes len: %d vs. %d"
+                             % (len(arg_shapes), len(relaxed_arg_shapes)))
+        relaxed_arg_shapes = [
+            _common_shape(x, y) for (x, y) in zip(
+                arg_shapes, relaxed_arg_shapes)]
+      self._function_cache.arg_relaxed_shapes[rank_only_cache_key] = (
+          relaxed_arg_shapes)
+      graph_function = self._create_graph_function(
+          args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
+      self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
+
       return graph_function, args, kwargs
 
 
@@ -1858,7 +2094,7 @@ class _FunctionGarbageCollector(object):
       pass
 
 
-class _ConcreteFunctionGarbageCollector(object):
+class ConcreteFunctionGarbageCollector(object):
   """Cleans up reference cycles when a `ConcreteFunction` goes out of scope."""
 
   def __init__(self, func_graph):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index c8f7ecbca2cbfd74e61079819653db544b70a442..7a17eb6ee2196a695014196fb3f40b6a1296cc9c 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import init_ops
@@ -65,6 +66,13 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
+def total_function_cache(defined):
+  # pylint: disable=protected-access
+  return (set(defined._function_cache.primary)
+          | set(defined._function_cache.arg_relaxed))
+  # pylint: enable=protected-access
+
+
 class MiniModel(keras_training.Model):
   """Minimal model for mnist.
 
@@ -98,6 +106,94 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
     self.assertAllEqual(sq2.numpy().reshape(-1), [52, 76, 74, 108])
 
+  def testVariable(self):
+    v1 = variables.Variable(1.0)
+    add = def_function.function(lambda x, v: x + v1 + v)
+    v2 = variables.Variable(1.0)
+    x = constant_op.constant(1.0)
+    r = add(x, v2)
+    self.assertEqual(3.0, self.evaluate(r))
+
+  def testInputShapeFunctionRelaxation(self):
+    unknown_dim = [False]
+
+    @function.defun
+    def func(a):
+      if a._shape_tuple()[0] is None:
+        unknown_dim[0] = True
+      return a + 1
+
+    func(constant_op.constant([]))
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 1)
+
+    func(constant_op.constant([1.0]))
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    func(constant_op.constant([1.0, 2.0]))
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+  def testNestedInputShapeFunctionRelaxation(self):
+    unknown_dim = [False]
+
+    @function.defun
+    def func(a_, b_=None):
+      del a_  # Only used to check which cache is used.
+      self.assertEqual(b_[0]._shape_tuple(), ())
+      if b_[1]._shape_tuple()[0] is None:
+        unknown_dim[0] = True
+      return b_[0] + 1
+
+    a = 'hi'
+    b0 = constant_op.constant(1.0)
+    func(a, b_=[b0, constant_op.constant([])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 1)
+
+    func(a, b_=[b0, constant_op.constant([1.0])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    func(a, b_=[b0, constant_op.constant([1.0, 1.0])])
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    unknown_dim[0] = False
+
+    # Now do the same except with a new a which is not a tensor; this should
+    # change the cache key.
+    a = 'bye'
+    func(a, b_=[b0, constant_op.constant([])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 3)
+
+    # Since we already marked a cache miss for a function with the same
+    # non-input signatures, here we will immediately start relaxing shapes.
+    func(a, b_=[b0, constant_op.constant([1.0])])
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 3)
+
+  def testFunctionRelaxationLosesInnerDimWithKerasLayer(self):
+    layer = keras.layers.Dense(1)
+    fn = def_function.function()(layer)
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      fn(array_ops.ones((3, 2)))
+      self.assertNotIn('ValueError', printed.contents())
+    with self.captureWritesToStream(sys.stderr) as printed:
+      # Use batch size 2 to trigger a second cache miss on the shape.
+      fn(array_ops.ones((2, 2)))
+      self.assertNotIn('ValueError', printed.contents())
+
+    # Shape relaxation passes TensorShape([None, None]), which causes layer
+    # matmul to fail, due to incompatible dims.  What would have been a graph
+    # build time error (layer would complain about the inner dim being 4).
+    with self.captureWritesToStream(sys.stderr) as printed:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, r'MatMul'):
+        fn(array_ops.ones((3, 4)))
+
   def testWastedAdd(self):
 
     @def_function.function()
@@ -381,13 +477,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     x = random_ops.random_uniform([2, 2]).numpy()
     defined = function.defun(f)
     defined(x)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     x = random_ops.random_uniform([2, 2]).numpy()
     defined(x)
     # A NumPy array with different values but the same shape and dtype
     # shouldn't trigger another function definition.
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     # Test that the numpy array is properly an argument to the graph function.
     self.assertEqual(1., defined(numpy.ones([])).numpy())
@@ -466,6 +562,50 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     value = tensor_init()
     self.assertAllEqual(value, 2.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGetConcreteFunctionCreatesVariables(self):
+
+    v_holder = []
+
+    @def_function.function
+    def tensor_init():
+      if not v_holder:
+        v_holder.append(variables.Variable(5.))
+      return v_holder[0].read_value()
+
+    concrete = tensor_init.get_concrete_function()
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(5., self.evaluate(concrete()))
+    self.assertAllEqual(5., self.evaluate(tensor_init()))
+
+  def testFuncGraphCaptureByValue(self):
+    v = variables.Variable(1.0)
+
+    def trivial_function():
+      return v.read_value()
+
+    graph_function = function.Function(
+        trivial_function, 'test', capture_by_value=True)
+
+    self.assertAllEqual(graph_function(), 1.0)
+    v.assign(2.0)
+    self.assertAllEqual(graph_function(), 1.0)
+
+  def testFuncGraphCaptureByValueNested(self):
+    v = variables.Variable(1.0)
+
+    def trivial_function():
+      return control_flow_ops.cond(
+          array_ops.placeholder_with_default(True, ()),
+          v.read_value, v.read_value)
+
+    graph_function = function.Function(
+        trivial_function, 'test', capture_by_value=True)
+
+    self.assertAllEqual(graph_function(), 1.0)
+    v.assign(2.0)
+    self.assertAllEqual(graph_function(), 1.0)
+
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
 
@@ -484,6 +624,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype)
     self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2]))
 
+  def testShapeInferenceForMoreSpecificInput(self):
+    self.skipTest('b/124219898')
+
+    def f(a):
+      return array_ops.reshape(a, [-1, 3])
+
+    signature = [tensor_spec.TensorSpec(None, dtypes.float32)]
+    compiled = def_function.function(f, input_signature=signature)
+
+    with ops.Graph().as_default():
+      inputs = array_ops.zeros([10, 10, 3])
+      self.assertAllEqual(f(inputs).shape, compiled(inputs).shape)
+
+  def testFuncListAttr(self):
+
+    @function.defun
+    def test_function(val):
+
+      def fn1():
+        return array_ops.ones([10])
+
+      fn2 = lambda: array_ops.ones([10]) * 2
+
+      def fn3(x=2):
+        return array_ops.ones([10]) * x
+      fn3 = functools.partial(fn3, x=3)
+
+      return gen_functional_ops.case(val, [], [dtypes.float32],
+                                     [function.defun(f).get_concrete_function()
+                                      for f in (fn1, fn2, fn3)])
+
+    ones = array_ops.ones([10])
+    self.assertAllEqual([ones], test_function(0))
+    self.assertAllEqual([ones * 2], test_function(1))
+    self.assertAllEqual([ones * 3], test_function(2))
+    self.assertAllEqual([ones * 3], test_function(22))  # default branch
+
   @test_util.enable_control_flow_v2
   def testVariableInLoopInFunction(self):
 
@@ -579,7 +756,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     run_metadata = context.export_run_metadata()
     context.disable_run_metadata()
     step_stats = run_metadata.step_stats
-    self.assertGreater(len(step_stats.dev_stats), 0)
+    self.assertNotEmpty(step_stats.dev_stats)
     cpu_stats = step_stats.dev_stats[0]
     self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
                      cpu_stats.device)
@@ -588,10 +765,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # arbitrarily many (placeholders, return identities, etc, might be included
     # or not in the future, so shouldn't be tested for exactly.
     self.assertGreaterEqual(len(cpu_stats.node_stats), 2)
-    self.assertEqual(len(run_metadata.partition_graphs), 1)
+    self.assertLen(run_metadata.partition_graphs, 1)
 
   def testGraphModeCaptureVariable(self):
-    with context.graph_mode(), self.cached_session() as sess:
+    with context.graph_mode(), self.cached_session():
 
       class HasAVar(object):
 
@@ -807,8 +984,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return None
 
     with self.assertRaisesRegexp(
-        errors.InvalidArgumentError, 'Could not colocate node with its '
-        'resource and reference inputs.*'):
+        errors.InvalidArgumentError,
+        'Cannot place the graph because a reference or resource edge connects '
+        'colocation groups with incompatible assigned devices'):
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
       self.evaluate(resource_apply_adam())
@@ -915,7 +1093,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                   constant_op.constant(4)],
         constant_op.constant(5)
     ])
-    self.assertEqual(len(ret), 2)
+    self.assertLen(ret, 2)
     self.assertAllEqual(ret[0][0], 2)
     self.assertAllEqual(ret[0][1][0][0], 8)
     self.assertAllEqual(ret[0][1][0][1], 4)
@@ -1027,7 +1205,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     defined = function.defun(multi_device_fn)
     outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -1035,7 +1213,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with ops.device('/cpu:3'):
       outputs = self.evaluate(defined())
     # All function definitions are agnostic to call site devices.
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -1043,7 +1221,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     with ops.device('/cpu:0'):
       outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -1057,7 +1235,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def func():
       return constant_op.constant(0)
 
-    defined = function.defun(func)
+    defined = def_function.function(func)
     with ops.device('cpu:0'):
       cpu_graph_function = defined.get_concrete_function()
 
@@ -1128,10 +1306,38 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     defined = function.defun(func)
     defined(Foo())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     defined(Foo())
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheTensorDtypeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 1)
+
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex128)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheTensorShapeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 1)
+
+    t = constant_op.constant([1.0], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorShapeDtypeCollision(self):
 
@@ -1141,11 +1347,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     t = constant_op.constant([1.0], dtype=dtypes.complex128)
     defined(t)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorUnknownShapesCollision(self):
 
@@ -1155,21 +1361,34 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with context.graph_mode(), self.cached_session():
       defined = function.defun(func)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 1)
+      self.assertLen(total_function_cache(defined), 1)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[1])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 2)
+      self.assertLen(total_function_cache(defined), 2)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[2])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 3)
-
-      t = constant_op.constant(1.0, dtype=dtypes.float32)
+      # Gradual shape relaxation is performed; and the common shape between
+      # [1] and [2] is one containing unknown dimensions.
+      self.assertLen(total_function_cache(defined), 2)
+
+      # pylint: disable=protected-access
+      self.assertLen(defined._function_cache.arg_relaxed_shapes, 1)
+      relaxed_shapes = (
+          list(defined._function_cache.arg_relaxed_shapes.values())[0])
+      self.assertEqual(len(relaxed_shapes), 1)
+      relaxed_shape = relaxed_shapes[0]
+      # pylint: enable=protected-access
+      self.assertEqual(relaxed_shape.rank, 1)
+      self.assertEqual(tensor_shape.dimension_value(relaxed_shape[0]), None)
+
+      t = constant_op.constant([1.0, 1.0, 1.0], dtype=dtypes.float32)
       defined(t)
-      self.assertEqual(len(defined._function_cache), 4)
+      # Shape (3,) matches the relaxed shape TensorShape([None])
+      self.assertLen(total_function_cache(defined), 2)
 
   def testPythonFunctionWithDefaultArgs(self):
 
@@ -1184,35 +1403,36 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     def cache_keys():
       """Sanitizes cache keys of non-input metadata."""
-      return tuple(key[0] for key in defined._function_cache)
+      return tuple(key[0] for key in total_function_cache(defined))
 
     # `True` corresponds to the fact that we're executing eagerly
-    self.assertIn(('URRR', (0, 1, 20)), cache_keys())
+    self.assertIn(('URRRu', (0, 1, 20)), cache_keys())
 
     defined(1)  # bar=1, baz=2
-    self.assertIn(('URRR', (1, 1, 2)), cache_keys())
+    self.assertIn(('URRRu', (1, 1, 2)), cache_keys())
 
     # This matches the previous call.
     defined(foo=1)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     defined(1, 2, 3)
-    self.assertIn(('URRR', (1, 2, 3)), cache_keys())
+    self.assertLen(total_function_cache(defined), 3)
+    self.assertIn(('URRRu', (1, 2, 3)), cache_keys())
 
     # This matches the previous call.
     defined(1, bar=2, baz=3)
-    self.assertEqual(len(defined._function_cache), 3)
+    self.assertLen(total_function_cache(defined), 3)
 
     # This matches the previous call.
     defined(1, baz=3, bar=2)
-    self.assertEqual(len(defined._function_cache), 3)
+    self.assertLen(total_function_cache(defined), 3)
 
   def testFunctoolsPartialUnwrappedCorrectly(self):
 
     def full_function(a, b, c=3):
       return a, b, c
 
-    partial = functools.partial(full_function, 1, c=3)
+    partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2)
 
     defined = function.defun(partial)
@@ -1221,7 +1441,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(func_b.numpy(), b)
     self.assertEqual(func_c.numpy(), c)
 
-  def testInputSignatureWithCompatibleInputs(self):
+  def testInputSignatureWithMatchingInputs(self):
 
     def foo(a):
       self.assertEqual(a.shape, (2,))
@@ -1231,12 +1451,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2])
     self.assertAllEqual(a, defined(a))
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(a, defined.get_concrete_function()(a))
     self.assertAllEqual(a, defined.get_concrete_function(a)(a))
     self.assertAllEqual(a, defined.get_concrete_function(
         tensor_spec.TensorSpec((2,), dtype=dtypes.float32))(a))
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     def bar(a):
       self.assertEqual(a._shape_tuple(), (2, None))
@@ -1246,31 +1466,55 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(bar, input_signature=signature)
     a = array_ops.ones([2, 1])
     out = defined(a)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(out, a)
 
     # Changing the second dimension shouldn't create a new function.
     b = array_ops.ones([2, 3])
     out = defined(b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(out, b)
 
+  def testInputSignatureWithCompatibleInputs(self):
+
+    rank2_spec = tensor_spec.TensorSpec(shape=(None, None),
+                                        dtype=dtypes.float32)
+
+    @function.defun(input_signature=[rank2_spec])
+    def func(a):
+      self.assertEqual([None, None], a.shape.as_list())
+      return array_ops.shape(a)
+
+    self.assertAllEqual([3, 1], func([[0], [1.0], [1]]))
+    self.assertAllEqual([2, 2], func(numpy.array([[1, 1], [2, 2]])))
+
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      func([0.0, 1.0, 2.0])  # Wrong shape.
+
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      func([['wrong dtype']])
+
   def testNestedInputSignatures(self):
 
+    def expected_foo(a, b):
+      return [a, b]
+
+    @function.defun(input_signature=[
+        [tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
+        tensor_spec.TensorSpec((1,), dtypes.float32),
+    ])
     def foo(a, b):
       self.assertEqual(a[0]._shape_tuple(), (2, None))
       self.assertEqual(a[1]._shape_tuple(), (2, None))
       self.assertEqual(b._shape_tuple(), (1,))
       return [a, b]
 
-    signature = [[tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
-                 tensor_spec.TensorSpec((1,), dtypes.float32)]
-    defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2, 1])
     b = array_ops.ones([1])
-    out = defined([a, a], b)
-    self.assertEqual(len(defined._function_cache), 1)
-    nest.assert_same_structure(out, [[a, a], b])
+    expected = expected_foo([a, a], b)
+    out = foo([a, a], b)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], a)
     self.assertAllEqual(out[1], b)
@@ -1279,33 +1523,58 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     a = array_ops.ones([2, 3])
     b = array_ops.ones([2, 5])
     c = array_ops.ones([1])
-    out = defined([a, b], c)
-    self.assertEqual(len(defined._function_cache), 1)
-    nest.assert_same_structure(out, [[a, b], c])
+    expected = expected_foo([a, b], c)
+    out = foo([a, b], c)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out[0][0], a)
+    self.assertAllEqual(out[0][1], b)
+    self.assertAllEqual(out[1], c)
+
+    # Passing compatible inputs should work.
+    a = a.numpy().tolist()
+    b = b.numpy().tolist()
+    c = c.numpy().tolist()
+    out = foo([a, b], c)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], b)
     self.assertAllEqual(out[1], c)
 
+  def testNestedInputSignaturesWithDict(self):
+    def expected_bar(a):
+      return a
+
+    @function.defun(input_signature=[{
+        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
+        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
+        'c': tensor_spec.TensorSpec((1,), dtypes.float32)}])
     def bar(a):
       self.assertEqual(a['a']._shape_tuple(), (2, None))
       self.assertEqual(a['b']._shape_tuple(), (2, None))
       self.assertEqual(a['c']._shape_tuple(), (1,))
       return a
 
-    signature = [{
-        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'c': tensor_spec.TensorSpec((1,), dtypes.float32)
-    }]
     a = array_ops.ones([2, 3])
     b = array_ops.ones([1])
     inputs = {'a': a, 'b': a, 'c': b}
-    defined = def_function.function(bar, input_signature=signature)
-    out = defined(inputs)
-    nest.assert_same_structure(out, inputs)
-    self.assertAllEqual(out['a'], inputs['a'])
-    self.assertAllEqual(out['b'], inputs['b'])
-    self.assertAllEqual(out['c'], inputs['c'])
+    expected = expected_bar(inputs)
+    out = bar(inputs)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out['a'], expected['a'])
+    self.assertAllEqual(out['b'], expected['b'])
+    self.assertAllEqual(out['c'], expected['c'])
+
+    # Passing compatible inputs should work.
+    a = a.numpy().tolist()
+    b = b.numpy().tolist()
+    inputs = {'a': a, 'b': a, 'c': b}
+    out = bar(inputs)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out['a'], expected['a'])
+    self.assertAllEqual(out['b'], expected['b'])
+    self.assertAllEqual(out['c'], expected['c'])
 
   def testInputSignatureMustBeSequenceOfTensorSpecs(self):
 
@@ -1324,6 +1593,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                  'tuple or a list.*'):
       function.defun(foo, input_signature=signature)
 
+  @test_util.run_in_graph_and_eager_modes
   def testInputsIncompatibleWithSignatureRaisesError(self):
 
     def foo(a):
@@ -1340,9 +1610,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Arguments and signature arguments do not match.*'):
+    with self.assertRaisesRegexp(TypeError, 'Received 2 argument\(s\)'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
@@ -1353,7 +1621,60 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
 
-  def testInputSignatureForFunctionWithNonTensorInputsNotAllowed(self):
+  def testInputsIncompatibleWithNestedSignatureRaisesError(self):
+
+    def foo(a, b):
+      return [a, b]
+
+    signature = [[tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
+                 [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2]
+    defined = function.defun(foo, input_signature=signature)
+    a = array_ops.ones([1])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Structure of Python function inputs.*'):
+      defined([a, a, a], [a])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Structure of Python function inputs.*'):
+      defined([a], [a, a, a])
+    defined([a, a], [a, a])
+
+  def testUnderspecifiedInputSignature(self):
+    @function.defun(input_signature=[
+        tensor_spec.TensorSpec([], dtypes.float32),
+    ])
+    def foo(a, training=True):
+      if training:
+        return a
+      else:
+        return -1.0 * a
+
+    x = constant_op.constant(1.0)
+    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+      foo(x, training=True)
+
+    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+      foo(x, training=False)
+
+    self.assertAllEqual(x.numpy(), foo(x).numpy())
+
+  def testInputSignatureWithPartialFunction(self):
+    self.skipTest('b/124441704')
+    def full_function(a, b, c=3.0):
+      return a, b, c
+
+    partial = functools.partial(full_function, 1, c=4)
+    a, b, c = partial(2.0)
+    signature = [tensor_spec.TensorSpec([], dtypes.float32)]
+    defined = function.defun(partial, input_signature=signature)
+    x = constant_op.constant(2.0)
+    func_a, func_b, func_c = defined(x)
+    self.assertEqual(func_a.numpy(), a)
+    self.assertEqual(func_b.numpy(), b)
+    self.assertEqual(func_c.numpy(), c)
+
+  def testInputSignatureConversionWithDefaultArg(self):
 
     def foo(a, training=True):
       if training:
@@ -1367,11 +1688,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     ]
     defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'When input_signature is provided, all inputs to '
-        'the Python function must be Tensors.'):
-      defined(a, training=True)
+    self.assertAllEqual(a.numpy(), defined(a))
+    self.assertAllEqual(a.numpy(), defined(a, training=True))
+    self.assertAllEqual(-a.numpy(), defined(a, training=False))
 
   def testInputSignatureWithKeywordPositionalArgs(self):
 
@@ -1386,22 +1705,22 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     integer = constant_op.constant(2, dtypes.int64)
 
     out1, out2 = foo(flt, integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt=flt, integer=integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(integer=integer, flt=flt)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt, integer=integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
@@ -1431,27 +1750,27 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     a = constant_op.constant(2.0)
     b = constant_op.constant([1.0, 2.0])
     one = defined(a, b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     two = defined(a=a, b=b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     three = defined(b=b, a=a)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     four = defined(a, b=b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     # The next call corresponds to a new input signature, hence
     # we expect another function to be defined.
     five = defined(b, a)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     six = defined(a=b, b=a)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     seven = defined(b=a, a=b)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     self.assertAllEqual(one, [1.0, 2.0])
     self.assertAllEqual(two, [1.0, 2.0])
@@ -1533,35 +1852,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 2)
+        self.assertLen(graph._functions, 2)
         functions = list(graph._functions.values())
         self.assertRegexpMatches(
             functions[0].definition.signature.name, '.*matmul.*')
         attrs = functions[0].definition.attr
-        self.assertEqual(len(attrs), 2)
+        self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_1'].s, b'value1')
         self.assertEqual(attrs['experimental_2'].i, 2)
 
         self.assertRegexpMatches(
             functions[1].definition.signature.name, '.*add.*')
         attrs = functions[1].definition.attr
-        self.assertEqual(len(attrs), 2)
+        self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_3'].b, True)
         self.assertEqual(attrs['experimental_4'].f, 1.0)
         # pylint: enable=protected-access
 
   def testFunctionWithInvalidAttribute(self):
-    @function.defun_with_attributes(attributes={'attr1': 'value1'})
-    def matmul(x, y):
-      return math_ops.matmul(x, y)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 '.*Attribute name is not whitelisted.*'):
-      with context.graph_mode(), self.cached_session():
-        with ops.get_default_graph().as_default():
-          t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-          matmul(t, t)
-
     @function.defun_with_attributes(attributes={'experimental_1': ['value1']})
     def add(x, y):
       return math_ops.add(x, y)
@@ -1591,7 +1899,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
@@ -1630,7 +1938,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual(double.eval().reshape(-1), [2, 4, 6, 8])
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         functions = list(graph._functions.values())
         for i in range(len(functions)):
           self.assertEqual(captured_function_names[i],
@@ -1667,7 +1975,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
@@ -1693,7 +2001,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual([[1, 2], [3, 4]], self.evaluate(composite_t))
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
 
   def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
@@ -1711,12 +2019,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
         # Test register function with cache, note inputs are ignored.
         function.register(defun_matmul)
         graph = ops.get_default_graph()
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
@@ -1733,7 +2041,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         graph = ops.get_default_graph()
         # Only one function is registered since the input param are in same type
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
   def testCallingFunctionWithDifferentVariables(self):
 
@@ -1744,8 +2052,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     v = resource_variable_ops.ResourceVariable(0.0)
     graph_function = foo.get_concrete_function(v)
-    self.assertEqual(len(graph_function.inputs), 1)
-    self.assertEqual(len(graph_function.captured_inputs), 0)
+    self.assertLen(graph_function.inputs, 1)
+    self.assertEmpty(graph_function.captured_inputs)
 
     self.assertEqual(float(graph_function(v)), 1.0)
     self.assertEqual(float(graph_function(v)), 2.0)
@@ -1773,34 +2081,30 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       graph_function('Not a Tensor.')
 
   def testSwapImplementationWithGrapplerPlugin(self):
+    # Set the min_graph_nodes to -1 since the graph in this test is too small,
+    # and will be ignored by grappler if don't set this.
     rewrites = rewriter_config_pb2.RewriterConfig()
-    # function_optimizer has to be turn off, otherwise it will delete the
-    # registered function if it does not get called.
-    # TODO(scottzhu): Move the ExperimentalImplementationSelector to be called
-    # before function_optimizer in future.
-    rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-    customer_optimizer = rewrites.custom_optimizers.add()
-    customer_optimizer.name = 'ExperimentalImplementationSelector'
+    rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
     rewrites.min_graph_nodes = -1
     graph_options = config_pb2.GraphOptions(
         rewrite_options=rewrites, build_cost_model=1)
     config = config_pb2.ConfigProto(graph_options=graph_options)
 
     with context.graph_mode(), self.cached_session(
-        config=config, graph=ops.Graph(), use_gpu=True) as sess:
+        config=config, graph=ops.Graph(), use_gpu=True):
 
       @function.defun_with_attributes(
           attributes={
-              'experimental_api_implements': 'random_boost',
-              'experimental_api_preferred_device': 'CPU'
+              'api_implements': 'random_boost',
+              'api_preferred_device': 'CPU'
           })
       def cpu_boost(x):
         return math_ops.add(x, 2.0)
 
       @function.defun_with_attributes(
           attributes={
-              'experimental_api_implements': 'random_boost',
-              'experimental_api_preferred_device': 'GPU'
+              'api_implements': 'random_boost',
+              'api_preferred_device': 'GPU'
           })
       def gpu_boost(x):
         return math_ops.add(x, 4.0)
@@ -1834,18 +2138,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       with ops.Graph().as_default():
         x = constant_op.constant(11)
         maybe_add(x, True)
-        self.assertEqual(len(maybe_add._function_cache), 1)
-        self.assertEqual(len(add._function_cache), 1)
+        self.assertLen(total_function_cache(maybe_add), 1)
+        self.assertLen(total_function_cache(add), 1)
 
         maybe_add(x, False)
-        self.assertEqual(len(maybe_add._function_cache), 2)
-        self.assertEqual(len(add._function_cache), 1)
+        self.assertLen(total_function_cache(maybe_add), 2)
+        self.assertLen(total_function_cache(add), 1)
 
       with ops.Graph().as_default():
         x = constant_op.constant(11)
         maybe_add(x, True)
-        self.assertEqual(len(maybe_add._function_cache), 3)
-        self.assertEqual(len(add._function_cache), 2)
+        self.assertLen(total_function_cache(maybe_add), 3)
+        self.assertLen(total_function_cache(add), 2)
+
+  def testCacheKeyOverlappingShapes(self):
+    @function.defun
+    def defined(t):
+      return t
+
+    defined(array_ops.zeros([12, 1]))
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined(array_ops.zeros([1, 21]))
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheKeyNestedLists(self):
+    @function.defun
+    def defined(l):
+      return l
+
+    a = constant_op.constant(1.)
+    b = constant_op.constant(2.)
+    c = constant_op.constant(3.)
+    defined([[a], b, c])
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined([[a, b], c])
+    self.assertLen(total_function_cache(defined), 2)
 
   def testDecoratedMethod(self):
     m = DefunnedMiniModel()
@@ -2067,7 +2396,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     m = DefunnedMiniModel()
     m(array_ops.ones([1, 2]))
     weak_variables = weakref.WeakSet(m.variables)
-    self.assertEqual(2, len(weak_variables))
+    self.assertLen(weak_variables, 2)
     del m
     self.assertEqual([], list(weak_variables))
 
@@ -2122,13 +2451,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def fn(x):
       return fn2(x)
 
-    try:
+    with self.assertRaises(errors.InvalidArgumentError) as cm:
       fn(2)
-      self.assertFail()
-    except errors.InvalidArgumentError as e:
-      self.assertIn('fn -> fn2', e.message)
-      self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
-      self.assertNotIn('fn3', e.message)
+    e = cm.exception
+    self.assertIn('fn -> fn2', e.message)
+    self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
+    self.assertNotIn('fn3', e.message)
 
   def testFunctionIsNotPinned(self):
     """Tests that functions aren't pinned to the CPU by the eager runtime."""
@@ -2408,6 +2736,7 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     result = func(g1, g2, c1, g3, c2)
     self.assertEqual(result.numpy(), 5.0 * 7.0 * 17.0)
 
+
 if __name__ == '__main__':
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
index ad62e6d10ac4528d1703356811793ff4986533c8..2ed2d5882e2721c6650e1d978238100d98a59322 100644
--- a/tensorflow/python/eager/lift_to_graph.py
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 def _graph_inputs(op):
@@ -40,35 +42,226 @@ class UnliftableError(Exception):
   pass
 
 
-def lift_to_graph(init_tensor, graph, sources=None):
-  """Copies the tensor and all its inputs recursively to the outer graph."""
-  # Check that the initializer does not depend on any placeholders.
-  if sources is None:
-    sources = set([])
+def _constant_inputs(op_or_tensor):
+  return all(_as_operation(i).type == u"Const"
+             and not _as_operation(i).control_inputs
+             for i in _graph_inputs(_as_operation(op_or_tensor)))
+
+
+def _path_from(from_op, tensor, sources):
+  """Find one path from `from_op` to `tensor`, ignoring `sources`.
+
+  Args:
+    from_op: A `tf.Operation`.
+    tensor: A `tf.Operation` or `tf.Tensor`.
+    sources: A list of `tf.Tensor`.
+
+  Returns:
+    A python string containing the path, or "??" if none is found.
+  """
   visited_ops = set([x.op for x in sources])
+  ops_to_visit = [_as_operation(tensor)]
+  some_op_output = {}
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in visited_ops:
+      continue
+    visited_ops.add(op)
+    if op == from_op:
+      path_op = op
+      path = [path_op]
+      final_op = _as_operation(tensor)
+      while path_op != final_op:
+        path_op = some_op_output[path_op]
+        path.append(path_op)
+      return " <- ".join(["%s (%s)" % (x.name, x.type) for x in reversed(path)])
+    else:
+      for inp in _graph_inputs(op):
+        if inp not in visited_ops and inp not in sources:
+          some_op_output[inp] = op
+          ops_to_visit.append(inp)
+  return "??"
+
+
+def _map_subgraph(init_tensor, sources, disallowed_placeholders, visited_ops,
+                  op_outputs, add_sources):
+  """Walk a Graph and capture the subgraph between init_tensor and sources.
+
+  Note: This function mutates visited_ops and op_outputs.
+
+  Arguments:
+    init_tensor:  A Tensor or Operation where the subgraph terminates.
+    sources:  A set of Tensors where subgraph extraction should stop.
+    disallowed_placeholders: An optional set of ops which may not appear in the
+      lifted graph. Defaults to all placeholders.
+    visited_ops: A set of operations which were visited in a prior pass.
+    op_outputs: A defaultdict containing the outputs of an op which are to be
+      copied into the new subgraph.
+    add_sources: A boolean indicating whether placeholders which are not in
+      sources should be allowed.
+
+  Returns:
+    The set of placeholders upon which init_tensor depends and are not in
+    sources.
+
+  Raises:
+    UnliftableError: if init_tensor depends on a placeholder which is not in
+      sources and add_sources is False.
+  """
   ops_to_visit = [_as_operation(init_tensor)]
-  op_outputs = collections.defaultdict(set)
+  extra_sources = set()
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in visited_ops:
       continue
     visited_ops.add(op)
-    # TODO(apassos) distinguish arg placeholders, capture placeholders,
-    # and placeholders the user might directly use to initialize
-    # variables.
-    if op.type == "Placeholder":
+
+    should_raise = False
+    if disallowed_placeholders is not None and op in disallowed_placeholders:
+      should_raise = True
+    elif op.type == "Placeholder":
+      if disallowed_placeholders is None and not add_sources:
+        should_raise = True
+      extra_sources.update(op.outputs)
+
+    if should_raise:
       raise UnliftableError(
-          "Unable to lift tensor", init_tensor,
-          "because it depends transitively on placeholder ", op)
+          "Unable to lift tensor %s because it depends transitively on "
+          "placeholder %s via at least one path, e.g.: %s"
+          % (repr(init_tensor), repr(op), _path_from(op, init_tensor, sources)))
     for inp in _graph_inputs(op):
       op_outputs[inp].add(op)
-      if inp not in visited_ops and inp not in sources:
+      if inp not in visited_ops and inp not in (sources or extra_sources):
         ops_to_visit.append(inp)
+
+  return extra_sources
+
+
+def _copy_non_source(op, graph, op_map):
+  """Copy an op directly to a given graph.
+
+  This function assumes that all of the inputs to an op have already been
+  copied.
+
+  Args:
+    op: The op to be copied.
+    graph: The destination graph.
+    op_map: A dict mapping ops and tensors in the old graph to the new one.
+  """
+  copied_inputs = [op_map[x] for x in op.inputs]
+  copied_control_inputs = [op_map[x] for x in op.control_inputs]
+  with ops.control_dependencies(copied_control_inputs), ops.device(op.device):
+    copied_op = graph.create_op(
+        op_type=op.type,
+        inputs=copied_inputs,
+        dtypes=[x.dtype for x in op.outputs],
+        attrs=op.node_def.attr,
+        name=op.name)
+  op_map[op] = copied_op
+  for i, o in enumerate(op.outputs):
+    op_map[o] = copied_op.outputs[i]
+
+
+def _copy_source(s, graph, op_map, handle_captures, inverse_captures):
+  """Create a source in a graph based on a Tensor from a different graph.
+
+  This function creates a placeholder analog of `s` in a graph with the
+  following behavior:
+
+  1) If s is a captured Tensor or Variable and handle_captures is set to True,
+     simply capture it in the new graph as well.
+
+  2) If s is a PlaceholderWithDefault whose default is a constant, preserve
+     said default in the new graph.
+
+  3) When applicable, copy resource variable metadata from `s` to the newly
+     created placeholder.
+
+  Args:
+    s: The source of interest.
+    graph: The destination graph.
+    op_map: A dict mapping ops and tensors in the old graph to the new one.
+    handle_captures: A boolean indicating whether to re-capture s in the new
+      graph or simply create a vanilla placeholder.
+    inverse_captures: A dict mapping s back to the Tensor or Variable that it
+      captures.
+  """
+  if handle_captures and s in inverse_captures:
+    copied_placeholder = graph.capture(inverse_captures[s], name=s.op.name)
+  elif s.op.type == "PlaceholderWithDefault" and _constant_inputs(s):
+    # Copy the default value to the graph.
+    default_value = s.op.inputs[0]
+    _copy_non_source(op=default_value.op, graph=graph, op_map=op_map)
+
+    with ops.device(s.op.device):
+      copied_placeholder = array_ops.placeholder_with_default(
+          input=op_map[default_value], shape=s.shape, name=s.op.name)
+  else:
+    with ops.device(s.op.device):
+      copied_placeholder = array_ops.placeholder(
+          dtype=s.dtype, shape=s.shape, name=s.op.name)
+
+  base_handle = resource_variable_ops.get_resource_handle_data(s)
+  if base_handle.shape_and_type:
+    resource_variable_ops._set_handle_shapes_and_types(  # pylint: disable=protected-access
+        copied_placeholder,
+        base_handle,
+        graph_mode=True)
+
+  op_map[s] = copied_placeholder
+
+
+def lift_to_graph(init_tensors, graph, sources=None,
+                  disallowed_placeholders=None, add_sources=False,
+                  handle_captures=False, base_graph=None):
+  """Copies the tensor and all its inputs recursively to the outer graph.
+
+  Args:
+    init_tensors: The Tensor to lift.
+    graph: The graph to lift to.
+    sources: Optional sequence of nodes to start from. If omitted the whole
+      subgraph which feeds into `init_tensor` is lifted.
+    disallowed_placeholders: An optional set of ops which may not appear in the
+      lifted graph. Defaults to all placeholders.
+    add_sources: A boolean indicating whether placeholders which are not in
+      sources should be allowed.
+    handle_captures: A boolean indicating whether to re-capture s in the new
+      graph or simply create a vanilla placeholder.
+    base_graph: The graph from which to lift ops. This will be inferred if not
+      specified.
+
+  Returns:
+    A mapping from ops in the current default graph to ops in `graph`.
+
+  Raises:
+    UnliftableError: If a placeholder blocks lifting.
+  """
+  variable_init_tensors = {i for i in init_tensors if isinstance(
+      i, resource_variable_ops.ResourceVariable)}
+  init_tensors = set(init_tensors).difference(variable_init_tensors)
+  base_graph = base_graph or list(init_tensors)[0].graph
+
+  # Check that the initializer does not depend on any placeholders.
+  sources = set(sources or [])
+  visited_ops = set([x.op for x in sources])
+  op_outputs = collections.defaultdict(set)
+
+  # First we extract the subgraph between init_tensors and sources.
+  for init_tensor in init_tensors:
+    sources.update(_map_subgraph(
+        init_tensor=init_tensor,
+        sources=sources,
+        disallowed_placeholders=disallowed_placeholders,
+        visited_ops=visited_ops,
+        op_outputs=op_outputs,
+        add_sources=add_sources))
+
   # Topologically sort the nodes we've extracted. Now we know how many of their
   # outputs are part of this subgraph.
   ops_to_copy = []
   marked_ops = set([])
-  ops_to_visit = [_as_operation(init_tensor)]
+  ops_to_visit = [_as_operation(t) for t in init_tensors
+                  if not op_outputs[_as_operation(t)]]
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in marked_ops:
@@ -76,27 +269,34 @@ def lift_to_graph(init_tensor, graph, sources=None):
     marked_ops.add(op)
     ops_to_copy.append(op)
     for inp in _graph_inputs(op):
-      if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources:
+      if (all(x in marked_ops for x in op_outputs[inp]) and
+          inp not in sources):
         ops_to_visit.append(inp)
+
+  # When lifting from one FuncGraph to another, we will need to capture the
+  # relevant tensors as well.
+  captures = collections.OrderedDict()
+  if (isinstance(base_graph, func_graph.FuncGraph) and
+      isinstance(graph, func_graph.FuncGraph)):
+    captures = base_graph.captures
+  inverse_captures = {v: k for k, v in captures.items()}
+
   # ops_to_copy now holds a reverse topologically sorted list of ops which
   # ends in the initializer. We copy those to the outermost graph and
   # build the initialization op there.
   with graph.as_default():
-    op_map = {}
+    op_map = {i: i for i in variable_init_tensors}  # Pass through variables.
     source_ops = set()
     for s in sources:
       source_ops.add(s.op)
-      op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape)
+      _copy_source(s=s, graph=graph, op_map=op_map,
+                   handle_captures=handle_captures,
+                   inverse_captures=inverse_captures)
+
     for op in reversed(ops_to_copy):
       if op in source_ops:
         continue
-      copied_inputs = [op_map[x] for x in op.inputs]
-      copied_control_inputs = [op_map[x] for x in op.control_inputs]
-      with ops.control_dependencies(copied_control_inputs):
-        copied_op = graph.create_op(
-            op.type, copied_inputs, [x.dtype for x in op.outputs],
-            attrs=op.node_def.attr)
-      op_map[op] = copied_op
-      for i, o in enumerate(op.outputs):
-        op_map[o] = copied_op.outputs[i]
+
+      _copy_non_source(op=op, graph=graph, op_map=op_map)
+
     return op_map
diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py
index a88312264357546496c07b5824aede0178e880ae..659c0cc6e69e25701b200d73020420a328a93d72 100644
--- a/tensorflow/python/eager/profiler.py
+++ b/tensorflow/python/eager/profiler.py
@@ -25,6 +25,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
 
 LOGDIR_PLUGIN = 'plugins/profile'
 
@@ -45,7 +46,17 @@ def start():
   if _profiler is not None:
     raise AssertionError('Another profiler is running.')
   with _profiler_lock:
-    _profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle)  # pylint: disable=protected-access
+    profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
+    if context.default_execution_mode == context.EAGER_MODE:
+      pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
+          profiler_context,
+          context.context()._handle)  # pylint: disable=protected-access
+    _profiler = pywrap_tensorflow.TFE_NewProfiler(profiler_context)
+    pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+    if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
+      logging.warning('Another profiler session is running which is probably '
+                      'created by profiler server. Please avoid using profiler '
+                      'server and profiler APIs at the same time.')
 
 
 def stop():
@@ -79,7 +90,8 @@ def start_profiler_server(port):
   """Start a profiler grpc server that listens to given port.
 
   The profiler server will keep the program running even the training finishes.
-  Please shutdown the server with CTRL-C. The service defined in
+  Please shutdown the server with CTRL-C. It can be used in both eager mode and
+  graph mode. The service defined in
   tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
   tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
   file following https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace
@@ -87,9 +99,13 @@ def start_profiler_server(port):
   Args:
     port: port profiler server listens to.
   """
-  pywrap_tensorflow.TFE_StartProfilerServer(
-      context.context()._handle,  # pylint: disable=protected-access
-      port)
+  profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
+  if context.default_execution_mode == context.EAGER_MODE:
+    pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
+        profiler_context,
+        context.context()._handle)  # pylint: disable=protected-access
+  pywrap_tensorflow.TFE_StartProfilerServer(profiler_context, port)
+  pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
 
 
 class Profiler(object):
diff --git a/tensorflow/python/eager/profiler_client.py b/tensorflow/python/eager/profiler_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f09d8b63419f4f837f74cd59fb1b3083b7d968b
--- /dev/null
+++ b/tensorflow/python/eager/profiler_client.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler client APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors
+
+
+def start_tracing(service_addr,
+                  logdir,
+                  duration_ms,
+                  worker_list='',
+                  include_dataset_ops=True,
+                  num_tracing_attempts=3):
+  """Sending grpc requests to profiler server to perform on-demand profiling.
+
+  Note: This method will block caller thread until receives tracing result.
+
+  Args:
+    service_addr: Address of profiler service e.g. localhost:6009.
+    logdir: Path of TensorBoard log directory e.g. /tmp/tb_log.
+    duration_ms: Duration of tracing or monitoring in ms.
+    worker_list: The list of worker TPUs that we are about to profile in the
+      current session. (TPU only)
+    include_dataset_ops: Set to false to profile longer traces.
+    num_tracing_attempts: Automatically retry N times when no trace event is
+      collected.
+
+  Raises:
+    UnavailableError: If no trace event is collected.
+  """
+  # TODO(fishx): Uses errors.raise_exception_on_not_ok_status instead.
+  if not pywrap_tensorflow.TFE_ProfilerClientStartTracing(
+      service_addr, logdir, worker_list, include_dataset_ops, duration_ms,
+      num_tracing_attempts):
+    raise errors.UnavailableError(None, None, 'No trace event is collected.')
diff --git a/tensorflow/python/eager/profiler_test.py b/tensorflow/python/eager/profiler_test.py
index 6940f56a98190904f63bcc7ae9a7e641da69c872..ba19e17e96ef9321ed8314bc4a96fab6ca2c1e02 100644
--- a/tensorflow/python/eager/profiler_test.py
+++ b/tensorflow/python/eager/profiler_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.profiler import trace_events_pb2
+from tensorflow.core.profiler import trace_events_pb2
 from tensorflow.python.eager import profiler
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 8d6f212499f80513eeb2a20cee8b2e0d7be21e3f..1db1b23d4c94ad911a2ffbd475134615f370af22 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -231,7 +231,12 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor);
 
 // Encodes the object as a tuple that is meant to be used as part of the key
-// for the defun function cache.
-PyObject* TFE_Py_EncodeArg(PyObject*);
+// for the defun function cache.  If `include_tensor_ranks_only` is true,
+// then the encoding only stores tensor ranks, and the key is
+// agnostic to dimension sizes.  Otherwise, full tensor shape encodings are
+// returned.
+PyObject* TFE_Py_EncodeArg(PyObject*, bool include_tensor_ranks_only);
+
+void TFE_Py_EnableInteractivePythonLogging();
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index da1bb24fc739d9919329934fee04569ad53d9386..3286e1add81d96172b249bae57beeaeea3399e28 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <cstring>
 #include <thread>
 
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "absl/strings/str_cat.h"
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -264,7 +264,8 @@ bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
 }
 
 bool SetOpAttrList(
-    TFE_Op* op, const char* key, PyObject* py_list, TF_AttrType type,
+    TFE_Context* ctx, TFE_Op* op, const char* key, PyObject* py_list,
+    TF_AttrType type,
     tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
     TF_Status* status) {
   if (!PySequence_Check(py_list)) {
@@ -369,6 +370,40 @@ bool SetOpAttrList(
     TFE_OpSetAttrShapeList(op, key, dims.get(), num_dims.get(), num_values,
                            status);
     if (TF_GetCode(status) != TF_OK) return false;
+  } else if (type == TF_ATTR_FUNC) {
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
+    for (int i = 0; i < num_values; ++i) {
+      tensorflow::Safe_PyObjectPtr py_value(PySequence_ITEM(py_list, i));
+      // Allow:
+      // (1) String function name, OR
+      // (2) A Python object with a .name attribute
+      //     (A crude test for being a
+      //     tensorflow.python.framework.function._DefinedFunction)
+      //     (which is what the various "defun" or "Defun" decorators do).
+      // And in the future also allow an object that can encapsulate
+      // the function name and its attribute values.
+      tensorflow::StringPiece func_name;
+      if (!ParseStringValue(key, py_value.get(), status, &func_name)) {
+        PyObject* name_attr = PyObject_GetAttrString(py_value.get(), "name");
+        if (name_attr == nullptr ||
+            !ParseStringValue(key, name_attr, status, &func_name)) {
+          TF_SetStatus(
+              status, TF_INVALID_ARGUMENT,
+              tensorflow::strings::StrCat(
+                  "unable to set function value attribute from a ",
+                  py_value.get()->ob_type->tp_name,
+                  " object. If you think this is an error, please file an "
+                  "issue at "
+                  "https://github.com/tensorflow/tensorflow/issues/new")
+                  .c_str());
+          return false;
+        }
+      }
+      funcs[i] = TFE_NewOp(ctx, func_name.data(), status);
+      if (TF_GetCode(status) != TF_OK) return false;
+    }
+    TFE_OpSetAttrFunctionList(op, key, funcs.get(), num_values);
+    if (TF_GetCode(status) != TF_OK) return false;
   } else {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
                  tensorflow::strings::StrCat("Attr ", key,
@@ -619,7 +654,8 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
     const TF_AttrType type = TFE_OpGetAttrType(op, key, &is_list, out_status);
     if (TF_GetCode(out_status) != TF_OK) return;
     if (is_list != 0) {
-      if (!SetOpAttrList(op, key, py_value, type, nullptr, out_status)) return;
+      if (!SetOpAttrList(ctx, op, key, py_value, type, nullptr, out_status))
+        return;
     } else {
       if (!SetOpAttrScalar(ctx, op, key, py_value, type, nullptr, out_status))
         return;
@@ -649,7 +685,8 @@ void SetOpAttrWithDefaults(
     }
   } else {
     if (is_list != 0) {
-      SetOpAttrList(op, attr_name, attr_value, type, attr_list_sizes, status);
+      SetOpAttrList(ctx, op, attr_name, attr_value, type, attr_list_sizes,
+                    status);
     } else {
       SetOpAttrScalar(ctx, op, attr_name, attr_value, type, attr_list_sizes,
                       status);
@@ -1011,8 +1048,18 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   void MarkAsResult(PyObject* gradient) const final { Py_INCREF(gradient); }
 
   PyObject* Zeros(const PyTapeTensor& tensor) const final {
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_shape = tensor.GetShape();
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_dtype = tensor.GetDType();
+    if (PyErr_Occurred()) {
+      Py_DECREF(py_shape);
+      return nullptr;
+    }
     PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
     PyObject* result = PyEval_CallObject(zeros_fn_, arg_list);
     Py_DECREF(arg_list);
@@ -1022,6 +1069,9 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   }
 
   PyObject* Ones(const PyTapeTensor& tensor) const final {
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_shape = tensor.GetShape();
     PyObject* py_dtype = tensor.GetDType();
     PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
@@ -2086,6 +2136,9 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
         PyBackwardFunction* function =
             new PyBackwardFunction([op_name, attrs, num_inputs, op_inputs,
                                     op_outputs](PyObject* output_grads) {
+              if (PyErr_Occurred()) {
+                return static_cast<PyObject*>(nullptr);
+              }
               tensorflow::Safe_PyObjectPtr callback_args(
                   Py_BuildValue("OOOOOO", op_name, attrs, num_inputs, op_inputs,
                                 op_outputs, output_grads));
@@ -2411,14 +2464,14 @@ bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
 
 bool RunCallbacks(
     const FastPathOpExecInfo& op_exec_info, PyObject* args,
-    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_inputs,
-    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_attrs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_inputs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_attrs,
     PyObject* flattened_result) {
   if (!op_exec_info.run_callbacks) return true;
 
-  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs.size()));
-  for (int i = 0; i < flattened_inputs.size(); i++) {
-    PyObject* input = flattened_inputs[i].get();
+  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs->size()));
+  for (int i = 0; i < flattened_inputs->size(); i++) {
+    PyObject* input = (*flattened_inputs)[i].get();
     Py_INCREF(input);
     PyTuple_SET_ITEM(inputs.get(), i, input);
   }
@@ -2426,7 +2479,7 @@ bool RunCallbacks(
   int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
                                op_exec_info.op_def->input_arg_size() -
                                kFastPathExecuteInputStartIndex;
-  int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
+  int num_attrs = flattened_attrs->size() + num_non_inferred_attrs;
   tensorflow::Safe_PyObjectPtr attrs(PyTuple_New(num_attrs));
 
   for (int i = 0; i < num_non_inferred_attrs; i++) {
@@ -2438,7 +2491,7 @@ bool RunCallbacks(
   }
   for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
     PyObject* attr_or_name =
-        flattened_attrs.at(i - num_non_inferred_attrs).get();
+        flattened_attrs->at(i - num_non_inferred_attrs).get();
     Py_INCREF(attr_or_name);
     PyTuple_SET_ITEM(attrs.get(), i, attr_or_name);
   }
@@ -2676,9 +2729,10 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       for (Py_ssize_t j = 0; j < len; j++) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
-        if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             []() { Py_RETURN_NONE; },
-                             [](const TF_DataType& dtype) {}, status)) {
+        if (!ConvertToTensor(
+                op_exec_info, py_input, &py_eager_tensor,
+                []() { Py_RETURN_NONE; }, [](const TF_DataType& dtype) {},
+                status)) {
           return nullptr;
         }
 
@@ -2757,8 +2811,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     PyList_SET_ITEM(flat_result.get(), i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (!RunCallbacks(op_exec_info, args, *flattened_inputs, *flattened_attrs,
-                    flat_result.get())) {
+  if (!RunCallbacks(op_exec_info, args, flattened_inputs.get(),
+                    flattened_attrs.get(), flat_result.get())) {
     return nullptr;
   }
 
@@ -2823,10 +2877,13 @@ namespace {
 const char kTensor[] = "T";
 const char kIndexedSlices[] = "I";
 const char kList[] = "L";
+const char kListEnd[] = "l";
 const char kTuple[] = "U";
+const char kTupleEnd[] = "u";
 const char kDict[] = "D";
 const char kRaw[] = "R";
 const char kShape[] = "s";
+const char kShapeDelim[] = "-";
 const char kDType[] = "d";
 const char kNone[] = "n";
 
@@ -2856,7 +2913,9 @@ struct EncodeResult {
   }
 };
 
-tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
+tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
+                                       bool include_tensor_ranks_only,
+                                       EncodeResult* result) {
   if (EagerTensor_CheckExact(arg)) {
     TFE_TensorHandle* t = EagerTensor_Handle(arg);
     tensorflow::TensorShape tensor_shape;
@@ -2865,10 +2924,13 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
     absl::StrAppend(&result->str, kDType, t->handle->dtype);
 
     absl::StrAppend(&result->str, kShape);
-    for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
-      absl::StrAppend(&result->str, dim_size);
+    if (include_tensor_ranks_only) {
+      absl::StrAppend(&result->str, tensor_shape.dim_sizes().size());
+    } else {
+      for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
+        absl::StrAppend(&result->str, dim_size, kShapeDelim);
+      }
     }
-
     return tensorflow::Status::OK();
   }
 
@@ -2892,6 +2954,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
       static_cast<tensorflow::DataType>(MakeInt(dtype_enum.get()));
 
   absl::StrAppend(&result->str, kDType, dtype);
+
   static char _shape_tuple[] = "_shape_tuple";
   tensorflow::Safe_PyObjectPtr shape_tuple(
       PyObject_CallMethod(arg, _shape_tuple, nullptr));
@@ -2912,22 +2975,30 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
       shape_tuple.get(), "shape_tuple didn't return a sequence"));
 
   int len = PySequence_Fast_GET_SIZE(shape_seq.get());
-  for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
-    if (item == Py_None) {
-      absl::StrAppend(&result->str, kNone);
-    } else {
-      absl::StrAppend(&result->str, MakeInt(item));
+
+  if (include_tensor_ranks_only) {
+    absl::StrAppend(&result->str, len);
+  } else {
+    for (int i = 0; i < len; ++i) {
+      PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
+      if (item == Py_None) {
+        absl::StrAppend(&result->str, kNone);
+      } else {
+        absl::StrAppend(&result->str, MakeInt(item));
+      }
     }
   }
-
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result);
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
+                                          bool include_tensor_ranks_only,
+                                          EncodeResult* result);
 
 // This function doesn't set the type of sequence before
 tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
+                                         const char* end_type,
+                                         bool include_tensor_ranks_only,
                                          EncodeResult* result) {
   tensorflow::Safe_PyObjectPtr arg_seq(
       PySequence_Fast(arg, "unable to create seq from list/tuple"));
@@ -2939,17 +3010,22 @@ tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
     if (item == Py_None) {
       absl::StrAppend(&result->str, kNone);
     } else {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(item, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(item, include_tensor_ranks_only, result));
     }
   }
+  absl::StrAppend(&result->str, end_type);
 
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
+                                          bool include_tensor_ranks_only,
+                                          EncodeResult* result) {
   if (tensorflow::swig::IsTensor(arg)) {
     absl::StrAppend(&result->str, kTensor);
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(arg, result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(arg, include_tensor_ranks_only, result));
   } else if (tensorflow::swig::IsIndexedSlices(arg)) {
     absl::StrAppend(&result->str, kIndexedSlices);
     tensorflow::Safe_PyObjectPtr values(PyObject_GetAttrString(arg, "values"));
@@ -2958,7 +3034,8 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
       return tensorflow::errors::InvalidArgument(
           "IndexedSlices does not have a values attr");
     }
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(values.get(), result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(values.get(), include_tensor_ranks_only, result));
 
     tensorflow::Safe_PyObjectPtr indices(
         PyObject_GetAttrString(arg, "indices"));
@@ -2967,7 +3044,8 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
       return tensorflow::errors::InvalidArgument(
           "IndexedSlices does not have a indices attr");
     }
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(indices.get(), result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(indices.get(), include_tensor_ranks_only, result));
 
     tensorflow::Safe_PyObjectPtr dense_shape(
         PyObject_GetAttrString(arg, "dense_shape"));
@@ -2977,12 +3055,15 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
           "IndexedSlices does not have a dense_shape attr");
     }
     if (dense_shape.get() != Py_None) {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(dense_shape.get(), result));
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(
+          dense_shape.get(), include_tensor_ranks_only, result));
     }
   } else if (PyList_Check(arg)) {
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kList, result));
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
+        arg, kList, kListEnd, include_tensor_ranks_only, result));
   } else if (PyTuple_Check(arg)) {
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kTuple, result));
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
+        arg, kTuple, kTupleEnd, include_tensor_ranks_only, result));
   } else if (PyDict_Check(arg)) {
     tensorflow::Safe_PyObjectPtr keys(PyDict_Keys(arg));
     if (PyList_Sort(keys.get()) == -1) {
@@ -2994,9 +3075,11 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
 
     for (int i = 0; i < len; i++) {
       PyObject* key = PyList_GetItem(keys.get(), i);
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(key, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(key, include_tensor_ranks_only, result));
       PyObject* value = PyDict_GetItem(arg, key);
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(value, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(value, include_tensor_ranks_only, result));
     }
   } else {
     PyObject* object = PyWeakref_NewRef(arg, nullptr);
@@ -3023,13 +3106,51 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
 // on known shapes to produce slimmer graphs, and correctness, as some
 // high-level APIs require shapes to be fully-known.
 //
+// `include_tensor_ranks_only` allows caching on arguments excluding shape info,
+// so that a slow path using relaxed shape can rely on a cache key that excludes
+// shapes.
+//
 // TODO(nareshmodi): Add support for sparse tensors.
-PyObject* TFE_Py_EncodeArg(PyObject* arg) {
+PyObject* TFE_Py_EncodeArg(PyObject* arg, bool include_tensor_ranks_only) {
   EncodeResult result;
-  const auto status = TFE_Py_EncodeArgHelper(arg, &result);
+  const auto status =
+      TFE_Py_EncodeArgHelper(arg, include_tensor_ranks_only, &result);
   if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
     return nullptr;
   }
 
   return result.ToPyTuple();
 }
+
+// A method prints incoming messages directly to Python's
+// stdout using Python's C API. This is necessary in Jupyter notebooks
+// and colabs where messages to the C stdout don't go to the notebook
+// cell outputs, but calls to Python's stdout do.
+void PrintToPythonStdout(const char* msg) {
+  if (Py_IsInitialized()) {
+    PyGILState_STATE py_threadstate;
+    py_threadstate = PyGILState_Ensure();
+
+    string string_msg = msg;
+    // PySys_WriteStdout truncates strings over 1000 bytes, so
+    // we write the message in chunks small enough to not be truncated.
+    int CHUNK_SIZE = 900;
+    auto len = string_msg.length();
+    for (int i = 0; i < len; i += CHUNK_SIZE) {
+      PySys_WriteStdout("%s", string_msg.substr(i, CHUNK_SIZE).c_str());
+    }
+    PySys_WriteStdout("\n");
+
+    PyGILState_Release(py_threadstate);
+  }
+}
+
+// Register PrintToPythonStdout as a log listener, to allow
+// printing in colabs and jupyter notebooks to work.
+void TFE_Py_EnableInteractivePythonLogging() {
+  static bool enabled_interactive_logging = false;
+  if (!enabled_interactive_logging) {
+    enabled_interactive_logging = true;
+    TF_RegisterLogListener(PrintToPythonStdout);
+  }
+}
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0ee2ff68c209aa13aaeb32be610302c11616b9d7..0d8845bd96f9dad6d9e110e5a9fd617647d18f64 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -339,6 +339,24 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorNumpyZeroDim(self):
+    for np_type, dtype in [(np.int32, dtypes.int32),
+                           (np.half, dtypes.half),
+                           (np.float32, dtypes.float32)]:
+      x = ops.convert_to_tensor([np.array(65, dtype=np_type),
+                                 np.array(16, dtype=np_type)])
+      self.assertEqual(x.dtype, dtype)
+      self.assertAllEqual(x, [65, 16])
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorNumpyScalar(self):
+    x = ops.convert_to_tensor([np.asscalar(np.array(321, dtype=np.int)),
+                               np.asscalar(np.array(16, dtype=np.int))])
+    self.assertAllEqual(x, [321, 16])
+
   def testEagerTensorError(self):
     with self.assertRaisesRegexp(
         TypeError,
@@ -347,7 +365,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):
       _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
 
 
-
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
   def testListOfThree(self):
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index f8fbda861cad59daa409aa91f2bf7d0a16d9003e..8eb02007bb0b82d5161819721d436f2994b61ded 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -19,12 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -59,7 +62,7 @@ class VariableHolder(object):
       return self._fn(*args, **kwargs)
 
 
-# TODO(allenl): make this checkpointable
+# TODO(allenl): make this trackable
 class WrappedFunction(function.ConcreteFunction):
   """Wraps a tf V1 piece of code in a function."""
 
@@ -67,6 +70,54 @@ class WrappedFunction(function.ConcreteFunction):
     super(WrappedFunction, self).__init__(
         fn_graph, attrs=attrs, signature=signature)
     self._variable_holder = variable_holder
+    if ops.executing_eagerly_outside_functions():
+      # TODO(allenl): Make this work in 1.x?
+      self._lift_unlifted_variables()
+
+  def _lift_unlifted_variables(self):
+    """Finds resource variables and lifts them into the outer context.
+
+    When we import a GraphDef inside a wrap_function, no Python graph building
+    code runs. This means we get VarHandleOps which create variable resources,
+    but no corresponding Python objects. Leaving them like this works but gives
+    the user no way to interact with or modify the variables outside the graph.
+
+    This method searches for variables and lifts them out as regular variable
+    objects when possible, indicating to the FuncGraph that they are captures.
+    """
+    with self.graph.as_default():
+      collection_variables = (
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+          + ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+      existing_captures = set(self.graph.internal_captures)
+      lifted_variables = {}
+      for old_variable in collection_variables:
+        if (old_variable._in_graph_mode  # pylint: disable=protected-access
+            and isinstance(old_variable,
+                           resource_variable_ops.ResourceVariable)):
+          if old_variable.handle in existing_captures:
+            continue
+          new_variable = def_function.UnliftedInitializerVariable(
+              array_ops.placeholder(
+                  name="unused_{}_initializer".format(old_variable.op.name),
+                  shape=old_variable.shape,
+                  dtype=old_variable.dtype),
+              name=old_variable.op.name,
+              trainable=old_variable.trainable)
+          self.graph.captures[new_variable.handle] = old_variable.handle
+          existing_captures.add(old_variable.handle)
+          lifted_variables[old_variable] = new_variable
+          # pylint: disable=protected-access
+          self._variable_holder._variables.append(new_variable)
+          self.graph._weak_variables.append(weakref.ref(new_variable))
+          # pylint: enable=protected-access
+      # Update the graph's collections, partly for the user and partly so this
+      # function is idempotent when it runs again in prune() calls.
+      for collection_name in [ops.GraphKeys.GLOBAL_VARIABLES,
+                              ops.GraphKeys.LOCAL_VARIABLES]:
+        mutable_collection = ops.get_collection_ref(collection_name)
+        for index, current in enumerate(mutable_collection):
+          mutable_collection[index] = lifted_variables.get(current, current)
 
   def prune(self, feeds, fetches):
     flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
@@ -103,15 +154,16 @@ class WrappedFunction(function.ConcreteFunction):
           sink_tensor = identity_fetches[0]
         else:
           identity_fetches = []
-          sink_tensor = control_flow_ops.no_op()
+          sink_tensor = array_ops.zeros([])
     lift_map = lift_to_graph.lift_to_graph(
-        sink_tensor, pruned_graph,
-        sources=flat_feeds + internal_captures)
+        [sink_tensor], pruned_graph, sources=flat_feeds + internal_captures)
     for original_fetch, identity_fetch in zip(
         tensor_fetches, identity_fetches):
       lift_map[original_fetch] = lift_map[identity_fetch]
     pruned_graph.outputs.extend(
         lift_map[x] for x in flat_fetches if isinstance(x, ops.Tensor))
+    if not tensor_fetches:
+      pruned_graph.outputs.append(lift_map[sink_tensor])
     for external_capture, internal_capture in self.graph.captures.items():
       pruned_graph.captures[external_capture] = lift_map[internal_capture]
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index bb2b28861f8268597f20df223a621efb02d3e22d..6225b13849f603a3881e2e740e2dfe09e77bd9d3 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -206,7 +206,7 @@ class WrapFunctionTest(test.TestCase):
         fetches=(f_wrapped.graph.get_operation_by_name('increment'),
                  f_wrapped.graph.get_tensor_by_name('other:0')))
     first_output, second_output = increments(constant_op.constant(2))
-    self.assertEqual(['Placeholder:0', 'Placeholder_1:0'],
+    self.assertEqual(['step:0', 'increment/resource:0'],
                      [t.name for t in increments.inputs])
     self.assertIs(None, first_output)
     self.assertEqual(1, second_output.numpy())
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 789887e53790ccfd5ad1c239009a9047c07d17b4..8caf46e3fa7f5b64a7b9d4683bc406b2fc213103 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -13,6 +13,7 @@ py_library(
     deps = [
         ":feature_column",
         ":feature_column_v2",
+        ":sequence_feature_column",
         "//tensorflow/python:util",
     ],
 )
@@ -164,3 +165,59 @@ tf_py_test(
         "no_windows",
     ],
 )
+
+py_library(
+    name = "sequence_feature_column",
+    srcs = ["sequence_feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_feature_column_test",
+    srcs = ["sequence_feature_column_test.py"],
+    additional_deps = [
+        ":feature_column_v2",
+        ":feature_column_v2_test",
+        ":sequence_feature_column",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+    ],
+    tags = ["no_pip"],
+)
+
+py_test(
+    name = "sequence_feature_column_integration_test",
+    srcs = ["sequence_feature_column_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_v2",
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:layers",
+    ],
+)
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 68a2712425c56ae4b3e42c6bd7ae497c0358a074..15950403566b00025d93e643e6be880dac9bbb3d 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -21,4 +21,5 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
+from tensorflow.python.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 4b93c2f28db631a0ffa0c10d0811485765cce5f0..3b9f527061b6d71ac930cf921eac914ec94a6747 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -141,11 +141,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.engine.base_layer import Layer
 # TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
 # of the main repo.
 from tensorflow.python.keras import utils
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -162,7 +162,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
@@ -380,7 +380,7 @@ class _BaseFeaturesLayer(Layer):
     return array_ops.concat(output_tensors, -1)
 
 
-@keras_export('keras.layers.DenseFeatures', v1=[])
+@keras_export('keras.layers.DenseFeatures')
 class DenseFeatures(_BaseFeaturesLayer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
@@ -3192,7 +3192,7 @@ class EmbeddingColumn(
         sparse_tensors,
         weight_collections=weight_collections,
         trainable=trainable)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3228,7 +3228,7 @@ def _raise_shared_embedding_column_error():
                    '`DenseFeatures` or `LinearModel` instead.')
 
 
-class SharedEmbeddingColumnCreator(tracking.AutoCheckpointable):
+class SharedEmbeddingColumnCreator(tracking.AutoTrackable):
 
   def __init__(self,
                dimension,
@@ -3376,7 +3376,7 @@ class SharedEmbeddingColumn(
                                                    state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -4022,13 +4022,9 @@ class WeightedCategoricalColumn(
 
   def transform_feature(self, transformation_cache, state_manager):
     """Applies weights to tensor generated from `categorical_column`'."""
-    print('WeightedCategoricalColumn.transform_feature: ', self.name)
-    print('Weight feature key: ', self.weight_feature_key)
     weight_tensor = transformation_cache.get(self.weight_feature_key,
                                              state_manager)
-    print('Weight tensor before: ', weight_tensor)
     weight_tensor = self._transform_weight_tensor(weight_tensor)
-    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
@@ -4042,9 +4038,7 @@ class WeightedCategoricalColumn(
 
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
-    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
-    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -4432,7 +4426,7 @@ class IndicatorColumn(
     dense_tensor = transformation_cache.get(self, state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -4461,7 +4455,7 @@ class IndicatorColumn(
     # representation created by _transform_feature.
     dense_tensor = inputs.get(self)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -4515,6 +4509,31 @@ def _verify_static_batch_size_equality(tensors, columns):
                 expected_batch_size, batch_size))
 
 
+def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
+  """Returns a [batch_size] Tensor with per-example sequence length."""
+  with ops.name_scope(None, 'sequence_length') as name_scope:
+    row_ids = sp_tensor.indices[:, 0]
+    column_ids = sp_tensor.indices[:, 1]
+    # Add one to convert column indices to element length
+    column_ids += array_ops.ones_like(column_ids)
+    # Get the number of elements we will have per example/row
+    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)
+
+    # The raw values are grouped according to num_elements;
+    # how many entities will we have after grouping?
+    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
+    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
+    # these will get grouped, and the final seq_length is [1, 1]
+    seq_length = math_ops.cast(
+        math_ops.ceil(seq_length / num_elements), dtypes.int64)
+
+    # If the last n rows do not have ids, seq_length will have shape
+    # [batch_size - n]. Pad the remaining values with zeros.
+    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
+    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
+    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
+
+
 class SequenceCategoricalColumn(
     CategoricalColumn,
     fc_old._SequenceCategoricalColumn,  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/python/feature_column/sequence_feature_column.py
similarity index 95%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
rename to tensorflow/python/feature_column/sequence_feature_column.py
index 2f4bda194a41242167e0abfcaeac5044f6026f85..bc58c413fef8a69111faaa1edaae873aa9f76cd9 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -25,9 +25,7 @@ from __future__ import print_function
 import collections
 
 
-from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.feature_column import feature_column_v2 as fc_v2
+from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -35,11 +33,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 
 
-class SequenceFeatures(fc_v2._BaseFeaturesLayer):
+@keras_export('keras.experimental.SequenceFeatures')
+class SequenceFeatures(fc._BaseFeaturesLayer):
   """A layer for sequence input.
 
     All `feature_columns` must be sequence dense columns with the same
@@ -61,10 +62,10 @@ class SequenceFeatures(fc_v2._BaseFeaturesLayer):
     watches = sequence_categorical_column_with_identity(
         'watches', num_buckets=1000)
     watches_embedding = embedding_column(watches, dimension=10)
-    columns = [rating, watches]
+    columns = [rating, watches_embedding]
 
-    features = tf.parse_example(..., features=make_parse_example_spec(columns))
     sequence_input_layer = SequenceFeatures(columns)
+    features = tf.parse_example(..., features=make_parse_example_spec(columns))
     sequence_input, sequence_length = sequence_input_layer(features)
     sequence_length_mask = tf.sequence_mask(sequence_length)
 
@@ -99,7 +100,7 @@ class SequenceFeatures(fc_v2._BaseFeaturesLayer):
         feature_columns=feature_columns,
         trainable=trainable,
         name=name,
-        expected_column_type=fc_v2.SequenceDenseColumn,
+        expected_column_type=fc.SequenceDenseColumn,
         **kwargs)
 
   def _target_shape(self, input_shape, total_elements):
@@ -139,8 +140,8 @@ class SequenceFeatures(fc_v2._BaseFeaturesLayer):
         sequence_lengths.append(sequence_length)
 
     # Check and process sequence lengths.
-    fc_v2._verify_static_batch_size_equality(sequence_lengths,
-                                             self._feature_columns)
+    fc._verify_static_batch_size_equality(sequence_lengths,
+                                          self._feature_columns)
     sequence_length = _assert_all_equal_and_return(sequence_lengths)
 
     return self._verify_and_concat_tensors(output_tensors), sequence_length
@@ -195,6 +196,7 @@ def concatenate_context_input(context_input, sequence_input):
   return array_ops.concat([sequence_input, tiled_context_input], 2)
 
 
+@tf_export('feature_column.sequence_categorical_column_with_identity')
 def sequence_categorical_column_with_identity(
     key, num_buckets, default_value=None):
   """Returns a feature column that represents sequences of integers.
@@ -243,6 +245,7 @@ def sequence_categorical_column_with_identity(
           default_value=default_value))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_hash_bucket')
 def sequence_categorical_column_with_hash_bucket(
     key, hash_bucket_size, dtype=dtypes.string):
   """A sequence of categorical terms where ids are set by hashing.
@@ -288,6 +291,7 @@ def sequence_categorical_column_with_hash_bucket(
           dtype=dtype))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file')
 def sequence_categorical_column_with_vocabulary_file(
     key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
     default_value=None, dtype=dtypes.string):
@@ -352,6 +356,7 @@ def sequence_categorical_column_with_vocabulary_file(
           dtype=dtype))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list')
 def sequence_categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A sequence of categorical terms where ids use an in-memory list.
@@ -413,6 +418,7 @@ def sequence_categorical_column_with_vocabulary_list(
           num_oov_buckets=num_oov_buckets))
 
 
+@tf_export('feature_column.sequence_numeric_column')
 def sequence_numeric_column(
     key,
     shape=(1,),
@@ -459,7 +465,7 @@ def sequence_numeric_column(
     ValueError: if any dimension in shape is not a positive integer.
     ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  shape = fc_v2._check_shape(shape=shape, key=key)
+  shape = fc._check_shape(shape=shape, key=key)
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
@@ -558,7 +564,7 @@ class SequenceNumericColumn(
       num_elements = self.variable_shape.num_elements()
     else:
       num_elements = 1
-    seq_length = fc_old._sequence_length_from_sparse_tensor(
+    seq_length = fc._sequence_length_from_sparse_tensor(
         sp_tensor, num_elements=num_elements)
 
     return fc.SequenceDenseColumn.TensorSequenceLengthPair(
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_integration_test.py b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
similarity index 98%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_integration_test.py
rename to tensorflow/python/feature_column/sequence_feature_column_integration_test.py
index 1b165a620ae67e855400eb297ec17db80eac7937..b7c67945c6bc05f1f0ff6be356e3cf7e844ee29b 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_integration_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
@@ -23,11 +23,11 @@ import tempfile
 
 from google.protobuf import text_format
 
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
similarity index 77%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
rename to tensorflow/python/feature_column/sequence_feature_column_test.py
index a1feaddcc00d5fac86dca3138dfa1c6314bb6a8b..0c8f37b107122882e1f72c0bbb10ebe2c2885f5e 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -22,20 +22,21 @@ import os
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
-from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.feature_column.feature_column_v2_test import _TestStateManager
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_v2_test as fc_test
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import monitored_session
 
 
 class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
@@ -82,6 +83,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -128,18 +130,19 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     input_layer, sequence_length = sequence_input_layer({
         'aaa': sparse_input_a, 'bbb': sparse_input_b,})
 
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.evaluate(variables_lib.global_variables_initializer())
+    weights = sequence_input_layer.weights
     self.assertCountEqual(
         ('sequence_features/aaa_embedding/embedding_weights:0',
          'sequence_features/bbb_embedding/embedding_weights:0'),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
-      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
+        tuple([v.name for v in weights]))
+    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
+    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence embedding column."""
     vocabulary_size = 3
@@ -162,70 +165,75 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
       sequence_input_layer = sfc.SequenceFeatures([embedding_column_a])
       _, _ = sequence_input_layer({'aaa': sparse_input})
 
+  @test_util.run_in_graph_and_eager_modes
   def test_shared_embedding_column(self):
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [2, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 4.),  # id 1
-        (5., 6.)  # id 2
-    )
-
-    def _get_initializer(embedding_dimension, embedding_values):
-
-      def _initializer(shape, dtype, partition_info):
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertEqual(dtypes.float32, dtype)
-        self.assertIsNone(partition_info)
-        return embedding_values
-
-      return _initializer
-
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[5., 6., 3., 4.], [0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [2, 0]
-        [[1., 2., 5., 6.], [3., 4., 1., 2.]],
-    ]
-    expected_sequence_length = [1, 2]
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension,
-        initializer=_get_initializer(embedding_dimension, embedding_values))
-
-    sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
-    input_layer, sequence_length = sequence_input_layer({
-        'aaa': sparse_input_a, 'bbb': sparse_input_b})
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [1]
+          # example 1, ids [2, 0]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(1, 2, 0),
+          dense_shape=(2, 2))
+
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 4.),  # id 1
+          (5., 6.)  # id 2
+      )
 
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ('aaa_bbb_shared_embedding:0',),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+      def _get_initializer(embedding_dimension, embedding_values):
+
+        def _initializer(shape, dtype, partition_info):
+          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+          self.assertEqual(dtypes.float32, dtype)
+          self.assertIsNone(partition_info)
+          return embedding_values
+
+        return _initializer
+
+      expected_input_layer = [
+          # example 0, ids_a [2], ids_b [1]
+          [[5., 6., 3., 4.], [0., 0., 0., 0.]],
+          # example 1, ids_a [0, 1], ids_b [2, 0]
+          [[1., 2., 5., 6.], [3., 4., 1., 2.]],
+      ]
+      expected_sequence_length = [1, 2]
+
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      # Test that columns are reordered alphabetically.
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension,
+          initializer=_get_initializer(embedding_dimension, embedding_values))
 
+      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+      input_layer, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(
+          ('aaa_bbb_shared_embedding:0',),
+          tuple([v.name for v in global_vars]))
+      with fc_test._initialized_session() as sess:
+        self.assertAllEqual(embedding_values,
+                            global_vars[0].eval(session=sess))
+        self.assertAllEqual(expected_input_layer,
+                            input_layer.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length, sequence_length.eval(session=sess))
+
+  @test_util.run_deprecated_v1
   def test_shared_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence shared embedding column."""
     vocabulary_size = 3
@@ -299,6 +307,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -320,11 +329,11 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     input_layer, sequence_length = sequence_input_layer({
         'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence categorical column."""
     vocabulary_size = 3
@@ -370,6 +379,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[3.], [0.], [8.], [0.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
@@ -379,10 +389,9 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     sequence_input_layer = sfc.SequenceFeatures([numeric_column])
     input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -413,6 +422,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
        'expected_sequence_length': [2, 1]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column_multi_dim(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     """Tests SequenceFeatures for multi-dimensional numeric_column."""
@@ -423,11 +433,11 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     sequence_input_layer = sfc.SequenceFeatures([numeric_column])
     input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequence_length_not_equal(self):
     """Tests that an error is raised when sequence lengths are not equal."""
     # Input a with sequence_length = [2, 1]
@@ -445,16 +455,12 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
 
     sequence_input_layer = sfc.SequenceFeatures(
         [numeric_column_a, numeric_column_b])
-    _, sequence_length = sequence_input_layer({
-        'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
-    with monitored_session.MonitoredSession() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Condition x == y did not hold element-wise:\] '
-          r'\[x \(sequence_features/aaa/sequence_length:0\) = \] \[2 1\] '
-          r'\[y \(sequence_features/bbb/sequence_length:0\) = \] \[1 1\]'):
-        sess.run(sequence_length)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, r'Condition x == y did not hold.*'):
+      _, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+      self.evaluate(sequence_length)
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -471,12 +477,13 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
            # example 1, [[10., 11., 12., 13.], []]
            'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
                        (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
            'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
            'dense_shape': (2, 2, 4)},
        'expected_shape': [2, 2, 4]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_numeric(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
@@ -511,6 +518,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
            'dense_shape': (4, 2, 2)},
        'expected_shape': [4, 2, 3]}
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_indicator(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
@@ -524,43 +532,44 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_compute_output_shape(self):
     price1 = sfc.sequence_numeric_column('price1', shape=2)
     price2 = sfc.sequence_numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': sparse_tensor.SparseTensor(
-              indices=[[0, 0, 0], [0, 0, 1],
-                       [0, 1, 0], [0, 1, 1],
-                       [1, 0, 0], [1, 0, 1],
-                       [2, 0, 0], [2, 0, 1],
-                       [3, 0, 0], [3, 0, 1]],
-              values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
-              dense_shape=(4, 3, 2)),
-          'price2': sparse_tensor.SparseTensor(
-              indices=[[0, 0],
-                       [0, 1],
-                       [1, 0],
-                       [2, 0],
-                       [3, 0]],
-              values=[10., 11., 20., 30., 40.],
-              dense_shape=(4, 3))}
-      sequence_features = sfc.SequenceFeatures([price1, price2])
-      seq_input, seq_len = sequence_features(features)
-      self.assertEqual(
-          sequence_features.compute_output_shape((None, None)),
-          (None, None, 3))
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
-                           [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
-                           [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
-                           [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
-                          self.evaluate(seq_input))
-      self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
-
-
+    features = {
+        'price1': sparse_tensor.SparseTensor(
+            indices=[[0, 0, 0], [0, 0, 1],
+                     [0, 1, 0], [0, 1, 1],
+                     [1, 0, 0], [1, 0, 1],
+                     [2, 0, 0], [2, 0, 1],
+                     [3, 0, 0], [3, 0, 1]],
+            values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
+            dense_shape=(4, 3, 2)),
+        'price2': sparse_tensor.SparseTensor(
+            indices=[[0, 0],
+                     [0, 1],
+                     [1, 0],
+                     [2, 0],
+                     [3, 0]],
+            values=[10., 11., 20., 30., 40.],
+            dense_shape=(4, 3))}
+    sequence_features = sfc.SequenceFeatures([price1, price2])
+    seq_input, seq_len = sequence_features(features)
+    self.assertEqual(
+        sequence_features.compute_output_shape((None, None)),
+        (None, None, 3))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
+                         [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
+                         [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
+                         [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
+                        self.evaluate(seq_input))
+    self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
   """Tests the utility fn concatenate_context_input."""
 
@@ -575,9 +584,8 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
         [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
         [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
     ], dtype=np.float32)
-    with monitored_session.MonitoredSession() as sess:
-      output = sess.run(input_layer)
-      self.assertAllEqual(expected, output)
+    output = self.evaluate(input_layer)
+    self.assertAllEqual(expected, output)
 
   @parameterized.named_parameters(
       {'testcase_name': 'rank_lt_3',
@@ -624,6 +632,7 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DenseFeaturesTest(test.TestCase):
   """Tests DenseFeatures with sequence feature columns."""
 
@@ -693,10 +702,11 @@ def _get_sequence_dense_tensor(column, features):
 
 
 def _get_sequence_dense_tensor_state(column, features):
-  state_manager = _TestStateManager()
+  state_manager = fc._StateManagerImpl(Layer(), trainable=True)
   column.create_state(state_manager)
-  return column.get_sequence_dense_tensor(
+  dense_tensor, lengths = column.get_sequence_dense_tensor(
       fc.FeatureTransformationCache(features), state_manager)
+  return dense_tensor, lengths, state_manager
 
 
 def _get_sparse_tensors(column, features):
@@ -704,6 +714,7 @@ def _get_sparse_tensors(column, features):
       fc.FeatureTransformationCache(features), None)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithIdentityTest(
     test.TestCase, parameterized.TestCase):
 
@@ -735,11 +746,11 @@ class SequenceCategoricalColumnWithIdentityTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithHashBucketTest(
     test.TestCase, parameterized.TestCase):
 
@@ -774,11 +785,11 @@ class SequenceCategoricalColumnWithHashBucketTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_indices_shape(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    _assert_sparse_tensor_indices_shape(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithVocabularyFileTest(
     test.TestCase, parameterized.TestCase):
 
@@ -827,37 +838,40 @@ class SequenceCategoricalColumnWithVocabularyFileTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
   def test_get_sparse_tensors_dynamic_zero_length(self):
     """Tests _get_sparse_tensors with a dynamic sequence length."""
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
-    expected = sparse_tensor.SparseTensorValue(
-        indices=np.zeros((0, 3)),
-        values=np.array((), dtype=np.int64),
-        dense_shape=(2, 0, 1))
-    column = sfc.sequence_categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    input_placeholder_shape = list(inputs.dense_shape)
-    # Make second dimension (sequence length) dynamic.
-    input_placeholder_shape[1] = None
-    input_placeholder = array_ops.sparse_placeholder(
-        dtypes.string, shape=input_placeholder_shape)
-    id_weight_pair = _get_sparse_tensors(column, {'aaa': input_placeholder})
-
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      result = id_weight_pair.id_tensor.eval(
-          session=sess, feed_dict={input_placeholder: inputs})
-      _assert_sparse_tensor_value(
-          self, expected, result)
-
-
+    with ops.Graph().as_default():
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
+      expected = sparse_tensor.SparseTensorValue(
+          indices=np.zeros((0, 3)),
+          values=np.array((), dtype=np.int64),
+          dense_shape=(2, 0, 1))
+      column = sfc.sequence_categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      input_placeholder_shape = list(inputs.dense_shape)
+      # Make second dimension (sequence length) dynamic.
+      input_placeholder_shape[1] = None
+      input_placeholder = array_ops.sparse_placeholder(
+          dtypes.string, shape=input_placeholder_shape)
+      id_weight_pair = _get_sparse_tensors(column, {'aaa': input_placeholder})
+
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with fc_test._initialized_session() as sess:
+        result = id_weight_pair.id_tensor.eval(
+            session=sess, feed_dict={input_placeholder: inputs})
+        _assert_sparse_tensor_value(
+            self, expected, result)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithVocabularyListTest(
     test.TestCase, parameterized.TestCase):
 
@@ -891,11 +905,13 @@ class SequenceCategoricalColumnWithVocabularyListTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceEmbeddingColumnTest(
     test.TestCase, parameterized.TestCase):
 
@@ -959,15 +975,15 @@ class SequenceEmbeddingColumnTest(
         categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
 
-    embedding_lookup, _ = _get_sequence_dense_tensor_state(
+    embedding_lookup, _, state_manager = _get_sequence_dense_tensor_state(
         embedding_column, {'aaa': inputs})
 
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    variables = state_manager._layer.weights
+    self.evaluate(variables_lib.global_variables_initializer())
     self.assertCountEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+        ('embedding_weights:0',), tuple([v.name for v in variables]))
+    self.assertAllEqual(embedding_values, self.evaluate(variables[0]))
+    self.assertAllEqual(expected, self.evaluate(embedding_lookup))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -996,13 +1012,12 @@ class SequenceEmbeddingColumnTest(
     embedding_column = fc.embedding_column(
         categorical_column, dimension=2)
 
-    _, sequence_length = _get_sequence_dense_tensor_state(
+    _, sequence_length, _ = _get_sequence_dense_tensor_state(
         embedding_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1024,16 +1039,16 @@ class SequenceEmbeddingColumnTest(
     embedding_column = fc.embedding_column(
         categorical_column, dimension=2)
 
-    _, sequence_length = _get_sequence_dense_tensor_state(
+    _, sequence_length, _ = _get_sequence_dense_tensor_state(
         embedding_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
 class SequenceSharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_get_sequence_dense_tensor(self):
     vocabulary_size = 3
     embedding_dimension = 2
@@ -1102,100 +1117,102 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     embedding_lookup_b = _get_sequence_dense_tensor(
         shared_embedding_columns[1], {'bbb': sparse_input_b})[0]
 
+    self.evaluate(variables_lib.global_variables_initializer())
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(
-          expected_lookups_a, embedding_lookup_a.eval(session=sess))
-      self.assertAllEqual(
-          expected_lookups_b, embedding_lookup_b.eval(session=sess))
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(
+        expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
   def test_sequence_length(self):
-    vocabulary_size = 3
-
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length_a = [1, 2]
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0, 2]
-        # example 1, ids [1]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0, 2, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length_b = [2, 1]
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_length_a = _get_sequence_dense_tensor(
-        shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
-    sequence_length_b = _get_sequence_dense_tensor(
-        shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
-
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length_a = sess.run(sequence_length_a)
-      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
-      self.assertEqual(np.int64, sequence_length_a.dtype)
-      sequence_length_b = sess.run(sequence_length_b)
-      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
-      self.assertEqual(np.int64, sequence_length_b.dtype)
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      expected_sequence_length_a = [1, 2]
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [0, 2]
+          # example 1, ids [1]
+          indices=((0, 0), (0, 1), (1, 0)),
+          values=(0, 2, 1),
+          dense_shape=(2, 2))
+      expected_sequence_length_b = [2, 1]
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b], dimension=2)
+
+      sequence_length_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+      sequence_length_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
+
+      with fc_test._initialized_session() as sess:
+        sequence_length_a = sess.run(sequence_length_a)
+        self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
+        self.assertEqual(np.int64, sequence_length_a.dtype)
+        sequence_length_b = sess.run(sequence_length_b)
+        self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
+        self.assertEqual(np.int64, sequence_length_b.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids []
-        # example 1, ids [2]
-        # example 2, ids [0, 1]
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids []
-        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        # example 2, ids []
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids [0, 1]
-        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
-        values=(2, 1, 0, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_length_a = _get_sequence_dense_tensor(
-        shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
-    sequence_length_b = _get_sequence_dense_tensor(
-        shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length_a, sequence_length_a.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length_b, sequence_length_b.eval(session=sess))
-
-
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids []
+          # example 1, ids [2]
+          # example 2, ids [0, 1]
+          # example 3, ids []
+          # example 4, ids [1]
+          # example 5, ids []
+          indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(6, 2))
+      expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids []
+          # example 2, ids []
+          # example 3, ids []
+          # example 4, ids [1]
+          # example 5, ids [0, 1]
+          indices=((0, 0), (4, 0), (5, 0), (5, 1)),
+          values=(2, 1, 0, 1),
+          dense_shape=(6, 2))
+      expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b], dimension=2)
+
+      sequence_length_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+      sequence_length_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
+
+      with fc_test._initialized_session() as sess:
+        self.assertAllEqual(
+            expected_sequence_length_a, sequence_length_a.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length_b, sequence_length_b.eval(session=sess))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -1248,8 +1265,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
     indicator_tensor, _ = _get_sequence_dense_tensor(
         indicator_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+    self.assertAllEqual(expected, self.evaluate(indicator_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1280,10 +1296,9 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         indicator_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1307,11 +1322,11 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         indicator_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
@@ -1372,8 +1387,7 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': inputs})
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
+    self.assertAllEqual(expected, self.evaluate(dense_tensor))
 
   def test_get_sequence_dense_tensor_with_normalizer_fn(self):
 
@@ -1406,9 +1420,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+    self.assertAllEqual(
+        expected_dense_tensor, self.evaluate(dense_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1444,9 +1457,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+    self.assertAllEqual(
+        expected_dense_tensor, self.evaluate(dense_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1493,10 +1505,9 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         numeric_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1516,9 +1527,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 6210010cfff53fdc9eb685178d64be24abb448ea..a8ba4ea50d144854c9b38bca427ae9f820994fdd 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -32,12 +32,65 @@ from tensorflow.python.util import tf_decorator
 # Op types that should not run in program order, e.g. because they need to run
 # asynchronously to avoid deadlock.
 ASYNC_STATEFUL_OPS = [
+    "CollectiveGather",
     "CollectiveReduce",
     "CollectiveBcastSend",
     "CollectiveBcastRecv",
     "NcclAllReduce",
 ]
 
+LEGACY_RANDOM_OPS = [
+    # These may be used in variable initializers -- thus their execution should
+    # not be dependent on other stateful operations.  This is because although
+    # according to program order, tf.Variables may be created in sequence,
+    # their initialization happens outside of the program order (specifically,
+    # in graph mode their initialization happens by calling a grouped
+    # initializer operation or in eager mode, where initialization is lifted
+    # out of the tf.function and executed the first time the function is
+    # executed).
+    #
+    # Unless there is a specific dependency between the initializers
+    # themselves (e.g. one initializer depends on a Variable whose value depends
+    # on another initializer), the initialization can happen in any order so
+    # long as it's before the associated Variable read operations.
+    #
+    # Note that in general the randomness of legacy random operations is only
+    # guaranteed by providing a graph-level and op-level seed (and ordering of
+    # the same op across multiple iterations of a while_loop is specifically not
+    # guaranteed; see the discussion below).
+    #
+    # There is a possible race condition inside while_loop where the same
+    # random OpKernel instantiation is reused across multiple steps
+    # of the loop.  Since legacy Random OpKernels have an internal rng state,
+    # automatic dependency tracking across loop steps would likely
+    # fix this race; and for that case this blacklist is problematic.
+    # However, since automatic dependency tracking inside while loops is not
+    # currently supported, and there are no other examples of OpKernel reuse
+    # (each OpKernel is associated with a unique op in graph mode),
+    # this blacklist has no effect on the aforementioned behavior.
+    #
+    # TODO(ebrevdo,skyewm): Modify the check against this blacklist to
+    # only occur when the op is inside a "variable initialization scope"; and
+    # add proper autodeps inside while_loops that respects this updated check.
+    "RandomUniform",
+    "RandomUniformInt",
+    "RandomStandardNormal",
+    "ParameterizedTruncatedNormal",
+    "TruncatedNormal",
+    "RandomShuffle",
+    "Multinomial",
+    "RandomGamma",
+    "RandomGammaGrad",
+    "RandomPoisson",
+    "RandomPoissonV2",
+]
+
+_ALL_BLACKLISTED_OPS = set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS)
+
+
+def op_is_stateful(op_def):
+  return op_def.is_stateful and op_def.name not in _ALL_BLACKLISTED_OPS
+
 
 class AutomaticControlDependencies(object):
   """Context manager to automatically add control dependencies.
@@ -45,7 +98,7 @@ class AutomaticControlDependencies(object):
   Code under this context manager will act as if a sensible set of control
   dependencies were present. More specifically:
     1. All stateful ops in the scope will execute (with the exception of ops in
-       ASYNC_STATEFUL_OPS)
+       ASYNC_STATEFUL_OPS and LEGACY_RANDOM_OPS)
     2. Stateful ops which modify the same resource will execute in program order
 
   Note: creating variables in an automatic control dependencies context is not
@@ -57,6 +110,7 @@ class AutomaticControlDependencies(object):
 
   def __init__(self):
     self._returned_tensors = set()
+    self.ops_which_must_run = set()
 
   def mark_as_return(self, tensor):
     """Acts like identity but marks the `Tensor` as a return value.
@@ -233,8 +287,7 @@ class AutomaticControlDependencies(object):
       control_inputs = set()
       # Ensure stateful ops run
       if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
-          or (self._graph._registered_ops[op.type].is_stateful   # pylint: disable=protected-access
-              and op.type not in ASYNC_STATEFUL_OPS)):
+          or op_is_stateful(self._graph._registered_ops[op.type])):  # pylint: disable=protected-access
         ops_which_must_run.add(op)
       # Ignore switches (they're handled separately)
       if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
@@ -271,8 +324,8 @@ class AutomaticControlDependencies(object):
         if inp in merge_for_resource:
           merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
         last_op_using_resource_tensor[inp] = op
-      if (op.op_def.is_stateful and op.type not in ASYNC_STATEFUL_OPS
-          and not found_resource and op._control_flow_context is None):  # pylint: disable=protected-access
+      if (op_is_stateful(op.op_def) and not found_resource
+          and op._control_flow_context is None):  # pylint: disable=protected-access
         if None in last_op_using_resource_tensor:
           op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
         last_op_using_resource_tensor[None] = op
@@ -281,10 +334,11 @@ class AutomaticControlDependencies(object):
       op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
 
     # Ensure all ops which must run do run
+    self.ops_which_must_run.update(ops_which_must_run)
     for r in self._returned_tensors:
-      if ops_which_must_run:
+      if self.ops_which_must_run:
         r.op._add_control_inputs(  # pylint: disable=protected-access
-            [o for o in ops_which_must_run
+            [o for o in self.ops_which_must_run
              if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
 
 
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 2c25ab133b914ddd5209bfcda3a8a80d7c44f505..d9df96f6d70c36ddd9b942f66929b1033e4542f6 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -25,7 +25,9 @@ from tensorflow.python.framework import auto_control_deps as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -296,6 +298,30 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     self.assertEqual(self.evaluate(outer()), 2.0)
 
+  def testVariableInitializersCanBeLifted(self):
+    # The initializer is a stateful op, but using it inside a function should
+    # *not* create additional dependencies.  That's what we're testing.
+    layer = keras_core.Dense(1, kernel_initializer="glorot_uniform")
+
+    @def_function.function
+    def fn(x):
+      # Stateful operation
+      control_flow_ops.Assert(x, ["Error"])
+      # Variable initialization should be lifted.  Prior to the change that
+      # added this test, the lifting would crash because of an auto control dep
+      # added on `x`.  Note, the error did not happen if we
+      # manually created a tf.Variable outside of function and used it
+      # here.  Alternatively, creating a tf.Variable inside fn() causes
+      # a different sort of error that is out of scope for this test.
+      return layer(ops.convert_to_tensor([[1.0, 1.0]]))
+
+    true = ops.convert_to_tensor(True)
+
+    concrete = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool))
+    self.evaluate(concrete(true))
+    self.evaluate(fn(True))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96e5991741e56e33cf99963873f12a51e3aad23
--- /dev/null
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -0,0 +1,142 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers to convert variables to constants in TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.saver import export_meta_graph
+
+
+def _run_inline_graph_optimization(func):
+  """Apply function inline optimization to the graph.
+
+  Returns the GraphDef after Grappler's function inlining optimization is
+  applied. This optimization does not work on models with control flow.
+
+  Args:
+    func: ConcreteFunction.
+
+  Returns:
+    GraphDef
+  """
+  meta_graph = export_meta_graph(
+      graph_def=func.graph.as_graph_def(), graph=func.graph)
+
+  # Add a collection 'train_op' so that Grappler knows the outputs.
+  fetch_collection = meta_graph_pb2.CollectionDef()
+  for array in func.inputs + func.outputs:
+    fetch_collection.node_list.value.append(array.name)
+  meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+
+  # Initialize RewriterConfig with everything disabled except function inlining.
+  config = config_pb2.ConfigProto()
+  rewrite_options = config.graph_options.rewrite_options
+  rewrite_options.optimizers.append("function")
+  return tf_optimizer.OptimizeGraph(config, meta_graph)
+
+
+def convert_variables_to_constants_v2(func):
+  """Replaces all the variables in a graph with constants of the same values.
+
+  TensorFlow 2.0 function for converting all Variable ops into Const ops holding
+  the same values. This makes it possible to describe the network fully with a
+  single GraphDef file, and allows the removal of a lot of ops related to
+  loading and saving the variables. This function runs Grappler's function
+  inlining optimization in order to return a single subgraph.
+
+  The current implementation only works for graphs that do not contain any
+  control flow or embedding related ops.
+
+  Args:
+    func: ConcreteFunction.
+
+  Returns:
+    GraphDef containing a simplified version of the original.
+  """
+  # TODO(nupurgarg): Replace ResourceGather with Gather.
+  # TODO(nupurgarg): Change attr for Variables in control flow and functions.
+  graph_def = _run_inline_graph_optimization(func)
+
+  # Identify the ReadVariableOps.
+  get_name = lambda name: name.split(":")[0]
+  map_name_to_node = {get_name(node.name): node for node in graph_def.node}
+
+  variables_generator = func.graph.variables
+  resource_identities = {}
+  resource_placeholders = {}
+  for node in graph_def.node:
+    if node.op == "ReadVariableOp":
+      # Get name of Placeholder op associated with ReadVariableOp. There can be
+      # an Identity in between the ReadVariableOp and Placeholder. Store the
+      # Identity ops with the associated dtypes.
+      input_name = get_name(node.input[0])
+      while map_name_to_node[input_name].op == "Identity":
+        resource_identities[input_name] = node.attr["dtype"]
+        input_name = get_name(map_name_to_node[input_name].input[0])
+      if map_name_to_node[input_name].op != "Placeholder":
+        raise ValueError("Cannot find the Placeholder op that is an input "
+                         "to the ReadVariableOp.")
+      # Build a map of Placeholder ops that are inputs to ReadVariableOps to the
+      # variable's dtype and data.
+      # TODO(nupurgarg): Confirm relationship between variables in
+      # `func.graph.variables` and ReadVariableOps in `graph_def.nodes`.
+      resource_placeholders[input_name] = {
+          "dtype": node.attr["dtype"],
+          "data": next(variables_generator).numpy(),
+      }
+
+  # Reconstruct the graph with constants in place of variables.
+  output_graph_def = graph_pb2.GraphDef()
+  how_many_converted = 0
+
+  for input_node in graph_def.node:
+    output_node = output_graph_def.node.add()
+    # Convert Placeholder ops that are inputs to ReadVariableOps into Const ops.
+    if input_node.name in resource_placeholders:
+      dtype = resource_placeholders[input_node.name]["dtype"]
+      data = resource_placeholders[input_node.name]["data"]
+
+      output_node.op = "Const"
+      output_node.name = input_node.name
+      output_node.attr["dtype"].CopyFrom(dtype)
+      output_node.attr["value"].tensor.CopyFrom(
+          tensor_util.make_tensor_proto(
+              data, dtype=dtype.type, shape=data.shape))
+      how_many_converted += 1
+    # Change the dtype for Identity ops that are inputs to ReadVariableOps.
+    elif input_node.name in resource_identities:
+      output_node.CopyFrom(input_node)
+      output_node.attr["T"].CopyFrom(resource_identities[input_node.name])
+    # Convert ReadVariableOps into Identity ops.
+    elif input_node.op == "ReadVariableOp":
+      output_node.op = "Identity"
+      output_node.name = input_node.name
+      output_node.input.extend([input_node.input[0]])
+      output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
+    else:
+      output_node.CopyFrom(input_node)
+
+  logging.info("Converted %d variables to const ops.", how_many_converted)
+  return output_graph_def
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd28dbaf463e4547448683662cf45fef3c0b5837
--- /dev/null
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -0,0 +1,191 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convert_to_constants.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import convert_to_constants
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+from tensorflow.python.training.tracking import tracking
+
+
+class VariablesToConstantsTest(test.TestCase):
+
+  def _hasStatefulPartitionedCallOp(self, graph_def):
+    """Determines if a StatefulPartitionedCall op exists in the graph."""
+    for node in graph_def.node:
+      if node.op == "StatefulPartitionedCall":
+        return True
+    return False
+
+  def _getNumVariables(self, graph_def):
+    """Returns the number of ReadVariableOp in the graph."""
+    return sum(node.op == "ReadVariableOp" for node in graph_def.node)
+
+  def _getTensors(self, sess, tensor_list):
+    """Returns a list of Tensor objects from the Session."""
+    return [
+        sess.graph.get_tensor_by_name(tensor.name) for tensor in tensor_list
+    ]
+
+  def _evaluateGraphDef(self, graph_def, func, input_data):
+    """Evaluates the GraphDef using Sessions."""
+    with ops.Graph().as_default() as graph:
+      importer.import_graph_def(graph_def, name="")
+      func.add_to_graph(graph)
+      sess = session.Session(graph=graph)
+
+    input_tensors = self._getTensors(sess, func.inputs)
+    output_tensors = self._getTensors(sess, func.outputs)
+    return sess.run(
+        output_tensors, feed_dict=dict(zip(input_tensors, input_data)))
+
+  @test_util.run_v2_only
+  def testConstSavedModel(self):
+    """Test a basic model with functions to make sure functions are inlined."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda x: 2. * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures["serving_default"]
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(variable_graph_def))
+    self.assertTrue(variable_graph_def.library.function)
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(constant_graph_def.library.function)
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableModel(self):
+    """Test a basic model with Variables."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertEqual(2, self._getNumVariables(variable_graph_def))
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableSavedModel(self):
+    """Test a basic model with Variables with saving/loading the SavedModel."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures["serving_default"]
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertTrue(self._hasStatefulPartitionedCallOp(variable_graph_def))
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testMultiFunctionModel(self):
+    """Test a basic model with Variables."""
+
+    class BasicModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def add(self, x):
+        if self.y is None:
+          self.y = variables.Variable(2.)
+        return x + self.y
+
+      @def_function.function
+      def sub(self, x):
+        if self.z is None:
+          self.z = variables.Variable(3.)
+        return x - self.z
+
+    input_data = constant_op.constant(1., shape=[1])
+    root = BasicModel()
+    concrete_func = root.add.get_concrete_function(input_data)
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertEqual(1, self._getNumVariables(variable_graph_def))
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.add(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 2d1b210d2aef3483240dfda4a686e3acc30d1057..b671dfbfaa12ed47b2ca5de0a923280af95de2ef 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -41,6 +41,8 @@ _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 _BAD_FILE_SUBSTRINGS = [
     os.path.join("tensorflow", "python"),
     os.path.join("tensorflow", "contrib"),
+    os.path.join("tensorflow_estimator", "python"),
+    os.path.join("tensorflow_estimator", "contrib"),
     "<embedded",
 ]
 
@@ -222,16 +224,25 @@ def compute_useful_stack(op):
 
   Returns:
     A list of line name and lineno pairs. Below is an example of returned list:
-    [("tool_utils.py", "124"), ("tool_utils.py", "21"), ....]
+    [("tool_utils.py", "124", "func1", "a={}"), ("tool_utils.py", "21", "func2",
+    "for i in range(10):"), ....]
   """
   defining_frame_index = _find_index_of_defining_frame_for_op(op)
   stack_trace = []
-  # the stack trace is collected from the defining (included) to the outermost.
-  for index in reversed(range(defining_frame_index + 1)):
+  # The stack trace is collected from the defining (included) to the outermost.
+  # Include `frame_num` frames at most.
+  # Two lines from the TensorFlow library are included to show the node
+  # definition.
+  frame_num = 10
+  innermost_excluded = min(defining_frame_index + 2 + 1, len(op.traceback))
+  outermost_included = max(innermost_excluded - frame_num, 0)
+  for index in reversed(range(outermost_included, innermost_excluded)):
     frame = op.traceback[index]
     filename = frame[tf_stack.TB_FILENAME]
     lineno = frame[tf_stack.TB_LINENO]
-    stack_trace.append((filename, lineno))
+    func = frame[tf_stack.TB_FUNCNAME]
+    code = frame[tf_stack.TB_CODEDICT]
+    stack_trace.append((filename, lineno, func, code))
   return stack_trace
 
 
@@ -312,21 +323,18 @@ def traceback_files_common_prefix(all_ops):
   return os.path.split(os.path.commonprefix(list(files)))[0]
 
 
-def _sources_for_node(name, graph):
-  """Gets the top-level root input nodes for 'name' node.
-
-  We recursively traverse the graph from 'name' node to its inputs and collect
-  all the nodes which don't have any inputs.
+def _sources_for_node(node, graph):
+  """Gets the input op nodes for 'node'.
 
   Args:
-    name: The name of the node.
+    node: The node.
     graph: The graph containing the node.
 
   Returns:
-    The unique top-level root input nodes.
+    The unique input nodes.
   """
-  def _helper(name, graph, seen_names, inputs):
-    """Recursive helper. 'seen_names' and 'inputs' are mutated."""
+  inputs = set()
+  for name in node.node_def.input:
     if name.startswith("^"):
       name = name[1:]
     try:
@@ -336,20 +344,9 @@ def _sources_for_node(name, graph):
       try:
         op = graph.get_operation_by_name(name)
       except KeyError:
-        return
-    name = op.name
-    if name in seen_names:
-      return
-    seen_names.add(name)
-    if not op.node_def.input:
-      inputs.add(op)
-      return
-    for n in op.node_def.input:
-      _helper(n, graph, seen_names, inputs)
-
-  names = set()
-  inputs = set()
-  _helper(name, graph, names, inputs)
+        continue
+    inputs.add(op)
+
   return list(inputs)
 
 
@@ -413,7 +410,7 @@ def interpolate(error_message, graph):
     if op is None:
       tagged_ops.append(None)
     else:
-      tagged_ops.append([op] + _sources_for_node(op.name, graph))
+      tagged_ops.append([op] + _sources_for_node(op, graph))
 
   common_prefix = traceback_files_common_prefix(tagged_ops)
   for tag, ops in zip(tags, tagged_ops):
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 922b9e2bd308a03b0b6b28aa741c4e6f54c1b347..c473dfeedf8d232d5b5211fe5982ab4f8ea41fee 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -511,7 +511,11 @@ def exception_type_from_error_code(error_code):
 
 @tf_export("errors.error_code_from_exception_type")
 def error_code_from_exception_type(cls):
-  return _EXCEPTION_CLASS_TO_CODE[cls]
+  try:
+    return _EXCEPTION_CLASS_TO_CODE[cls]
+  except KeyError:
+    warnings.warn("Unknown class exception")
+    return UnknownError(None, None, "Unknown class exception", None)
 
 
 def _make_specific_exception(node_def, op, message, error_code):
diff --git a/tensorflow/python/framework/errors_test.py b/tensorflow/python/framework/errors_test.py
index 574b126caeef87c5e05f4f08a9432b22d2f8040d..c044202d92ad549d48fd0f4d9ace79b4e9a8ef97 100644
--- a/tensorflow/python/framework/errors_test.py
+++ b/tensorflow/python/framework/errors_test.py
@@ -70,6 +70,10 @@ class ErrorsTest(test.TestCase):
           isinstance(
               errors_impl._make_specific_exception(None, None, None,
                                                    error_code), exc_type))
+      # error_code_from_exception_type and exception_type_from_error_code should
+      # be consistent with operation result.
+      self.assertEqual(error_code,
+                       errors_impl.error_code_from_exception_type(exc_type))
       # pylint: enable=protected-access
 
   def testKnownErrorClassForEachErrorCodeInProto(self):
@@ -98,6 +102,14 @@ class ErrorsTest(test.TestCase):
     self.assertTrue("Unknown error code: 37" in str(w[0].message))
     self.assertTrue(isinstance(exc, errors_impl.OpError))
 
+    with warnings.catch_warnings(record=True) as w:
+      # pylint: disable=protected-access
+      exc = errors_impl.error_code_from_exception_type("Unknown")
+      # pylint: enable=protected-access
+    self.assertEqual(1, len(w))
+    self.assertTrue("Unknown class exception" in str(w[0].message))
+    self.assertTrue(isinstance(exc, errors_impl.OpError))
+
   def testStatusDoesNotLeak(self):
     try:
       with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 3f81466cd40b5aaec8bdc7909681b6bdb3427b80..9097a8dd1f05de0dc271df18721557f979c44c29 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as py_collections
+import itertools
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import dtypes
@@ -135,9 +137,11 @@ class FuncGraph(ops.Graph):
     captures: Maps external tensor -> internal tensor (i.e. input placeholder).
       The entries are in the order they were captured.
     seed: The graph-level random seed.
+    capture_by_value: If True, the func graph will capture Variables by value
+      instead of reference.
   """
 
-  def __init__(self, name, collections=None):
+  def __init__(self, name, collections=None, capture_by_value=None):
     """Construct a new FuncGraph.
 
     The graph will inherit its graph key, collections, seed, and distribution
@@ -152,17 +156,29 @@ class FuncGraph(ops.Graph):
         The current whitelisted collections are the global variables, the
         local variables, and the trainable variables.
         Defaults to None.
+      capture_by_value: An optional boolean. If True, the func graph will
+        capture Variables by value instead of reference. By default inherit
+        from outer graphs, and failing that will default to False.
     """
     super(FuncGraph, self).__init__()
 
     self.name = name
     self.inputs = []
     self.outputs = []
+    self.control_outputs = []
     self.structured_input_signature = None
     self.structured_outputs = None
     self._weak_variables = []
     self.outer_graph = ops.get_default_graph()
     self.captures = py_collections.OrderedDict()
+    # Inherit capture-by-value from outer graph.
+    if capture_by_value is not None:
+      self.capture_by_value = capture_by_value
+    elif self.outer_graph is not None and isinstance(
+        self.outer_graph, FuncGraph):
+      self.capture_by_value = self.outer_graph.capture_by_value
+    else:
+      self.capture_by_value = False
 
     self._building_function = True
     # Map from resource tensor name to last op (in program order) which uses
@@ -195,6 +211,9 @@ class FuncGraph(ops.Graph):
     else:
       self._collections = collections
 
+  def __str__(self):
+    return "FuncGraph(name=%s, id=%s)" % (self.name, id(self))
+
   def as_default(self):
     outer_cm = super(FuncGraph, self).as_default()
 
@@ -278,11 +297,39 @@ class FuncGraph(ops.Graph):
   def variables(self, var_list):
     self._weak_variables = [weakref.ref(v) for v in var_list]
 
+  def _capture_by_value(
+      self,
+      op_type,
+      inputs,
+      dtypes,  # pylint: disable=redefined-outer-name
+      input_types=None,
+      name=None,
+      attrs=None,
+      op_def=None,
+      compute_shapes=True,
+      compute_device=True):
+    # When capturing by value, do the read outside
+    reverse_captures = dict((v, k) for k, v in self.captures.items())
+    uncaptured_inputs = [reverse_captures.get(t, t) for t in inputs]
+    with ops.init_scope():
+      if context.executing_eagerly():
+        attr_list = ("dtype", int(attrs["dtype"].type))
+        value, = execute.execute(
+            compat.as_bytes(op_type), 1, uncaptured_inputs, attr_list,
+            context.context())
+      else:
+        op = ops.get_default_graph().create_op(
+            op_type, uncaptured_inputs, dtypes, input_types, name, attrs,
+            op_def, compute_shapes, compute_device)
+        value = op.outputs[0]
+    captured_value = self.capture(value)
+    return captured_value.op
+
   def create_op(
       self,
       op_type,
       inputs,
-      dtypes,
+      dtypes,  # pylint: disable=redefined-outer-name
       input_types=None,
       name=None,
       attrs=None,
@@ -321,6 +368,12 @@ class FuncGraph(ops.Graph):
     Returns:
       An `Operation` object.
     """
+    if self.capture_by_value and op_type in ["ReadVariableOp",
+                                             "ResourceGather"]:
+      return self._capture_by_value(
+          op_type, inputs, dtypes, input_types, name, attrs, op_def,
+          compute_shapes, compute_device)
+
     # This capturing logic interacts poorly with control flow contexts which
     # want to replace inputs of ops far too late in the process. This can lead
     # the context to get confused and try to create an Enter for an Enter. We
@@ -366,6 +419,19 @@ class FuncGraph(ops.Graph):
     if tensor.graph is not self:
       if name is None:
         name = tensor.op.name
+      inner_graph = tensor.graph
+      while inner_graph is not None and isinstance(inner_graph, FuncGraph):
+        if inner_graph is self:
+          raise ValueError(
+              "Trying to capture a tensor from an inner function. This can be "
+              "caused by accessing a tensor defined inside a loop or "
+              "conditional body, or a subfunction, from a calling function, "
+              "without going through the proper return value mechanism. "
+              "Consider using TensorFlow mechanisms such as TensorArrays "
+              "to return tensors from inner functions or loop / conditional "
+              "bodies. Tensor: %s; tensor graph: %s; this graph: %s"
+              % (tensor, tensor.graph, self))
+        inner_graph = inner_graph.outer_graph
       return self._capture_helper(tensor, name)
     return tensor
 
@@ -402,7 +468,9 @@ def func_graph_from_py_func(name,
                             add_control_dependencies=True,
                             arg_names=None,
                             op_return_value=None,
-                            collections=None):
+                            collections=None,
+                            capture_by_value=None,
+                            override_flat_arg_shapes=None):
   """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
@@ -438,6 +506,15 @@ def func_graph_from_py_func(name,
       The current whitelisted collections are the global variables, the
       local variables, and the trainable variables.
       Defaults to None.
+    capture_by_value: An optional boolean. If True, the func graph will capture
+      Variables by value instead of reference. By default inherit from outer
+      graphs, and failing that will default to False.
+    override_flat_arg_shapes: An optional list of instances that are either
+      `None` or `TensorShape`.  The length must match that of
+      `nest.flatten((args, kwargs))`.  The entries containing value `None`
+      must match entries in flattened arguments containing non-tensors, while
+      entries containing a `TensorShape` must match entries in the flattened
+      arguments containing tensors.
 
   Returns:
     A FuncGraph.
@@ -445,28 +522,45 @@ def func_graph_from_py_func(name,
   Raises:
     TypeError: If any of `python_func`'s return values is neither `None` nor a
       `Tensor`.
+    ValueError: If both `signature` and `override_flat_arg_shapes` are
+      passed in.
   """
   if op_return_value is not None:
     assert isinstance(op_return_value, ops.Tensor), op_return_value
   if func_graph is None:
-    func_graph = FuncGraph(name, collections=collections)
+    func_graph = FuncGraph(name, collections=collections,
+                           capture_by_value=capture_by_value)
   assert isinstance(func_graph, FuncGraph)
   if add_control_dependencies:
-    control_manager = AutomaticControlDependencies
+    control_manager = AutomaticControlDependencies()
   else:
-    control_manager = ops.NullContextmanager
-  with func_graph.as_default(), control_manager() as a:
+    control_manager = ops.NullContextmanager()
+  with func_graph.as_default(), control_manager as a:
     current_scope = variable_scope.get_variable_scope()
     default_use_recource = current_scope.use_resource
     current_scope.set_use_resource(True)
 
+    if signature is not None and override_flat_arg_shapes is not None:
+      raise ValueError(
+          "Passed both signature and override_flat_arg_shapes: %s and %s."
+          % (signature, override_flat_arg_shapes))
+
     if signature is not None:
       args = signature
       kwargs = {}
 
     # Creates and names placeholders for all arguments.
-    func_args = _get_defun_inputs_from_args(args, arg_names)
-    func_kwargs = _get_defun_inputs_from_kwargs(kwargs)
+    if override_flat_arg_shapes is not None:
+      flat_args = nest.flatten(args)
+      arg_shapes = override_flat_arg_shapes[:len(flat_args)]
+      kwarg_shapes = override_flat_arg_shapes[len(flat_args):]
+    else:
+      arg_shapes = None
+      kwarg_shapes = None
+    func_args = _get_defun_inputs_from_args(
+        args, arg_names, flat_shapes=arg_shapes)
+    func_kwargs = _get_defun_inputs_from_kwargs(
+        kwargs, flat_shapes=kwarg_shapes)
 
     # Convert all Tensors into TensorSpecs before saving the structured inputs.
     # If storing pure concrete functions that are not called through polymorphic
@@ -476,12 +570,19 @@ def func_graph_from_py_func(name,
         convert_structure_to_signature(func_args, arg_names),
         convert_structure_to_signature(func_kwargs))
 
+    flat_func_args = nest.flatten(func_args)
+    flat_func_kwargs = nest.flatten(func_kwargs)
+    # Temporarily set inputs to allow graph building code to inspect
+    # them. Reassigned below.
+    func_graph.inputs = [arg for arg in flat_func_args + flat_func_kwargs
+                         if isinstance(arg, ops.Tensor)]
+
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
-    func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args))
+    func_args_before = nest.pack_sequence_as(func_args, flat_func_args)
     func_kwargs_before = nest.pack_sequence_as(
-        func_kwargs, nest.flatten(func_kwargs))
+        func_kwargs, flat_func_kwargs)
 
     def convert(x):
       """Converts a function output to a Tensor."""
@@ -527,7 +628,7 @@ def func_graph_from_py_func(name,
                   strip_decorators=(def_function.function,),
                   optional_features=autograph_options,
                   force_conversion=True,
-              ), *args, **kwargs)
+              ), args, kwargs)
 
         # Wrapping around a decorator allows checks like tf_inspect.getargspec
         # to be accurate.
@@ -575,7 +676,10 @@ def func_graph_from_py_func(name,
 
     func_graph.variables = variables
 
-  # Register any other functions defined in the graph.
+  if add_control_dependencies:
+    func_graph.control_outputs.extend(control_manager.ops_which_must_run)
+
+# Register any other functions defined in the graph.
   with ops.init_scope():
     if context.executing_eagerly():
       for f in func_graph._functions.values():  # pylint: disable=protected-access
@@ -681,37 +785,73 @@ def _create_substitute_placeholder(value, name=None, dtype=None):
   return placeholder
 
 
-def _get_defun_inputs_from_args(args, names):
+def _get_defun_inputs_from_args(args, names, flat_shapes=None):
   """Maps Python function positional args to graph-construction inputs."""
-  return _get_defun_inputs(args, names, structure=args)
+  return _get_defun_inputs(
+      args, names, structure=args, flat_shapes=flat_shapes)
 
 
-def _get_defun_inputs(flat_args, names, structure):
+def _get_defun_inputs(args, names, structure, flat_shapes=None):
   """Maps python function args to graph-construction inputs.
 
   Args:
-    flat_args: A flat list of user-specified arguments.
+    args: A flat list of user-specified arguments.
     names: A list of strings with user-specified argument names, same length as
-      `flat_args`. May be `None`, in which case a generic name is used.
+      `args`. May be `None`, in which case a generic name is used.
     structure: The original argument list or dictionary.
+    flat_shapes: A flat list of values that are either `None` or
+      instances of `TensorShape`.  If provided, then length must match
+      that of `nest.flatten(args)`; and locations where `args` are
+      instances of `Tensor` must have a corresponding `TensorShape` in
+      `flat_shapes`.  May be `None`, in which case exact shapes are read
+      directly from the args.
 
   Returns:
     Placeholders with the same structure as `structure`.
+
+  Raises:
+    RuntimeError: if `flat_shapes` is provided, but
+     `len(flat_shapes) != len(nest.flatten(args))`.
+    RuntimeError: if a shape from `flat_shapes` is not None
+     for an argument that is not a `Tensor`, `TensorSpec`,
+     or `ResourceVariable`.
   """
   func_graph = ops.get_default_graph()
   function_inputs = []
   if names is None:
-    names = [None] * len(flat_args)
-  for arg_value, name in zip(flat_args, names):
+    names = [None] * len(args)
+  if flat_shapes is None:
+    shapes_iter = itertools.repeat(None)
+  else:
+    len_flat_args = len(nest.flatten(args))
+    if len_flat_args != len(flat_shapes):
+      raise RuntimeError(
+          "Length of fully flat shapes (%d) must match that of "
+          "flatten(args) (%d).  args: %s, flat_shapes: %s"
+          % (len(flat_shapes),
+             len_flat_args,
+             args,
+             flat_shapes))
+    shapes_iter = iter(flat_shapes)
+  for arg_value, name in zip(args, names):
     for arg in nest.flatten(arg_value):
+      # We have a shape entry for each arg, regadless of whether it's a real
+      # Tensor or not.  For non-tensor entries it should be None.
+      shape = next(shapes_iter)
       if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
         if isinstance(arg, tensor_spec.TensorSpec) and arg.name:
           requested_name = arg.name
         else:
           requested_name = name
-        placeholder = graph_placeholder(
-            arg.dtype, arg.shape,
-            name=requested_name)
+        placeholder_shape = shape if shape is not None else arg.shape
+        try:
+          placeholder = graph_placeholder(
+              arg.dtype, placeholder_shape,
+              name=requested_name)
+        except ValueError:
+          # Sometimes parameter names are not valid op names, so fall back to
+          # unnamed placeholders.
+          placeholder = graph_placeholder(arg.dtype, placeholder_shape)
         if name is not None:
           # Record the requested/user-specified name in case it's different than
           # the uniquified name, for validation when exporting signatures.
@@ -730,18 +870,24 @@ def _get_defun_inputs(flat_args, names, structure):
             attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
         function_inputs.append(arg)
       else:
+        if shape is not None:
+          raise RuntimeError(
+              "Expected provided shape override to be None for arg that isn't "
+              "a Tensor, but saw arg: '%s', shape: '%s'.  args: %s"
+              % (arg, shape, args))
         function_inputs.append(arg)
   return nest.pack_sequence_as(structure, function_inputs)
 
 
-def _get_defun_inputs_from_kwargs(kwargs):
+def _get_defun_inputs_from_kwargs(kwargs, flat_shapes):
   """Maps Python function keyword args to graph-construction inputs."""
   if kwargs:
-    names, flat_args = zip(*sorted(kwargs.items()))
+    names, args = zip(*sorted(kwargs.items()))
   else:
     names = []
-    flat_args = []
-  return _get_defun_inputs(flat_args, names, structure=kwargs)
+    args = []
+  return _get_defun_inputs(
+      args, names, structure=kwargs, flat_shapes=flat_shapes)
 
 
 def dismantle_func_graph(func_graph):
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 291986a2c4b05a8a15de1e840f09ccc5f30f9b6d..7002f163d131545a632cc90e06f31a8788901fae 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -129,13 +129,15 @@ class Defun(object):
   def __call__(self, func):
     # Various sanity checks on the callable func.
     if not callable(func):
-      raise ValueError("func %s must be callable" % func)
+      raise ValueError("function %s must be callable" % func)
 
     # Func should not use kwargs and defaults.
     argspec = tf_inspect.getargspec(func)
     if argspec.keywords or argspec.defaults:
-      raise ValueError("Functions with argument defaults or keywords "
-                       "arguments are not supported.")
+      raise ValueError(
+          "function with argument defaults or keywords arguments are not"
+          " supported. {} has defaults {} and keywords {}.".format(
+              func, argspec.defaults, argspec.keywords))
 
     # Computes how many arguments 'func' has.
     min_args = len(argspec.args)
@@ -412,6 +414,8 @@ class _DefinedFunction(object):
           [t._as_tf_output() for t in temp_graph.inputs],
           [t._as_tf_output() for t in temp_graph.outputs],
           output_names,
+          [], # control_outputs
+          [], # control_output_names
           None,  # opts
           description)
       self._c_func = c_api_util.ScopedTFFunction(c_func)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 3d5a5fe79758d43e54a7acaa689bd7d7fe902c56..cd623223e32c29c48b4b338bf508a9cabd02c643 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -284,7 +284,7 @@ class FunctionTest(test.TestCase):
         out, = sess.run(dlogits, {logits: x, labels: y})
       self.assertAllClose(out, np.exp(prob - y))
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/124286351")  # No error is raised
   def testCustomGradientError(self):
     dtype = dtypes.float32
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index a46fccc513c0205067680a6bc3ffb76b7b488c10..f0cd1647d0e2aa7fd993165384f616ce02228909 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -197,7 +197,7 @@ def extract_sub_graph(graph_def, dest_nodes):
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+    instructions="Use tf.compat.v1.graph_util.tensor_shape_from_node_def_name")
 @tf_export(v1=["graph_util.tensor_shape_from_node_def_name"])
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
@@ -353,19 +353,27 @@ def remove_training_nodes(input_graph, protected_nodes=None):
     nodes_after_removal.append(new_node)
 
   types_to_splice = {"Identity": True}
+  control_input_names = set()
+  node_names_with_control_input = set()
+  for node in nodes_after_removal:
+    for node_input in node.input:
+      if "^" in node_input:
+        control_input_names.add(node_input.replace("^", ""))
+        node_names_with_control_input.add(node.name)
+
   names_to_splice = {}
   for node in nodes_after_removal:
     if node.op in types_to_splice and node.name not in protected_nodes:
       # We don't want to remove nodes that have control edge inputs, because
       # they might be involved in subtle dependency issues that removing them
       # will jeopardize.
-      has_control_edge = False
-      for input_name in node.input:
-        if re.match(r"^\^", input_name):
-          has_control_edge = True
-      if not has_control_edge:
+      if node.name not in node_names_with_control_input:
         names_to_splice[node.name] = node.input[0]
 
+  # We also don't want to remove nodes which are used as control edge inputs.
+  names_to_splice = {name: value for name, value in names_to_splice.items()
+                     if name not in control_input_names}
+
   nodes_after_splicing = []
   for node in nodes_after_removal:
     if node.name in names_to_splice:
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index dd26b8a78e9d2e13b34770775fcb1219745396e0..78777dc87724ab202e267a3aab4666c81465de59 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -308,8 +308,9 @@ class DeviceFunctionsTest(test.TestCase):
       new_node.input.extend([input_name])
     return new_node
 
-  def create_constant_node_def(self, name, value, dtype, shape=None):
-    node = self.create_node_def("Const", name, [])
+  def create_constant_node_def(self, name, value, dtype,
+                               shape=None, inputs=None):
+    node = self.create_node_def("Const", name, inputs or [])
     self.set_attr_dtype(node, "dtype", dtype)
     self.set_attr_tensor(node, "value", value, dtype, shape)
     return node
@@ -393,6 +394,18 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertProtoEquals(expected_graph_def,
                            graph_util.remove_training_nodes(graph_def))
 
+  def testRemoveIdentityUsedAsControlInputInConst(self):
+    """Check that Identity nodes used as control inputs are not removed."""
+    graph_def = graph_pb2.GraphDef()
+    graph_def.node.extend([
+        self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]),
+        self.create_node_def("Identity", "I", ["Base"]),
+        self.create_node_def("BaseOp", "Base", [])
+    ])
+
+    self.assertProtoEquals(graph_def,
+                           graph_util.remove_training_nodes(graph_def))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 0febab506e53ca0545cc910f037496324d8eb3d5..fc566ce0b24fa52c712fe5f64357b066e5e41a08 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -538,7 +538,7 @@ def create_graph_debug_info_def(operations):
       all_file_names.add(trace[0])
 
   # Sets the `files` field in the GraphDebugInfo proto
-  graph_debug_info_def.files.extend(sorted(all_file_names))
+  graph_debug_info_def.files.extend(all_file_names)
 
   # Builds a mapping between file names and index of the `files` field, so we
   # only store the indexes for the nodes in the GraphDebugInfo.
@@ -550,9 +550,10 @@ def create_graph_debug_info_def(operations):
   # save the storage space.
   for node_name, trace in node_to_trace.items():
     trace_def = graph_debug_info_def.traces[node_name]
-    for file_line in trace:
-      file_index = file_to_index[file_line[0]]
-      trace_def.file_line_cols.add(file_index=file_index, line=file_line[1])
+    for file_name, line, func, code in trace:
+      file_index = file_to_index[file_name]
+      trace_def.file_line_cols.add(
+          file_index=file_index, line=line, func=func, code=code)
 
   return graph_debug_info_def
 
@@ -972,8 +973,11 @@ def export_scoped_meta_graph(filename=None,
 
   Raises:
     ValueError: When the `GraphDef` is larger than 2GB.
+    ValueError: When executing in Eager mode and either `graph_def` or `graph`
+      is undefined.
   """
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not (graph_def is not None and
+                                          graph is not None):
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "Eager Execution is enabled.")
   graph = graph or ops.get_default_graph()
@@ -1061,11 +1065,12 @@ def export_scoped_meta_graph(filename=None,
       name, _ = os.path.splitext(filename)
       debug_filename = "{name}{ext}".format(name=name, ext=".debug")
 
-      # Gets the operation from the graph by the name.
-      ops_to_export = {}
+      # Gets the operation from the graph by the name. Exludes variable nodes,
+      # so only the nodes in the frozen models are included.
+      ops_to_export = []
       for node in scoped_meta_graph_def.graph_def.node:
         scoped_op_name = ops.prepend_name_scope(node.name, export_scope)
-        ops_to_export.add(graph.get_operation_by_name(scoped_op_name))
+        ops_to_export.append(graph.get_operation_by_name(scoped_op_name))
 
       graph_debug_info = create_graph_debug_info_def(ops_to_export)
 
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 2318b32ef10d67c48950061d2c489f6c7dfb20a0..372763a862b5c416458bdc6ea9ae0f247687c8fc 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -212,6 +212,22 @@ def _MakeTensor(v, arg_name):
       (repr(v), arg_name))
 
 
+def _MakeFunc(v, arg_name):
+  """Ensure v is a func."""
+  if isinstance(v, attr_value_pb2.NameAttrList):
+    return v
+  fn_attr = attr_value_pb2.NameAttrList()
+  if isinstance(v, compat.bytes_or_text_types):
+    fn_attr.name = v
+  elif hasattr(v, "add_to_graph"):
+    v.add_to_graph(ops.get_default_graph())
+    fn_attr.name = v.name
+  else:
+    raise TypeError("Don't know how to convert {} to a func for "
+                    "argument {}".format(v, arg_name))
+  return fn_attr
+
+
 class _OpInfo(object):
   """All per-Op state we would like to precompute/validate."""
 
@@ -515,9 +531,9 @@ class OpDefLibrary(object):
             else:
               raise TypeError(
                   "Expected %s passed to parameter '%s' of op '%s', got %s of "
-                  "type '%s' instead." %
+                  "type '%s' instead. Error: %s" %
                   (dtypes.as_dtype(dtype).name, input_arg.name, op_type_name,
-                   repr(values), type(values).__name__))
+                   repr(values), type(values).__name__, err))
           except ValueError:
             # What type does convert_to_tensor think it has?
             try:
@@ -733,13 +749,9 @@ class OpDefLibrary(object):
           attr_value.list.tensor.extend(
               [_MakeTensor(x, key) for x in value])
         elif attr_def.type == "func":
-          if isinstance(value, attr_value_pb2.NameAttrList):
-            attr_value.func.CopyFrom(value)
-          elif isinstance(value, compat.bytes_or_text_types):
-            attr_value.func.name = value
-          else:
-            value.add_to_graph(ops.get_default_graph())
-            attr_value.func.name = value.name
+          attr_value.func.CopyFrom(_MakeFunc(value, key))
+        elif attr_def.type == "list(func)":
+          attr_value.list.func.extend([_MakeFunc(x, key) for x in value])
         else:
           raise TypeError("Unrecognized Attr type " + attr_def.type)
 
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 66cfe213b3cc943de4cd423e8e2ffffbe0b49f8b..71d708dd89ecfbda9d64240b707563cd3fb2a9e9 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -24,6 +24,7 @@ from google.protobuf import text_format
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
@@ -140,40 +141,43 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a="Bad string")
-      self.assertEqual(str(cm.exception),
-                       "Expected int32 passed to parameter 'a' of op 'Simple', "
-                       "got 'Bad string' of type 'str' instead.")
+      self.assertTrue(
+          "Expected int32 passed to parameter 'a' of op 'Simple', "
+          "got 'Bad string' of type 'str' instead." in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=self.Tensor(dtypes.string))
-      self.assertEqual(str(cm.exception),
-                       "Input 'a' of 'Simple' Op has type string "
-                       "that does not match expected type of int32.")
+      self.assertTrue(
+          "Input 'a' of 'Simple' Op has type string "
+          "that does not match expected type of int32." in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=6, extra="bogus")
-      self.assertEqual(str(cm.exception),
-                       "apply_op() got unexpected keyword arguments: extra")
+      self.assertTrue(
+          "apply_op() got unexpected keyword arguments: extra"
+          in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=6, extra1="bogus", extra2="also_bogus")
-      self.assertEqual(str(cm.exception),
-                       "apply_op() got unexpected keyword arguments: extra1, "
-                       "extra2")
+      self.assertTrue(
+          "apply_op() got unexpected keyword arguments: extra1, "
+          "extra2" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple")
-      self.assertEqual(str(cm.exception), "No argument for input a")
+      self.assertTrue(
+          "No argument for input a" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", wrong=7)
-      self.assertEqual(str(cm.exception), "No argument for input a")
+      self.assertTrue(
+          "No argument for input a" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a={"label": 1})
-      self.assertEqual(str(cm.exception),
-                       "Expected int32 passed to parameter 'a' of op 'Simple', "
-                       "got {'label': 1} of type 'dict' instead.")
+      self.assertTrue(
+          "Expected int32 passed to parameter 'a' of op 'Simple', "
+          "got {'label': 1} of type 'dict' instead." in str(cm.exception))
 
   def testReservedInput(self):
     with ops.Graph().as_default():
@@ -268,19 +272,13 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
         attr { key: 'T' value { type: DT_STRING } }
         """, out.op.node_def)
 
-      with self.assertRaises(TypeError) as cm:
+      with self.assertRaises(TypeError):
         self._lib.apply_op("Binary", a="left", b=12)
-      self.assertEqual(str(cm.exception),
-                       "Expected string passed to parameter 'b' of op 'Binary',"
-                       " got 12 of type 'int' instead.")
 
-      with self.assertRaises(TypeError) as cm:
+      with self.assertRaises(TypeError):
         self._lib.apply_op("Binary",
                            a=self.Tensor(dtypes.string),
                            b=self.Tensor(dtypes.int32))
-      self.assertEqual(str(cm.exception),
-                       "Input 'b' of 'Binary' Op has type int32 "
-                       "that does not match type string of argument 'a'.")
 
   def testRestrict(self):
     with ops.Graph().as_default():
@@ -466,6 +464,46 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
       self.assertEqual(str(cm.exception),
                        "Expected float for argument 'a' not 'bad'.")
 
+  def testAttrFunc(self):
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32, func_name="MyFn")
+      def fn(x):
+        return 2 + x
+      op = self._lib.apply_op("FuncAttr", f=fn, name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'FuncAttr' attr { key: 'f'
+                                        value { func { name: 'MyFn' } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("FuncAttr", f=3)
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to convert 3 to a func for argument f")
+
+  def testAttrFuncList(self):
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32, func_name="MyFn")
+      def fn1(x):
+        return 2 + x
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="MyFn2")
+      def fn2(x, y):
+        return 2 + x, y * 3
+      @function.Defun(dtypes.int32, func_name="MyFn3")
+      def fn3(y):
+        return 2 + y
+      op = self._lib.apply_op("FuncListAttr", f=[fn1, fn2, fn3], name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'FuncListAttr'
+        attr { key: 'f' value { list { func { name: 'MyFn' }
+                                       func { name: 'MyFn2' }
+                                       func { name: 'MyFn3' } } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("FuncListAttr", f=[fn1, 3, fn2])
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to convert 3 to a func for argument f")
+
   def testAttrBool(self):
     with ops.Graph().as_default():
       op = self._lib.apply_op("AttrBool", a=True, name="t")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e74b43f93e61feb0030195e72ced0d31b94211c8..41d6bdeec1ba35068e58eff948667e649e9893a6 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -95,9 +95,12 @@ class _UserDeviceSpec(object):
         lineno = -1
       self.display_name = "%s<%s, %d>" % (func_name, fname, lineno)
 
+    self.raw_string = None
+
     self.function = self._device_name_or_function
     if not (self._device_name_or_function is None or
             callable(self._device_name_or_function)):
+      self.raw_string = self._device_name_or_function
       self.function = pydev.merge_device(self._device_name_or_function)
 
 
@@ -3048,9 +3051,6 @@ class Graph(object):
     # being called inside function definitions behave as if they were seeing the
     # actual outside graph).
     self._graph_key = "grap-key-%d/" % (uid(),)
-    # A string with the last reduction method passed to
-    # losses.compute_weighted_loss(), or None.
-    self._last_loss_reduction = None
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
     # Set to True if this graph is being built in an
@@ -5048,6 +5048,48 @@ class Graph(object):
     self._thread_local._distribution_strategy_stack = (  # pylint: disable=protected-access
         _distribution_strategy_stack)
 
+  @property
+  def _auto_cast_variable_read_dtype(self):
+    """The dtype that instances of `AutoCastVariable` will be casted to.
+
+    This is None if `AutoCastVariables` should not be casted.
+
+    See `AutoCastVariable` for more information.
+
+    Returns:
+      The dtype that instances of `AutoCastVariable` will be casted to.
+    """
+    if not hasattr(self._thread_local, "_auto_cast_variable_read_dtype"):
+      self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
+    return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
+
+  @_auto_cast_variable_read_dtype.setter
+  def _auto_cast_variable_read_dtype(self, _auto_cast_variable_read_dtype):
+    self._thread_local._auto_cast_variable_read_dtype = (  # pylint: disable=protected-access
+        _auto_cast_variable_read_dtype)
+
+  @tf_contextlib.contextmanager
+  def _enable_auto_casting_variables(self, dtype):
+    """Context manager to automatically cast AutoCastVariables.
+
+    If an AutoCastVariable `var` is used under this context manager, it will be
+    casted to `dtype` before being used.
+
+    See `AutoCastVariable` for more information.
+
+    Args:
+      dtype: The dtype that AutoCastVariables should be casted to.
+
+    Yields:
+      Nothing.
+    """
+    prev_read_dtype = self._auto_cast_variable_read_dtype
+    try:
+      self._auto_cast_variable_read_dtype = dtype
+      yield
+    finally:
+      self._auto_cast_variable_read_dtype = prev_read_dtype
+
   def _mutation_lock(self):
     """Returns a lock to guard code that creates & mutates ops.
 
@@ -5547,6 +5589,8 @@ def init_scope():
     try:
       with outer_context(), name_scope(scope), control_dependencies(
           None), tape.stop_recording():
+        context_manager = NullContextmanager
+        context_manager_input = None
         if not context.executing_eagerly():
           # The device stack is preserved when lifting into a graph. Eager
           # execution doesn't implement device stacks and in particular it
@@ -5555,7 +5599,21 @@ def init_scope():
           outer_graph = get_default_graph()
           outer_device_stack = outer_graph._device_function_stack  # pylint: disable=protected-access
           outer_graph._device_function_stack = innermost_nonempty_device_stack  # pylint: disable=protected-access
-        yield
+        elif innermost_nonempty_device_stack is not None:
+          for device_spec in innermost_nonempty_device_stack.peek_objs():
+            if device_spec.function is None:
+              break
+            if device_spec.raw_string:
+              context_manager = context.device
+              context_manager_input = device_spec.raw_string
+              break
+            # It is currently not possible to have a device function in V2,
+            # but in V1 we are unable to apply device functions in eager mode.
+            # This means that we will silently skip some of the entries on the
+            # device stack in V1 + eager mode.
+
+        with context_manager(context_manager_input):
+          yield
     finally:
       # If an exception is raised here it may be hiding a related exception in
       # try-block (just above).
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 8872c9f67e501274eb7d8f26b8c7d45f3822f240..7d9799a1a7e28c3317ddca1ce3ffada51517b508 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -1587,6 +1587,8 @@ class CollectionTest(test_util.TensorFlowTestCase):
     self.assertSequenceEqual(g.collections, ["key"])
     g.add_to_collection("other", "foo")
     self.assertSequenceEqual(sorted(g.collections), ["key", "other"])
+    self.assertSequenceEqual(
+        sorted(g.get_all_collection_keys()), ["key", "other"])
 
   def test_add_to_collection(self):
     g = ops.Graph()
@@ -2414,17 +2416,22 @@ class GraphTest(test_util.TensorFlowTestCase):
 
   def testDefaultGraph(self):
     orig = ops.get_default_graph()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     g0 = ops.Graph()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     context_manager_0 = g0.as_default()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     with context_manager_0 as g0:
       self._AssertDefault(g0)
       with ops.Graph().as_default() as g1:
+        self.assertTrue(ops.has_default_graph())
         self._AssertDefault(g1)
       self._AssertDefault(g0)
     self._AssertDefault(orig)
+    self.assertFalse(ops.has_default_graph())
 
   def testPreventFeeding(self):
     g = ops.Graph()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index b0b9ce70558eecc68dc5ba3a844310fd2a6f39d4..c27f0140144287cb715e92715343d1b1e69b009a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -547,7 +547,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_tensor(_t, \"", attr_api_name,
                          "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type != "func") {
+    } else if (attr_type != "func" && attr_type != "list(func)") {
       *function_setup =
           strings::StrCat("# No definition for ", function_name_,
                           " since we don't support attrs with type\n"
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index a7537bb5f1adfe70018f50cb9a627bfffe176226..0dc3dde4f6e95dbe4156a29d03f465e95cb4a5f6 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -74,9 +74,8 @@ def enable_v2_tensorshape():
   # in `tensor_shape[i]`, but they would not be.
   ```
   """
-  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  global _TENSORSHAPE_V2_OVERRIDE  # pylint: disable=invalid-name
   _TENSORSHAPE_V2_OVERRIDE = True
-  TensorShape = TensorShapeV2
 
 
 @tf_export(v1=["disable_v2_tensorshape"])
@@ -85,9 +84,8 @@ def disable_v2_tensorshape():
 
   See docstring for `enable_v2_tensorshape` for details about the new behavior.
   """
-  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  global _TENSORSHAPE_V2_OVERRIDE  # pylint: disable=invalid-name
   _TENSORSHAPE_V2_OVERRIDE = False
-  TensorShape = TensorShapeV1
 
 
 @tf_export("compat.dimension_value",
@@ -470,6 +468,54 @@ class Dimension(object):
     """
     return self // other
 
+  def __rdiv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'int' and 'Dimension'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: '{}' and 'Dimension', "
+                    "please use // instead".format(type(other).__name__))
+
+  def __truediv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'Dimension' and 'int'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: 'Dimension' and '{}', "
+                    "please use // instead".format(type(other).__name__))
+
+  def __rtruediv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'int' and 'Dimension'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: '{}' and 'Dimension', "
+                    "please use // instead".format(type(other).__name__))
+
   def __mod__(self, other):
     """Returns `self` modulo `other`.
 
@@ -635,8 +681,8 @@ def as_dimension(value):
     return Dimension(value)
 
 
-@tf_export(v1=["TensorShape"])
-class TensorShapeV1(object):
+@tf_export("TensorShape")
+class TensorShape(object):
   """Represents the shape of a `Tensor`.
 
   A `TensorShape` represents a possibly-partial shape specification for a
@@ -695,7 +741,7 @@ class TensorShapeV1(object):
   @property
   def _v2_behavior(self):
     if _TENSORSHAPE_V2_OVERRIDE is None:
-      return False
+      return tf2.enabled()
     return _TENSORSHAPE_V2_OVERRIDE
 
   def __repr__(self):
@@ -1151,22 +1197,6 @@ def unknown_shape(rank=None, **kwargs):
     return TensorShape([Dimension(None)] * rank)
 
 
-@tf_export("TensorShape", v1=[])
-class TensorShapeV2(TensorShapeV1):
-
-  @property
-  def _v2_behavior(self):
-    if _TENSORSHAPE_V2_OVERRIDE is None:
-      return True
-    return _TENSORSHAPE_V2_OVERRIDE
-
-
-if tf2.enabled():
-  TensorShape = TensorShapeV2
-else:
-  TensorShape = TensorShapeV1
-
-
 def scalar():
   """Returns a shape representing a scalar."""
   return TensorShape([])
diff --git a/tensorflow/python/framework/tensor_shape_div_test.py b/tensorflow/python/framework/tensor_shape_div_test.py
index 8e63d7f54705bb5c8384315f068598a86c047599..5160c75e5272d9326a35a0813809387605cca1ea 100644
--- a/tensorflow/python/framework/tensor_shape_div_test.py
+++ b/tensorflow/python/framework/tensor_shape_div_test.py
@@ -35,6 +35,16 @@ class DimensionDivTest(test_util.TensorFlowTestCase):
         for y in values:
           self.assertEqual((x / y).value, (x // y).value)
 
+  def testRDivFail(self):
+    # Note: This test is related to GitHub issue 25790.
+    """Without from __future__ import division, __rdiv__ is used."""
+    if six.PY2:  # Old division exists only in Python 2
+      two = tensor_shape.Dimension(2)
+      message = (r"unsupported operand type\(s\) for /: "
+                 r"'int' and 'Dimension', please use // instead")
+      with self.assertRaisesRegexp(TypeError, message):
+        _ = 6 / two
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 7d85e0a99e662512b29e4134091658190a3bc500..b4a37c05a83a578343114dd2e2f604af37e72fce 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -205,6 +205,23 @@ class DimensionTest(test_util.TensorFlowTestCase):
     reconstructed = ctor(*args)
     self.assertEquals(reconstructed, dim)
 
+  def testDiv(self):
+    # Note: This test is related to GitHub issue 25790.
+    six = tensor_shape.Dimension(6)
+    two = tensor_shape.Dimension(2)
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'Dimension' and 'Dimension', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = six / two
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'Dimension' and 'int', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = six / 2
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'int' and 'Dimension', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = 6 / two
+
 
 class ShapeTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index c44636edc4ec5101c588766714c98a7da15793e4..2e847c7a35b946c4153304014d15f5ee96760142 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -108,7 +108,9 @@ class TensorSpec(object):
     return hash((self._shape_tuple, self.dtype))
 
   def __eq__(self, other):
-    return self.shape == other.shape and self.dtype == other.dtype
+    return (self._shape_tuple == other._shape_tuple  # pylint: disable=protected-access
+            and self.dtype == other.dtype
+            and self._name == other._name)  # pylint: disable=protected-access
 
   def __ne__(self, other):
     return not self == other
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 7de6653d49afb1d4535014284ad199ee746965c8..af943f09ab84b0032d06e071f5d2fa5652027c33 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,6 +22,8 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -807,6 +809,8 @@ def constant_value(tensor, partial=False):  # pylint: disable=invalid-name
   """
   if isinstance(tensor, ops.EagerTensor):
     return tensor.numpy()
+  if not pywrap_tensorflow.IsTensor(tensor):
+    return tensor
   ret = _ConstantValue(tensor, partial)
   if ret is not None:
     # The caller may now depend on the constant value of `tensor`, so we
@@ -932,13 +936,15 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   return ret
 
 
+@tf_export("is_tensor")
 def is_tensor(x):  # pylint: disable=invalid-name
   """Check whether `x` is of tensor type.
 
-  Check whether an object is a tensor. This check is equivalent to calling
-  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.Variable))` and also checks
-  if all the component variables of a MirroredVariable or a ReplicaLocalVariable
-  are tensors.
+  Check whether an object is a tensor or a composite tensor. This check is
+  equivalent to calling
+  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.RaggedTensor, tf.Variable))`
+  and also checks if all the component variables of a MirroredVariable or a
+  ReplicaLocalVariable are tensors.
 
   Args:
     x: A python object to check.
@@ -947,4 +953,5 @@ def is_tensor(x):  # pylint: disable=invalid-name
     `True` if `x` is a tensor, `False` if not.
   """
   return (isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x) or  # pylint: disable=protected-access
+          isinstance(x, composite_tensor.CompositeTensor) or
           (hasattr(x, "is_tensor_like") and x.is_tensor_like))
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index cdacdfaaada96e21d4f4d6a9fb2a9247e332969f..e73df390357107b155dd6bdfb7c3bedc713303a9 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -773,6 +773,16 @@ class TensorUtilTest(test.TestCase):
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
 
+class IsTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantTensor(self):
+    np_val = np.random.rand(3).astype(np.int32)
+    tf_val = constant_op.constant(np_val)
+    self.assertFalse(tensor_util.is_tensor(np_val))
+    self.assertTrue(tensor_util.is_tensor(tf_val))
+
+
 class ConstantValueTest(test.TestCase):
 
   def testConstant(self):
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 1d0145f61c84969cf1b52eb070ec3f933d25741a..5d1386c26d73816772936bac9fe57c575a399066 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -406,6 +406,10 @@ REGISTER_OP("FuncAttr")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("FuncListAttr")
+    .Attr("f: list(func)")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("Simple")
     .Input("a: int32")
     .Output("out: float")
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d27572f05666f53c5443ad0e1a6ef197564ef98d..1d267830ff9bfca395f63e49a53bf5b97a71887b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1460,7 +1460,7 @@ def disable_all_xla(description):
       value = getattr(cls, name)
       if callable(value) and name.startswith(
           "test") and not name == "test_session":
-        setattr(cls, name, base_decorator(value))
+        setattr(cls, name, base_decorator(description)(value))
     return cls
 
   return disable_all_impl
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 593d38206d127978f1982a0f2cc22e17daee1a3d..0d4f7de9f55b2bd13cd1ab7988b7f6c43d3e018c 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -272,7 +272,6 @@ static PyObject* TF_GetColocationGroups(GItem item) {
     if (!s.ok()) {
       continue;
     }
-    int i = 0;
     for (const auto& arg : op_def->input_arg()) {
       if (!arg.is_ref()) {
         continue;
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 7aea97f115da5bbfa3466bbc249d37024290d694..931a5d6785e6351934cba2825f5d8c9c6a161e3e 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -63,6 +63,7 @@ py_library(
         ":pil_for_keras",
         ":saving",
         "//tensorflow/python:training",
+        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/saved_model",
         "@keras_applications_archive//:keras_applications",
@@ -156,10 +157,12 @@ py_library(
         ":activations",
         ":backend",
         ":callbacks",
+        ":callbacks_v1",
         ":constraints",
         ":engine_utils",
         ":initializers",
         ":losses",
+        ":mode_keys",
         ":optimizers",
         ":regularizers",
         ":saving",
@@ -168,7 +171,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
@@ -187,9 +190,9 @@ py_library(
     deps = [
         ":backend",
         ":engine_utils",
+        ":mode_keys",
         ":optimizers",
         "//tensorflow/python:lib",
-        "//tensorflow/python:mode_keys",
         "//tensorflow/python:saver",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model/model_utils",
@@ -214,6 +217,19 @@ py_library(
         "callbacks.py",
     ],
     srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        ":mode_keys",
+    ],
+)
+
+py_library(
+    name = "callbacks_v1",
+    srcs = [
+        "callbacks_v1.py",
+    ],
+    srcs_version = "PY2AND3",
     deps = [
         ":backend",
         ":engine_utils",
@@ -292,6 +308,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -355,6 +372,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "mode_keys",
+    srcs = [
+        "utils/mode_keys.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/saved_model/model_utils:mode_keys",
+    ],
+)
+
 tf_py_test(
     name = "integration_test",
     size = "medium",
@@ -410,7 +438,7 @@ tf_py_test(
 
 tf_py_test(
     name = "regularizers_test",
-    size = "small",
+    size = "medium",
     srcs = ["regularizers_test.py"],
     additional_deps = [
         ":keras",
@@ -483,16 +511,29 @@ tf_py_test(
     shard_count = 4,
 )
 
+tf_py_test(
+    name = "metrics_correctness_test",
+    size = "medium",
+    srcs = ["metrics_correctness_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
 tf_py_test(
     name = "applications_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["applications/applications_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 2,
+    shard_count = 11,
 )
 
 tf_py_test(
@@ -542,6 +583,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 8,
+    tags = [
+        "manual",  # b/124471597
+        "notap",  # b/124471597
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -567,7 +613,10 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 4,
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_rocm",
+        "no_windows_gpu",
+    ],
 )
 
 tf_py_test(
@@ -738,6 +787,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 8,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -751,6 +801,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 8,
+    tags = ["no_rocm"],
 )
 
 tf_py_test(
@@ -861,6 +912,27 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "composite_tensor_support_test",
+    size = "medium",
+    srcs = ["utils/composite_tensor_support_test.py"],
+    additional_deps = [
+        ":engine",
+        ":layers",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+    ],
+)
+
 tf_py_test(
     name = "io_utils_test",
     size = "small",
@@ -986,6 +1058,20 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "callbacks_v1_test",
+    size = "medium",
+    srcs = ["callbacks_v1_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     tags = ["notsan"],
 )
 
@@ -1127,6 +1213,7 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    tags = ["no_rocm"],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index f024b9b59a21df6e6771e89ef00428bcaaf49524..b7ec63837d92d11258a88b870e5af5be04c32e5e 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -25,6 +25,7 @@ from tensorflow.python.keras import activations
 from tensorflow.python.keras import applications
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import callbacks_v1
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import datasets
 from tensorflow.python.keras import estimator
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
index b15ca5990aef9bed088cccd0dea1be049386eaf2..ad6b58992a9195975fbee8d4b81d8f810ab6e3df 100644
--- a/tensorflow/python/keras/applications/applications_test.py
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -32,13 +32,11 @@ MODEL_LIST = [
     (applications.InceptionV3, 2048),
     (applications.InceptionResNetV2, 1536),
     (applications.MobileNet, 1024),
-    # TODO(fchollet): enable MobileNetV2 tests when a new TensorFlow test image
-    # is released with keras_applications upgraded to 1.0.5 or above.
+    (applications.MobileNetV2, 1280),
     (applications.DenseNet121, 1024),
     (applications.DenseNet169, 1664),
     (applications.DenseNet201, 1920),
     (applications.NASNetMobile, 1056),
-    (applications.NASNetLarge, 4032),
 ]
 
 
@@ -47,7 +45,8 @@ class ApplicationsTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters(*MODEL_LIST)
   def test_feature_extration_model(self, model_fn, output_dim):
     model = model_fn(include_top=False, weights=None)
-    self.assertEqual(model.output_shape, (None, None, None, output_dim))
+    self.assertLen(model.output_shape, 4)
+    self.assertEqual(model.output_shape[-1], output_dim)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 9c2b46ee2452a7c6f296b9a712dc8cbb16c29653..0a8c5bb19f493d257ad453b8b265aaba2aac3a7a 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -37,6 +37,7 @@ from tensorflow.python.distribute import distribute_coordinator_context as dc_co
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
@@ -79,6 +80,9 @@ py_sum = sum
 # while executing eagerly (such as the functional API for model-building).
 _GRAPH = None
 
+# A graph which is used for constructing functions in eager mode.
+_CURRENT_SCRATCH_GRAPH = None
+
 # This is a thread local object that will hold the default internal TF session
 # used by Keras. It can be set manually via `set_session(sess)`.
 _SESSION = threading.local()
@@ -389,16 +393,26 @@ def eager_learning_phase_scope(value):
     _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
 
 
-def _get_session():
+def _current_graph(op_input_list):
+  """Return the graph members of `op_input_list`, or the current graph."""
+  return ops._get_graph_from_inputs(op_input_list)
+
+
+def _get_session(op_input_list=()):
   """Returns the session object for the current thread."""
   global _SESSION
   default_session = ops.get_default_session()
   if default_session is not None:
     session = default_session
   else:
-    if getattr(_SESSION, 'session', None) is None:
-      # We are creating the Session inside a Distribution
-      # Strategy scope.
+    if ops.inside_function():
+      raise RuntimeError('Cannot get session inside Tensorflow graph function.')
+    # If we don't have a session, or that session does not match the current
+    # graph, create and cache a new session.
+    if (getattr(_SESSION, 'session', None) is None or
+        _SESSION.session.graph is not _current_graph(op_input_list)):
+      # If we are creating the Session inside a tf.distribute.Strategy scope,
+      # we ask the strategy for the right session options to use.
       if distribution_strategy_context.has_strategy():
         configure_and_create_distributed_session(
             distribution_strategy_context.get_strategy())
@@ -410,12 +424,13 @@ def _get_session():
 
 
 @keras_export(v1=['keras.backend.get_session'])
-def get_session():
+def get_session(op_input_list=()):
   """Returns the TF session to be used by the backend.
 
   If a default TensorFlow session is available, we will return it.
 
-  Else, we will return the global Keras session.
+  Else, we will return the global Keras session assuming it matches
+  the current graph.
 
   If no global Keras session exists at this point:
   we will create a new global session.
@@ -423,10 +438,15 @@ def get_session():
   Note that you can manually set the global session
   via `K.set_session(sess)`.
 
+  Arguments:
+      op_input_list: An option sequence of tensors or ops, which will be used
+        to determine the current graph. Otherwise the default graph will be
+        used.
+
   Returns:
       A TensorFlow session.
   """
-  session = _get_session()
+  session = _get_session(op_input_list)
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
@@ -443,6 +463,40 @@ def get_graph():
     return ops.get_default_graph()
 
 
+@tf_contextlib.contextmanager
+def _scratch_graph(graph=None):
+  """Retrieve a shared and temporary func graph.
+
+  The eager execution path lifts a subgraph from the keras global graph into
+  a scratch graph in order to create a function. DistributionStrategies, in
+  turn, constructs multiple functions as well as a final combined function. In
+  order for that logic to work correctly, all of the functions need to be
+  created on the same scratch FuncGraph.
+
+  Args:
+    graph: A graph to be used as the current scratch graph. If not set then
+      a scratch graph will either be retrieved or created:
+
+  Yields:
+    The current scratch graph.
+  """
+  global _CURRENT_SCRATCH_GRAPH
+  if (_CURRENT_SCRATCH_GRAPH is not None and graph is not None and
+      _CURRENT_SCRATCH_GRAPH is not graph):
+    raise ValueError('Multiple scratch graphs specified.')
+
+  if _CURRENT_SCRATCH_GRAPH:
+    yield _CURRENT_SCRATCH_GRAPH
+    return
+
+  graph = graph or func_graph.FuncGraph('keras_scratch_graph')
+  try:
+    _CURRENT_SCRATCH_GRAPH = graph
+    yield graph
+  finally:
+    _CURRENT_SCRATCH_GRAPH = None
+
+
 @keras_export('keras.backend.set_session')
 def set_session(session):
   """Sets the global TensorFlow session.
@@ -956,10 +1010,10 @@ def dtype(x):
       # Keras variable
       >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
       >>> K.dtype(kvar)
-      'float32_ref'
+      'float32'
       >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
       >>> K.dtype(kvar)
-      'float32_ref'
+      'float32'
   ```
   """
   return x.dtype.base_dtype.name
@@ -1590,7 +1644,7 @@ def min(x, axis=None, keepdims=False):
           the reduced dimension is retained with length 1.
 
   Returns:
-      A tensor with miminum values of `x`.
+      A tensor with minimum values of `x`.
   """
   return math_ops.reduce_min(x, axis, keepdims)
 
@@ -2753,7 +2807,7 @@ def get_value(x):
       return x.numpy()
   elif ops.inside_function():
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
-  return x.eval(session=get_session())
+  return x.eval(session=get_session((x,)))
 
 
 @keras_export('keras.backend.batch_get_value')
@@ -2774,7 +2828,7 @@ def batch_get_value(tensors):
   elif ops.inside_function():  # pylint: disable=protected-access
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   if tensors:
-    return get_session().run(tensors)
+    return get_session(tensors).run(tensors)
   else:
     return []
 
@@ -2893,7 +2947,7 @@ class GraphExecutionFunction(object):
                       'should be a list or tuple.')
     self.inputs = nest.flatten(inputs)
     self._outputs_structure = outputs
-    self.outputs = nest.flatten(outputs)
+    self.outputs = cast_variables_to_tensor(nest.flatten(outputs))
     with ops.control_dependencies(self.outputs):
       updates_ops = []
       for update in updates:
@@ -2992,7 +3046,7 @@ class GraphExecutionFunction(object):
   def __call__(self, inputs):
     inputs = nest.flatten(inputs)
 
-    session = get_session()
+    session = get_session(inputs)
     feed_arrays = []
     array_vals = []
     feed_symbols = []
@@ -3049,48 +3103,79 @@ class EagerExecutionFunction(object):
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None):
+    self.name = name
+    self._outputs_structure = outputs
+    inputs = nest.flatten(inputs)
+    outputs = nest.flatten(outputs)
+
     updates = updates or []
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = nest.flatten(inputs)
-    self._outputs_structure = outputs
-    self.outputs = nest.flatten(outputs)
-    self.name = name
 
-    graph = get_graph()
+    if updates and not outputs:
+      # Edge case; never happens in practice
+      raise ValueError('Cannot create a Keras backend function with updates'
+                       ' but no outputs during eager execution.')
+
+    graphs = {i.graph for i in nest.flatten([inputs, outputs, updates])
+              if hasattr(i, 'graph')}
+    if len(graphs) > 1:
+      raise ValueError('Cannot create an execution function which is comprised '
+                       'of elements from multiple graphs.')
+
+    source_graph = graphs.pop()
+    global_graph = get_graph()
+
+    updates_ops = []
+    legacy_update_ops = []
+    for update in updates:
+      # For legacy reasons it is allowed to pass an update as a tuple
+      # `(variable, new_value)` (this maps to an assign op). Otherwise it
+      # is assumed to already be an op -- we cannot control its execution
+      # order.
+      if isinstance(update, tuple):
+        legacy_update_ops.append(update)
+      else:
+        if hasattr(update, 'op'):
+          update = update.op
+        updates_ops.append(update)
+
+    with _scratch_graph() as exec_graph:
+      global_graph = get_graph()
+      if source_graph not in (exec_graph, global_graph):
+        raise ValueError('Unknown graph. Aborting.')
+
+      if source_graph is global_graph and exec_graph is not global_graph:
+        init_tensors = (
+            outputs + updates_ops + [p for [p, _] in legacy_update_ops] +
+            [p_new for [_, p_new] in legacy_update_ops
+             if isinstance(p_new, ops.Tensor)])
+        lifted_map = lift_to_graph.lift_to_graph(
+            init_tensors=init_tensors, graph=exec_graph, sources=inputs,
+            add_sources=True, handle_captures=True, base_graph=source_graph)
+
+        inputs = [lifted_map[i] for i in inputs]
+        outputs = [lifted_map[i] for i in outputs]
+        updates_ops = [lifted_map[i] for i in updates_ops]
+        legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
+                             for p, p_new in legacy_update_ops]
+
     # Consolidate updates
-    with graph.as_default():
-      with ops.control_dependencies(self.outputs):
-        # In general, updates should be run after the outputs have been
-        # computed. However, we can only ensure this when we create
-        # the updates here (i.e. when updates are passed as tuples).
-        # We cannot modify the control dependencies of preexisting update ops.
-        updates_ops = []
-        for update in updates:
-          # For legacy reasons it is allowed to pass an update as a tuple
-          # `(variable, new_value)` (this maps to an assign op).
-          if isinstance(update, tuple):
-            p, new_p = update
-            updates_ops.append(state_ops.assign(p, new_p))
-          else:
-            # Assumed already an op -- we cannot control its execution order.
-            updates_ops.append(update)
-
-      # We set the update ops to run at the end by conditioning it on output[0]
-      if updates and not self.outputs:
-        # Edge case; never happens in practice
-        raise ValueError('Cannot create a Keras backend function with updates'
-                         ' but no outputs during eager execution.')
+    with exec_graph.as_default():
+      outputs = cast_variables_to_tensor(outputs)
+      with ops.control_dependencies(outputs):
+        for p, p_new in legacy_update_ops:
+          updates_ops.append(state_ops.assign(p, p_new))
+
+      self.inputs, self.outputs = inputs, outputs
       with ops.control_dependencies(updates_ops):
         self.outputs[0] = array_ops.identity(self.outputs[0])
 
-    # Prepare graph function
-    # TODO(fchollet): can we restrict `captures` to variables actually used in
-    # the relevant subgraph?
-    graph.inputs = self.inputs + list(graph.captures.values())
-    graph.outputs = self.outputs
-    graph_fn = eager_function.ConcreteFunction(graph)
+      exec_graph.inputs = self.inputs + list(exec_graph.captures.values())
+      exec_graph.outputs = self.outputs
+      graph_fn = eager_function.ConcreteFunction(exec_graph)
+
     graph_fn._num_positional_args = len(self.inputs)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
@@ -3098,7 +3183,7 @@ class EagerExecutionFunction(object):
     # Handle placeholders with default
     # (treated as required placeholder by graph functions)
     self._placeholder_default_values = {}
-    with graph.as_default():
+    with exec_graph.as_default():
       for x in self.inputs:
         if x.op.type == 'PlaceholderWithDefault':
           self._placeholder_default_values[x] = tensor_util.constant_value(
@@ -4144,8 +4229,8 @@ def conv1d(x,
   x = nn.convolution(
       input=x,
       filter=kernel,
-      dilation_rate=(dilation_rate,),
-      strides=(strides,),
+      dilation_rate=dilation_rate,
+      strides=strides,
       padding=padding,
       data_format=tf_data_format)
   if data_format == 'channels_first' and tf_data_format == 'NWC':
@@ -4366,6 +4451,7 @@ def separable_conv2d(x,
   Raises:
       ValueError: if `data_format` is neither `channels_last` or
       `channels_first`.
+      ValueError: if `strides` is not a tuple of 2 integers.
   """
   if data_format is None:
     data_format = image_data_format()
@@ -4573,6 +4659,8 @@ def pool2d(x,
   Raises:
       ValueError: if `data_format` is neither `"channels_last"` or
       `"channels_first"`.
+      ValueError: if `pool_size` is not a tuple of 2 integers.
+      ValueError: if `strides` is not a tuple of 2 integers.
       ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
   """
   if data_format is None:
@@ -5222,11 +5310,16 @@ def in_multi_worker_mode():
 def configure_and_create_distributed_session(distribution_strategy):
   """Configure session config and create a session with it."""
 
-  # TODO(priyag): Throw error if a session already exists.
   def _create_session(distribution_strategy):
     """Create the Distributed Strategy session."""
     session_config = get_default_session_config()
 
+    # If a session already exists, merge in its config; in the case there is a
+    # conflict, take values of the existing config.
+    global _SESSION
+    if getattr(_SESSION, 'session', None) and _SESSION.session._config:
+      session_config.MergeFrom(_SESSION.session._config)
+
     if is_tpu_strategy(distribution_strategy):
       # TODO(priyag, yuefengz): Remove this workaround when Distribute
       # Coordinator is integrated with keras and we can create a session from
@@ -5262,3 +5355,13 @@ def configure_and_create_distributed_session(distribution_strategy):
 def is_tpu_strategy(strategy):
   """We're executing TPU Strategy."""
   return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def cast_variables_to_tensor(tensors):
+
+  def _cast_variables_to_tensor(tensor):
+    if isinstance(tensor, variables_module.Variable):
+      return array_ops.identity(tensor)
+    return tensor
+
+  return nest.map_structure(_cast_variables_to_tensor, tensors)
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 263a1dc461027cf58064d363efa393c4235b8d01..cd7821639b92613b716679c94ed3fd195663ba42 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1741,6 +1741,16 @@ class BackendGraphTests(test.TestCase):
         x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
 
+  def test_get_session_different_graphs(self):
+    with ops.Graph().as_default():
+      x = keras.backend.constant(1)
+      session = keras.backend.get_session()
+      self.assertIs(session, keras.backend.get_session((x,)))
+      self.assertIs(session, keras.backend.get_session())
+    with ops.Graph().as_default():
+      self.assertIs(session, keras.backend.get_session((x,)))
+      self.assertIsNot(session, keras.backend.get_session())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 41fedbb31fecdabc7e2f95fd529a8826305808ef..79bafb9559ccccd4bccd179c3f63fae182e573fd 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -32,18 +32,14 @@ import six
 
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import summary_ops_v2
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary as tf_summary
-from tensorflow.python.training import saver
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util.tf_export import keras_export
 
 try:
@@ -52,7 +48,6 @@ except ImportError:
   requests = None
 
 
-# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
@@ -91,20 +86,57 @@ def configure_callbacks(callbacks,
   # Add additional callbacks during training.
   if mode == ModeKeys.TRAIN:
     model.history = History()
-    stateful_metric_names = None
-    if hasattr(model, 'metrics_names'):
-      stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-    callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
-                ] + (callbacks or []) + [model.history]
+    callbacks = [BaseLogger()] + (callbacks or []) + [model.history]
     if verbose:
-      callbacks.append(
-          ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+      callbacks.append(ProgbarLogger(count_mode))
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()
+  callback_model = model._get_callback_model()  # pylint: disable=protected-access
   callback_list.set_model(callback_model)
 
+  set_callback_parameters(
+      callback_list,
+      model,
+      do_validation=do_validation,
+      batch_size=batch_size,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      samples=samples,
+      verbose=verbose,
+      mode=mode)
+
+  callback_list.model.stop_training = False
+  return callback_list
+
+
+def set_callback_parameters(callback_list,
+                            model,
+                            do_validation=False,
+                            batch_size=None,
+                            epochs=None,
+                            steps_per_epoch=None,
+                            samples=None,
+                            verbose=1,
+                            mode=ModeKeys.TRAIN):
+  """Sets callback parameters.
+
+  Arguments:
+      callback_list: CallbackList instance.
+      model: Model being trained.
+      do_validation: Whether or not validation loop will be run.
+      batch_size: Number of samples per batch.
+      epochs: Number of epoch to train.
+      steps_per_epoch: Number of batches to run per training epoch.
+      samples: Number of training samples.
+      verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
+      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+        Which loop mode to configure callbacks for.
+  """
+  for cbk in callback_list:
+    if isinstance(cbk, (BaseLogger, ProgbarLogger)):
+      cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
+
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
@@ -123,9 +155,6 @@ def configure_callbacks(callbacks,
       'metrics': callback_metrics,
   }
   callback_list.set_params(callback_params)
-  callback_list.model.stop_training = False
-  return callback_list
-# pylint: enable=protected-access
 
 
 def _is_generator_like(data):
@@ -395,7 +424,7 @@ class Callback(object):
   take as argument will contain keys for quantities relevant to
   the current batch or epoch.
 
-  Currently, the `.fit()` method of the `Sequential` model class
+  Currently, the `.fit()` method of the `Model` class
   will include the following quantities in the `logs` that
   it passes to its callbacks:
 
@@ -412,6 +441,10 @@ class Callback(object):
   def __init__(self):
     self.validation_data = None
     self.model = None
+    # Whether this Callback should only run on the chief worker in a
+    # Multi-Worker setting.
+    # TODO(omalleyt): Make this attr public once solution is stable.
+    self._chief_worker_only = None
 
   def set_params(self, params):
     self.params = params
@@ -815,6 +848,17 @@ class ModelCheckpoint(Callback):
         self.monitor_op = np.less
         self.best = np.Inf
 
+    # Only the chief worker writes model checkpoints.
+    self._chief_worker_only = True
+
+  def set_model(self, model):
+    self.model = model
+    # Use name matching rather than `isinstance` to avoid circular dependencies.
+    if (not self.save_weights_only and
+        not model._is_graph_network and  # pylint: disable=protected-access
+        model.__class__.__name__ != 'Sequential'):
+      self.save_weights_only = True
+
   def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
     self.epochs_since_last_save += 1
@@ -1054,10 +1098,10 @@ class LearningRateScheduler(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
 
 
-@keras_export('keras.callbacks.TensorBoard')
+@keras_export('keras.callbacks.TensorBoard', v1=[])
 class TensorBoard(Callback):
   # pylint: disable=line-too-long
-  """Tensorboard basic visualizations.
+  """TensorBoard basic visualizations.
 
   This callback writes a log for TensorBoard, which allows
   you to visualize dynamic graphs of your training and test
@@ -1077,51 +1121,25 @@ class TensorBoard(Callback):
   [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
   Arguments:
-      log_dir: the path of the directory where to save the log
-          files to be parsed by TensorBoard.
-      histogram_freq: frequency (in epochs) at which to compute activation
-          and weight histograms for the layers of the model. If set to 0,
-          histograms won't be computed. Validation data (or split) must be
-          specified for histogram visualizations.
-      write_graph: whether to visualize the graph in TensorBoard.
-          The log file can become quite large when
-          write_graph is set to True.
-      write_grads: whether to visualize gradient histograms in TensorBoard.
-          `histogram_freq` must be greater than 0.
-      batch_size: size of batch of inputs to feed to the network
-          for histograms computation.
-      write_images: whether to write model weights to visualize as
-          image in TensorBoard.
-      embeddings_freq: frequency (in epochs) at which selected embedding
-          layers will be saved. If set to 0, embeddings won't be computed.
-          Data to be visualized in TensorBoard's Embedding tab must be passed
-          as `embeddings_data`.
-      embeddings_layer_names: a list of names of layers to keep eye on. If
-          None or empty list all the embedding layer will be watched.
-      embeddings_metadata: a dictionary which maps layer name to a file name
-          in which metadata for this embedding layer is saved. See the
-          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
-          about metadata files format. In case if the same metadata file is
-          used for all embedding layers, string can be passed.
-      embeddings_data: data to be embedded at layers specified in
-          `embeddings_layer_names`. Numpy array (if the model has a single
-          input) or list of Numpy arrays (if the model has multiple inputs).
-          Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      log_dir: the path of the directory where to save the log files to be
+        parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation and
+        weight histograms for the layers of the model. If set to 0, histograms
+        won't be computed. Validation data (or split) must be specified for
+        histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard. The log file
+        can become quite large when write_graph is set to True.
+      write_images: whether to write model weights to visualize as image in
+        TensorBoard.
       update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
-          writes the losses and metrics to TensorBoard after each batch.
-          The same applies for `'epoch'`. If using an integer, let's say `1000`,
-          the callback will write the metrics and losses to TensorBoard every
-          1000 samples. Note that writing too frequently to TensorBoard
-          can slow down your training.
+        writes the losses and metrics to TensorBoard after each batch. The same
+        applies for `'epoch'`. If using an integer, let's say `1000`, the
+        callback will write the metrics and losses to TensorBoard every 1000
+        samples. Note that writing too frequently to TensorBoard can slow down
+        your training.
 
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
-
-  @compatibility(eager)
-  Using the `TensorBoard` callback will work when eager execution is enabled,
-  with the restriction that outputting histogram summaries of weights and
-  gradients is not supported. Consequently, `histogram_freq` will be ignored.
-  @end_compatibility
   """
 
   # pylint: enable=line-too-long
@@ -1129,316 +1147,194 @@ class TensorBoard(Callback):
   def __init__(self,
                log_dir='./logs',
                histogram_freq=0,
-               batch_size=32,
                write_graph=True,
-               write_grads=False,
                write_images=False,
-               embeddings_freq=0,
-               embeddings_layer_names=None,
-               embeddings_metadata=None,
-               embeddings_data=None,
-               update_freq='epoch'):
+               update_freq='epoch',
+               **kwargs):
     super(TensorBoard, self).__init__()
+    self._validate_kwargs(kwargs)
+
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
-    if self.histogram_freq and context.executing_eagerly():
-      logging.warning(
-          UserWarning('Weight and gradient histograms not supported for eager'
-                      'execution, setting `histogram_freq` to `0`.'))
-      self.histogram_freq = 0
-    self.merged = None
     self.write_graph = write_graph
-    self.write_grads = write_grads
     self.write_images = write_images
-    self.batch_size = batch_size
-    self._current_batch = 0
-    self._total_batches_seen = 0
-    self._total_val_batches_seen = 0
-    self.embeddings_freq = embeddings_freq
-    self.embeddings_layer_names = embeddings_layer_names
-    self.embeddings_metadata = embeddings_metadata
-    self.embeddings_data = embeddings_data
     if update_freq == 'batch':
       self.update_freq = 1
     else:
       self.update_freq = update_freq
+
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
+    self._current_batch = 0
+    self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
 
-  def _init_writer(self, model):
-    """Sets file writer."""
-    if context.executing_eagerly():
-      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
-      if not model.run_eagerly and self.write_graph:
-        with self.writer.as_default():
-          summary_ops_v2.graph(K.get_graph())
-    elif self.write_graph:
-      self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
-    else:
-      self.writer = tf_summary.FileWriter(self.log_dir)
-
-  def _make_histogram_ops(self, model):
-    """Defines histogram ops when histogram_freq > 0."""
-    # only make histogram summary op if it hasn't already been made
-    if self.histogram_freq and self.merged is None:
-      for layer in self.model.layers:
-        for weight in layer.weights:
-          mapped_weight_name = weight.name.replace(':', '_')
-          tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_images:
-            w_img = array_ops.squeeze(weight)
-            shape = K.int_shape(w_img)
-            if len(shape) == 2:  # dense layer kernel case
-              if shape[0] > shape[1]:
-                w_img = array_ops.transpose(w_img)
-                shape = K.int_shape(w_img)
-              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
-            elif len(shape) == 3:  # convnet case
-              if K.image_data_format() == 'channels_last':
-                # switch to channels_first to display
-                # every kernel as a separate image
-                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
-                shape = K.int_shape(w_img)
-              w_img = array_ops.reshape(w_img,
-                                        [shape[0], shape[1], shape[2], 1])
-            elif len(shape) == 1:  # bias case
-              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
-            else:
-              # not possible to handle 3D convnets etc.
-              continue
-
-            shape = K.int_shape(w_img)
-            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
-            tf_summary.image(mapped_weight_name, w_img)
-
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [
-                grad.values if is_indexed_slices(grad) else grad
-                for grad in grads
-            ]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
-        if hasattr(layer, 'output'):
-          if isinstance(layer.output, list):
-            for i, output in enumerate(layer.output):
-              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
-          else:
-            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+    self._writers = []  # file writers to be closed
+    self._train_writer = None  # set in `_initialize_writers`
+    self._validation_writer = None  # set in `_initialize_writers`
+
+    # TensorBoard should only write summaries on the chief when in a
+    # Multi-Worker setting.
+    self._chief_worker_only = True
+
+  def _validate_kwargs(self, kwargs):
+    """Handle arguments were supported in V1."""
+    if kwargs.get('write_grads', False):
+      logging.warning('`write_grads` will be ignored in TensorFlow 2.0 '
+                      'for the `TensorBoard` Callback.')
+    if kwargs.get('embeddings_freq', False):
+      logging.warning('Embeddings will be ignored in TensorFlow 2.0 '
+                      'for the `TensorBoard` Callback.')
+
+    unrecognized_kwargs = set(kwargs.keys()) - {
+        'write_grads', 'embeddings_freq', 'embeddings_layer_names',
+        'embeddings_metadata', 'embeddings_data'
+    }
+
+    # Only allow kwargs that were supported in V1.
+    if unrecognized_kwargs:
+      raise ValueError('Unrecognized arguments in `TensorBoard` '
+                       'Callback: ' + str(unrecognized_kwargs))
 
   def set_model(self, model):
-    """Sets Keras model and creates summary ops."""
-
+    """Sets Keras model and writes graph if specified."""
     self.model = model
-    self._init_writer(model)
-    # histogram summaries only enabled in graph mode
-    if not context.executing_eagerly():
-      self._make_histogram_ops(model)
-      self.merged = tf_summary.merge_all()
-
-    # If both embedding_freq and embeddings_data are available, we will
-    # visualize embeddings.
-    if self.embeddings_freq and self.embeddings_data is not None:
-      # Avoid circular dependency.
-      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
-      self.embeddings_data = training_utils.standardize_input_data(
-          self.embeddings_data, model.input_names)
-
-      # If embedding_layer_names are not provided, get all of the embedding
-      # layers from the model.
-      embeddings_layer_names = self.embeddings_layer_names
-      if not embeddings_layer_names:
-        embeddings_layer_names = [
-            layer.name
-            for layer in self.model.layers
-            if type(layer).__name__ == 'Embedding'
-        ]
-
-      self.assign_embeddings = []
-      embeddings_vars = {}
-
-      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
-      self.step = step = array_ops.placeholder(dtypes.int32)
+    with context.eager_mode():
+      self._initialize_writers()
+      if self.write_graph:
+        if model.run_eagerly:
+          logging.warning('TensorBoard Callback will ignore `write_graph=True`'
+                          'when `Model.run_eagerly=True`.`')
+        else:
+          with self._train_writer.as_default():
+            with summary_ops_v2.always_record_summaries():
+              summary_ops_v2.graph(K.get_graph())
 
-      for layer in self.model.layers:
-        if layer.name in embeddings_layer_names:
-          embedding_input = self.model.get_layer(layer.name).output
-          embedding_size = np.prod(embedding_input.shape[1:])
-          embedding_input = array_ops.reshape(embedding_input,
-                                              (step, int(embedding_size)))
-          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
-          embedding = variables.Variable(
-              array_ops.zeros(shape), name=layer.name + '_embedding')
-          embeddings_vars[layer.name] = embedding
-          batch = state_ops.assign(embedding[batch_id:batch_id + step],
-                                   embedding_input)
-          self.assign_embeddings.append(batch)
-
-      self.saver = saver.Saver(list(embeddings_vars.values()))
-
-      # Create embeddings_metadata dictionary
-      if isinstance(self.embeddings_metadata, str):
-        embeddings_metadata = {
-            layer_name: self.embeddings_metadata
-            for layer_name in embeddings_vars.keys()
-        }
-      else:
-        # If embedding_metadata is already a dictionary
-        embeddings_metadata = self.embeddings_metadata
-
-      try:
-        from tensorboard.plugins import projector
-      except ImportError:
-        raise ImportError('Failed to import TensorBoard. Please make sure that '
-                          'TensorBoard integration is complete."')
-
-      # TODO(psv): Add integration tests to test embedding visualization
-      # with TensorBoard callback. We are unable to write a unit test for this
-      # because TensorBoard dependency assumes TensorFlow package is installed.
-      config = projector.ProjectorConfig()
-      for layer_name, tensor in embeddings_vars.items():
-        embedding = config.embeddings.add()
-        embedding.tensor_name = tensor.name
-
-        if (embeddings_metadata is not None and
-            layer_name in embeddings_metadata):
-          embedding.metadata_path = embeddings_metadata[layer_name]
-
-      projector.visualize_embeddings(self.writer, config)
-
-  def _fetch_callback(self, summary):
-    self.writer.add_summary(summary, self._total_val_batches_seen)
-    self._total_val_batches_seen += 1
-
-  def _write_custom_summaries(self, step, logs=None):
-    """Writes metrics out as custom scalar summaries.
+  def _close_writers(self):
+    """Close all remaining open file writers owned by this callback.
 
-    Arguments:
-        step: the global step to use for Tensorboard.
-        logs: dict. Keys are scalar summary names, values are
-            NumPy scalars.
+    If there are no such file writers, this is a no-op.
+    """
+    with context.eager_mode():
+      for writer in self._writers:
+        writer.close()
+      del self._writers[:]
 
+  def _initialize_writers(self):
+    """Create all file writers needed and validation writers.
+
+    This updates `self._train_writer` and `self._validation_writer`, and
+    populates the `self._writers` list to be cleaned up by
+    `_close_writers`.
     """
-    logs = logs or {}
-    if context.executing_eagerly():
-      # use v2 summary ops
-      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
-        for name, value in logs.items():
-          if isinstance(value, np.ndarray):
-            value = value.item()
-          summary_ops_v2.scalar(name, value, step=step)
-    else:
-      # use FileWriter from v1 summary
-      for name, value in logs.items():
-        if isinstance(value, np.ndarray):
-          value = value.item()
-        summary = tf_summary.Summary()
-        summary_value = summary.value.add()
-        summary_value.simple_value = value
-        summary_value.tag = name
-        self.writer.add_summary(summary, step)
-    self.writer.flush()
+    self._close_writers()
+
+    def create_writer(subdir):
+      path = os.path.join(self.log_dir, subdir)
+      return summary_ops_v2.create_file_writer(path)
+
+    self._train_writer = create_writer('train')
+    self._writers.append(self._train_writer)
+    self._validation_writer = create_writer('validation')
+    self._writers.append(self._validation_writer)
 
   def on_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch."""
-    # Don't output batch_size and batch number as Tensorboard summaries
+    # Don't output batch_size and batch number as TensorBoard summaries
     logs = logs or {}
     self._samples_seen += logs.get('size', 1)
     samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
     if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
-      batch_logs = {('batch_' + k): v
-                    for k, v in logs.items()
-                    if k not in ['batch', 'size', 'num_steps']}
-      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._log_metrics(logs, prefix='batch_', step=self._total_batches_seen)
       self._samples_seen_at_last_write = self._samples_seen
     self._total_batches_seen += 1
 
-  def on_epoch_begin(self, epoch, logs=None):
-    """Add histogram op to Model eval_function callbacks, reset batch count."""
-
-    # check if histogram summary should be run for this epoch
-    if self.histogram_freq and epoch % self.histogram_freq == 0:
-      self._epoch = epoch
-      # pylint: disable=protected-access
-      # add the histogram summary op if it should run this epoch
-      self.model._make_eval_function()
-      if self.merged not in self.model._eval_function.fetches:
-        self.model._eval_function.fetches.append(self.merged)
-        self.model._eval_function.fetch_callbacks[
-            self.merged] = self._fetch_callback
-      # pylint: enable=protected-access
-
   def on_epoch_end(self, epoch, logs=None):
-    """Checks if summary ops should run next epoch, logs scalar summaries."""
-
-    # don't output batch_size and
-    # batch number as Tensorboard summaries
-    logs = {('epoch_' + k): v
-            for k, v in logs.items()
-            if k not in ['batch', 'size', 'num_steps']}
-    if self.update_freq == 'epoch':
-      step = epoch
-    else:
-      step = self._samples_seen
-    self._write_custom_summaries(step, logs)
-
-    # pop the histogram summary op after each epoch
-    if self.histogram_freq:
-      # pylint: disable=protected-access
-      if self.merged in self.model._eval_function.fetches:
-        self.model._eval_function.fetches.remove(self.merged)
-      if self.merged in self.model._eval_function.fetch_callbacks:
-        self.model._eval_function.fetch_callbacks.pop(self.merged)
-      # pylint: enable=protected-access
-
-    if self.embeddings_data is None and self.embeddings_freq:
-      raise ValueError('To visualize embeddings, embeddings_data must '
-                       'be provided.')
-
-    if self.embeddings_freq and self.embeddings_data is not None:
-      if epoch % self.embeddings_freq == 0:
-        # We need a second forward-pass here because we're passing
-        # the `embeddings_data` explicitly. This design allows to pass
-        # arbitrary data as `embeddings_data` and results from the fact
-        # that we need to know the size of the `tf.Variable`s which
-        # hold the embeddings in `set_model`. At this point, however,
-        # the `validation_data` is not yet set.
-
-        embeddings_data = self.embeddings_data
-        n_samples = embeddings_data[0].shape[0]
-        i = 0
-        while i < n_samples:
-          step = min(self.batch_size, n_samples - i)
-          batch = slice(i, i + step)
-
-          if isinstance(self.model.input, list):
-            feed_dict = {
-                model_input: embeddings_data[idx][batch]
-                for idx, model_input in enumerate(self.model.input)
-            }
-          else:
-            feed_dict = {self.model.input: embeddings_data[0][batch]}
+    """Runs metrics and histogram summaries at epoch end."""
+    step = epoch if self.update_freq == 'epoch' else self._samples_seen
+    self._log_metrics(logs, prefix='epoch_', step=step)
 
-          feed_dict.update({self.batch_id: i, self.step: step})
-
-          if not isinstance(K.learning_phase(), int):
-            feed_dict[K.learning_phase()] = False
+    if self.histogram_freq and epoch % self.histogram_freq == 0:
+      self._log_weights(epoch)
 
-          self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
-          self.saver.save(self.sess,
-                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
-                          epoch)
+  def on_train_end(self, logs=None):
+    self._close_writers()
 
-          i += self.batch_size
+  def _log_metrics(self, logs, prefix, step):
+    """Writes metrics out as custom scalar summaries.
 
-  def on_train_end(self, logs=None):
-    self.writer.close()
+    Arguments:
+        logs: Dict. Keys are scalar summary names, values are NumPy scalars.
+        prefix: String. The prefix to apply to the scalar summary names.
+        step: Int. The global step to use for TensorBoard.
+    """
+    if logs is None:
+      logs = {}
+
+    # Group metrics by their associated file writer. Values are lists of
+    # metrics, as (name, scalar_value) pairs.
+    logs_by_writer = {
+        self._train_writer: [],
+        self._validation_writer: [],
+    }
+    validation_prefix = 'val_'
+    for (name, value) in logs.items():
+      if name in ('batch', 'size', 'num_steps'):
+        # Scrub non-metric items.
+        continue
+      if name.startswith(validation_prefix):
+        name = name[len(validation_prefix):]
+        writer = self._validation_writer
+      else:
+        writer = self._train_writer
+      name = prefix + name  # assign batch or epoch prefix
+      logs_by_writer[writer].append((name, value))
+
+    with context.eager_mode():
+      with summary_ops_v2.always_record_summaries():
+        for writer in logs_by_writer:
+          with writer.as_default():
+            for (name, value) in logs_by_writer[writer]:
+              summary_ops_v2.scalar(name, value, step=step)
+
+  def _log_weights(self, epoch):
+    """Logs the weights of the Model to TensorBoard."""
+    with context.eager_mode(), \
+          self._train_writer.as_default(), \
+          summary_ops_v2.always_record_summaries():
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          weight_name = weight.name.replace(':', '_')
+          with ops.init_scope():
+            weight = K.get_value(weight)
+          summary_ops_v2.histogram(weight_name, weight, step=epoch)
+          if self.write_images:
+            self._log_weight_as_image(weight, weight_name, epoch)
+      self._train_writer.flush()
+
+  def _log_weight_as_image(self, weight, weight_name, epoch):
+    """Logs a weight as a TensorBoard image."""
+    w_img = array_ops.squeeze(weight)
+    shape = K.int_shape(w_img)
+    if len(shape) == 1:  # Bias case
+      w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+    elif len(shape) == 2:  # Dense layer kernel case
+      if shape[0] > shape[1]:
+        w_img = array_ops.transpose(w_img)
+        shape = K.int_shape(w_img)
+      w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+    elif len(shape) == 3:  # ConvNet case
+      if K.image_data_format() == 'channels_last':
+        # Switch to channels_first to display every kernel as a separate
+        # image.
+        w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+        shape = K.int_shape(w_img)
+      w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1])
+
+    shape = K.int_shape(w_img)
+    # Not possible to handle 3D convnets etc.
+    if len(shape) == 4 and shape[-1] in [1, 3, 4]:
+      summary_ops_v2.image(weight_name, w_img, step=epoch)
 
 
 @keras_export('keras.callbacks.ReduceLROnPlateau')
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 6aba08b95fb1c13a7ded23a117af3ab0d2d64874..2a14debfdd539a797ba9a894fc96890e3e69f392 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -23,22 +23,25 @@ import csv
 import os
 import re
 import shutil
-import tempfile
+import sys
 import threading
 import unittest
 
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adam
+from tensorflow.python.util import tf_contextlib
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -218,173 +221,219 @@ class CallbackCountsTest(keras_parameterized.TestCase):
         })
 
 
-class KerasCallbacksTest(test.TestCase):
+class KerasCallbacksTest(keras_parameterized.TestCase):
 
-  def test_ModelCheckpoint(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.cached_session():
-      np.random.seed(1337)
+  def _get_model(self, input_shape=None):
+    layers = [
+        keras.layers.Dense(3, activation='relu'),
+        keras.layers.Dense(2, activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=input_shape)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=[keras.metrics.CategoricalAccuracy(name='my_acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
 
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging(self):
+    model = self._get_model(input_shape=(3,))
 
-      filepath = os.path.join(temp_dir, 'checkpoint.h5')
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-      # case 1
-      monitor = 'val_loss'
-      save_best_only = False
-      mode = 'auto'
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*- loss:.*- my_acc:.*)+'
 
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(dataset, epochs=2, steps_per_epoch=10)
+      self.assertRegexpMatches(printed.contents(), expected_log)
 
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+  @keras_parameterized.run_with_all_model_types(exclude_models='functional')
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging_deferred_model_build(self):
+    model = self._get_model()
+    self.assertFalse(model.built)
 
-      # case 2
-      mode = 'min'
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*- loss:.*- my_acc:.*)+'
 
-      # case 3
-      mode = 'max'
-      monitor = 'val_acc'
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(dataset, epochs=2, steps_per_epoch=10)
+      self.assertRegexpMatches(printed.contents(), expected_log)
 
-      # case 4
-      save_best_only = True
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+  @keras_parameterized.run_with_all_model_types
+  def test_ModelCheckpoint(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
 
-      # Case: metric not available.
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor='unknown',
-              save_best_only=True)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      # File won't be written.
-      assert not os.path.exists(filepath)
+    layers = [
+        keras.layers.Dense(NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'),
+        keras.layers.Dense(NUM_CLASSES, activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10,))
+    model.compile(
+        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
-      # case 5
-      save_best_only = False
-      period = 2
-      mode = 'auto'
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
 
-      filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode,
-              period=period)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=4,
-          verbose=1)
-      assert os.path.exists(filepath.format(epoch=2))
-      assert os.path.exists(filepath.format(epoch=4))
-      os.remove(filepath.format(epoch=2))
-      os.remove(filepath.format(epoch=4))
-      assert not os.path.exists(filepath.format(epoch=1))
-      assert not os.path.exists(filepath.format(epoch=3))
-
-      # Invalid use: this will raise a warning but not an Exception.
-      keras.callbacks.ModelCheckpoint(
-          filepath,
-          monitor=monitor,
-          save_best_only=save_best_only,
-          mode='unknown')
+    filepath = os.path.join(temp_dir, 'checkpoint')
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+    # case 1
+    monitor = 'val_loss'
+    save_best_only = False
+    mode = 'auto'
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(
+            NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+    model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+    model.compile(
+        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
+
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 2
+    mode = 'min'
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 3
+    mode = 'max'
+    monitor = 'val_acc'
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 4
+    save_best_only = True
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # Case: metric not available.
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor='unknown',
+            save_best_only=True)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    # File won't be written.
+    assert not os.path.exists(filepath)
+
+    # case 5
+    save_best_only = False
+    period = 2
+    mode = 'auto'
+
+    filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode,
+            period=period)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=4,
+        verbose=1)
+    assert os.path.exists(filepath.format(epoch=2))
+    assert os.path.exists(filepath.format(epoch=4))
+    os.remove(filepath.format(epoch=2))
+    os.remove(filepath.format(epoch=4))
+    assert not os.path.exists(filepath.format(epoch=1))
+    assert not os.path.exists(filepath.format(epoch=3))
+
+    # Invalid use: this will raise a warning but not an Exception.
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        mode='unknown')
 
   def test_EarlyStopping(self):
     with self.cached_session():
@@ -399,9 +448,7 @@ class KerasCallbacksTest(test.TestCase):
       model = testing_utils.get_small_sequential_mlp(
           num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
       model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
+          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
       cases = [
           ('max', 'val_acc'),
@@ -459,7 +506,7 @@ class KerasCallbacksTest(test.TestCase):
       model = testing_utils.get_small_sequential_mlp(
           num_hidden=1, num_classes=1, input_dim=1)
       model.compile(
-          optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
+          optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
 
       stopper = keras.callbacks.EarlyStopping(monitor='acc',
                                               baseline=baseline)
@@ -836,310 +883,6 @@ class KerasCallbacksTest(test.TestCase):
       self.assertEqual(len(loss), 1)
       self.assertEqual(loss[0], np.inf)
 
-  @test_util.run_deprecated_v1
-  def test_TensorBoard(self):
-    np.random.seed(1337)
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    def data_generator(train):
-      if train:
-        max_batch_index = len(x_train) // BATCH_SIZE
-      else:
-        max_batch_index = len(x_test) // BATCH_SIZE
-      i = 0
-      while 1:
-        if train:
-          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        else:
-          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        i += 1
-        i %= max_batch_index
-
-    # case: Sequential
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=temp_dir, histogram_freq=1, write_images=True,
-          write_grads=True, batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      # fit with validation data and accuracy
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      # fit generator with validation data
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data
-      # histogram_freq must be zero
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator with validation data and accuracy
-      tsb.histogram_freq = 1
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data and accuracy
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
-      assert os.path.exists(temp_dir)
-
-  @test_util.run_deprecated_v1
-  def test_TensorBoard_multi_input_output(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          else:
-            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          i += 1
-          i %= max_batch_index
-
-      inp1 = keras.Input((INPUT_DIM,))
-      inp2 = keras.Input((INPUT_DIM,))
-      inp = keras.layers.add([inp1, inp2])
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model([inp1, inp2], [output1, output2])
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(log_dir=filepath,
-                                            histogram_freq=histogram_freq,
-                                            write_images=True, write_grads=True,
-                                            batch_size=5)]
-
-      # fit without validation data
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
-
-      # fit with validation data and accuracy
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                validation_data=([x_test] * 2, [y_test] * 2),
-                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
-
-      # fit generator without validation data
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          callbacks=callbacks_factory(histogram_freq=0))
-
-      # fit generator with validation data and accuracy
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          validation_data=([x_test] * 2, [y_test] * 2),
-                          callbacks=callbacks_factory(histogram_freq=1))
-      assert os.path.isdir(filepath)
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_histogram_summaries_in_test_function(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.steps_seen = []
-
-      def add_summary(self, summary, global_step):
-        summary_obj = summary_pb2.Summary()
-
-        # ensure a valid Summary proto is being sent
-        if isinstance(summary, bytes):
-          summary_obj.ParseFromString(summary)
-        else:
-          assert isinstance(summary, summary_pb2.Summary)
-          summary_obj = summary
-
-        # keep track of steps seen for the merged_summary op,
-        # which contains the histogram summaries
-        if len(summary_obj.value) > 1:
-          self.steps_seen.append(global_step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    def _init_writer(obj, _):
-      obj.writer = FileWriterStub(obj.log_dir)
-
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      keras.callbacks.TensorBoard._init_writer = _init_writer
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_histogram_summaries_with_generator(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
-
-    with self.cached_session():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation generator
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=2,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=cbks,
-          verbose=0)
-
-      with self.assertRaises(ValueError):
-        # fit with validation generator but no
-        # validation_steps
-        model.fit_generator(
-            generator(),
-            steps_per_epoch=2,
-            epochs=2,
-            validation_data=generator(),
-            callbacks=cbks,
-            verbose=0)
-
-      self.assertTrue(os.path.exists(tmpdir))
-
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
@@ -1187,209 +930,6 @@ class KerasCallbacksTest(test.TestCase):
       t.join()
       assert not t.is_alive()
 
-  def test_TensorBoard_with_ReduceLROnPlateau(self):
-    with self.cached_session():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-      model.compile(
-          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
-
-      cbks = [
-          keras.callbacks.ReduceLROnPlateau(
-              monitor='val_loss', factor=0.5, patience=4, verbose=1),
-          keras.callbacks.TensorBoard(log_dir=temp_dir)
-      ]
-
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      assert os.path.exists(temp_dir)
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_batch_logging(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batches_logged = []
-        self.summary_values = []
-        self.summary_tags = []
-
-      def add_summary(self, summary, step):
-        self.summary_values.append(summary.value[0].simple_value)
-        self.summary_tags.append(summary.value[0].tag)
-        self.batches_logged.append(step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    for batch in range(5):
-      tb_cbk.on_batch_end(batch, {'acc': batch})
-    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
-    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
-    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_epoch_and_batch_logging(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summary = (step, summary)
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summary = (step, summary)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0})
-    batch_step, batch_summary = tb_cbk.writer.batch_summary
-    self.assertEqual(batch_step, 0)
-    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
-
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-    tb_cbk.on_epoch_end(0, {'acc': 10.0})
-    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
-    self.assertEqual(epoch_step, 0)
-    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_Tensorboard_eager(self):
-    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer=adam.AdamOptimizer(0.01),
-        metrics=['accuracy'])
-
-    cbks = [keras.callbacks.TensorBoard(log_dir=temp_dir)]
-
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=2,
-        verbose=0)
-
-    self.assertTrue(os.path.exists(temp_dir))
-
-  @test_util.run_deprecated_v1
-  def test_TensorBoard_update_freq(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batch_summaries = []
-        self.epoch_summaries = []
-
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summaries.append((step, summary))
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summaries.append((step, summary))
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    # Epoch mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(tb_cbk.writer.batch_summaries, [])
-    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
-
-    # Batch mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    self.assertFalse(tb_cbk.writer.epoch_summaries)
-
-    # Integer mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq=20)
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertFalse(tb_cbk.writer.batch_summaries)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    self.assertFalse(tb_cbk.writer.epoch_summaries)
-
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
@@ -1422,5 +962,217 @@ class KerasCallbacksTest(test.TestCase):
             epochs=1)
 
 
+# A summary that was emitted during a test. Fields:
+#   logdir: str. The logdir of the FileWriter to which the summary was
+#     written.
+#   tag: str. The name of the summary.
+_ObservedSummary = collections.namedtuple('_ObservedSummary', ('logdir', 'tag'))
+
+
+class _MockSummaryFile(object):
+  """Record summary tag names and the files to which they're written.
+
+  Fields `scalars`, `images`, and `histograms` are sets containing
+  `_ObservedSummary` values.
+  """
+
+  def __init__(self):
+    self.scalars = set()
+    self.images = set()
+    self.histograms = set()
+
+
+@tf_contextlib.contextmanager
+def _mock_summary_api():
+  summary_file = _MockSummaryFile()
+
+  # Keep track of the logdir associated with each created resource.
+  # (There doesn't seem to be an easy way to get this information after
+  # the fact.)
+  resource_logdirs = {}
+  real_create_file_writer = summary_ops_v2.create_file_writer
+
+  def mock_create_file_writer(logdir, *args, **kwargs):
+    writer = real_create_file_writer(logdir, *args, **kwargs)
+    resource = writer._resource
+    assert resource is not None
+    assert resource not in resource_logdirs, (resource, resource_logdirs)
+    resource_logdirs[resource] = logdir
+    return writer
+
+  def make_mock_summary(summary_set):
+
+    def mock_summary(tag, *args, **kwargs):
+      del args  # unused
+      del kwargs  # unused
+      resource = context.context().summary_writer_resource
+      logdir = resource_logdirs[resource]
+      summary_set.add(_ObservedSummary(logdir=logdir, tag=tag))
+
+    return mock_summary
+
+  with test.mock.patch.object(summary_ops_v2,
+                              'create_file_writer',
+                              mock_create_file_writer), \
+        test.mock.patch.object(summary_ops_v2,
+                               'scalar',
+                               make_mock_summary(summary_file.scalars)), \
+        test.mock.patch.object(summary_ops_v2,
+                               'histogram',
+                               make_mock_summary(summary_file.histograms)), \
+        test.mock.patch.object(summary_ops_v2,
+                               'image',
+                               make_mock_summary(summary_file.images)):
+    yield summary_file
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TestTensorBoardV2(keras_parameterized.TestCase):
+
+  def setUp(self):
+    super(TestTensorBoardV2, self).setUp()
+    self.logdir = os.path.join(self.get_temp_dir(), 'tb')
+    self.train_dir = os.path.join(self.logdir, 'train')
+    self.validation_dir = os.path.join(self.logdir, 'validation')
+
+  def _get_model(self):
+    layers = [
+        keras.layers.Conv2D(8, (3, 3)),
+        keras.layers.Flatten(),
+        keras.layers.Dense(1)
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10, 10, 1))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def test_TensorBoard_basic(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars, {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        })
+
+  def test_TensorBoard_batch_metrics(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='batch_loss'),
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+
+  def test_TensorBoard_weight_histograms(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    temp_dir = self.get_temp_dir() + '/tb'
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, histogram_freq=1)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.histograms),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+
+  def test_TensorBoard_weight_images(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    temp_dir = self.get_temp_dir() + '/tb'
+    tb_cbk = keras.callbacks.TensorBoard(
+        temp_dir, histogram_freq=1, write_images=True)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.histograms),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.images),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+
+  def _strip_layer_names(self, summaries):
+    """Deduplicate summary names modulo layer suffix.
+
+    Args:
+      summaries: A `set` of `_ObservedSummary` values.
+
+    Returns:
+      A new `set` of `_ObservedSummary` values with layer suffixes
+      removed.
+    """
+    return {s._replace(tag=s.tag[s.tag.rfind('/') + 1:]) for s in summaries}
+
+  def test_TensorBoard_invalid_argument(self):
+    with self.assertRaisesRegexp(ValueError, 'Unrecognized arguments'):
+      keras.callbacks.TensorBoard(wwrite_images=True)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..980eacd356d996c4d5ecbde1e7d3a9cd2edc7e6c
--- /dev/null
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -0,0 +1,424 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-import-not-at-top
+"""Callbacks: utilities called at certain points during model training.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=['keras.callbacks.TensorBoard'])
+class TensorBoard(callbacks.Callback):
+  # pylint: disable=line-too-long
+  """TensorBoard basic visualizations.
+
+  This callback writes a log for TensorBoard, which allows
+  you to visualize dynamic graphs of your training and test
+  metrics, as well as activation histograms for the different
+  layers in your model.
+
+  TensorBoard is a visualization tool provided with TensorFlow.
+
+  If you have installed TensorFlow with pip, you should be able
+  to launch TensorBoard from the command line:
+
+  ```sh
+  tensorboard --logdir=/full_path_to_your_logs
+  ```
+
+  You can find more information about TensorBoard
+  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+  Arguments:
+      log_dir: the path of the directory where to save the log files to be
+        parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation and
+        weight histograms for the layers of the model. If set to 0, histograms
+        won't be computed. Validation data (or split) must be specified for
+        histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard. The log file
+        can become quite large when write_graph is set to True.
+      write_grads: whether to visualize gradient histograms in TensorBoard.
+        `histogram_freq` must be greater than 0.
+      batch_size: size of batch of inputs to feed to the network for histograms
+        computation.
+      write_images: whether to write model weights to visualize as image in
+        TensorBoard.
+      embeddings_freq: frequency (in epochs) at which selected embedding layers
+        will be saved. If set to 0, embeddings won't be computed. Data to be
+        visualized in TensorBoard's Embedding tab must be passed as
+        `embeddings_data`.
+      embeddings_layer_names: a list of names of layers to keep eye on. If None
+        or empty list all the embedding layer will be watched.
+      embeddings_metadata: a dictionary which maps layer name to a file name in
+        which metadata for this embedding layer is saved. See the
+          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+            about metadata files format. In case if the same metadata file is
+            used for all embedding layers, string can be passed.
+      embeddings_data: data to be embedded at layers specified in
+        `embeddings_layer_names`. Numpy array (if the model has a single input)
+        or list of Numpy arrays (if the model has multiple inputs). Learn [more
+        about
+            embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
+        writes the losses and metrics to TensorBoard after each batch. The same
+        applies for `'epoch'`. If using an integer, let's say `1000`, the
+        callback will write the metrics and losses to TensorBoard every 1000
+        samples. Note that writing too frequently to TensorBoard can slow down
+        your training.
+
+  Raises:
+      ValueError: If histogram_freq is set and no validation data is provided.
+
+  @compatibility(eager)
+  Using the `TensorBoard` callback will work when eager execution is enabled,
+  with the restriction that outputting histogram summaries of weights and
+  gradients is not supported. Consequently, `histogram_freq` will be ignored.
+  @end_compatibility
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               log_dir='./logs',
+               histogram_freq=0,
+               batch_size=32,
+               write_graph=True,
+               write_grads=False,
+               write_images=False,
+               embeddings_freq=0,
+               embeddings_layer_names=None,
+               embeddings_metadata=None,
+               embeddings_data=None,
+               update_freq='epoch'):
+    super(TensorBoard, self).__init__()
+    self.log_dir = log_dir
+    self.histogram_freq = histogram_freq
+    if self.histogram_freq and context.executing_eagerly():
+      logging.warning(
+          UserWarning('Weight and gradient histograms not supported for eager'
+                      'execution, setting `histogram_freq` to `0`.'))
+      self.histogram_freq = 0
+    self.merged = None
+    self.write_graph = write_graph
+    self.write_grads = write_grads
+    self.write_images = write_images
+    self.batch_size = batch_size
+    self._current_batch = 0
+    self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_layer_names = embeddings_layer_names
+    self.embeddings_metadata = embeddings_metadata
+    self.embeddings_data = embeddings_data
+    if update_freq == 'batch':
+      self.update_freq = 1
+    else:
+      self.update_freq = update_freq
+    self._samples_seen = 0
+    self._samples_seen_at_last_write = 0
+
+  def _init_writer(self, model):
+    """Sets file writer."""
+    if context.executing_eagerly():
+      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
+      if not model.run_eagerly and self.write_graph:
+        with self.writer.as_default():
+          summary_ops_v2.graph(K.get_graph())
+    elif self.write_graph:
+      self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
+    else:
+      self.writer = tf_summary.FileWriter(self.log_dir)
+
+  def _make_histogram_ops(self, model):
+    """Defines histogram ops when histogram_freq > 0."""
+    # only make histogram summary op if it hasn't already been made
+    if self.histogram_freq and self.merged is None:
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          mapped_weight_name = weight.name.replace(':', '_')
+          tf_summary.histogram(mapped_weight_name, weight)
+          if self.write_images:
+            w_img = array_ops.squeeze(weight)
+            shape = K.int_shape(w_img)
+            if len(shape) == 2:  # dense layer kernel case
+              if shape[0] > shape[1]:
+                w_img = array_ops.transpose(w_img)
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+            elif len(shape) == 3:  # convnet case
+              if K.image_data_format() == 'channels_last':
+                # switch to channels_first to display
+                # every kernel as a separate image
+                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img,
+                                        [shape[0], shape[1], shape[2], 1])
+            elif len(shape) == 1:  # bias case
+              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+            else:
+              # not possible to handle 3D convnets etc.
+              continue
+
+            shape = K.int_shape(w_img)
+            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
+            tf_summary.image(mapped_weight_name, w_img)
+
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [
+                grad.values if is_indexed_slices(grad) else grad
+                for grad in grads
+            ]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
+        if hasattr(layer, 'output'):
+          if isinstance(layer.output, list):
+            for i, output in enumerate(layer.output):
+              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
+          else:
+            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+
+  def set_model(self, model):
+    """Sets Keras model and creates summary ops."""
+
+    self.model = model
+    self._init_writer(model)
+    # histogram summaries only enabled in graph mode
+    if not context.executing_eagerly():
+      self._make_histogram_ops(model)
+      self.merged = tf_summary.merge_all()
+
+    # If both embedding_freq and embeddings_data are available, we will
+    # visualize embeddings.
+    if self.embeddings_freq and self.embeddings_data is not None:
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
+
+      # If embedding_layer_names are not provided, get all of the embedding
+      # layers from the model.
+      embeddings_layer_names = self.embeddings_layer_names
+      if not embeddings_layer_names:
+        embeddings_layer_names = [
+            layer.name
+            for layer in self.model.layers
+            if type(layer).__name__ == 'Embedding'
+        ]
+
+      self.assign_embeddings = []
+      embeddings_vars = {}
+
+      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
+      self.step = step = array_ops.placeholder(dtypes.int32)
+
+      for layer in self.model.layers:
+        if layer.name in embeddings_layer_names:
+          embedding_input = self.model.get_layer(layer.name).output
+          embedding_size = np.prod(embedding_input.shape[1:])
+          embedding_input = array_ops.reshape(embedding_input,
+                                              (step, int(embedding_size)))
+          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
+          embedding = variables.Variable(
+              array_ops.zeros(shape), name=layer.name + '_embedding')
+          embeddings_vars[layer.name] = embedding
+          batch = state_ops.assign(embedding[batch_id:batch_id + step],
+                                   embedding_input)
+          self.assign_embeddings.append(batch)
+
+      self.saver = saver.Saver(list(embeddings_vars.values()))
+
+      # Create embeddings_metadata dictionary
+      if isinstance(self.embeddings_metadata, str):
+        embeddings_metadata = {
+            layer_name: self.embeddings_metadata
+            for layer_name in embeddings_vars.keys()
+        }
+      else:
+        # If embedding_metadata is already a dictionary
+        embeddings_metadata = self.embeddings_metadata
+
+      try:
+        from tensorboard.plugins import projector
+      except ImportError:
+        raise ImportError('Failed to import TensorBoard. Please make sure that '
+                          'TensorBoard integration is complete."')
+
+      # TODO(psv): Add integration tests to test embedding visualization
+      # with TensorBoard callback. We are unable to write a unit test for this
+      # because TensorBoard dependency assumes TensorFlow package is installed.
+      config = projector.ProjectorConfig()
+      for layer_name, tensor in embeddings_vars.items():
+        embedding = config.embeddings.add()
+        embedding.tensor_name = tensor.name
+
+        if (embeddings_metadata is not None and
+            layer_name in embeddings_metadata):
+          embedding.metadata_path = embeddings_metadata[layer_name]
+
+      projector.visualize_embeddings(self.writer, config)
+
+  def _fetch_callback(self, summary):
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
+
+  def _write_custom_summaries(self, step, logs=None):
+    """Writes metrics out as custom scalar summaries.
+
+    Arguments:
+        step: the global step to use for TensorBoard.
+        logs: dict. Keys are scalar summary names, values are
+            NumPy scalars.
+
+    """
+    logs = logs or {}
+    if context.executing_eagerly():
+      # use v2 summary ops
+      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
+        for name, value in logs.items():
+          if isinstance(value, np.ndarray):
+            value = value.item()
+          summary_ops_v2.scalar(name, value, step=step)
+    else:
+      # use FileWriter from v1 summary
+      for name, value in logs.items():
+        if isinstance(value, np.ndarray):
+          value = value.item()
+        summary = tf_summary.Summary()
+        summary_value = summary.value.add()
+        summary_value.simple_value = value
+        summary_value.tag = name
+        self.writer.add_summary(summary, step)
+    self.writer.flush()
+
+  def on_batch_end(self, batch, logs=None):
+    """Writes scalar summaries for metrics on every training batch."""
+    # Don't output batch_size and batch number as TensorBoard summaries
+    logs = logs or {}
+    self._samples_seen += logs.get('size', 1)
+    samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
+    if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
+      batch_logs = {('batch_' + k): v
+                    for k, v in logs.items()
+                    if k not in ['batch', 'size', 'num_steps']}
+      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._samples_seen_at_last_write = self._samples_seen
+    self._total_batches_seen += 1
+
+  def on_epoch_begin(self, epoch, logs=None):
+    """Add histogram op to Model eval_function callbacks, reset batch count."""
+
+    # check if histogram summary should be run for this epoch
+    if self.histogram_freq and epoch % self.histogram_freq == 0:
+      self._epoch = epoch
+      # pylint: disable=protected-access
+      # add the histogram summary op if it should run this epoch
+      self.model._make_eval_function()
+      if self.merged not in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.append(self.merged)
+        self.model._eval_function.fetch_callbacks[
+            self.merged] = self._fetch_callback
+      # pylint: enable=protected-access
+
+  def on_epoch_end(self, epoch, logs=None):
+    """Checks if summary ops should run next epoch, logs scalar summaries."""
+
+    # don't output batch_size and
+    # batch number as TensorBoard summaries
+    logs = {('epoch_' + k): v
+            for k, v in logs.items()
+            if k not in ['batch', 'size', 'num_steps']}
+    if self.update_freq == 'epoch':
+      step = epoch
+    else:
+      step = self._samples_seen
+    self._write_custom_summaries(step, logs)
+
+    # pop the histogram summary op after each epoch
+    if self.histogram_freq:
+      # pylint: disable=protected-access
+      if self.merged in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.remove(self.merged)
+      if self.merged in self.model._eval_function.fetch_callbacks:
+        self.model._eval_function.fetch_callbacks.pop(self.merged)
+      # pylint: enable=protected-access
+
+    if self.embeddings_data is None and self.embeddings_freq:
+      raise ValueError('To visualize embeddings, embeddings_data must '
+                       'be provided.')
+
+    if self.embeddings_freq and self.embeddings_data is not None:
+      if epoch % self.embeddings_freq == 0:
+        # We need a second forward-pass here because we're passing
+        # the `embeddings_data` explicitly. This design allows to pass
+        # arbitrary data as `embeddings_data` and results from the fact
+        # that we need to know the size of the `tf.Variable`s which
+        # hold the embeddings in `set_model`. At this point, however,
+        # the `validation_data` is not yet set.
+
+        embeddings_data = self.embeddings_data
+        n_samples = embeddings_data[0].shape[0]
+        i = 0
+        while i < n_samples:
+          step = min(self.batch_size, n_samples - i)
+          batch = slice(i, i + step)
+
+          if isinstance(self.model.input, list):
+            feed_dict = {
+                model_input: embeddings_data[idx][batch]
+                for idx, model_input in enumerate(self.model.input)
+            }
+          else:
+            feed_dict = {self.model.input: embeddings_data[0][batch]}
+
+          feed_dict.update({self.batch_id: i, self.step: step})
+
+          if not isinstance(K.learning_phase(), int):
+            feed_dict[K.learning_phase()] = False
+
+          self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
+          self.saver.save(self.sess,
+                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
+                          epoch)
+
+          i += self.batch_size
+
+  def on_train_end(self, logs=None):
+    self.writer.close()
diff --git a/tensorflow/python/keras/callbacks_v1_test.py b/tensorflow/python/keras/callbacks_v1_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb6af938b8a5b24048a92a12f674e1b15bc6763
--- /dev/null
+++ b/tensorflow/python/keras/callbacks_v1_test.py
@@ -0,0 +1,562 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import callbacks_v1
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 10
+NUM_CLASSES = 2
+INPUT_DIM = 3
+NUM_HIDDEN = 5
+BATCH_SIZE = 5
+
+
+class TestTensorBoardV1(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard(self):
+    np.random.seed(1337)
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    def data_generator(train):
+      if train:
+        max_batch_index = len(x_train) // BATCH_SIZE
+      else:
+        max_batch_index = len(x_test) // BATCH_SIZE
+      i = 0
+      while 1:
+        if train:
+          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        else:
+          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        i += 1
+        i %= max_batch_index
+
+    # case: Sequential
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=temp_dir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      # fit with validation data and accuracy
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      # fit generator with validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data
+      # histogram_freq must be zero
+      tsb.histogram_freq = 0
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator with validation data and accuracy
+      tsb.histogram_freq = 1
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data and accuracy
+      tsb.histogram_freq = 0
+      model.fit_generator(
+          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
+      assert os.path.exists(temp_dir)
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_multi_input_output(self):
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+    with self.cached_session():
+      filepath = os.path.join(tmpdir, 'logs')
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def data_generator(train):
+        if train:
+          max_batch_index = len(x_train) // BATCH_SIZE
+        else:
+          max_batch_index = len(x_test) // BATCH_SIZE
+        i = 0
+        while 1:
+          if train:
+            # simulate multi-input/output models
+            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
+                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
+          else:
+            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
+                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
+          i += 1
+          i %= max_batch_index
+
+      inp1 = keras.Input((INPUT_DIM,))
+      inp2 = keras.Input((INPUT_DIM,))
+      inp = keras.layers.add([inp1, inp2])
+      hidden = keras.layers.Dense(2, activation='relu')(inp)
+      hidden = keras.layers.Dropout(0.1)(hidden)
+      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      model = keras.models.Model([inp1, inp2], [output1, output2])
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='sgd',
+                    metrics=['accuracy'])
+
+      # we must generate new callbacks for each test, as they aren't stateless
+      def callbacks_factory(histogram_freq):
+        return [
+            callbacks_v1.TensorBoard(
+                log_dir=filepath,
+                histogram_freq=histogram_freq,
+                write_images=True,
+                write_grads=True,
+                batch_size=5)
+        ]
+
+      # fit without validation data
+      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
+
+      # fit with validation data and accuracy
+      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                validation_data=([x_test] * 2, [y_test] * 2),
+                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
+
+      # fit generator without validation data
+      model.fit_generator(data_generator(True), len(x_train), epochs=2,
+                          callbacks=callbacks_factory(histogram_freq=0))
+
+      # fit generator with validation data and accuracy
+      model.fit_generator(data_generator(True), len(x_train), epochs=2,
+                          validation_data=([x_test] * 2, [y_test] * 2),
+                          callbacks=callbacks_factory(histogram_freq=1))
+      assert os.path.isdir(filepath)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_histogram_summaries_in_test_function(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.steps_seen = []
+
+      def add_summary(self, summary, global_step):
+        summary_obj = summary_pb2.Summary()
+
+        # ensure a valid Summary proto is being sent
+        if isinstance(summary, bytes):
+          summary_obj.ParseFromString(summary)
+        else:
+          assert isinstance(summary, summary_pb2.Summary)
+          summary_obj = summary
+
+        # keep track of steps seen for the merged_summary op,
+        # which contains the histogram summaries
+        if len(summary_obj.value) > 1:
+          self.steps_seen.append(global_step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    def _init_writer(obj, _):
+      obj.writer = FileWriterStub(obj.log_dir)
+
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      callbacks_v1.TensorBoard._init_writer = _init_writer
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=tmpdir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_histogram_summaries_with_generator(self):
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+    def generator():
+      x = np.random.randn(10, 100).astype(np.float32)
+      y = np.random.randn(10, 10).astype(np.float32)
+      while True:
+        yield x, y
+
+    with self.cached_session():
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=10, input_dim=100)
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=tmpdir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation generator
+      model.fit_generator(
+          generator(),
+          steps_per_epoch=2,
+          epochs=2,
+          validation_data=generator(),
+          validation_steps=2,
+          callbacks=cbks,
+          verbose=0)
+
+      with self.assertRaises(ValueError):
+        # fit with validation generator but no
+        # validation_steps
+        model.fit_generator(
+            generator(),
+            steps_per_epoch=2,
+            epochs=2,
+            validation_data=generator(),
+            callbacks=cbks,
+            verbose=0)
+
+      self.assertTrue(os.path.exists(tmpdir))
+
+  def test_TensorBoard_with_ReduceLROnPlateau(self):
+    with self.cached_session():
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+      model.compile(
+          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.5, patience=4, verbose=1),
+          callbacks_v1.TensorBoard(log_dir=temp_dir)
+      ]
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      assert os.path.exists(temp_dir)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batches_logged = []
+        self.summary_values = []
+        self.summary_tags = []
+
+      def add_summary(self, summary, step):
+        self.summary_values.append(summary.value[0].simple_value)
+        self.summary_tags.append(summary.value[0].tag)
+        self.batches_logged.append(step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    for batch in range(5):
+      tb_cbk.on_batch_end(batch, {'acc': batch})
+    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
+    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_epoch_and_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summary = (step, summary)
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summary = (step, summary)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0})
+    batch_step, batch_summary = tb_cbk.writer.batch_summary
+    self.assertEqual(batch_step, 0)
+    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+    tb_cbk.on_epoch_end(0, {'acc': 10.0})
+    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+    self.assertEqual(epoch_step, 0)
+    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_Tensorboard_eager(self):
+    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=adam.AdamOptimizer(0.01),
+        metrics=['accuracy'])
+
+    cbks = [callbacks_v1.TensorBoard(log_dir=temp_dir)]
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=2,
+        verbose=0)
+
+    self.assertTrue(os.path.exists(temp_dir))
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_update_freq(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batch_summaries = []
+        self.epoch_summaries = []
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summaries.append((step, summary))
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summaries.append((step, summary))
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    # Epoch mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(tb_cbk.writer.batch_summaries, [])
+    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
+
+    # Batch mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+
+    # Integer mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq=20)
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertFalse(tb_cbk.writer.batch_summaries)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index e91dce5e5d17e092254c1bce64dc258caba91c48..9730ecbef568eb6cfc4ecaf81be0b96c5425d300 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -46,8 +46,9 @@ from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # p
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -57,7 +58,7 @@ from tensorflow.tools.docs import doc_controls
 
 
 @keras_export('keras.layers.Layer')
-class Layer(checkpointable.Checkpointable):
+class Layer(trackable.Trackable):
   """Base layer class.
 
   This is the class from which all layers inherit.
@@ -110,7 +111,7 @@ class Layer(checkpointable.Checkpointable):
       constraints on inputs that can be accepted by the layer.
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
                **kwargs):
     # These properties should be set by the user via keyword arguments.
@@ -244,8 +245,8 @@ class Layer(checkpointable.Checkpointable):
 
   @doc_controls.for_subclass_implementers
   def add_weight(self,
-                 name,
-                 shape,
+                 name=None,
+                 shape=None,
                  dtype=None,
                  initializer=None,
                  regularizer=None,
@@ -259,8 +260,8 @@ class Layer(checkpointable.Checkpointable):
     """Adds a new variable to the layer.
 
     Arguments:
-      name: variable name.
-      shape: variable shape.
+      name: Variable name.
+      shape: Variable shape. Defaults to scalar if unspecified.
       dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
       initializer: initializer instance (callable).
       regularizer: regularizer instance (callable).
@@ -272,7 +273,7 @@ class Layer(checkpointable.Checkpointable):
         marked as non-trainable. `trainable` defaults to `True` unless
         `synchronization` is set to `ON_READ`.
       constraint: constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      partitioner: Partitioner to be passed to the `Trackable` API.
       use_resource: Whether to use `ResourceVariable`.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
@@ -297,6 +298,7 @@ class Layer(checkpointable.Checkpointable):
       ValueError: When giving unsupported dtype and no initializer or when
         trainable has been set to True with synchronization set as `ON_READ`.
     """
+    shape = shape or ()
     # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in ['getter', 'collections']:
@@ -344,9 +346,9 @@ class Layer(checkpointable.Checkpointable):
         name=name,
         shape=shape,
         # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Checkpointable` method.
+        # `Trackable` method.
         getter=getter or base_layer_utils.make_variable,
-        # Manage errors in Layer rather than Checkpointable.
+        # Manage errors in Layer rather than Trackable.
         overwrite=True,
         initializer=initializer,
         dtype=dtype,
@@ -363,8 +365,10 @@ class Layer(checkpointable.Checkpointable):
       # TODO(fchollet): in the future, this should be handled at the
       # level of variable creation, and weight regularization losses
       # should be variable attributes.
-      self._handle_weight_regularization(name, variable, regularizer)
-
+      name_in_scope = variable.name[:variable.name.find(':')]
+      self._handle_weight_regularization(name_in_scope,
+                                         variable,
+                                         regularizer)
     if trainable:
       self._trainable_weights.append(variable)
     else:
@@ -532,8 +536,6 @@ class Layer(checkpointable.Checkpointable):
       # framework.
       if base_layer_utils.needs_keras_history(inputs):
         base_layer_utils.create_keras_history(inputs)
-      # Do not track these Tensors in any sublayers invoked during `call`.
-      base_layer_utils.mark_checked(inputs)
 
     # Handle Keras mask propagation from previous layer to current layer.
     previous_mask = None
@@ -548,22 +550,28 @@ class Layer(checkpointable.Checkpointable):
         # pass to __call__, hence we set previous_mask as the default value.
         kwargs['mask'] = previous_mask
 
-    with ops.name_scope(self._name_scope()):
-      if not self.built:
-        # Build layer if applicable (if the `build` method has been overridden).
-        self._maybe_build(inputs)
-        # We must set self.built since user defined build functions are not
-        # constrained to set self.built.
-        self.built = True
-
+    with base_layer_utils.call_context():
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
-        input_spec.assert_input_compatibility(
-            self.input_spec, inputs, self.name)
+        input_spec.assert_input_compatibility(self.input_spec, inputs,
+                                              self.name)
         graph = backend.get_graph()
-        with graph.as_default():
+        with graph.as_default(), ops.name_scope(self._name_scope()):
+          # Build layer if applicable (if the `build` method has been
+          # overridden).
+          self._maybe_build(inputs)
+          # Explicitly pass the learning phase placeholder to `call` if
+          # the `training` argument was left unspecified by the user.
+          # This behavior is restricted to the managed Keras FuncGraph.
+          learning_phase_passed_by_framework = False
+          if (self._expects_training_arg and
+              not base_layer_utils.training_arg_passed_to_call(
+                  tf_inspect.getfullargspec(self.call), args, kwargs) and
+              getattr(graph, 'name', None) == 'keras_graph'):
+            learning_phase_passed_by_framework = True
+            kwargs['training'] = backend.learning_phase()
           if not self.dynamic:
             try:
               outputs = self.call(inputs, *args, **kwargs)
@@ -593,6 +601,8 @@ class Layer(checkpointable.Checkpointable):
                              'Tensor or a list of Tensors, not None '
                              '(layer: ' + self.name + ').')
           if base_layer_utils.have_all_keras_metadata(inputs):
+            if learning_phase_passed_by_framework:
+              kwargs.pop('training')
             inputs, outputs = self._set_connectivity_metadata_(
                 inputs, outputs, args, kwargs)
           self._handle_activity_regularization(inputs, outputs)
@@ -606,9 +616,11 @@ class Layer(checkpointable.Checkpointable):
             self._set_inputs(inputs, outputs)
       else:
         # Eager execution on data tensors.
-        outputs = self.call(inputs, *args, **kwargs)
-        self._handle_activity_regularization(inputs, outputs)
-        self._set_mask_metadata(inputs, outputs, previous_mask)
+        with ops.name_scope(self._name_scope()):
+          self._maybe_build(inputs)
+          outputs = self.call(inputs, *args, **kwargs)
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
 
     if not context.executing_eagerly():
       # Optionally load weight values specified at layer instantiation.
@@ -685,7 +697,12 @@ class Layer(checkpointable.Checkpointable):
       A list of tensors.
     """
     collected_losses = []
-    if context.executing_eagerly():
+
+    # If any eager losses are present, we assume the model to be part of an
+    # eager training loop (either a custom one or the one used when
+    # `run_eagerly=True`), and so we always return just the eager losses in that
+    # case.
+    if self._eager_losses:
       collected_losses.extend(self._eager_losses)
     else:
       collected_losses.extend(self._losses)
@@ -716,6 +733,7 @@ class Layer(checkpointable.Checkpointable):
     Arguments:
       losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
         may also be zero-argument callables which create a loss tensor.
+        Other types of input are ignored.
       inputs: Ignored when executing eagerly. If anything other than None is
         passed, it signals the losses are conditional on some of the layer's
         inputs, and thus they should only be run where these inputs are
@@ -741,10 +759,13 @@ class Layer(checkpointable.Checkpointable):
         self._callable_losses.append(
             functools.partial(_tag_unconditional, loss))
       else:
-        if context.executing_eagerly():
-          self._eager_losses.append(_tag_unconditional(loss))
-        else:
+        if not tensor_util.is_tensor(loss):
+          # Ignoring constant values as this does not affect the gradients.
+          return
+        if tf_utils.is_symbolic_tensor(loss):
           self._losses.append(_tag_unconditional(loss))
+        else:
+          self._eager_losses.append(_tag_unconditional(loss))
 
   @doc_controls.for_subclass_implementers
   def add_metric(self, value, aggregation=None, name=None):
@@ -757,7 +778,7 @@ class Layer(checkpointable.Checkpointable):
         already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
         y_pred))`. If aggregation='mean', the given metric tensor will be
         sample-wise reduced using `mean` function. eg, `model.add_metric(
-        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+        tf.reduce_sum(outputs), name='output_mean', aggregation='mean')`.
       name: String metric name.
 
     Raises:
@@ -768,8 +789,25 @@ class Layer(checkpointable.Checkpointable):
           'We currently support only `mean` sample-wise metric aggregation. '
           'You provided aggregation=`%s`' % aggregation)
 
-    if tf_utils.is_symbolic_tensor(value):
-      self._symbolic_add_metric(value, aggregation, name)
+    is_symbolic = tf_utils.is_symbolic_tensor(value)
+    if name is None and (not is_symbolic or not hasattr(value, '_metric_obj')):
+      # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
+      # In eager mode, we use metric name to lookup a metric. Without a name,
+      # a new Mean metric wrapper will be created on every model/layer call.
+      # So, we raise an error when no name is provided.
+      # We will do the same for symbolic mode for consistency although a name
+      # will be generated if no name is provided.
+
+      # We will not raise this error in the foll use case for the sake of
+      # consistency as name in provided in the metric constructor.
+      # model.add_metric(metrics.Mean(name='my_metric')(outputs))
+      raise ValueError('Please provide a name for your metric like '
+                       '`self.add_metric(tf.reduce_sum(inputs), '
+                       'name=\'mean_activation\', aggregation=\'mean\')`')
+
+    if is_symbolic:
+      with backend.get_graph().as_default():
+        self._symbolic_add_metric(value, aggregation, name)
     else:
       self._eager_add_metric(value, aggregation, name)
 
@@ -1291,9 +1329,10 @@ class Layer(checkpointable.Checkpointable):
       match(value)  # Update the metric state.
       return
     else:
-      if aggregation is None:
-        raise ValueError('We do not support adding an aggregated metric tensor '
-                         'in `call` in eager execution.')
+      # Aggregation will always be set in this use case. If not we will raise
+      # error on model/layer call in graph function mode when model/layer is
+      # created.
+      assert aggregation is not None
       metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
       self._metrics.append(metric_obj)
 
@@ -1312,10 +1351,20 @@ class Layer(checkpointable.Checkpointable):
         else:
           raise ValueError(
               'We currently do not support reusing a metric instance.')
-      else:
+      elif hasattr(value, '_metric_obj'):
         # We track the instance using the metadata on the result tensor.
         result_tensor = value
         metric_obj = result_tensor._metric_obj
+      else:
+        raise ValueError(
+            'We do not support adding an aggregated metric result tensor that '
+            'is not the output of a `tf.keras.metrics.Metric` metric instance. '
+            'Without having access to the metric instance we cannot reset the '
+            'state of a metric after every epoch during training. You can '
+            'create a `tf.keras.metrics.Metric` instance and pass the result '
+            'here or pass an un-aggregated result with `aggregation` parameter '
+            'set as `mean`. For example: `self.add_metric(tf.reduce_sum(inputs)'
+            ', name=\'mean_activation\', aggregation=\'mean\')`')
     else:
       # If a non-aggregated tensor is given as input (ie. `aggregation` is
       # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
@@ -1355,32 +1404,35 @@ class Layer(checkpointable.Checkpointable):
           self.add_loss(mean_activity_loss, inputs=inputs)
 
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    if getattr(self, '_compute_output_and_mask_jointly', False):
-      # Mask is already computed for Keras Graph Networks.
-      return
-
     flat_outputs = nest.flatten(outputs)
-    if all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs):
-      # Mask is already computed by sublayers.
-      return
-
-    if hasattr(self, 'compute_mask'):
-      output_masks = self.compute_mask(inputs, previous_mask)
-      # `compute_mask` can return a single `None` even when a Layer
-      # has multiple outputs.
-      if output_masks is None:
-        flat_masks = [None for _ in flat_outputs]
+    mask_already_computed = (
+        getattr(self, '_compute_output_and_mask_jointly', False) or
+        all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
+
+    if not mask_already_computed:
+      if hasattr(self, 'compute_mask'):
+        output_masks = self.compute_mask(inputs, previous_mask)
+        # `compute_mask` can return a single `None` even when a Layer
+        # has multiple outputs.
+        if output_masks is None:
+          flat_masks = [None for _ in flat_outputs]
+        else:
+          flat_masks = nest.flatten(output_masks)
       else:
-        flat_masks = nest.flatten(output_masks)
-    else:
-      flat_masks = [None for _ in flat_outputs]
+        flat_masks = [None for _ in flat_outputs]
 
-    for output, mask in zip(flat_outputs, flat_masks):
-      try:
-        output._keras_mask = mask
-      except AttributeError:
-        # C Type such as np.ndarray.
-        pass
+      for output, mask in zip(flat_outputs, flat_masks):
+        try:
+          output._keras_mask = mask
+        except AttributeError:
+          # C Type such as np.ndarray.
+          pass
+
+    if tf_utils.are_all_symbolic_tensors(flat_outputs):
+      for output in flat_outputs:
+        if getattr(output, '_keras_mask', None) is not None:
+          # Do not track masks for `TensorFlowOpLayer` construction.
+          output._keras_mask._keras_history_checked = True
 
   def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
     call_convention = getattr(
@@ -1575,6 +1627,9 @@ class Layer(checkpointable.Checkpointable):
 
   def _maybe_build(self, inputs):
     # Check input assumptions set before layer building, e.g. input rank.
+    if self.built:
+      return
+
     input_spec.assert_input_compatibility(
         self.input_spec, inputs, self.name)
     input_list = nest.flatten(inputs)
@@ -1589,6 +1644,9 @@ class Layer(checkpointable.Checkpointable):
     # Only call `build` if the user has manually overridden the build method.
     if not hasattr(self.build, '_is_default'):
       self.build(input_shapes)
+    # We must set self.built since user defined build functions are not
+    # constrained to set self.built.
+    self.built = True
 
   def _symbolic_call(self, inputs):
     input_shapes = nest.map_structure(lambda x: x.shape, inputs)
@@ -1607,12 +1665,16 @@ class Layer(checkpointable.Checkpointable):
       super(Layer, self).__setattr__(name, value)
       return
 
+    # Keep track of trackable objects, for the needs of `Network.save_weights`.
+    value = data_structures.sticky_attribute_assignment(
+        trackable=self, value=value, name=name)
+
     # Append value to self._layers if relevant
     if (isinstance(value, Layer) or
-        checkpointable_layer_utils.has_weights(value)):
+        trackable_layer_utils.has_weights(value)):
       # Initialize `_layers` here in case `__init__` has not yet been called.
       if not hasattr(self, '_layers'):
-        self._layers = []
+        super(Layer, self).__setattr__('_layers', [])
       # We need to check object identity to avoid de-duplicating empty
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
@@ -1623,18 +1685,23 @@ class Layer(checkpointable.Checkpointable):
           value._use_resource_variables = True
 
     # Append value to list of trainable / non-trainable weights if relevant
-    if isinstance(value, tf_variables.Variable):
-      # Users may add extra weights/variables
-      # simply by assigning them to attributes (invalid for graph networks)
-      if not hasattr(self, '_trainable_weights'):
-        self._trainable_weights = []
-      if not hasattr(self, '_non_trainable_weights'):
-        self._non_trainable_weights = []
-      if value not in self._trainable_weights + self._non_trainable_weights:
-        if value.trainable:
-          self._trainable_weights.append(value)
-        else:
-          self._non_trainable_weights.append(value)
+    # TODO(b/125122625): This won't pick up on any variables added to a
+    # list/dict after creation.
+    for val in nest.flatten(value):
+      if isinstance(val, tf_variables.Variable):
+        # Users may add extra weights/variables
+        # simply by assigning them to attributes (invalid for graph networks)
+        if not hasattr(self, '_trainable_weights'):
+          super(Layer, self).__setattr__('_trainable_weights', [])
+        if not hasattr(self, '_non_trainable_weights'):
+          super(Layer, self).__setattr__('_non_trainable_weights', [])
+        if val not in self._trainable_weights + self._non_trainable_weights:
+          if val.trainable:
+            self._trainable_weights.append(val)
+          else:
+            self._non_trainable_weights.append(val)
+          backend.track_variable(val)
+
     super(Layer, self).__setattr__(name, value)
 
   def _gather_children_attribute(self, attribute):
@@ -1646,7 +1713,7 @@ class Layer(checkpointable.Checkpointable):
     return []
 
   # This is a hack so that the is_layer (within
-  # training/checkpointable/layer_utils.py) check doesn't get the weights attr.
+  # training/trackable/layer_utils.py) check doesn't get the weights attr.
   # TODO(b/110718070): Remove when fixed.
   def _is_layer(self):
     return True
@@ -1812,6 +1879,9 @@ class TensorFlowOpLayer(Layer):
         name=name, trainable=trainable, dtype=dtype)
     self.node_def = node_def_pb2.NodeDef.FromString(node_def)
     self.constants = constants or {}
+    # Layer uses original op unless it is called on new inputs.
+    # This means `built` is not set in `__call__`.
+    self.built = True
 
   def call(self, inputs):
     if context.executing_eagerly():
@@ -1827,6 +1897,10 @@ class TensorFlowOpLayer(Layer):
         inputs.insert(index, constant)
 
       self.node_def.name = graph.unique_name(self.node_def.name)
+      # Check for case where first input should be a list of Tensors.
+      if 'N' in self.node_def.attr:
+        num_tensors = self.node_def.attr['N'].i
+        inputs = [inputs[:num_tensors]] + inputs[num_tensors:]
       c_op = ops._create_c_op(graph, self.node_def, inputs, control_inputs=[])
       op = graph._create_op_from_tf_operation(c_op)
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index faeb4c5f4024c7ef1d89ef20da596eb8f4234e1c..b4748d130113ed218dfe5c8375cbd39af96f6e23 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -32,6 +33,7 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -225,6 +227,28 @@ class BaseLayerTest(keras_parameterized.TestCase):
     model(np.zeros((2, 4), dtype='float32'))
     self.assertTrue(model.built)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_default_add_weight(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__()
+        self.default_weight = self.add_weight()
+        self.weight_without_name = self.add_weight(shape=(3, 4))
+        self.regularized_weight_without_name = self.add_weight(
+            shape=(3, 4), regularizer='l2')
+
+    layer = TestLayer()
+    self.assertEqual(layer.default_weight.shape.as_list(), [])
+    self.assertEqual(layer.weight_without_name.shape.as_list(), [3, 4])
+    self.assertEqual(layer.default_weight.dtype.name, 'float32')
+    self.assertEqual(layer.weight_without_name.dtype.name, 'float32')
+    self.assertEqual(len(layer.losses), 1)
+    if not context.executing_eagerly():
+      # Cannot access tensor.name in eager execution.
+      self.assertTrue('Variable_2/Regularizer' in layer.losses[0].name)
+
   def test_learning_phase_freezing_for_layers(self):
     # This test is only meant to run in graph functions mode (ambient eager).
     # In forced eager, `model.predict` ignores the global learning phase
@@ -257,6 +281,91 @@ class BaseLayerTest(keras_parameterized.TestCase):
     keras.backend.set_learning_phase(0)
     self.assertEqual(get_learning_phase_value(), 0)
 
+  # Cannot be enabled with `run_eagerly=True`, see b/123904578
+  @test_util.run_all_in_graph_and_eager_modes
+  def test_layer_can_return_variable(self):
+
+    class ComputeSum(keras.layers.Layer):
+
+      def __init__(self):
+        super(ComputeSum, self).__init__()
+        self.total = variables.Variable(
+            initial_value=array_ops.zeros((1, 1)), trainable=False)
+        if not context.executing_eagerly():
+          keras.backend.get_session().run(self.total.initializer)
+
+      def call(self, inputs):
+        self.total.assign_add(inputs)
+        return self.total
+
+    inputs = keras.Input(shape=(1,))
+    model = keras.Model(inputs, ComputeSum()(inputs))
+    model.predict(np.ones((1, 1)))
+
+  def _get_layer_with_training_arg(self):
+
+    class TrainingLayer(keras.layers.Layer):
+      """A layer with a `training` argument in a defuned `call`."""
+
+      @def_function.function
+      def call(self, inputs, training=None):
+        if training is None:
+          training = keras.backend.learning_phase()
+        return tf_utils.smart_cond(training,
+                                   lambda: array_ops.ones_like(inputs),
+                                   lambda: array_ops.zeros_like(inputs))
+
+    return TrainingLayer()
+
+  @keras_parameterized.run_with_all_model_types
+  # b/124459427: can't test with `run_eagerly=True` for now.
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_arg_in_defun(self):
+    layer = self._get_layer_with_training_arg()
+    model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
+    model.compile(rmsprop.RMSprop(0.),
+                  loss='mae')
+    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(history.history['loss'][0], 1.)
+    loss = model.evaluate(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(loss, 0.)
+
+    # Test that the argument injection performed in `call` is not active
+    # when the argument is passed explicitly.
+    layer = self._get_layer_with_training_arg()
+    inputs = keras.Input(shape=(1,))
+    # Pass `training` by name
+    outputs = layer(inputs, training=False)
+    model = keras.Model(inputs, outputs)
+    model.compile(rmsprop.RMSprop(0.),
+                  loss='mae')
+    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(history.history['loss'][0], 0.)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_raw_variable_assignment(self):
+
+    class RawVariableLayer(keras.layers.Layer):
+
+      def __init__(self, **kwargs):
+        super(RawVariableLayer, self).__init__(**kwargs)
+        # Test variables in nested structure.
+        self.var_list = [variables.Variable(1.), {'a': variables.Variable(2.)}]
+
+      def call(self, inputs):
+        return inputs * self.var_list[0] * self.var_list[1]['a']
+
+    model = testing_utils.get_model_from_layers([RawVariableLayer()],
+                                                input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10)), np.ones((10, 10))
+    # Checks that variables get initialized.
+    model.fit(x, y, batch_size=2, epochs=2)
+
+
+class SymbolicSupportTest(test.TestCase):
+
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
     x = keras.Input((3,))
@@ -390,6 +499,9 @@ class NestedTrackingTest(test.TestCase):
     self.assertEqual(len(layer.weights), 8)
     self.assertEqual(len(layer.trainable_weights), 0)
     self.assertEqual(len(layer.non_trainable_weights), 8)
+    self.assertEqual(
+        set([layer.dense1, layer.dense2, layer.v1, layer.v2]),
+        set([obj for unused_name, obj in layer._checkpoint_dependencies]))
 
   def test_nested_layer_updates_losses_tracking(self):
     # Test that updates and losses from nested sublayers are
@@ -434,6 +546,34 @@ class NestedTrackingTest(test.TestCase):
       self.assertEqual(len(layer.updates), 3)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class NameScopingTest(keras_parameterized.TestCase):
+
+  def test_name_scope_layer(self):
+    x = keras.backend.placeholder(shape=(10, 10))
+    layer = keras.layers.Dense(10, name='MyName')
+    layer(x)
+    self.assertEqual(layer.bias.name, 'MyName/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
+
+  def test_name_scope_sublayer(self):
+    x = keras.backend.placeholder(shape=(10, 10))
+    layer = keras.layers.Dense(
+        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName2')
+    y = layer(x)
+    self.assertEqual(layer.bias.name, 'MyName2/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName2/kernel:0')
+    self.assertEqual(y.name, 'MyName2/MyAct/Relu:0')
+
+  def test_name_scope_tf_tensor(self):
+    x = ops.convert_to_tensor(np.ones((10, 10)))
+    layer = keras.layers.Dense(
+        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName3')
+    layer(x)
+    self.assertEqual(layer.bias.name, 'MyName3/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName3/kernel:0')
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 211e8482e407ae775abbca45552d9bd76572e401..19143dbbdf0f463936c6cb6ecec7a1d769feac18 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as collections_lib
+import threading
 import enum
 
 from tensorflow.python.framework import dtypes
@@ -28,6 +29,9 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
+
+_call_context = threading.local()
 
 
 class CallConvention(enum.Enum):
@@ -72,7 +76,7 @@ def make_variable(name,
   that has fewer constraints (`variable_scope.variable()`).
 
   In the longer term, it seems like a similar "default variable creator" method
-  should exist in `CheckpointableBase` instead. When this happens, we can get
+  should exist in `Trackable` instead. When this happens, we can get
   rid of this temporary solution.
 
   TODO(fchollet): remove this method when no longer needed.
@@ -277,7 +281,7 @@ def _create_keras_history_helper(tensors, processed_ops=None):
           # a constant (Variables currently have `Placeholder` op type
           # when originating from an eager context
           # so can't be supported.
-          constants[i] = backend.function([], [op_input])([])
+          constants[i] = backend.function([], op_input)([])
       processed_ops = _create_keras_history_helper(layer_inputs, processed_ops)
       name = op.name
       node_def = op.node_def.SerializeToString()
@@ -292,6 +296,11 @@ def _create_keras_history_helper(tensors, processed_ops=None):
 def needs_keras_history(tensors):
   """Check if any Tensors need to be wrapped in TensorFlowOpLayers.
 
+  This will never return True inside a sublayer, because sublayers
+  do not need to create Keras History. Otherwise, this returns True
+  if one or more of `tensors` originates from a `keras.Input` and
+  does not have `_keras_history` set.
+
   Arguments:
     tensors: An arbitrary nested structure of Tensors.
 
@@ -299,7 +308,7 @@ def needs_keras_history(tensors):
     Bool, whether at least one Tensor needs to be wrapped.
   """
   input_tensors = nest.flatten(tensors)
-  if all(
+  if getattr(_call_context, 'in_call', False) or all(
       getattr(tensor, '_keras_history', None) is not None
       for tensor in input_tensors):
     # KerasHistory already set.
@@ -362,3 +371,22 @@ def mark_checked(tensors):
     tensor._keras_history_checked = True  # pylint: disable=protected-access
 
   nest.map_structure(_mark_checked, tensors)
+
+
+@tf_contextlib.contextmanager
+def call_context():
+  """Scope that marks when we are currently inside a Layer/Model's `call`."""
+  was_in_call = getattr(_call_context, 'in_call', False)
+  _call_context.in_call = True
+  try:
+    yield
+  finally:
+    _call_context.in_call = was_in_call
+
+
+def training_arg_passed_to_call(argspec, args, kwargs):
+  """Returns whether a user passed the `training` argument in `__call__`."""
+  # `argspec.args` starts with ['self', 'inputs']
+  full_args = dict(zip(argspec.args[2:], args))
+  full_args.update(kwargs)
+  return 'training' in full_args
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index 91df7705d14244478371d81baf41a53d3828632a..0c9fdbb54c00d0aa597d5d8979bec27ddb6eb0aa 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
-from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,12 +32,13 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 
@@ -67,7 +68,7 @@ def set_weights(distribution_strategy, dist_model, weights):
     weights = weights[num_param:]
 
   if not ops.executing_eagerly_outside_functions():
-    K.get_session().run(assign_ops)
+    K.get_session(assign_ops).run(assign_ops)
 
 
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
@@ -104,7 +105,7 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                                         grouped_inputs)
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
+    loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
                                         grouped_outputs[0])
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
@@ -196,14 +197,14 @@ def validate_callbacks(input_callbacks, optimizer):
       # features of the callback that involve accessing model attributes and
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
-        if callback.__getattribute__('histogram_freq'):
+        if getattr(callback, 'histogram_freq', False):
           logging.warning(
               UserWarning(
                   '`histogram_freq` in the TensorBoard callback is not '
                   'supported when using DistributionStrategy. Setting '
                   '`histogram_freq` to `0`.'))
           callback.histogram_freq = 0
-        if callback.__getattribute__('write_grads'):
+        if getattr(callback, 'write_grads', False):
           logging.warning(
               UserWarning(
                   '`write_grads` in the TensorBoard callback is not supported '
@@ -522,7 +523,7 @@ def initialize_iterator(iterator, distribution_strategy):
   with distribution_strategy.scope():
     init_op = control_flow_ops.group(iterator.initialize())
     if not context.executing_eagerly():
-      K.get_session().run(init_op)
+      K.get_session((init_op,)).run(init_op)
 
 
 def _get_input_from_iterator(iterator, model):
@@ -563,6 +564,11 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
   inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
   inputs = flatten_perdevice_values(strategy, inputs)
   targets = flatten_perdevice_values(strategy, targets)
+  # Expand 1-dimensional inputs.
+  # TODO(b/124535720): Remove once this standarize data logic is shared with
+  # main flow.
+  inputs, targets = nest.map_structure(training_utils.standardize_single_array,
+                                       (inputs, targets))
   if mode == ModeKeys.PREDICT:
     sample_weights = []
     targets = []
@@ -723,21 +729,46 @@ def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
 
 
 def _make_execution_function(model, mode):
-  """Makes function to run one step of distributed model execution."""
+  """Makes or reuses function to run one step of distributed model execution."""
+  strategy = model._distribution_strategy
+
+  distributed_model = get_distributed_model(model, mode)
+  # If distributed model for a particular `mode` is already built, use the
+  # `_distribution_function` on that distributed model.
+  if distributed_model:
+    return distributed_model._distributed_function
+
+  # If distributed_model is not built, create one for `mode`.
+  if model._compile_distribution:
+    clone_model_on_replicas(model, strategy, mode)
+  else:
+    _build_distributed_network(model, strategy, mode)
+
+  # We've just created the distributed model. So `distributed_model` should be
+  # not None.
+  distributed_model = get_distributed_model(model, mode)
+  assert distributed_model
+
+  # Also create an execution fuction on that distributed model.
   if context.executing_eagerly():
-    return _make_eager_execution_function(model, mode)
+    distributed_function = _make_eager_execution_function(model, mode)
+  else:
+    distributed_function = _make_graph_execution_function(model, mode)
+
+  # We cache the distributed execution function on the model since creating
+  # distributed models and exection functions are expensive.
+  distributed_model._distributed_function = distributed_function
+  return distributed_function
 
-  strategy = model._distribution_strategy
-  if not get_distributed_model(model, mode):
-    if model._compile_distribution:
-      clone_model_on_replicas(model, strategy, mode)
-    else:
-      _build_distributed_network(model, strategy, mode)
+
+def _make_graph_execution_function(model, mode):
+  """Makes function to run one step of distributed model in graph mode."""
 
   def _per_device_function(model):
     f = model._make_execution_function(mode)
     return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
 
+  strategy = model._distribution_strategy
   with strategy.scope():
     # Create train ops on each of the devices when we call
     # `_per_device_fit_function`.
@@ -773,35 +804,38 @@ def _make_execution_function(model, mode):
 
 def _make_eager_execution_function(model, mode):
   """Makes function to run one step of distributed model eager execution."""
-  strategy = model._distribution_strategy
-  if not get_distributed_model(model, mode):
-    if model._compile_distribution:
-      clone_model_on_replicas(model, strategy, mode)
-    else:
-      _build_distributed_network(model, strategy, mode)
-
   def _per_device_function(model):
     f = model._make_execution_function(mode)
     return (f.inputs, f.outputs)
 
   # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
   # the global one.
-  with K.get_graph().as_default(), strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs) = strategy.extended.call_for_each_replica(
-        _per_device_function, args=(get_distributed_model(model, mode),))
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of inptus/outputs
-    # on all the devices over which the model is distributed.
-    (all_inputs, all_outputs, _, _) = unwrap_values(
-        strategy,
-        grouped_inputs,
-        grouped_outputs,
-        with_loss_tensor=(mode != ModeKeys.PREDICT))
-
+  strategy = model._distribution_strategy
+  global_graph = K.get_graph()
+
+  with global_graph.as_default(), strategy.scope():
+    # First we gather the relevant portions of the model across all replicas.
+    # `K._scratch_graph(global_graph)` signals to Keras that it should not
+    # lift to a separate graph when creating the per-replica functions.
+    with K._scratch_graph(global_graph):
+      # Create train ops on each of the devices when we call
+      # `_per_device_fit_function`.
+      grouped = strategy.extended.call_for_each_replica(
+          _per_device_function, args=(get_distributed_model(model, mode),))
+      grouped_inputs, grouped_outputs = grouped
+
+      # Unwrap all the per device values returned from `call_for_each_replica`.
+      # Unwrapping per device values gives you a list of values that can be
+      # used to construct a new train function that is composed of
+      # inputs/outputs on all the devices over which the model is distributed.
+      (all_inputs, all_outputs, _, _) = unwrap_values(
+          strategy,
+          grouped_inputs,
+          grouped_outputs,
+          with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    # Finally, a joint Keras function is created; this one will be created in
+    # a separate FuncGraph.
     return K.function(
         all_inputs,
         all_outputs,
@@ -869,3 +903,38 @@ def _generate_cache_key(mode):
 def distributed_scope(strategy, learning_phase):
   with strategy.scope(), K.learning_phase_scope(learning_phase):
     yield
+
+
+def filter_distributed_callbacks(callbacks_list):
+  """Filter Callbacks based on the worker context when running multi-worker.
+
+  Arguments:
+    callbacks_list: A list of `Callback` instances.
+
+  Returns:
+    The list of `Callback` instances that should be run on this worker.
+  """
+
+  if not K.in_multi_worker_mode():
+    raise ValueError(
+        'filter_distributed_callbacks() should only be called when Keras '
+        'is in multi worker mode.')
+
+  worker_context = dc_context.get_current_worker_context()
+  callbacks_list = callbacks_list or []
+  if not [
+      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
+  ]:
+    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
+    # fails to.
+    logging.warning('ModelCheckpoint callback is not provided. '
+                    'Workers will need to restart training if any fails.')
+  # TODO(rchao): Add similar warning for restoring callback (to be designed).
+
+  if callbacks_list is None or worker_context.is_chief:
+    return callbacks_list
+
+  # Some Callbacks should only run on the chief worker.
+  return [
+      callback for callback in callbacks_list if not callback._chief_worker_only
+  ]  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 0835e7c0833379496fe387368a9c89376551f4df..a6fdfad833d17ef2f49c707aab4b35215919647b 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import copy
 import json
 import os
-import weakref
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
@@ -45,10 +44,10 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
@@ -153,7 +152,7 @@ class Network(base_layer.Layer):
   # empty lists shouldn't cause issues; adding or removing them will not break
   # checkpoints, but may cause "all Python objects matched" assertions to fail
   # (in which case less strict assertions may be substituted if necessary).
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _base_init(self, name=None):
     # The following are implemented as property functions:
     # self.trainable_weights
@@ -171,7 +170,7 @@ class Network(base_layer.Layer):
     self._is_compiled = False
     self._expects_training_arg = False
 
-    # This is True for Sequential networks and graph networks.
+    # This is True for Sequential networks and Functional networks.
     self._compute_output_and_mask_jointly = False
 
     self.supports_masking = False
@@ -207,10 +206,10 @@ class Network(base_layer.Layer):
     self._outbound_nodes = []
     self._inbound_nodes = []
 
-    self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
-        weakref.ref(self))
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
@@ -239,6 +238,9 @@ class Network(base_layer.Layer):
     self._compute_output_and_mask_jointly = True
     self._is_graph_network = True
     self._dynamic = False
+    # `_expects_training_arg` is True since the `training` argument is always
+    # present in the signature of the `call` method of a graph network.
+    self._expects_training_arg = True
 
     self._input_layers = []
     self._output_layers = []
@@ -307,7 +309,7 @@ class Network(base_layer.Layer):
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _init_subclassed_network(self, name=None, dynamic=False):
     self._base_init(name=name)
     self._is_graph_network = False
@@ -368,20 +370,20 @@ class Network(base_layer.Layer):
       return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
 
   def _track_layers(self, layers):
-    """Add Checkpointable dependencies on a list of Layers."""
+    """Add Trackable dependencies on a list of Layers."""
     weight_layer_index = 0
     for layer_index, layer in enumerate(layers):
       if layer.weights:
         # Keep a separate index for layers which have weights. This allows users
         # to insert Layers without weights anywhere in the network without
         # breaking checkpoints.
-        self._track_checkpointable(
+        self._track_trackable(
             layer, name='layer_with_weights-%d' % weight_layer_index,
             overwrite=True)
         weight_layer_index += 1
       # Even if it doesn't have weights, we should still track everything in
-      # case it has/will have Checkpointable dependencies.
-      self._track_checkpointable(
+      # case it has/will have Trackable dependencies.
+      self._track_trackable(
           layer, name='layer-%d' % layer_index, overwrite=True)
 
   def __setattr__(self, name, value):
@@ -391,18 +393,15 @@ class Network(base_layer.Layer):
 
     if all(
         isinstance(v, (base_layer.Layer,
-                       data_structures.CheckpointableDataStructure)) or
-        checkpointable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
       try:
         self._is_graph_network
       except AttributeError:
         raise RuntimeError('It looks like you are subclassing `Model` and you '
                            'forgot to call `super(YourClass, self).__init__()`.'
                            ' Always start with this line.')
-    # Keep track of checkpointable objects,
-    # for the needs of `self.save/save_weights`.
-    value = data_structures.sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
+
     super(Network, self).__setattr__(name, value)
 
     # Keep track of metric instance created in subclassed model/layer.
@@ -479,7 +478,7 @@ class Network(base_layer.Layer):
 
   @property
   def layers(self):
-    return checkpointable_layer_utils.filter_empty_layer_containers(
+    return trackable_layer_utils.filter_empty_layer_containers(
         self._layers)
 
   def get_layer(self, name=None, index=None):
@@ -529,7 +528,12 @@ class Network(base_layer.Layer):
   @property
   def _unfiltered_losses(self):
     losses = []
-    if context.executing_eagerly():
+
+    # If any eager losses are present, we assume the model to be part of an
+    # eager training loop (either a custom one or the one used when
+    # `run_eagerly=True`), and so we always return just the eager losses in that
+    # case.
+    if self._eager_losses:
       losses.extend(self._eager_losses)
     else:
       losses.extend(self._losses)
@@ -540,7 +544,7 @@ class Network(base_layer.Layer):
         losses += layer.losses
     return losses
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
     self._eager_losses = []
@@ -680,14 +684,14 @@ class Network(base_layer.Layer):
 
   @property
   def trainable_weights(self):
-    return checkpointable_layer_utils.gather_trainable_weights(
+    return trackable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
         extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
-    return checkpointable_layer_utils.gather_non_trainable_weights(
+    return trackable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
         extra_variables=self._non_trainable_weights + self._trainable_weights)
@@ -724,7 +728,7 @@ class Network(base_layer.Layer):
         A list of `InputSpec` instances (one per input to the model)
             or a single instance if the model has only one input.
     """
-    # If not a graph network, can't assume anything.
+    # If subclassed model, can't assume anything.
     if not self._is_graph_network:
       return None
 
@@ -1299,7 +1303,10 @@ class Network(base_layer.Layer):
     """
     if not self._is_graph_network:
       raise NotImplementedError(
-          'Currently `save` requires model to be a graph network. Consider '
+          'The `save` method requires the model to be a Functional model or a '
+          'Sequential model. It does not work for subclassed models, '
+          'because such models are defined via the body of a Python method, '
+          'which isn\'t safely serializable. Consider '
           'using `save_weights`, in order to save the weights of the model.')
 
     from tensorflow.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
@@ -1392,7 +1399,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
       optimizer = getattr(self, 'optimizer', None)
       if (optimizer
-          and not isinstance(optimizer, checkpointable.Checkpointable)):
+          and not isinstance(optimizer, trackable.Trackable)):
         logging.warning(
             ('This model was compiled with a Keras optimizer (%s) but is being '
              'saved in TensorFlow format with `save_weights`. The model\'s '
@@ -1400,7 +1407,7 @@ class Network(base_layer.Layer):
              'the TensorFlow format the optimizer\'s state will not be '
              'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
             % (optimizer,))
-      self._checkpointable_saver.save(filepath, session=session)
+      self._trackable_saver.save(filepath, session=session)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
       checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
@@ -1459,7 +1466,7 @@ class Network(base_layer.Layer):
         # The checkpoint is not readable in TensorFlow format. Try HDF5.
         save_format = 'h5'
     if save_format == 'tf':
-      status = self._checkpointable_saver.restore(filepath)
+      status = self._trackable_saver.restore(filepath)
       if by_name:
         raise NotImplementedError(
             'Weights may only be loaded based on topology into Models when '
@@ -1469,7 +1476,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
         # Restore existing variables (if any) immediately, and set up a
         # streaming restore for any variables created in the future.
-        checkpointable_utils.streaming_restore(status=status, session=session)
+        trackable_utils.streaming_restore(status=status, session=session)
       status.assert_nontrivial_match()
       return status
     if h5py is None:
@@ -1830,3 +1837,4 @@ def _map_graph_network(inputs, outputs):
                        str(all_names.count(name)) + ' times in the model. '
                        'All layer names should be unique.')
   return network_nodes, nodes_by_depth, layers, layers_by_depth
+
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 671bfe9c280cb949eb0779094860635e5894955e..6c8f5c2f3984f4a445e43dc12f5817b25a3d63a4 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -93,7 +93,7 @@ class Sequential(training.Model):
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layers=None, name=None):
     super(Sequential, self).__init__(name=name)
     self.supports_masking = True
@@ -112,7 +112,7 @@ class Sequential(training.Model):
     # Historically, `sequential.layers` only returns layers that were added
     # via `add`, and omits the auto-generated `InputLayer` that comes at the
     # bottom of the stack.
-    # `CheckpointableBase` manages the `_layers` attributes and does filtering
+    # `Trackable` manages the `_layers` attributes and does filtering
     # over it.
     layers = super(Sequential, self).layers
     if layers and isinstance(layers[0], input_layer.InputLayer):
@@ -123,7 +123,7 @@ class Sequential(training.Model):
   def dynamic(self):
     return any(layer.dynamic for layer in self.layers)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
 
@@ -138,6 +138,14 @@ class Sequential(training.Model):
             multiple output tensors, or is already connected
             somewhere else (forbidden in `Sequential` models).
     """
+    # If we are passed a Keras tensor created by keras.Input(), we can extract
+    # the input layer from its keras history and use that without any loss of
+    # generality.
+    if hasattr(layer, '_keras_history'):
+      origin_layer = layer._keras_history[0]
+      if isinstance(origin_layer, input_layer.InputLayer):
+        layer = origin_layer
+
     if not isinstance(layer, base_layer.Layer):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
@@ -193,7 +201,7 @@ class Sequential(training.Model):
 
     self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def pop(self):
     """Removes the last layer in the model.
 
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index a65c200ce66070bd4d713bfc1ebda8d32f7435c3..afd7d230f9a8e69ed45e374de90216580de5a367 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -47,6 +47,18 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
+  @keras_parameterized.run_all_keras_modes
+  def test_input_defined_first_layer(self):
+    model = keras.models.Sequential()
+    model.add(keras.Input(shape=(2,), name='input_layer'))
+    model.add(keras.layers.Dense(1))
+    model.add(keras.layers.Dropout(0.3, name='dp'))
+    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
+                                 kernel_constraint='max_norm'))
+    self.assertLen(model.layers, 3)
+    self.assertLen(model.weights, 2 * 2)
+    self.assertEqual(model.get_layer(name='dp').name, 'dp')
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 6b4dadf2f444b49477314925c3306ad78dcd311d..ee5baf4553ea9657c68e99c03a86703b255b425a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -41,16 +41,14 @@ from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer as tf_optimizer_module
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.mode_keys import ModeKeys
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -143,7 +141,7 @@ class Model(Network):
         return super(Model, self).get_weights()
     return super(Model, self).get_weights()
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
               loss=None,
@@ -159,17 +157,20 @@ class Model(Network):
     Arguments:
         optimizer: String (name of optimizer) or optimizer instance.
             See `tf.keras.optimizers`.
-        loss: String (name of objective function) or objective function.
-            See `tf.losses`. If the model has multiple outputs, you can use a
-            different loss on each output by passing a dictionary or a list of
-            losses. The loss value that will be minimized by the model
-            will then be the sum of all individual losses.
-        metrics: List of metrics to be evaluated by the model
-            during training and testing.
-            Typically you will use `metrics=['accuracy']`.
+        loss: String (name of objective function), objective function or
+            `tf.losses.Loss` instance. See `tf.losses`. If the model has
+            multiple outputs, you can use a different loss on each output by
+            passing a dictionary or a list of losses. The loss value that will
+            be minimized by the model will then be the sum of all individual
+            losses.
+        metrics: List of metrics to be evaluated by the model during training
+            and testing. Typically you will use `metrics=['accuracy']`.
             To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary,
-            such as `metrics={'output_a': 'accuracy'}`.
+            multi-output model, you could also pass a dictionary, such as
+            `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
+            You can also pass a list (len = len(outputs)) of lists of metrics
+            such as `metrics=[['accuracy'], ['accuracy', 'mse']]` or
+            `metrics=['accuracy', ['accuracy', 'mse']]`.
         loss_weights: Optional list or dictionary specifying scalar
             coefficients (Python floats) to weight the loss contributions
             of different model outputs.
@@ -230,12 +231,6 @@ class Model(Network):
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
     if self._distribution_strategy:
-      if not isinstance(optimizer,
-                        (tf_optimizer_module.Optimizer, optimizers.TFOptimizer,
-                         optimizer_v2.OptimizerV2)):
-        raise NotImplementedError(
-            'optimizer must be an instance of '
-            'tf.train.Optimizer, not a %s' % type(optimizer))
       if sample_weight_mode:
         raise NotImplementedError('sample_weight_mode is not supported with '
                                   'DistributionStrategy.')
@@ -247,19 +242,12 @@ class Model(Network):
                          'DistributionStrategy.')
 
     loss = loss or {}
-    if self.run_eagerly and not isinstance(
-        optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer,
-                    optimizer_v2.OptimizerV2)):
-      raise ValueError(
-          'When running a model in eager execution, the optimizer must be an '
-          'instance of tf.train.Optimizer. Received: '
-          '%s' % optimizer)
 
     self.optimizer = optimizer
     # We've disabled automatic dependency tracking for this method, but do want
-    # to add a checkpoint dependency on the optimizer if it's checkpointable.
-    if isinstance(self.optimizer, checkpointable.Checkpointable):
-      self._track_checkpointable(
+    # to add a checkpoint dependency on the optimizer if it's trackable.
+    if isinstance(self.optimizer, trackable.Trackable):
+      self._track_trackable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
     self._compile_metrics = metrics or []
@@ -288,77 +276,30 @@ class Model(Network):
       return
     self._is_compiled = True
 
-    # Prepare loss functions.
-    if isinstance(loss, dict):
-      for name in loss:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_functions = []
-      for name in self.output_names:
-        if name not in loss:
-          logging.warning(
-              'Output "' + name +
-              '" missing from loss dictionary. We assume '
-              'this was done on purpose. The fit and evaluate APIs will not be '
-              'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
-    elif isinstance(loss, list):
-      if len(loss) != len(self.outputs):
-        raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model outputs. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [training_utils.get_loss_function(l) for l in loss]
-    else:
-      loss_function = training_utils.get_loss_function(loss)
-      loss_functions = [loss_function for _ in range(len(self.outputs))]
-    self.loss_functions = loss_functions
+    # Prepare list of loss functions, same size of model outputs.
+    self.loss_functions = training_utils.prepare_loss_functions(
+        loss, self.output_names)
 
-    skip_target_indices = []
-    skip_target_weighing_indices = []
     self._feed_outputs = []
     self._feed_output_names = []
     self._feed_output_shapes = []
     self._feed_loss_fns = []
-    for i in range(len(loss_functions)):
-      if loss_functions[i] is None:
+    # if loss function is None, then this output will be skipped during total
+    # loss calculation and feed targets preparation.
+    skip_target_indices = []
+    skip_target_weighing_indices = []
+    for i, loss_function in enumerate(self.loss_functions):
+      if loss_function is None:
         skip_target_indices.append(i)
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
     if not self.run_eagerly:
       masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
-      if not isinstance(masks, list):
-        masks = [masks]
-
-    # Prepare loss weights.
-    if loss_weights is None:
-      loss_weights_list = [1. for _ in range(len(self.outputs))]
-    elif isinstance(loss_weights, dict):
-      for name in loss_weights:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss_weights '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_weights_list = []
-      for name in self.output_names:
-        loss_weights_list.append(loss_weights.get(name, 1.))
-    elif isinstance(loss_weights, list):
-      if len(loss_weights) != len(self.outputs):
-        raise ValueError(
-            'When passing a list as loss_weights, '
-            'it should have one entry per model output. '
-            'The model has ' + str(len(self.outputs)) +
-            ' outputs, but you passed loss_weights=' + str(loss_weights))
-      loss_weights_list = loss_weights
-    else:
-      raise TypeError('Could not interpret loss_weights argument: ' +
-                      str(loss_weights) + ' - expected a list of dicts.')
-    self.loss_weights_list = loss_weights_list
+
+    # Prepare list loss weights, same size of model outputs.
+    self.loss_weights_list = training_utils.prepare_loss_weights(
+        self.output_names, loss_weights)
 
     # Initialization for Eager mode execution.
     if self.run_eagerly:
@@ -372,15 +313,9 @@ class Model(Network):
         raise ValueError('target_tensors are not currently supported in Eager '
                          'mode.')
       self.total_loss = None
-      for i in range(len(self.outputs)):
-        if len(self.outputs) > 1:
-          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
       # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
+      self._set_metric_attributes(skip_target_indices=skip_target_indices)
 
       self.targets = []
       for i in range(len(self.outputs)):
@@ -453,68 +388,9 @@ class Model(Network):
       # Save all metric attributes per output of the model.
       self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-      # Compute total loss.
-      total_loss = None
-      with K.name_scope('loss'):
-        for i in range(len(self.outputs)):
-          if i in skip_target_indices:
-            continue
-          y_true = self.targets[i]
-          y_pred = self.outputs[i]
-          loss_fn = loss_functions[i]
-          sample_weight = self.sample_weights[i]
-          mask = masks[i]
-          loss_weight = loss_weights_list[i]
-          with K.name_scope(self.output_names[i] + '_loss'):
-            if mask is not None:
-              mask = math_ops.cast(mask, y_pred.dtype)
-              # Update weights with mask.
-              if sample_weight is None:
-                sample_weight = mask
-              else:
-                # Update dimensions of weights to match with mask if possible.
-                mask, _, sample_weight = squeeze_or_expand_dimensions(
-                    mask, None, sample_weight)
-                sample_weight *= mask
-
-            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-
-          if len(self.outputs) > 1:
-            # Keep track of the un-aggregated loss result tensor.
-            self._compile_metrics_tensors[self.output_names[i] +
-                                          '_loss'] = output_loss
-
-            # Keep track of stateful result tensor and function for the loss.
-            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-                loss_fn, name=loss_fn.name)
-            result_tensor = self._call_metric_fn(mean_wrapped_loss, y_true,
-                                                 y_pred, sample_weight, mask)
-            self._compile_stateful_metrics_tensors[self.output_names[i] +
-                                                   '_loss'] = result_tensor
-            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
-
-            self._compile_metrics_names.append(self.output_names[i] + '_loss')
-          if total_loss is None:
-            total_loss = loss_weight * output_loss
-          else:
-            total_loss += loss_weight * output_loss
-        if total_loss is None:
-          if not self.losses:
-            raise ValueError('The model cannot be compiled '
-                             'because it has no loss to optimize.')
-          else:
-            total_loss = 0.
-
-        # Add regularization penalties
-        # and other layer-specific losses.
-        for loss_tensor in self.losses:
-          total_loss += loss_tensor
-
       # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
+      self._set_metric_attributes(skip_target_indices=skip_target_indices)
+
       # Invoke metric functions for all the outputs.
       self._handle_metrics(
           self.outputs,
@@ -523,8 +399,12 @@ class Model(Network):
           skip_target_indices=skip_target_indices,
           sample_weights=self.sample_weights)
 
-      # Prepare gradient updates and state updates.
-      self.total_loss = total_loss
+      # Compute total loss.
+      # Used to keep track of the total loss value (stateless).
+      # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+      #                   loss_weight_2 * output_2_loss_fn(...) +
+      #                   layer losses.
+      self.total_loss = self._prepare_total_loss(skip_target_indices, masks)
 
       # Functions for train, test and predict will
       # be compiled lazily when required.
@@ -533,6 +413,7 @@ class Model(Network):
 
       self._fit_function = None
       self._eval_function = None
+      self._predict_function = None
       self.train_function = None
       self.test_function = None
       self.predict_function = None
@@ -793,6 +674,8 @@ class Model(Network):
         # servers via the Distribute Coordinator.
         def _worker_fn(_):
           """Run training inside the distributed coordinator."""
+          filtered_callbacks = distributed_training_utils \
+              .filter_distributed_callbacks(callbacks)
           return training_distributed.fit_distributed(
               self,
               x=x,
@@ -800,7 +683,7 @@ class Model(Network):
               batch_size=batch_size,
               epochs=epochs,
               verbose=verbose,
-              callbacks=callbacks,
+              callbacks=filtered_callbacks,
               validation_split=validation_split,
               validation_data=validation_data,
               shuffle=shuffle,
@@ -1052,7 +935,9 @@ class Model(Network):
         # Multi-Worker mode runs the Keras evaluation loop on multiple
         # servers via the Distribute Coordinator.
         def _worker_fn(_):
-          """Run training inside the distributed coordinator."""
+          """Run evaluation inside the distributed coordinator."""
+          filtered_callbacks = distributed_training_utils \
+              .filter_distributed_callbacks(callbacks)
           return training_distributed.evaluate_distributed(
               self,
               x=x,
@@ -1061,7 +946,7 @@ class Model(Network):
               verbose=verbose,
               sample_weight=sample_weight,
               steps=steps,
-              callbacks=callbacks)
+              callbacks=filtered_callbacks)
 
         # Independent worker only for now.
         return dc.run_distribute_coordinator(
@@ -1255,8 +1140,16 @@ class Model(Network):
     if hasattr(self, 'metrics'):
       for m in self.metrics:
         m.reset_states()
-      if self._distribution_strategy:
-        distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
+
+    # Reset the state of loss metric wrappers.
+    if hasattr(
+        self, '_output_loss_metrics') and self._output_loss_metrics is not None:
+      for m in self._output_loss_metrics:
+        m.reset_states()
+
+    # Reset metrics on all the distributed (cloned) models.
+    if self._distribution_strategy:
+      distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
 
   def train_on_batch(self,
                      x,
@@ -1315,7 +1208,12 @@ class Model(Network):
 
     if self.run_eagerly:
       outputs = training_eager.train_on_batch(
-          self, x, y, sample_weights=sample_weights)
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          reset_metrics=reset_metrics,
+          output_loss_metrics=self._output_loss_metrics)
     else:
       if not isinstance(K.symbolic_learning_phase(), int):
         ins = x + y + sample_weights + [True]
@@ -1384,7 +1282,12 @@ class Model(Network):
 
     if self.run_eagerly:
       outputs = training_eager.test_on_batch(
-          self, x, y, sample_weights=sample_weights)
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          reset_metrics=reset_metrics,
+          output_loss_metrics=self._output_loss_metrics)
     else:
       inputs = x + y + sample_weights
       if reset_metrics:
@@ -1700,6 +1603,105 @@ class Model(Network):
         verbose=verbose,
         callbacks=callbacks)
 
+  def _prepare_total_loss(self, skip_target_indices=None, masks=None):
+    """Computes total loss from loss functions.
+
+    Arguments:
+        skip_target_indices: A list of indices of model outputs where loss
+          function is None.
+        masks: List of mask values corresponding to each model output.
+
+    Returns:
+        A list of loss weights of python floats.
+
+    Raises:
+        TypeError: If model run_eagerly is True.
+    """
+    if self.run_eagerly:
+      raise TypeError('total loss can not be computed when compiled with '
+                      'run_eagerly = True.')
+    skip_target_indices = skip_target_indices or []
+    total_loss = None
+    with K.name_scope('loss'):
+      zipped_inputs = zip(self.targets, self.outputs, self.loss_functions,
+                          self.sample_weights, masks, self.loss_weights_list)
+      for i, (y_true, y_pred, loss_fn, sample_weight, mask,
+              loss_weight) in enumerate(zipped_inputs):
+        if i in skip_target_indices:
+          continue
+        loss_name = self.output_names[i] + '_loss'
+        with K.name_scope(loss_name):
+          if mask is not None:
+            mask = math_ops.cast(mask, y_pred.dtype)
+            # Update weights with mask.
+            if sample_weight is None:
+              sample_weight = mask
+            else:
+              # Update dimensions of weights to match with mask if possible.
+              mask, _, sample_weight = (
+                  losses_utils.squeeze_or_expand_dimensions(
+                      mask, None, sample_weight))
+              sample_weight *= mask
+
+          # Reset reduction on the loss so that we can get the per sample loss
+          # value. We use this to get both the stateless and stateful loss
+          # values without having to compute the underlying loss function
+          # twice.
+          weighted_losses = None
+          if hasattr(loss_fn, 'reduction'):
+            current_loss_reduction = loss_fn.reduction
+            loss_fn.reduction = losses_utils.ReductionV2.NONE
+            weighted_losses = loss_fn(
+                y_true, y_pred, sample_weight=sample_weight)
+            loss_fn.reduction = current_loss_reduction
+
+            # Compute the stateless loss value.
+            output_loss = losses_utils.reduce_weighted_loss(
+                weighted_losses, reduction=current_loss_reduction)
+          else:
+            # Compute the stateless loss value for a custom loss class.
+            # Here we assume that the class takes care of loss reduction
+            # because if this class returns a vector value we cannot
+            # differentiate between use case where a custom optimizer
+            # expects a vector loss value vs unreduced per-sample loss value.
+            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
+
+        if len(self.outputs) > 1:
+          # Keep track of the un-aggregated loss result tensor.
+          self._compile_metrics_tensors[loss_name] = output_loss
+
+          # Keep track of stateful result tensor and function for the loss.
+          # Compute the stateful loss value.
+          if weighted_losses is not None:
+            # TODO(b/120571621): Directly call metric when the bug is fixed.
+            aggregated_output_loss = self._call_fn_for_each_replica(
+                self._output_loss_metrics[i], weighted_losses)
+          else:
+            # Custom loss class.
+            aggregated_output_loss = self._call_metric_fn(
+                self._output_loss_metrics[i], y_true, y_pred, sample_weight)
+          self._compile_stateful_metrics_tensors[
+              loss_name] = aggregated_output_loss
+          self._compile_stateful_metric_functions.append(
+              self._output_loss_metrics[i])
+
+        if total_loss is None:
+          total_loss = loss_weight * output_loss
+        else:
+          total_loss += loss_weight * output_loss
+      if total_loss is None:
+        if not self.losses:
+          raise ValueError('The model cannot be compiled '
+                           'because it has no loss to optimize.')
+        else:
+          total_loss = 0.
+
+      # Add regularization penalties and other layer-specific losses.
+      if self.losses:
+        total_loss += losses_utils.scale_loss_for_distribution(
+            math_ops.add_n(self.losses))
+    return total_loss
+
   def _get_callback_model(self):
     """Returns the Callback Model for this Model."""
 
@@ -1779,9 +1781,10 @@ class Model(Network):
       batch_size = 32
     return batch_size
 
-  @property
-  def _default_save_signature(self):
-    return saving_utils.trace_model_call(self)
+  def _list_functions_for_serialization(self):
+    return {
+        '_default_save_signature': saving_utils.trace_model_call(self)
+    }
 
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
@@ -1811,10 +1814,13 @@ class Model(Network):
         output_shapes.append(output.shape.as_list())
     self._per_output_metrics = training_utils.collect_per_output_metric_info(
         metrics, self.output_names, output_shapes, self.loss_functions)
-    self._per_output_weighted_metrics = \
+    self._per_output_weighted_metrics = (
         training_utils.collect_per_output_metric_info(
-            weighted_metrics, self.output_names, output_shapes,
-            self.loss_functions, self.sample_weights)
+            weighted_metrics,
+            self.output_names,
+            output_shapes,
+            self.loss_functions,
+            is_weighted=True))
 
   def _add_unique_metric_name(self, metric_name, output_index):
     """Makes the metric name unique and adds it to the model's metric name list.
@@ -1861,19 +1867,19 @@ class Model(Network):
 
   def _init_metric_attributes(self):
     """Initialized model metric attributes."""
-    # List of all metric names in the model.
+    # List of all metric names in the model. This includes loss metrics.
     self._compile_metrics_names = ['loss']
     # List of stateful metric functions. Used for resetting metric state during
-    # training/eval.
-    # This includes loss functions when there are multiple outputs.
+    # training/eval. This includes loss metric functions.
     self._compile_stateful_metric_functions = []
     # Dict of all aggregated metric result tensors. This includes aggregated
-    # loss result tensors when there are multiple outputs.
+    # loss result tensors.
     self._compile_stateful_metrics_tensors = {}
     # Dict of all metric result tensors (aggregated or not - based on the
-    # values given in compile.). This includes aggregated loss result tensors
-    # when there are multiple outputs.
+    # values given in compile.). This includes aggregated loss result tensors.
     self._compile_metrics_tensors = {}
+    # List of metric wrappers on output losses.
+    self._output_loss_metrics = None
 
   def _set_per_output_metric_attributes(self, metrics_dict, output_index):
     """Sets the metric attributes on the model for the given output.
@@ -1889,18 +1895,30 @@ class Model(Network):
     updated_metrics_dict = collections.OrderedDict()
     for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
       metric_name = self._add_unique_metric_name(metric_name, output_index)
+
+      # Update the name on the metric class to be the unique generated name.
+      stateful_metric_fn._name = metric_name  # pylint: disable=protected-access
       updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
       # Keep track of metric name, function and stateful function.
       self._compile_metrics_names.append(metric_name)
       self._compile_stateful_metric_functions.append(stateful_metric_fn)
     return updated_metrics_dict
 
-  def _set_metric_attributes(self, outputs, skip_target_indices=None):
+  def _set_metric_attributes(self, skip_target_indices=None):
     """Sets the metric attributes on the model for all the model outputs."""
+    # Add loss metric names to the model metric names list.
+    if len(self.outputs) > 1:
+      output_names = [
+          self.output_names[i] + '_loss'
+          for i in range(len(self.outputs))
+          if i not in skip_target_indices
+      ]
+      self._compile_metrics_names.extend(output_names)
+
     skip_target_indices = skip_target_indices or []
     updated_per_output_metrics = []
     updated_per_output_weighted_metrics = []
-    for i in range(len(outputs)):
+    for i in range(len(self.outputs)):
       if i in skip_target_indices:
         updated_per_output_metrics.append(self._per_output_metrics[i])
         updated_per_output_weighted_metrics.append(
@@ -1913,11 +1931,29 @@ class Model(Network):
           self._set_per_output_metric_attributes(
               self._per_output_weighted_metrics[i], i))
 
+    # Create a metric wrapper for each output loss.
+    if len(self.outputs) > 1:
+      self._output_loss_metrics = [
+          metrics_module.SumOverBatchSize() if hasattr(loss_fn, 'reduction')
+          else metrics_module.SumOverBatchSizeMetricWrapper(loss_fn)
+          for loss_fn in self.loss_functions
+      ]
+
     self._per_output_metrics = updated_per_output_metrics
     self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
-  def _call_metric_fn(self, fn, y_true, y_pred, weights, mask):
+  def _call_metric_fn(self, metric_fn, y_true, y_pred, weights, mask=None):
+    # TODO(b/120571621): Remove this function when the bug is fixed.
     """Helper function to call metric function with distribution strategy."""
+    return self._call_fn_for_each_replica(
+        training_utils.call_metric_function,
+        metric_fn,
+        y_true,
+        y_pred,
+        weights=weights,
+        mask=mask)
+
+  def _call_fn_for_each_replica(self, fn, *args, **kwargs):
     # TODO(b/120571621): We want to avoid metric reductions here since
     # since TPUStrategy does not implement replica local variables.
     # Remove this hack once we support TPUReplicaLocalVariables.
@@ -1927,10 +1963,8 @@ class Model(Network):
         distribution_strategy_context.in_cross_replica_context()):
       with self._distribution_strategy.scope():
         return self._distribution_strategy.extended.call_for_each_replica(
-            training_utils.call_metric_function,
-            (fn, y_true, y_pred, weights, mask))
-    return training_utils.call_metric_function(
-        fn, y_true, y_pred, weights=weights, mask=mask)
+            fn, args, kwargs)
+    return fn(*args, **kwargs)
 
   def _handle_per_output_metrics(self,
                                  metrics_dict,
@@ -2065,7 +2099,7 @@ class Model(Network):
           ' trainable weights, did you set `model.trainable`'
           ' without calling `model.compile` after ?', 1)
 
-  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
+  def _make_train_function_helper(self, fn_name, outputs):
     if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
@@ -2086,9 +2120,6 @@ class Model(Network):
       updates += self.get_updates_for(None)
       # Conditional updates relevant to this model
       updates += self.get_updates_for(self.inputs)
-      # Add stateful metrics updates.
-      if metric_updates is not None:
-        updates += metric_updates
 
       with K.name_scope('training'):
         # Gets loss and metrics. Updates weights at each call.
@@ -2114,7 +2145,7 @@ class Model(Network):
     self._make_train_function_helper(
         '_fit_function', [self.total_loss] + metrics_tensors)
 
-  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
+  def _make_test_function_helper(self, fn_name, outputs):
     if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     if getattr(self, fn_name) is None:
@@ -2124,9 +2155,6 @@ class Model(Network):
 
       with K.name_scope('evaluation'):
         updates = self.state_updates
-        # Add stateful metrics updates.
-        if metric_updates is not None:
-          updates += metric_updates
         # Return loss and metrics, no gradient updates.
         # Does update the network states.
         fn = K.function(
@@ -2184,9 +2212,6 @@ class Model(Network):
                                           sample_weight=None,
                                           class_weight=None,
                                           batch_size=None,
-                                          check_steps=False,
-                                          steps_name='steps',
-                                          steps=None,
                                           validation_split=0,
                                           shuffle=False,
                                           repeat=True,
@@ -2206,11 +2231,6 @@ class Model(Network):
         to, as conveyed by `y`.
       batch_size: Integer batch size. If provided, it is used to run additional
         validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
       validation_split: Float between 0 and 1.
         Fraction of the training data to be used as validation data.
       shuffle: Boolean whether to shuffle the training data before each epoch.
@@ -2259,18 +2279,15 @@ class Model(Network):
       if shuffle:
         training_utils.verify_dataset_shuffled(x)
 
-      if check_steps and steps is None:
-        raise ValueError('When using Datasets as input, '
-                         'you should specify the `{steps_name}` argument.'
-                         .format(steps_name=steps_name))
-
-    if ops.executing_eagerly_outside_functions():
-      session = None
-    else:
-      session = K.get_session()
-
     strategy = self._distribution_strategy
     with strategy.scope():
+      # We should be sure to call get_session() inside the strategy.scope()
+      # so the strategy can affect the session options.
+      if ops.executing_eagerly_outside_functions():
+        session = None
+      else:
+        session = K.get_session()
+
       first_x_value = nest.flatten(x)[0]
       if isinstance(first_x_value, np.ndarray):
         x = distributed_training_utils.list_to_tuple(x)
@@ -2285,18 +2302,14 @@ class Model(Network):
         else:
           in_tuple = x
 
-        if shuffle:
-          # 1024 is a good buffer size since it is much larger than the average
-          # batch size provided by the user and provides sufficient randomness.
-          # One thing to keep in mind is the memory usage based on the size of
-          # each sample.
-          shuffle_buffer = 1024
-        else:
-          shuffle_buffer = None
         ds = strategy.extended.experimental_make_numpy_dataset(in_tuple,
                                                                session=session)
-        if shuffle_buffer:
-          ds = ds.shuffle(shuffle_buffer)
+        if shuffle:
+          # We want a buffer size that is larger than the batch size provided by
+          # the user and provides sufficient randomness. Note that larger
+          # numbers introduce more memory usage based on the size of each
+          # sample.
+          ds = ds.shuffle(max(1024, batch_size * 8))
         if repeat:
           ds = ds.repeat()
 
@@ -2653,7 +2666,7 @@ class Model(Network):
           'However we received `validation_data=%s`' % validation_data)
     return val_x, val_y, val_sample_weight
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _set_inputs(self, inputs, outputs=None, training=None):
     """Set model's input and output specs based on the input data received.
 
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 68b8f615210de6ace43e0ebf1eb9cb66fa1a8056..e609f5d1986afe7ad1868c6fb97ddb5777e93865 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -252,8 +252,28 @@ def model_iteration(model,
           actual_inputs = ins() if callable(ins) else ins
           batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
-          if not is_dataset:
+          if is_dataset:
+            # The dataset passed by the user ran out of batches.
+            # Now we know the cardinality of the dataset.
+            # If steps_per_epoch was specified, then running out of data is
+            # unexpected, so we stop training and inform the user.
+            if steps_per_epoch:
+              callbacks.model.stop_training = True
+              logging.warning(
+                  'Your dataset ran out of data; interrupting training. '
+                  'Make sure that your dataset can generate at least '
+                  '`%s * epochs` batches (in this case, %d batches). '
+                  'You may need to use the repeat() function when '
+                  'building your dataset.'
+                  % (steps_name, steps_per_epoch * epochs))
+            elif step > 0:
+              steps_per_epoch = step
+              aggregator.num_samples_or_steps = steps_per_epoch
+              progbar.params['steps'] = steps_per_epoch
+              progbar.progbar.target = steps_per_epoch
+          else:
             # We ran out of batches while the user passed an iterator (legacy).
+            callbacks.model.stop_training = True
             logging.warning(
                 'Your dataset iterator ran out of data; '
                 'interrupting training. Make sure that your iterator '
@@ -261,15 +281,6 @@ def model_iteration(model,
                 'batches (in this case, %d batches). You may need to'
                 'use the repeat() function when building your '
                 'dataset.' % (steps_name, steps_per_epoch * epochs))
-            callbacks.model.stop_training = True
-          else:
-            # The dataset passed by the user ran out of batches.
-            # Now we know the cardinality of the dataset.
-            if step > 0:
-              steps_per_epoch = step
-              aggregator.num_samples_or_steps = steps_per_epoch
-              progbar.params['steps'] = steps_per_epoch
-              progbar.progbar.target = steps_per_epoch
           break
 
         if not isinstance(batch_outs, list):
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index 751c265005391536679ce0993a255ce124b164a5..bc37b082a84a473c3801645d53654f65e7616a60 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
@@ -34,6 +35,15 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
 
+class BatchCounterCallback(callbacks.Callback):
+
+  def __init__(self):
+    self.batch_count = 0
+
+  def on_batch_end(self, *args, **kwargs):
+    self.batch_count += 1
+
+
 class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
@@ -385,8 +395,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_finite_dataset_known_cardinality_no_steps_arg(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    model.compile(optimizer, 'mse',
+    model.compile('rmsprop', 'mse',
                   run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
@@ -394,8 +403,11 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.batch(10)
 
-    history = model.fit(dataset, epochs=2, verbose=1)
-    self.assertEqual(len(history.history['loss']), 2)
+    batch_counter = BatchCounterCallback()
+    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
     model.evaluate(dataset)
     out = model.predict(dataset)
     self.assertEqual(out.shape[0], 100)
@@ -404,8 +416,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    model.compile(optimizer, 'mse',
+    model.compile('rmsprop', 'mse',
                   run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
@@ -415,8 +426,49 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     self.assertEqual(keras.backend.get_value(cardinality.cardinality(dataset)),
                      cardinality.UNKNOWN)
 
-    history = model.fit(dataset, epochs=2, verbose=1)
-    self.assertEqual(len(history.history['loss']), 2)
+    batch_counter = BatchCounterCallback()
+    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_unknown_cardinality_out_of_data(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.filter(lambda x, y: True).batch(10)
+    self.assertEqual(
+        keras.backend.get_value(cardinality.cardinality(dataset)),
+        cardinality.UNKNOWN)
+
+    batch_counter = BatchCounterCallback()
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      # steps_per_epoch (200) is greater than the dataset size (100). As this is
+      # unexpected, training will stop and not make it to the second epoch.
+      history = model.fit(
+          dataset,
+          epochs=2,
+          verbose=1,
+          callbacks=[batch_counter],
+          steps_per_epoch=200)
+      self.assertIn(
+          'Your dataset ran out of data; interrupting training. '
+          'Make sure that your dataset can generate at least '
+          '`steps_per_epoch * epochs` batches (in this case, 400 batches). '
+          'You may need to use the repeat() function when '
+          'building your dataset.', str(mock_log.call_args))
+
+    self.assertLen(history.history['loss'], 1)
+    self.assertEqual(batch_counter.batch_count, 10)
     model.evaluate(dataset)
     out = model.predict(dataset)
     self.assertEqual(out.shape[0], 100)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 6412adc5a6453dcacbfff00b3096e24617cebb7c..3df58af0a25566ec78c5c8c1d2aaa1a06129fb58 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -22,8 +22,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import constant_op
@@ -36,9 +34,9 @@ from tensorflow.python.keras.engine import partial_batch_padding_handler as padd
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 
 
@@ -74,18 +72,11 @@ def fit_distributed(model,
             batch_size, mode=ModeKeys.TRAIN))
   batch_size = model._validate_or_infer_batch_size(
       batch_size, steps_per_epoch, x)
-  steps_name = 'steps_per_epoch'
-  if isinstance(x, dataset_ops.DatasetV2):
-    steps_per_epoch = training_utils.infer_steps_for_dataset(
-        x, steps_per_epoch, steps_name=steps_name)
   dataset = model._distribution_standardize_user_data(
       x, y,
       sample_weight=sample_weight,
       class_weight=class_weight,
       batch_size=batch_size,
-      check_steps=True,
-      steps_name=steps_name,
-      steps=steps_per_epoch,
       validation_split=validation_split,
       shuffle=shuffle)
 
@@ -100,18 +91,11 @@ def fit_distributed(model,
       validation_steps, _ = distributed_training_utils.get_input_params(
           model._distribution_strategy, first_valx_value, validation_steps,
           batch_size)
-    steps_name = 'validation_steps'
-    if isinstance(val_x, dataset_ops.DatasetV2):
-      validation_steps = training_utils.infer_steps_for_dataset(
-          val_x, validation_steps, steps_name=steps_name)
     val_dataset = model._distribution_standardize_user_data(
         val_x, val_y,
         sample_weight=val_sample_weights,
         class_weight=None,
         batch_size=batch_size,
-        check_steps=True,
-        steps_name=steps_name,
-        steps=validation_steps,
         validation_split=validation_split,
         shuffle=shuffle)
   elif validation_split:
@@ -129,7 +113,7 @@ def fit_distributed(model,
         initial_epoch=initial_epoch,
         steps_per_epoch=steps_per_epoch,
         validation_steps=validation_steps,
-        validation_freq=1)
+        validation_freq=validation_freq)
   else:
     return training_arrays.fit_loop(
         model,
@@ -143,7 +127,8 @@ def fit_distributed(model,
         initial_epoch=initial_epoch,
         steps_per_epoch=steps_per_epoch,
         validation_steps=validation_steps,
-        validation_freq=validation_freq)
+        validation_freq=validation_freq,
+        steps_name='steps_per_epoch')
 
 
 def evaluate_distributed(model,
@@ -161,18 +146,10 @@ def evaluate_distributed(model,
     steps, batch_size = distributed_training_utils.get_input_params(
         model._distribution_strategy, first_x_value, steps, batch_size)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-  steps_name = 'steps'
-
-  if isinstance(x, dataset_ops.DatasetV2):
-    steps = training_utils.infer_steps_for_dataset(x, steps,
-                                                   steps_name=steps_name)
   dataset = model._distribution_standardize_user_data(
       x, y,
       sample_weight=sample_weight,
-      batch_size=batch_size,
-      check_steps=True,
-      steps_name=steps_name,
-      steps=steps)
+      batch_size=batch_size)
 
   if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
     return experimental_tpu_test_loop(
@@ -202,16 +179,9 @@ def predict_distributed(model,
         model._distribution_strategy, first_x_value, steps,
         batch_size, mode=ModeKeys.PREDICT)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-  steps_name = 'steps'
-  if isinstance(x, dataset_ops.DatasetV2):
-    steps = training_utils.infer_steps_for_dataset(x, steps,
-                                                   steps_name=steps_name)
   dataset = model._distribution_standardize_user_data(
       x,
       batch_size=batch_size,
-      check_steps=True,
-      steps_name=steps_name,
-      steps=steps,
       repeat=False,
       allow_partial_batch=True)
   if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
@@ -272,6 +242,13 @@ def experimental_tpu_fit_loop(model,
   # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
   current_strategy = model._distribution_strategy
   iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+  steps_per_epoch = training_utils.infer_steps_for_dataset(
+      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
+  if (current_strategy.extended.steps_per_run != 1 and
+      steps_per_epoch is None):
+    raise ValueError('`steps_per_epoch` should be specified when calling '
+                     '`fit` on the model with TPUStrategy when '
+                     '`steps_per_run` != 1 .')
 
   scope = distributed_training_utils.distributed_scope(
       strategy=current_strategy, learning_phase=1)
@@ -312,7 +289,7 @@ def experimental_tpu_fit_loop(model,
 
     for label, output in zip(out_labels, combined_fn.outputs):
       if label == 'loss':
-        reduce_op = distribute_lib.get_loss_reduction()
+        reduce_op = ds_reduce_util.ReduceOp.SUM
       else:
         # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
@@ -331,18 +308,20 @@ def experimental_tpu_fit_loop(model,
     tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
-  if steps_per_epoch is None:
-    raise ValueError('`steps_per_epoch` should be specified when calling '
-                     '`fit` on the model.')
+  use_steps = steps_per_epoch is not None
+  if use_steps:
+    iteration_value = min(steps_per_epoch,
+                          current_strategy.extended.steps_per_run)
+  else:
+    iteration_value = current_strategy.extended.steps_per_run
+
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
+      value=iteration_value,
       dtype='int32',
       name='steps_per_run')
-
   ctx = current_strategy.extended.experimental_run_steps_on_iterator(
       step_fn, iterator, iterations=steps_per_run,
       initial_loop_values=initial_loop_values)
-
   train_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
 
@@ -362,11 +341,16 @@ def experimental_tpu_fit_loop(model,
       mode=mode)
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.extended.steps_per_run] * (
-      steps_per_epoch // current_strategy.extended.steps_per_run)
-  if steps_per_epoch % current_strategy.extended.steps_per_run:
-    steps_to_run.append(
-        steps_per_epoch % current_strategy.extended.steps_per_run)
+  if use_steps:
+    steps_to_run = ([current_strategy.extended.steps_per_run] *
+                    (steps_per_epoch //
+                     current_strategy.extended.steps_per_run))
+    if steps_per_epoch % current_strategy.extended.steps_per_run:
+      steps_to_run.append(
+          steps_per_epoch % current_strategy.extended.steps_per_run)
+    target_steps = len(steps_to_run)
+  else:
+    target_steps = np.inf
 
   callbacks._call_begin_hook(mode)
   for epoch in range(initial_epoch, epochs):
@@ -375,25 +359,36 @@ def experimental_tpu_fit_loop(model,
     epoch_logs = {}
     step_index = 0
     prev_step_count = None
-    for step_count in steps_to_run:
+    current_step = 0
+    while current_step < target_steps:
+      step_count = steps_to_run[current_step] if use_steps else 1
       batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
       callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
       if prev_step_count is None or step_count != prev_step_count:
         steps_per_run.load(step_count, K.get_session())
         prev_step_count = step_count
       try:
-        _, outputs = K.get_session().run([train_op, output_tensors])
+        _, outputs = K.batch_get_value([train_op, output_tensors])
       except errors.OutOfRangeError:
-        logging.warning('Your dataset iterator ran out of data; '
-                        'interrupting training. Make sure that your dataset '
-                        'can generate at least `steps_per_epoch * epochs` '
-                        'batches (in this case, %d batches).' %
-                        steps_per_epoch * epochs)
+        if use_steps:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+        else:
+          target_steps = current_step
+          logging.info('Dataset iterator ran out of data. Inferring the '
+                       'value of `steps_per_epoch` as %s  .' % target_steps)
+          distributed_training_utils.initialize_iterator(iterator,
+                                                         current_strategy)
         break
 
       batch_logs.update(outputs)
       callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
       step_index = step_index + step_count
+      current_step += 1
+
       if callbacks.model.stop_training:
         break
 
@@ -456,7 +451,11 @@ def experimental_tpu_test_loop(model,
   """
   mode = ModeKeys.TEST
   current_strategy = model._distribution_strategy
-  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+  iterator = distributed_training_utils.get_iterator(dataset,
+                                                     current_strategy)
+  steps = training_utils.infer_steps_for_dataset(dataset, steps,
+                                                 steps_name='steps')
+
   scope = distributed_training_utils.distributed_scope(
       strategy=current_strategy, learning_phase=0)
   scope.__enter__()
@@ -496,7 +495,7 @@ def experimental_tpu_test_loop(model,
 
     for label, output in zip(model.metrics_names, combined_fn.outputs):
       if label == 'loss':
-        reduce_op = distribute_lib.get_loss_reduction()
+        reduce_op = ds_reduce_util.ReduceOp.SUM
       else:
         # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
@@ -540,12 +539,29 @@ def experimental_tpu_test_loop(model,
       mode=ModeKeys.TEST)
   callbacks._call_begin_hook(mode)
 
-  assert steps is not None
   outs = [0.] * len(model.metrics_names)
-  for step in range(steps):
-    batch_logs = {'batch': step, 'size': 1}
-    callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
-    _, batch_outs = K.get_session().run([test_op, output_tensors])
+  if steps is not None:
+    target_steps = steps
+  else:
+    target_steps = np.inf
+
+  current_step = 0
+  while current_step < target_steps:
+    batch_logs = {'batch': current_step, 'size': 1}
+    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
+    try:
+      _, batch_outs = K.batch_get_value([test_op, output_tensors])
+    except errors.OutOfRangeError:
+      if steps is not None:
+        warning_msg = 'Make sure that your dataset can generate at least '
+        '`steps` batches (in this case, {} batches).'.format(steps)
+      else:
+        warning_msg = 'Number of steps ran: {} steps'.format(current_step)
+
+      logging.warning('Your dataset iterator ran out of data; '
+                      'interrupting evaluation. ' + warning_msg)
+      target_steps = current_step
+      break
     for i, label in enumerate(model.metrics_names):
       if i == 0:
         # Loss is stateless metrics.
@@ -555,15 +571,16 @@ def experimental_tpu_test_loop(model,
         outs[i] = batch_outs[label]
 
     batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
-    callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
     if verbose >= 1:
-      progbar.update(step + 1)
+      progbar.update(current_step + 1)
+    current_step += 1
 
   callbacks._call_end_hook(mode)
 
   scope.__exit__(None, None, None)
   if len(outs) >= 0:
-    outs[0] /= (steps)
+    outs[0] /= (target_steps)
 
   if len(outs) == 1:
     return outs[0]
@@ -592,6 +609,8 @@ def experimental_tpu_predict_loop(model,
       (if the model has multiple outputs).
   """
   mode = ModeKeys.PREDICT
+  steps = training_utils.infer_steps_for_dataset(dataset, steps,
+                                                 steps_name='steps')
   dataset_fully_shaped = (distributed_training_utils.
                           is_dataset_shape_fully_defined(dataset))
   padding_handler = None
@@ -698,22 +717,40 @@ def experimental_tpu_predict_loop(model,
       mode=mode)
   callbacks._call_begin_hook(mode)
 
-  assert steps is not None
   # Since we do not know how many samples we will see, we cannot pre-allocate
   # the returned Numpy arrays. Instead, we store one array per batch seen
   # and concatenate them upon returning.
   unconcatenated_outs = [[] for _ in model.outputs]
-  for step in range(steps):
-    batch_logs = {'batch': step, 'size': 1}
-    callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
-    _, batch_outs = K.get_session().run([predict_op, output_tensors])
+  if steps is not None:
+    target_steps = steps
+  else:
+    target_steps = np.inf
+
+  current_step = 0
+  while current_step < target_steps:
+    batch_logs = {'batch': current_step, 'size': 1}
+    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
+    try:
+      _, batch_outs = K.batch_get_value([predict_op, output_tensors])
+    except errors.OutOfRangeError:
+      if steps is not None:
+        warning_msg = 'Make sure that your dataset can generate at least '
+        '`steps` batches (in this case, {} batches).'.format(steps)
+      else:
+        warning_msg = 'Number of steps ran: {} steps'.format(current_step)
+
+      logging.warning('Your dataset iterator ran out of data; '
+                      'interrupting evaluation. ' + warning_msg)
+      break
+
     # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
     for i, label in enumerate(model.output_names):
       unconcatenated_outs[i].extend(batch_outs[label])
     batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
-    callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
     if verbose >= 1:
-      progbar.update(step + 1)
+      progbar.update(current_step + 1)
+    current_step += 1
 
   callbacks._call_end_hook(mode)
 
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 7f80f94d488a5b88ba3eb1c643fd5360938a730f..6fdb19c523dd1cd2dd13c1ae7549e406fcf95856 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -96,6 +96,10 @@ def _model_loss(model,
      regularization losses and applies masking and sample weighting
      to the loss value.
   """
+  # Used to keep track of the total loss value (stateless).
+  # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+  #                   loss_weight_2 * output_2_loss_fn(...) +
+  #                   layer losses.
   total_loss = 0
   kwargs = {}
   if model._expects_training_arg:
@@ -116,14 +120,14 @@ def _model_loss(model,
   masks = [t._keras_mask for t in outs]
   targets = nest.flatten(targets)
 
-  loss_metrics = []
-  aggregated_loss_metrics = []
+  # Used to keep track of individual output losses (stateless).
+  output_losses = []
+  # Used to keep track of individual output losses (stateful).
+  aggregated_output_losses = []
+
   with backend.name_scope('loss'):
     for i, loss_fn in enumerate(model.loss_functions):
-      if sample_weights:
-        weights = sample_weights[i]
-      else:
-        weights = None
+      weights = sample_weights[i] if sample_weights else None
       mask = masks[i]
       with backend.name_scope(model.output_names[i] + '_loss'):
         if mask is not None:
@@ -133,26 +137,47 @@ def _model_loss(model,
             weights = mask
           else:
             # Update dimensions of weights to match with mask if possible.
-            mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
+            mask, _, weights = (
+                losses_utils.squeeze_or_expand_dimensions(mask, None, weights))
             weights *= mask
-        output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
+
+        # Reset reduction on the loss so that we can get the per sample loss
+        # value. We use this to get both the stateless and stateful loss
+        # values without having to compute the underlying loss function
+        # twice.
+        weighted_losses = None
+        if hasattr(loss_fn, 'reduction'):
+          current_loss_reduction = loss_fn.reduction
+          loss_fn.reduction = losses_utils.ReductionV2.NONE
+          weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights)
+          loss_fn.reduction = current_loss_reduction
+
+          # Compute the stateless loss value.
+          output_loss = losses_utils.reduce_weighted_loss(weighted_losses)
+        else:
+          # Compute the stateless loss value for a custom loss class.
+          # Here we assume that the class takes care of loss reduction
+          # because if this class returns a vector value we cannot
+          # differentiate between use case where a custom optimizer
+          # expects a vector loss value vs unreduced per-sample loss value.
+          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
 
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
       # associated with a model, each output's loss is calculated and returned
       # as part of the loss_metrics.
       if len(model.outputs) > 1:
-        loss_metrics.append(backend.mean(output_loss))
-
+        output_losses.append(backend.mean(output_loss))
         if output_loss_metrics is not None:
-          # Keep track of the stateful loss result.
-          aggregated_loss_metrics.append(
-              training_utils.call_metric_function(
-                  output_loss_metrics[i],
-                  targets[i],
-                  outs[i],
-                  weights=weights,
-                  mask=mask))
+          # Compute the stateful loss value.
+          if weighted_losses is not None:
+            aggregated_output_loss = output_loss_metrics[i](weighted_losses)
+          else:
+            # Custom loss class.
+            aggregated_output_loss = training_utils.call_metric_function(
+                output_loss_metrics[i], targets[i], outs[i], weights=weights)
+          # Keep track of the stateful output loss result.
+          aggregated_output_losses.append(aggregated_output_loss)
 
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
@@ -164,10 +189,11 @@ def _model_loss(model,
     # Add regularization losses
     custom_losses = model.losses
     if custom_losses:
-      total_loss += math_ops.add_n(custom_losses)
+      total_loss += losses_utils.scale_loss_for_distribution(
+          math_ops.add_n(custom_losses))
     model._clear_losses()
 
-  return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
+  return outs, total_loss, output_losses, aggregated_output_losses, masks
 
 
 def _process_single_batch(model,
@@ -200,15 +226,15 @@ def _process_single_batch(model,
   """
   with backend.eager_learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
-      outs, loss, loss_metrics, aggregated_loss_metrics, masks\
-        = _model_loss(
-            model,
-            inputs,
-            targets,
-            output_loss_metrics=output_loss_metrics,
-            sample_weights=sample_weights,
-            training=training)
-      if loss is None:
+      outs, total_loss, output_losses, aggregated_output_losses, masks = (
+          _model_loss(
+              model,
+              inputs,
+              targets,
+              output_loss_metrics=output_loss_metrics,
+              sample_weights=sample_weights,
+              training=training))
+      if total_loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
     if training:
@@ -217,13 +243,18 @@ def _process_single_batch(model,
                         ' you are not setting model.trainable to False before '
                         'compiling the model.')
       else:
-        grads = tape.gradient(loss, model._collected_trainable_weights)
+        grads = tape.gradient(total_loss, model._collected_trainable_weights)
         model.optimizer.apply_gradients(zip(grads,
                                             model._collected_trainable_weights))
-    return outs, loss, loss_metrics, aggregated_loss_metrics, masks
+    return outs, total_loss, output_losses, aggregated_output_losses, masks
 
 
-def train_on_batch(model, inputs, targets, sample_weights=None):
+def train_on_batch(model,
+                   inputs,
+                   targets,
+                   sample_weights=None,
+                   reset_metrics=True,
+                   output_loss_metrics=None):
   """Calculates the loss and gradient updates for one input batch.
 
   Arguments:
@@ -231,6 +262,11 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs: Input batch data.
       targets: Target batch data.
       sample_weights: Sample weight batch data.
+      reset_metrics: If `True`, the metrics returned will be only for this
+        batch. If `False`, the metrics will be statefully accumulated across
+        batches.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Returns:
       total loss and the loss associated with each output.
@@ -240,20 +276,24 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in inputs
-      ])
-      targets = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in targets
-      ])
+      inputs = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in inputs])
+      targets = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
     sample_weights = [
         training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, _, masks = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=True)
+  outs, total_loss, output_losses, aggregated_output_losses, masks = (
+      _process_single_batch(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=True,
+          output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
@@ -262,16 +302,23 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=True)
-  loss = nest.flatten(loss)
-
-  return [
-      tensor_util.constant_value(v)
-      for v in loss + loss_metrics + metrics_results
-  ]
-
-
-def test_on_batch(model, inputs, targets, sample_weights=None):
+      return_stateful_result=not reset_metrics)
+  total_loss = nest.flatten(total_loss)
+  if reset_metrics:
+    final_output_losses = output_losses
+  else:
+    final_output_losses = aggregated_output_losses
+  results = total_loss + final_output_losses + metrics_results
+
+  return [tensor_util.constant_value(v) for v in results]
+
+
+def test_on_batch(model,
+                  inputs,
+                  targets,
+                  sample_weights=None,
+                  reset_metrics=True,
+                  output_loss_metrics=None):
   """Calculates the loss for one input batch.
 
   Arguments:
@@ -279,6 +326,11 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs: Input batch data.
       targets: Target batch data.
       sample_weights: Sample weight batch data.
+      reset_metrics: If `True`, the metrics returned will be only for this
+        batch. If `False`, the metrics will be statefully accumulated across
+        batches.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Returns:
       total loss, loss and metrics associated with each output.
@@ -288,19 +340,23 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in inputs
-      ])
-      targets = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in targets
-      ])
+      inputs = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in inputs])
+      targets = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
     sample_weights = [
         training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, _, masks = _model_loss(
-      model, inputs, targets, sample_weights=sample_weights, training=False)
+  outs, total_loss, output_losses, aggregated_output_losses, masks = (
+      _model_loss(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=False,
+          output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
@@ -309,10 +365,12 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=True)
-  loss = nest.flatten(loss)
-
-  return [
-      tensor_util.constant_value(v)
-      for v in loss + loss_metrics + metrics_results
-  ]
+      return_stateful_result=not reset_metrics)
+  total_loss = nest.flatten(total_loss)
+  if reset_metrics:
+    final_output_losses = output_losses
+  else:
+    final_output_losses = aggregated_output_losses
+  results = total_loss + final_output_losses + metrics_results
+
+  return [tensor_util.constant_value(v) for v in results]
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index da460ee20592a0cb4296a79235f184bd005c1768..a9fdb0721fe497abf86d957063fba5d4de986db9 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 
 
@@ -149,12 +149,14 @@ def model_iteration(model,
       model, mode, class_weight=class_weight)
 
   # Create the queue for the generator.
-  output_generator, enqueuer = _make_enqueued_generator(
-      generator,
-      workers=workers,
-      use_multiprocessing=use_multiprocessing,
-      max_queue_size=max_queue_size,
-      shuffle=shuffle)
+  enqueuer = None
+  if not is_dataset:
+    generator, enqueuer = _make_enqueued_generator(
+        generator,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        max_queue_size=max_queue_size,
+        shuffle=shuffle)
 
   num_samples_or_steps, use_steps = _get_num_samples_or_steps(
       data, steps_per_epoch)
@@ -208,10 +210,30 @@ def model_iteration(model,
 
     step = 0
     while step < target_steps:
-      batch_data = _get_next_batch(output_generator, mode)
+      batch_data = _get_next_batch(generator, mode)
       if batch_data is None:
-        if not is_dataset:
+        if is_dataset:
+          # The dataset passed by the user ran out of batches.
+          # Now we know the cardinality of the dataset.
+          # If steps_per_epoch was specified, then running out of data is
+          # unexpected, so we stop training and inform the user.
+          if steps_per_epoch:
+            callbacks.model.stop_training = True
+            logging.warning(
+                'Your dataset ran out of data; interrupting training. '
+                'Make sure that your dataset can generate at least '
+                '`%s * epochs` batches (in this case, %d batches). '
+                'You may need to use the repeat() function when '
+                'building your dataset.'
+                % (steps_name, steps_per_epoch * epochs))
+          elif step > 0:
+            steps_per_epoch = step
+            aggregator.num_samples_or_steps = steps_per_epoch
+            progbar.params['steps'] = steps_per_epoch
+            progbar.progbar.target = steps_per_epoch
+        else:
           # We ran out of batches while the user passed an iterator (legacy).
+          callbacks.model.stop_training = True
           logging.warning(
               'Your dataset iterator ran out of data; '
               'interrupting training. Make sure that your iterator '
@@ -219,16 +241,6 @@ def model_iteration(model,
               'batches (in this case, %d batches). You may need to'
               'use the repeat() function when building your '
               'dataset.' % (steps_name, steps_per_epoch * epochs))
-          callbacks.model.stop_training = True
-        else:
-          # The dataset passed by the user ran out of batches.
-          # Now we know the cardinality of the dataset.
-          # assert steps_per_epoch is None
-          if step > 0:
-            steps_per_epoch = step
-            aggregator.num_samples_or_steps = steps_per_epoch
-            progbar.params['steps'] = steps_per_epoch
-            progbar.progbar.target = steps_per_epoch
         break
 
       # `batch_size` used for validation data if validation
@@ -240,13 +252,32 @@ def model_iteration(model,
       callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
       progbar.on_batch_begin(step, batch_logs)
 
+      is_deferred = not model._is_compiled
       batch_outs = batch_function(*batch_data)
       if not isinstance(batch_outs, list):
         batch_outs = [batch_outs]
 
-      # Aggregate results.
       if step == 0:
         aggregator.create(batch_outs)
+
+        if is_deferred:
+          # Set callbacks params. We do this here when model is compiled only
+          # in the first iteration of this loop (deferred build scenario).
+          cbks.set_callback_parameters(
+              callbacks,
+              model,
+              do_validation=do_validation,
+              batch_size=batch_size,
+              epochs=epochs,
+              steps_per_epoch=steps_per_epoch,
+              samples=num_samples_or_steps,
+              verbose=verbose,
+              mode=mode)
+
+          progbar.params = callbacks.params
+          progbar.params['verbose'] = verbose
+
+      # Aggregate results.
       aggregator.aggregate(batch_outs)
 
       # Callbacks batch end.
@@ -317,10 +348,10 @@ predict_generator = functools.partial(
     model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
 
 
-def _get_next_batch(output_generator, mode):
+def _get_next_batch(generator, mode):
   """Retrieves the next batch of input data."""
   try:
-    generator_output = next(output_generator)
+    generator_output = next(generator)
   except (StopIteration, errors.OutOfRangeError):
     return None
   if not isinstance(generator_output, tuple):
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 6be4da70f6e60c4da4a0e96999e51cae1aabc005..c83e669e9a568b3ef32cd2742ed4e9bb3399b959 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -27,6 +27,7 @@ import numpy as np
 import six
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -34,6 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
@@ -54,6 +56,129 @@ except ImportError:
   scipy_sparse = None
 
 
+class CompileTest(keras_parameterized.TestCase):
+
+  def _get_multi_output_model(self):
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    output_a = keras.layers.Dense(1, name='dense_1')(input_a)
+    output_b = keras.layers.Dense(1, name='dense_2')(input_a)
+    return keras.models.Model(input_a, [output_a, output_b])
+
+  def _do_test_compile_with_model_and_single_loss(self, model, loss):
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss, loss)
+
+    loss = losses.get(loss)
+    if not isinstance(loss, list):
+      loss_list = [loss] * len(model.outputs)
+
+    self.assertEqual(len(model.loss_functions), len(loss_list))
+    for i in range(len(loss_list)):
+      self.assertIsInstance(model.loss_functions[i], losses.LossFunctionWrapper)
+      if not isinstance(loss_list[i], losses.LossFunctionWrapper):
+        self.assertEqual(model.loss_functions[i].fn, loss_list[i])
+    self.assertAllEqual(model.loss_weights_list, [1.] * len(loss_list))
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(('loss_string', 'mse'),
+                                  ('loss_function', losses.mean_squared_error),
+                                  ('loss_instance', losses.MeanSquaredError()))
+  def test_compile_with_single_output(self, loss):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    self._do_test_compile_with_model_and_single_loss(model, loss)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(('loss_string', 'mse'),
+                                  ('loss_function', losses.mean_squared_error),
+                                  ('loss_instance', losses.MeanSquaredError()))
+  def test_compile_with_multi_output(self, loss):
+    model = self._get_multi_output_model()
+    self._do_test_compile_with_model_and_single_loss(model, loss)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_multi_output_and_multi_loss(self):
+    model = self._get_multi_output_model()
+    # Test loss is a list.
+    loss = ['mse', 'mae']
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss_functions[0].fn, losses.mean_squared_error)
+    self.assertEqual(model.loss_functions[1].fn, losses.mean_absolute_error)
+    self.assertAllEqual(model.loss_weights_list, [1., 1.])
+
+    # Test loss is a dict.
+    loss = {'dense_1': 'mae', 'dense_2': 'mse'}
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss_functions[0].fn, losses.mean_absolute_error)
+    self.assertEqual(model.loss_functions[1].fn, losses.mean_squared_error)
+    self.assertAllEqual(model.loss_weights_list, [1., 1.])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_multi_output_and_loss_weights_list(self):
+    model = self._get_multi_output_model()
+    loss_weights = [1., 2.]
+    model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
+    self.assertAllEqual(model.loss_weights_list, [1., 2.])
+
+  def test_compile_with_multi_output_and_loss_weights_dict(self):
+    with context.graph_mode():
+      model = self._get_multi_output_model()
+      loss_weights = {'dense_1': 1., 'dense_2': 2.}
+      model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
+      self.assertAllEqual(model.loss_weights_list, [1., 2.])
+
+      input_np = np.random.random((10, 3))
+      output_a_np = np.random.random((10, 1))
+      output_b_np = np.random.random((10, 1))
+
+      with self.cached_session() as sess:
+        sess.run(variables_lib.global_variables_initializer())
+        total_loss, y_preds = sess.run(
+            [model.total_loss, model.outputs],
+            feed_dict={
+                'input_a:0': input_np,
+                'dense_1_target:0': output_a_np,
+                'dense_2_target:0': output_b_np
+            })
+        self.assertAllClose(
+            total_loss,
+            np.mean(
+                np.add((output_a_np - y_preds[0])**2,
+                       2 * (output_b_np - y_preds[1])**2)))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_size(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(ValueError, 'The model has 1 outputs'):
+      model.compile(optimizer='adam', loss=['mse', 'mae'])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_key(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown entry in loss dictionary: unknown_output'):
+      model.compile(optimizer='adam', loss={'unknown_output': 'mse'})
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_weights_size(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(ValueError,
+                                 'it should have one entry per model output'):
+      model.compile(optimizer='adam', loss='mse', loss_weights=[1., 2.])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_weights_key(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown entry in loss_weights dictionary: unknown_output'):
+      model.compile(
+          optimizer='adam', loss='mse', loss_weights={'unknown_output': 1.})
+
+
 class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
@@ -566,14 +691,17 @@ class TrainingTest(keras_parameterized.TestCase):
           validation_data=(x_train, y_train))
       self.assertEqual(test_callback.batch_end_call_count, 10)
       self.assertEqual(test_callback.epoch_end_call_count, 2)
+
+      weighted_metric = ('mae'
+                         if tf2.enabled() else 'weighted_mean_absolute_error')
       self.assertSetEqual(
           set(test_callback.batch_end_logs.keys()),
-          set(['batch', 'size', 'acc', 'loss', 'weighted_mean_absolute_error']))
+          set(['batch', 'size', 'acc', 'loss', weighted_metric]))
       self.assertSetEqual(
           set(test_callback.epoch_end_logs.keys()),
           set([
-              'acc', 'loss', 'weighted_mean_absolute_error', 'val_acc',
-              'val_loss', 'val_weighted_mean_absolute_error'
+              'acc', 'loss', weighted_metric, 'val_acc', 'val_loss',
+              'val_' + weighted_metric
           ]))
 
   @keras_parameterized.run_all_keras_modes
@@ -832,6 +960,44 @@ class TrainingTest(keras_parameterized.TestCase):
         callbacks=[val_counter])
     self.assertEqual(val_counter.val_runs, expected_runs)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_add_loss_correctness(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/124303407')
+
+    class Bias(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+      def call(self, inputs):
+        return inputs + self.bias
+
+    inputs = keras.Input(shape=(1,))
+    outputs = Bias()(inputs)
+    model = keras.Model(inputs, outputs)
+    targets = keras.Input(shape=(1,))
+
+    model.add_loss(
+        math_ops.reduce_mean(
+            keras.losses.mean_absolute_error(targets, outputs)))
+
+    # If we want to use the loss class instance as shown below, we will need to
+    # add graph scope as the reduction logic involves some eager mode checks.
+    with keras.backend.get_graph().as_default():
+      model.add_loss(keras.losses.MeanAbsoluteError()(targets, outputs))
+
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.033333),
+        loss=keras.losses.MeanAbsoluteError(),
+        target_tensors=[targets],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    history = model.fit(x, y, batch_size=3, epochs=5)
+    self.assertAllClose(history.history['loss'], [3., 2.7, 2.4, 2.1, 1.8], 1e-3)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -885,9 +1051,9 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
                 'dense_1': metrics_module.CategoricalAccuracy(),
             },
             run_eagerly=testing_utils.should_run_eagerly())
-        msg = ('Output "dense_1" missing from loss dictionary. We assume this '
+        msg = ('Output dense_1 missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
-               'expecting any data to be passed to "dense_1".')
+               'expecting any data to be passed to dense_1.')
         self.assertRegexpMatches(str(mock_log.call_args), msg)
 
 
@@ -2095,9 +2261,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     metrics = ['mse', metrics_module.BinaryAccuracy()]
     model.compile(optimizer, loss='mae', metrics=metrics,
                   run_eagerly=testing_utils.should_run_eagerly())
+
+    mse_metric = 'mse' if tf2.enabled() else 'mean_squared_error'
     reference_metric_names = [
-        'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
-        'dense_binary_accuracy', 'dropout_mean_squared_error',
+        'loss', 'dense_loss', 'dropout_loss', 'dense_' + mse_metric,
+        'dense_binary_accuracy', 'dropout_' + mse_metric,
         'dropout_binary_accuracy'
     ]
     self.assertEqual(reference_metric_names, model.metrics_names)
@@ -2114,69 +2282,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-  @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            3, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='mae',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=testing_utils.should_run_eagerly())
-
-    # verify correctness of stateful and stateless metrics.
-    x = np.ones((100, 4))
-    y = np.ones((100, 1))
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs[1], 1.)
-    self.assertEqual(outs[2], 1.)
-
-    y = np.zeros((100, 1))
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness_with_weighted_metrics(self):
-    np.random.seed(1337)
-    x = np.array([[[1.], [1.]], [[0.], [0.]]])
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(1, kernel_initializer='ones'),
-            input_shape=(2, 1)))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss='mse',
-        sample_weight_mode='temporal',
-        weighted_metrics=['accuracy', 'mse'],
-        run_eagerly=testing_utils.should_run_eagerly())
-    y = np.array([[[1.], [1.]], [[1.], [1.]]])
-
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs, [0.5, 0.5, 0.5])
-
-    w = np.array([[0., 0.], [0., 0.]])
-    outs = model.evaluate(x, y, sample_weight=w)
-    self.assertEqual(outs, [0., 0., 0.])
-
-    w = np.array([[3., 4.], [1., 2.]])
-    outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.75, 0.7, 0.3], .001)
-
-    # Verify that metric value is same with arbitrary weights and batch size.
-    x = np.random.random((50, 2, 1))
-    y = np.random.random((50, 2, 1))
-    w = np.random.random((50, 2))
-    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[2]
-    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
-    self.assertNear(mse1, mse2, err=1e-7)
-
   @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
     model = keras.Sequential()
@@ -2199,6 +2304,67 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.evaluate(x_test, y_test, batch_size=5)
     self.assertEqual(self.evaluate(acc_obj.count), 10)
 
+  @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+  @keras_parameterized.run_all_keras_modes
+  def test_metrics_valid_compile_input_formats(self):
+    inp_1 = keras.layers.Input(shape=(1,), name='input_1')
+    inp_2 = keras.layers.Input(shape=(1,), name='input_2')
+    x = keras.layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = keras.layers.Dense(
+        1, kernel_initializer='ones', name='output_1', trainable=False)
+    out_2 = keras.layers.Dense(
+        1, kernel_initializer='ones', name='output_2', trainable=False)
+
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+
+    # list of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[keras.metrics.MeanSquaredError()],
+        weighted_metrics=[keras.metrics.MeanSquaredError()],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # list of list of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[
+            keras.metrics.MeanSquaredError(),
+            [keras.metrics.MeanSquaredError(),
+             keras.metrics.Accuracy()]
+        ],
+        weighted_metrics=[
+            keras.metrics.MeanSquaredError(),
+            [keras.metrics.MeanSquaredError(),
+             keras.metrics.Accuracy()]
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # dict of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics={
+            'output_1':
+                keras.metrics.MeanSquaredError(),
+            'output_2': [
+                keras.metrics.MeanSquaredError(),
+                keras.metrics.Accuracy()
+            ],
+        },
+        weighted_metrics={
+            'output_1':
+                keras.metrics.MeanSquaredError(),
+            'output_2': [
+                keras.metrics.MeanSquaredError(),
+                keras.metrics.Accuracy()
+            ],
+        },
+        run_eagerly=testing_utils.should_run_eagerly())
+
   @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
     num_classes = 5
@@ -2216,6 +2382,17 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           metrics=metrics_module.CategoricalAccuracy(),
           run_eagerly=testing_utils.should_run_eagerly())
 
+    inp = keras.layers.Input(shape=(1,))
+    x = keras.layers.Dense(3, activation='relu')(inp)
+    out_1 = keras.layers.Dense(1, activation='sigmoid', name='output_1')(x)
+    out_2 = keras.layers.Dense(1, activation='sigmoid', name='output_2')(x)
+    model = keras.models.Model(inp, [out_1, out_2])
+    with self.assertRaisesRegex(
+        ValueError, 'When passing a list of lists as `metrics`, '
+        'it should have one entry per model output. '
+        'The model has 2 outputs, but you passed metrics='):
+      model.compile('rmsprop', loss='mse', metrics=[['mse']])
+
   @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
     if testing_utils.should_run_eagerly():
@@ -2244,40 +2421,44 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       scores = model.train_on_batch(x, y, sample_weight=w)
       self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
 
-  @tf_test_util.run_deprecated_v1
-  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(1,))
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      model = keras.models.Model(x, y)
-      model.add_metric(
-          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_with_tensor_on_model(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/124303407')
+
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+    model = keras.models.Model(x, y)
+    model.add_metric(
+        math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
 
-      # test with a metric which does not have the standard signature:
-      # (y_true, y_pred, sample_Weight)
+    # test with a metric which does not have the standard signature:
+    # (y_true, y_pred, sample_Weight)
+    with keras.backend.get_graph().as_default():
       model.add_metric(metrics_module.Mean(name='metric_2')(y))
-      model.compile('sgd', loss='mse')
+    model.compile(
+        'sgd', loss='mse', run_eagerly=testing_utils.should_run_eagerly())
 
-      inputs = np.ones(shape=(10, 1))
-      targets = np.ones(shape=(10, 1))
-      history = model.fit(
-          inputs,
-          targets,
-          epochs=2,
-          batch_size=5,
-          validation_data=(inputs, targets))
-      self.assertEqual(history.history['metric_1'][-1], 5)
-      self.assertEqual(history.history['metric_2'][-1], 1)
-      self.assertEqual(history.history['val_metric_1'][-1], 5)
-      self.assertEqual(history.history['val_metric_2'][-1], 1)
+    inputs = np.ones(shape=(10, 1))
+    targets = np.ones(shape=(10, 1))
+    history = model.fit(
+        inputs,
+        targets,
+        epochs=2,
+        batch_size=5,
+        validation_data=(inputs, targets))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertEqual(history.history['metric_2'][-1], 1)
+    self.assertEqual(history.history['val_metric_1'][-1], 5)
+    self.assertEqual(history.history['val_metric_2'][-1], 1)
 
-      eval_results = model.evaluate(inputs, targets, batch_size=5)
-      self.assertEqual(eval_results[-1], 1)
-      self.assertEqual(eval_results[-2], 5)
+    eval_results = model.evaluate(inputs, targets, batch_size=5)
+    self.assertEqual(eval_results[-1], 1)
+    self.assertEqual(eval_results[-2], 5)
 
-      model.predict(inputs, batch_size=5)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
+    model.predict(inputs, batch_size=5)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
 
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_model_call(self):
@@ -2317,6 +2498,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_layer_call(self):
 
@@ -2332,9 +2514,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
         return inputs + 1
 
-    model = keras.Sequential()
-    model.add(TestLayer(input_shape=(1,)))
-    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    layers = [
+        TestLayer(input_shape=(1,)),
+        keras.layers.Dense(2, kernel_initializer='ones')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
     model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
                   run_eagerly=testing_utils.should_run_eagerly())
 
@@ -2344,60 +2528,53 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual(history.history['metric_1'][-1], 5)
     self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
 
-  @tf_test_util.run_deprecated_v1
+  @keras_parameterized.run_all_keras_modes
   def test_model_metrics_list(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(1,))
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      model = keras.models.Model(x, y)
-      model.add_metric(
-          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+    model = keras.models.Model(x, y)
+    model.add_metric(
+        math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+    with keras.backend.get_graph().as_default():
       model.add_metric(metrics_module.Mean(name='metric_2')(y))
-      model.compile('sgd', loss='mse', metrics=['acc'])
-
-      # Verify that the metrics added using `compile` and `add_metric` API are
-      # included
-      self.assertEqual(model._compile_metrics, ['acc'])
-      names = []
-      for m in model.metrics:
-        if isinstance(m, metrics_module.Metric):
-          names.append(m.name)
-        else:
-          names.append(m.__name__)
-      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
-
-  def test_model_eager_metrics_list(self):
-    with context.eager_mode():
+    model.compile(
+        'sgd',
+        loss='mse',
+        metrics=[metrics_module.Accuracy('acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      class TestModel(keras.Model):
+    # Verify that the metrics added using `compile` and `add_metric` API are
+    # included
+    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
+    self.assertEqual([m.name for m in model.metrics],
+                     ['acc', 'metric_1', 'metric_2'])
 
-        def __init__(self):
-          super(TestModel, self).__init__(name='test_model')
-          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+  @keras_parameterized.run_all_keras_modes
+  def test_model_metrics_list_in_call(self):
 
-        def call(self, x):
-          self.add_metric(
-              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
-          return self.dense1(x)
+    class TestModel(keras.Model):
 
-      model = TestModel()
-      model.compile(
-          loss='mse',
-          optimizer=RMSPropOptimizer(0.01),
-          metrics=['acc'],
-          run_eagerly=True)
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+        return self.dense1(x)
 
-      self.assertEqual(model._compile_metrics, ['acc'])
-      names = []
-      for m in model.metrics:
-        if isinstance(m, metrics_module.Metric):
-          names.append(m.name)
-        else:
-          names.append(m.__name__)
-      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+    model = TestModel()
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(0.01),
+        metrics=[metrics_module.Accuracy('acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
+    self.assertEqual([m.name for m in model.metrics], ['acc', 'metric_1'])
 
   @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
@@ -2435,28 +2612,34 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
-  def test_invalid_metric_tensor_in_call(self):
-    with context.eager_mode():
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_invalid_metric_tensor(self):
+
+    class TestLayer(keras.layers.Layer):
 
-      class TestLayer(keras.layers.Layer):
+      def build(self, input_shape):
+        self.built = True
 
-        def call(self, inputs):
-          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
-          return inputs + 1
+      def call(self, inputs):
+        self.add_metric(math_ops.reduce_mean(inputs), name='metric_1')
+        return inputs + 1
 
-      model = keras.Sequential()
-      model.add(TestLayer(input_shape=(1,)))
-      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
-      model.compile(
-          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+    layers = [TestLayer(input_shape=(1,))]
+    layers.append(keras.layers.Dense(2, kernel_initializer='ones'))
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
 
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      with self.assertRaisesRegexp(
-          ValueError,
-          'We do not support adding an aggregated metric tensor in `call` in '
-          'eager execution.'):
-        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'We do not support adding an aggregated metric result tensor that is '
+        'not the output of a `tf.keras.metrics.Metric` metric instance.'):
+      model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          run_eagerly=testing_utils.should_run_eagerly())
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
@@ -2486,10 +2669,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
-  def test_multiple_no_name_input_to_add_metric(self):
-    # TODO(kaftan) Test seems to not work, file ticket
-    if testing_utils.should_run_eagerly() and context.executing_eagerly():
-      self.skipTest('Skipping running model eagerly.')
+  def test_add_metric_without_name(self):
 
     class TestModel(keras.Model):
 
@@ -2498,7 +2678,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
 
       def call(self, x):
-        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
         self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
         return self.dense1(x)
 
@@ -2507,8 +2686,58 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
                   run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
-    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
+
+    with self.assertRaisesRegex(ValueError,
+                                'Please provide a name for your metric like'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_correctness(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/124303407')
+
+    inputs = keras.Input(shape=(1,))
+    targets = keras.Input(shape=(1,))
+
+    class Bias(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.bias = self.add_variable('bias', (1,), initializer='zeros')
+        self.mae = metrics_module.MeanAbsoluteError(name='mae_1')
+
+      def call(self, inputs):
+        outputs = inputs + self.bias
+        self.add_metric(self.mae(targets, outputs), name='mae_1')
+        return outputs
+
+    outputs = Bias()(inputs)
+    model = keras.Model(inputs, outputs)
+
+    model.add_metric(
+        metrics_module.mean_absolute_error(targets, outputs),
+        name='mae_2',
+        aggregation='mean')
+
+    # If we want to use the metric class instance as shown below, we will need
+    # to add graph scope as the reduction logic involves some eager mode checks.
+    with keras.backend.get_graph().as_default():
+      model.add_metric(
+          metrics_module.MeanAbsoluteError(name='mae_3')(targets, outputs))
+
+    model.compile(
+        loss='mae',
+        optimizer=keras.optimizer_v2.gradient_descent.SGD(0.1),
+        metrics=[metrics_module.MeanAbsoluteError(name='mae_4')],
+        target_tensors=[targets],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    history = model.fit(x, y, batch_size=3, epochs=5)
+
+    expected_val = [1., 0.9, 0.8, 0.7, 0.6]
+    for key in ['loss', 'mae_1', 'mae_2', 'mae_3', 'mae_4']:
+      self.assertAllClose(history.history[key], expected_val, 1e-3)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_a1_total_loss_available_with_dict_dataset(self):
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 895a592bd922133478e23f6e32cf2805826562d2..8c9b49cfcd3ff5bb940ba2879a5acdd47f2573f9 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Training-related utilities.
-"""
+"""Training-related utilities."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,17 +20,18 @@ from __future__ import print_function
 import abc
 import collections
 from collections import OrderedDict
-import copy
 
 import numpy as np
 import six
 
+from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -186,10 +186,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   return slices
 
 
-def check_num_samples(ins,
-                      batch_size=None,
-                      steps=None,
-                      steps_name='steps'):
+def check_num_samples(ins, batch_size=None, steps=None, steps_name='steps'):
   """Determine the number of samples provided for training and evaluation.
 
   The number of samples is not defined when running with `steps`,
@@ -198,9 +195,8 @@ def check_num_samples(ins,
   Arguments:
       ins: List of tensors to be fed to the Keras function.
       batch_size: Integer batch size or `None` if not defined.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
+      steps: Total number of steps (batches of samples) before declaring
+        `_predict_loop` finished. Ignored with the default value of `None`.
       steps_name: The public API's parameter name for `steps`.
 
   Raises:
@@ -214,13 +210,10 @@ def check_num_samples(ins,
       processed based on the size of the first dimension of the
       first input numpy array. When steps is not `None` and
       `batch_size` is `None`, returns `None`.
-
-  Raises:
-      ValueError: In case of invalid arguments.
   """
   if steps is not None and batch_size is not None:
-    raise ValueError(
-        'If ' + steps_name + ' is set, the `batch_size` must be None.')
+    raise ValueError('If ' + steps_name +
+                     ' is set, the `batch_size` must be None.')
   if check_steps_argument(ins, steps, steps_name):
     return None
   if hasattr(ins[0], 'shape'):
@@ -233,9 +226,8 @@ def standardize_single_array(x, expected_shape=None):
   if x is None:
     return None
 
-  if (x.shape is not None
-      and len(x.shape) == 1
-      and (expected_shape is None or len(expected_shape) != 1)):
+  if (x.shape is not None and len(x.shape) == 1 and
+      (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
       x = array_ops.expand_dims(x, axis=1)
     else:
@@ -259,9 +251,8 @@ def standardize_input_data(data,
       data: User-provided input data (polymorphic).
       names: List of expected array names.
       shapes: Optional list of expected array shapes.
-      check_batch_axis: Boolean; whether to check that
-          the batch axis of the arrays matches the expected
-          value found in `shapes`.
+      check_batch_axis: Boolean; whether to check that the batch axis of the
+        arrays matches the expected value found in `shapes`.
       exception_prefix: String prefix used for exception formatting.
 
   Returns:
@@ -273,8 +264,9 @@ def standardize_input_data(data,
   if not names:
     if (data is not None and hasattr(data, '__len__') and len(data) and
         not isinstance(data, dict)):
-      raise ValueError('Error when checking model ' + exception_prefix + ': '
-                       'expected no data, but got:', data)
+      raise ValueError(
+          'Error when checking model ' + exception_prefix + ': '
+          'expected no data, but got:', data)
     return []
   if data is None:
     return [None for _ in range(len(names))]
@@ -302,8 +294,9 @@ def standardize_input_data(data,
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
   if shapes is not None:
-    data = [standardize_single_array(x, shape)
-            for (x, shape) in zip(data, shapes)]
+    data = [
+        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
+    ]
   else:
     data = [standardize_single_array(x) for x in data]
 
@@ -316,11 +309,11 @@ def standardize_input_data(data,
                        'but instead got the following list of ' +
                        str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
     elif len(names) > 1:
-      raise ValueError(
-          'Error when checking model ' + exception_prefix +
-          ': you are passing a list as input to your model, '
-          'but the model expects a list of ' + str(len(names)) +
-          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': you are passing a list as input to your model, '
+                       'but the model expects a list of ' + str(len(names)) +
+                       ' Numpy arrays instead. The list you passed was: ' +
+                       str(data)[:200])
     elif len(data) == 1 and not hasattr(data[0], 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
                       ': data should be a Numpy array, or list/dict of '
@@ -350,10 +343,10 @@ def standardize_input_data(data,
           shape = shape[1:]
         for dim, ref_dim in zip(data_shape, shape):
           if ref_dim != dim and ref_dim is not None and dim is not None:
-            raise ValueError(
-                'Error when checking ' + exception_prefix + ': expected ' +
-                names[i] + ' to have shape ' + str(shape) +
-                ' but got array with shape ' + str(data_shape))
+            raise ValueError('Error when checking ' + exception_prefix +
+                             ': expected ' + names[i] + ' to have shape ' +
+                             str(shape) + ' but got array with shape ' +
+                             str(data_shape))
   return data
 
 
@@ -395,10 +388,10 @@ def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       x_weights.append(x_weight.get(name))
     return x_weights
   else:
-    raise TypeError(
-        'The model has multiple outputs, so `' + weight_type + '` '
-        'should be either a list or a dict. '
-        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
+    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
+                    'should be either a list or a dict. '
+                    'Provided `' + weight_type + '` type not understood: ' +
+                    str(x_weight))
 
 
 def standardize_class_weights(class_weight, output_names):
@@ -429,8 +422,11 @@ def check_array_lengths(inputs, targets, weights=None):
     if x is None:
       return {}
     else:
-      return set([y.shape[0] for y in x
-                  if y is not None and not tensor_util.is_tensor(y)])
+      return set([
+          y.shape[0]
+          for y in x
+          if y is not None and not tensor_util.is_tensor(y)
+      ])
 
   set_x = set_of_lengths(inputs)
   set_y = set_of_lengths(targets)
@@ -485,8 +481,9 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
       continue
     if losses.is_categorical_crossentropy(loss):
       if y.shape[-1] == 1:
-        raise ValueError('You are passing a target array of shape ' + str(
-            y.shape) + ' while using as loss `categorical_crossentropy`. '
+        raise ValueError('You are passing a target array of shape ' +
+                         str(y.shape) +
+                         ' while using as loss `categorical_crossentropy`. '
                          '`categorical_crossentropy` expects '
                          'targets to be binary matrices (1s and 0s) '
                          'of shape (samples, classes). '
@@ -521,15 +518,15 @@ def collect_per_output_metric_info(metrics,
                                    output_names,
                                    output_shapes,
                                    loss_fns,
-                                   sample_weights=None):
+                                   is_weighted=False):
   """Maps metric names and functions to model outputs.
 
   Arguments:
-      metrics: a list or dict of metric functions.
+      metrics: a list or a list of lists or a dict of metric functions.
       output_names: a list of the names (strings) of model outputs.
       output_shapes: a list of the shapes (strings) of model outputs.
       loss_fns: a list of the loss functions corresponding to the model outputs.
-      sample_weights: a list of weights to be applied on the model outputs.
+      is_weighted: Boolean indicating whether the given metrics are weighted.
 
   Returns:
       A list (one entry per model output) of dicts.
@@ -551,15 +548,30 @@ def collect_per_output_metric_info(metrics,
   """
   if not metrics:
     return [{} for _ in output_names]
+
   if isinstance(metrics, list):
-    # we then apply all metrics to all outputs.
-    nested_metrics = [copy.copy(metrics) for _ in output_names]
+    any_sub_list = any(isinstance(m, list) for m in metrics)
+    if any_sub_list:
+      if len(metrics) != len(output_names):
+        raise ValueError('When passing a list of lists as `metrics`, '
+                         'it should have one entry per model output. '
+                         'The model has ' + str(len(output_names)) +
+                         ' outputs, but you passed metrics=' + str(metrics))
+      # User has provided a list of len = len(outputs).
+      nested_metrics = [generic_utils.to_list(m) for m in metrics]
+    else:
+      # If it is a single list we then apply all metrics to all outputs.
+      if len(output_names) > 1:
+        nested_metrics = []
+        for _ in output_names:
+          nested_metrics.append(
+              [metrics_module.clone_metric(m) for m in metrics])
+      else:
+        nested_metrics = [metrics]
   elif isinstance(metrics, dict):
     nested_metrics = []
     for name in output_names:
-      output_metrics = metrics.get(name, [])
-      if not isinstance(output_metrics, list):
-        output_metrics = [output_metrics]
+      output_metrics = generic_utils.to_list(metrics.get(name, []))
       nested_metrics.append(output_metrics)
   else:
     raise TypeError('Type of `metrics` argument not understood. '
@@ -569,9 +581,7 @@ def collect_per_output_metric_info(metrics,
   for i, metrics in enumerate(nested_metrics):
     metrics_dict = OrderedDict()
     for metric in metrics:
-      weighted = False if (sample_weights is None) else (
-          sample_weights[i] is not None)
-      metric_name = get_metric_name(metric, weighted)
+      metric_name = get_metric_name(metric, is_weighted)
       metric_fn = get_metric_function(
           metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
 
@@ -584,7 +594,7 @@ def collect_per_output_metric_info(metrics,
       stateful_fn = metric_fn
       if not is_stateful:
         stateful_fn = metrics_module.MeanMetricWrapper(
-            metric_fn, name=metric_fn.__name__)
+            metric_fn, name=metric_name)
 
       metrics_dict[metric_name] = (metric_fn, stateful_fn)
     per_output_metrics.append(metrics_dict)
@@ -624,8 +634,7 @@ def weighted_masked_objective(fn):
   `fn(y_true, y_pred, weights, mask)`.
 
   Arguments:
-      fn: The objective function to wrap,
-          with signature `fn(y_true, y_pred)`.
+      fn: The objective function to wrap, with signature `fn(y_true, y_pred)`.
 
   Returns:
       A function with signature `fn(y_true, y_pred, weights, mask)`.
@@ -695,10 +704,10 @@ def standardize_weights(y,
       y: Numpy array of model targets to be weighted.
       sample_weight: User-provided `sample_weight` argument.
       class_weight: User-provided `class_weight` argument.
-      sample_weight_mode: One of `None` or `"temporal"`.
-          `"temporal"` indicated that we expect 2D weight data
-          that will be applied to the last 2 dimensions of
-          the targets (i.e. we are weighting timesteps, not samples).
+      sample_weight_mode: One of `None` or `"temporal"`. `"temporal"` indicated
+        that we expect 2D weight data that will be applied to the last 2
+        dimensions of the targets (i.e. we are weighting timesteps, not
+        samples).
 
   Returns:
       A numpy array of target weights, one entry per sample to weight.
@@ -739,17 +748,17 @@ def standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError(
-          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
-          'Expected sample_weight with rank '
-          'less than or equal to ' + str(len(y.shape)))
+      raise ValueError('Found a sample_weight with shape' +
+                       str(sample_weight.shape) + '.'
+                       'Expected sample_weight with rank '
+                       'less than or equal to ' + str(len(y.shape)))
 
     if (not tensor_util.is_tensor(sample_weight) and
         y.shape[:sample_weight.ndim] != sample_weight.shape):
-      raise ValueError(
-          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
-          ' for an input with shape ' + str(y.shape) + '. '
-          'sample_weight cannot be broadcast.')
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + ' for an input with shape ' +
+                       str(y.shape) + '. '
+                       'sample_weight cannot be broadcast.')
 
   # Class weights applied per-sample.
   class_sample_weight = None
@@ -773,10 +782,10 @@ def standardize_weights(y,
       # subtract the sets to pick all missing classes
       existing_classes = set(y_classes)
       existing_class_weight = set(class_weight.keys())
-      raise ValueError('`class_weight` must contain all classes in the data.'
-                       ' The classes %s exist in the data but not in '
-                       '`class_weight`.' %
-                       (existing_classes - existing_class_weight))
+      raise ValueError(
+          '`class_weight` must contain all classes in the data.'
+          ' The classes %s exist in the data but not in '
+          '`class_weight`.' % (existing_classes - existing_class_weight))
 
   if class_sample_weight is not None and sample_weight is not None:
     # Multiply weights if both are provided.
@@ -812,21 +821,29 @@ def get_metric_name(metric, weighted=False):
   Returns:
       The metric name.
   """
-  metric_name_prefix = 'weighted_' if weighted else ''
-  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-    if metric in ('accuracy', 'acc'):
-      suffix = 'acc'
-    elif metric in ('crossentropy', 'ce'):
-      suffix = 'ce'
+  if tf2.enabled():
+    # We keep the string that the user has set in compile as the metric name.
+    if isinstance(metric, six.string_types):
+      return metric
+
+    metric = metrics_module.get(metric)
+    return metric.name if hasattr(metric, 'name') else metric.__name__
   else:
-    metric_fn = metrics_module.get(metric)
-    # Get metric name as string
-    if hasattr(metric_fn, 'name'):
-      suffix = metric_fn.name
+    metric_name_prefix = 'weighted_' if weighted else ''
+    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+      if metric in ('accuracy', 'acc'):
+        suffix = 'acc'
+      elif metric in ('crossentropy', 'ce'):
+        suffix = 'ce'
     else:
-      suffix = metric_fn.__name__
-  metric_name = metric_name_prefix + suffix
-  return metric_name
+      metric_fn = metrics_module.get(metric)
+      # Get metric name as string
+      if hasattr(metric_fn, 'name'):
+        suffix = metric_fn.name
+      else:
+        suffix = metric_fn.__name__
+    metric_name = metric_name_prefix + suffix
+    return metric_name
 
 
 def get_metric_function(metric, output_shape=None, loss_fn=None):
@@ -834,8 +851,8 @@ def get_metric_function(metric, output_shape=None, loss_fn=None):
 
   Arguments:
       metric: Metric function name or reference.
-      output_shape: The shape of the output that this metric
-          will be calculated for.
+      output_shape: The shape of the output that this metric will be calculated
+        for.
       loss_fn: The loss function used.
 
   Returns:
@@ -912,13 +929,13 @@ def validate_dataset_input(x, y, sample_weight, validation_split=None):
   Arguments:
     x: Input data. A `tf.data` dataset or iterator.
     y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
-        Expected to be `None` when `x` is a dataset iterator.
-    sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`. Expected to be `None` when
-        `x` is a dataset iterator
-    validation_split: Float between 0 and 1. Fraction of the training data to
-        be used as validation data. Expected to be `None` when `x` is a dataset
-        iterator.
+      Expected to be `None` when `x` is a dataset iterator.
+    sample_weight: An optional sample-weight array passed by the user to weight
+      the importance of each sample in `x`. Expected to be `None` when `x` is a
+      dataset iterator
+    validation_split: Float between 0 and 1. Fraction of the training data to be
+      used as validation data. Expected to be `None` when `x` is a dataset
+      iterator.
 
   Raises:
     ValueError: if argument `y` or `sample_weight` or `validation_split` are
@@ -944,8 +961,7 @@ def validate_dataset_input(x, y, sample_weight, validation_split=None):
         'Received: x=%s, validation_split=%f' % (x, validation_split))
 
 
-def check_generator_arguments(y=None,
-                              sample_weight=None,
+def check_generator_arguments(y=None, sample_weight=None,
                               validation_split=None):
   """Validates arguments passed when using a generator."""
   if y is not None:
@@ -985,8 +1001,8 @@ def check_steps_argument(input_data, steps, steps_name):
         but not provided.
   """
   # TODO(fchollet): allow datasets with steps=None if cardinality is known.
-  is_x_iterator = isinstance(input_data, (iterator_ops.Iterator,
-                                          iterator_ops.EagerIterator))
+  is_x_iterator = isinstance(
+      input_data, (iterator_ops.Iterator, iterator_ops.EagerIterator))
   if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
       (isinstance(input_data, list) and not input_data)):
     if steps is None:
@@ -1104,6 +1120,95 @@ def prepare_sample_weights(output_names, sample_weight_mode,
   return sample_weights, sample_weight_modes
 
 
+def prepare_loss_functions(loss, output_names):
+  """Converts loss to a list of loss functions.
+
+  Arguments:
+      loss: String (name of objective function), objective function or
+        `tf.losses.Loss` instance. See `tf.losses`. If the model has multiple
+        outputs, you can use a different loss on each output by passing a
+        dictionary or a list of losses. The loss value that will be minimized by
+        the model will then be the sum of all individual losses.
+      output_names: List of model output names.
+
+  Returns:
+      A list of loss objective functions.
+
+  Raises:
+      ValueError: If loss is a dict with keys not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if isinstance(loss, collections.Mapping):
+    for name in loss:
+      if name not in output_names:
+        raise ValueError('Unknown entry in loss dictionary: {}. Only expected '
+                         'following keys: {}'.format(name, output_names))
+    loss_functions = []
+    for name in output_names:
+      if name not in loss:
+        logging.warning(
+            'Output {0} missing from loss dictionary. We assume '
+            'this was done on purpose. The fit and evaluate APIs will not be '
+            'expecting any data to be passed to {0}.'.format(name))
+      loss_functions.append(get_loss_function(loss.get(name, None)))
+  elif isinstance(loss, six.string_types):
+    loss_functions = [get_loss_function(loss) for _ in output_names]
+  elif isinstance(loss, collections.Sequence):
+    if len(loss) != len(output_names):
+      raise ValueError('When passing a list as loss, it should have one entry '
+                       'per model outputs. The model has {} outputs, but you '
+                       'passed loss={}'.format(len(output_names), loss))
+    loss_functions = nest.map_structure(get_loss_function, loss)
+  else:
+    loss_functions = [get_loss_function(loss) for _ in range(len(output_names))]
+
+  return loss_functions
+
+
+def prepare_loss_weights(output_names, loss_weights=None):
+  """Converts loss weights to a list of loss weights.
+
+  Arguments:
+      output_names: List of model output names.
+      loss_weights: Optional list or dictionary specifying scalar coefficients
+        (Python floats) to weight the loss contributions of different model
+        outputs. The loss value that will be minimized by the model will then be
+        the *weighted sum* of all individual losses, weighted by the
+          `loss_weights` coefficients. If a list, it is expected to have a 1:1
+            mapping to the model's outputs. If a dict, it is expected to map
+            output names (strings) to scalar coefficients.
+
+  Returns:
+      A list of loss weights of python floats.
+
+  Raises:
+      ValueError: If loss weight is a dict with key not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if loss_weights is None:
+    weights_list = [1.] * len(output_names)
+  elif isinstance(loss_weights, dict):
+    for name in loss_weights:
+      if name not in output_names:
+        raise ValueError('Unknown entry in loss_weights dictionary: {}. '
+                         'Only expected the following keys: {}'.format(
+                             name, output_names))
+    weights_list = [loss_weights.get(name, 1.) for name in output_names]
+  elif isinstance(loss_weights, list):
+    if len(loss_weights) != len(output_names):
+      raise ValueError('When passing a list as loss_weights, '
+                       'it should have one entry per model output. '
+                       'The model has ' + str(len(output_names)) +
+                       ' outputs, but you passed loss_weights=' +
+                       str(loss_weights))
+    weights_list = loss_weights
+  else:
+    raise TypeError('Could not interpret loss_weights argument: ' +
+                    str(loss_weights) + ' - expected a list of dicts.')
+
+  return weights_list
+
+
 # TODO(rohanj): This is a hack to get around not depending on feature_column and
 # create a cyclical dependency. Figure out a cleaner solution
 def is_feature_layer(layer):
@@ -1113,8 +1218,7 @@ def is_feature_layer(layer):
 
 def is_eager_dataset_or_iterator(data):
   return context.executing_eagerly() and isinstance(
-      data, (dataset_ops.DatasetV1,
-             dataset_ops.DatasetV2,
+      data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
              iterator_ops.EagerIterator))
 
 
@@ -1251,10 +1355,8 @@ def verify_dataset_shuffled(x):
 
 
 def is_dataset_or_iterator(data):
-  return isinstance(data, (dataset_ops.DatasetV1,
-                           dataset_ops.DatasetV2,
-                           iterator_ops.EagerIterator,
-                           iterator_ops.Iterator))
+  return isinstance(data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                           iterator_ops.EagerIterator, iterator_ops.Iterator))
 
 
 def get_iterator(dataset):
@@ -1267,7 +1369,7 @@ def get_iterator(dataset):
 def initialize_iterator(iterator):
   init_op = iterator.initializer
   if not context.executing_eagerly():
-    K.get_session().run(init_op)
+    K.get_session((init_op,)).run(init_op)
 
 
 def extract_tensors_from_dataset(dataset):
@@ -1414,7 +1516,10 @@ class ModelInputs(object):
         # we have. The user should call `model._set_inputs(placeholders)`
         # to specify custom placeholders if the need arises.
         shape = (None,) + tuple(v.shape[1:])
-        v = K.placeholder(shape=shape, name=k)
+        dtype = dtypes.as_dtype(v.dtype)
+        if dtype.is_floating:
+          dtype = K.floatx()
+        v = K.placeholder(shape=shape, name=k, dtype=dtype)
       elif isinstance(v, tensor_shape.TensorShape):
         shape = (None,) + tuple(v.as_list()[1:])
         v = K.placeholder(shape=shape, name=k)
@@ -1450,7 +1555,7 @@ def get_input_shape_and_dtype(layer):
       does not have a defined input shape.
 
   Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
+    ValueError: in case an empty Sequential or Functional model is passed.
   """
 
   def _is_graph_model(layer):
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index e472dc3f2cdd87f114e9b398560dd61f9cf33337..30e3d1f3e4029ce837647352f19fd70486d7bf1a 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -25,7 +25,9 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_utils
@@ -45,10 +47,11 @@ class ModelInputsTest(test.TestCase):
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
     self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
+    self.assertEqual(backend.floatx(), vals[0].dtype)
 
   def test_single_thing_eager(self):
     with context.eager_mode():
-      a = np.ones(10)
+      a = np.ones(10, dtype=np.int32)
       model_inputs = training_utils.ModelInputs(a)
       self.assertEqual(['input_1'], model_inputs.get_input_names())
       val = model_inputs.get_symbolic_inputs()
@@ -56,6 +59,7 @@ class ModelInputsTest(test.TestCase):
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
       self.assertEqual(1, len(vals))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertEqual(dtypes.int32, vals[0].dtype)
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index bb889f48b10aa77dd78108b3879c595c958e2b92..7250db2f99bcc68ca562564ce798c9f9f7020c35 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python import keras
@@ -47,10 +49,11 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
          keras.layers.Dropout(0.1),
          keras.layers.Dense(y_train.shape[-1], activation='softmax')],
         input_shape=x_train.shape[1:])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -82,10 +85,11 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
     y = base_model(x)
     y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
     model = keras.models.Model(x, y)
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     if not testing_utils.should_run_eagerly():
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
@@ -119,10 +123,11 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
     ]
     model = testing_utils.get_model_from_layers(
         layers, input_shape=x_train.shape[1:])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=15, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -147,10 +152,11 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
     model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
                                                 activation='softmax',
                                                 dtype=dtypes.float32)))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=15, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -184,10 +190,11 @@ class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
     ]
     model = testing_utils.get_model_from_layers(
         layers, input_shape=x_train.shape[1:])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -235,8 +242,8 @@ class ActivationV2IntegrationTest(keras_parameterized.TestCase):
               validation_data=(x_train, y_train),
               verbose=2)
 
-    output_path = keras.saving.saved_model.export(
-        model, '/tmp/tf_keras_saved_model')
+    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
+    keras.saving.saved_model.export_saved_model(model, output_path)
     loaded_model = keras.saving.saved_model.load_from_saved_model(output_path)
     self.assertEqual(model.summary(), loaded_model.summary())
 
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 30b919cc0a9038cf0eeb10a240105fbabd591efa..8d80eb85472416dddc16a2ad9db052faee72f2e0 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -689,7 +689,7 @@ class Conv2DTranspose(Conv2D):
       - [A guide to convolution arithmetic for deep
         learning](https://arxiv.org/abs/1603.07285v1)
       - [Deconvolutional
-        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+        Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
   """
 
   def __init__(self,
@@ -972,7 +972,7 @@ class Conv3DTranspose(Conv3D):
       - [A guide to convolution arithmetic for deep
         learning](https://arxiv.org/abs/1603.07285v1)
       - [Deconvolutional
-        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+        Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 9140ce426e6881b2abbc821e835c1e792c884343..24b61feec27e1af5791fdfb1228bf01aca91a126 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -420,6 +421,7 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
       keras.layers.ZeroPadding3D(padding=None)
 
 
+@test_util.disable_all_xla('b/124289666')  # align_corners=False unimplemented
 @keras_parameterized.run_all_keras_modes
 class UpSamplingTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index 300b1b815595474426b7faf400dd9c9b61cd5acd..0bc6e8b144a6ab79ff6b0d0fe936683a5478b9e3 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -288,7 +288,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
       ('other', init_ops.random_uniform_initializer(), 5.0))
   @test_util.run_in_graph_and_eager_modes()
   def test_different_params_similar_approximation(self, initializer, scale):
-    # Layers initialized using different randomness (seed).
+    random_seed.set_random_seed(12345)
     rff_layer1 = kernel_layers.RandomFourierFeatures(
         output_dim=3000,
         kernel_initializer=initializer,
@@ -314,7 +314,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
     approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
     approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
-    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.05)
+    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)
 
   @parameterized.named_parameters(
       ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index f962a75b32421860296476607a5dacdaaf5468cd..7432ad4af886f2cbe20574d2c264d81681b210a6 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.keras import backend as K
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -35,7 +36,8 @@ class MergeLayersTest(keras_parameterized.TestCase):
     i2 = keras.layers.Input(shape=(4, 5))
     i3 = keras.layers.Input(shape=(4, 5))
 
-    o = keras.layers.add([i1, i2, i3])
+    add_layer = keras.layers.Add()
+    o = add_layer([i1, i2, i3])
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
@@ -47,6 +49,56 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
+    self.assertEqual(
+        add_layer.compute_mask([i1, i2, i3], [None, None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                add_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      add_layer.compute_mask([i1, i2, i3], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      add_layer.compute_mask(i1, [None, None, None])
+    with self.assertRaisesRegexp(ValueError, " should have the same length."):
+      add_layer.compute_mask([i1, i2, i3], [None, None])
+
+  def test_merge_subtract(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    i3 = keras.layers.Input(shape=(4, 5))
+
+    subtract_layer = keras.layers.Subtract()
+    o = subtract_layer([i1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
+
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, x1 - x2, atol=1e-4)
+
+    self.assertEqual(subtract_layer.compute_mask([i1, i2], [None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                subtract_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      subtract_layer.compute_mask([i1, i2], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      subtract_layer.compute_mask(i1, [None, None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on exactly 2 inputs"):
+      subtract_layer([i1, i2, i3])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on exactly 2 inputs"):
+      subtract_layer([i1])
+
   def test_merge_multiply(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -108,7 +160,8 @@ class MergeLayersTest(keras_parameterized.TestCase):
   def test_merge_concatenate(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.concatenate([i1, i2], axis=1)
+    concat_layer = keras.layers.Concatenate(axis=1)
+    o = concat_layer([i1, i2])
     self.assertListEqual(o.shape.as_list(), [None, 8, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
@@ -119,6 +172,23 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertEqual(out.shape, (2, 8, 5))
     self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
 
+    self.assertEqual(concat_layer.compute_mask([i1, i2], [None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                concat_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      concat_layer.compute_mask([i1, i2], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      concat_layer.compute_mask(i1, [None, None])
+    with self.assertRaisesRegexp(ValueError, "should have the same length"):
+      concat_layer.compute_mask([i1, i2], [None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on a list of inputs"):
+      concat_layer(i1)
+
   def test_merge_dot(self):
     i1 = keras.layers.Input(shape=(4,))
     i2 = keras.layers.Input(shape=(4,))
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 9404543ed9c2a4a48eaf40eb4190ac21be0e0d9d..ab8469a61167759bc512a4c0fa8bb4847639ec23 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -42,15 +42,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
 # The following string constants are used by Defun approach for unified backend
 # of LSTM and GRU.
-_DEFUN_API_NAME_ATTRIBUTE = 'experimental_api_implements'
-_DEFUN_DEVICE_ATTRIBUTE = 'experimental_api_preferred_device'
+_DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
+_DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
 _CPU_DEVICE_NAME = 'CPU'
 _GPU_DEVICE_NAME = 'GPU'
 
@@ -78,7 +78,6 @@ class StackedRNNCells(Layer):
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, cells, **kwargs):
     for cell in cells:
       if not hasattr(cell, 'call'):
@@ -443,7 +442,8 @@ class RNN(Layer):
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  _setattr_tracking = False
+
   def __init__(self,
                cell,
                return_sequences=False,
@@ -468,8 +468,8 @@ class RNN(Layer):
     self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self.cell, name='cell')
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self.cell, name='cell')
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
@@ -781,9 +781,9 @@ class RNN(Layer):
     else:
       input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
-    if self.unroll and timesteps in [None, 1]:
+    if self.unroll and timesteps is None:
       raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined or equal to 1. \n'
+                       'time dimension is undefined. \n'
                        '- If using a Sequential model, '
                        'specify the time dimension by passing '
                        'an `input_shape` or `batch_input_shape` '
@@ -2020,6 +2020,8 @@ class UnifiedGRU(GRU):
           True = "after" (default and CuDNN compatible).
   """
 
+  _setattr_tracking = False  # TODO(allenl): Figure out why this is needed.
+
   def __init__(self,
                units,
                activation='tanh',
@@ -2114,8 +2116,7 @@ class UnifiedGRU(GRU):
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
       # This is a dummy tensor for testing purpose.
-      runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
+      runtime = _runtime('unknown')
     else:
       last_output, outputs, runtime, states = self._defun_gru_call(
           inputs, initial_state, training)
@@ -2152,16 +2153,11 @@ class UnifiedGRU(GRU):
             count=3)
 
       inputs *= self._dropout_mask[0]
-    experimental_api_name = 'gru_' + str(uuid.uuid4())
-    defun_standard_gru = _generate_defun_backend(
-        experimental_api_name, _CPU_DEVICE_NAME, standard_gru)
-    defun_cudnn_gru = _generate_defun_backend(
-        experimental_api_name, _GPU_DEVICE_NAME, cudnn_gru)
     if ops.executing_eagerly_outside_functions():
       # Under eager context, the device placement is already known. Prefer the
       # GPU implementation when GPU is available.
       if context.num_gpus() > 0:
-        last_output, outputs, new_h, runtime = defun_cudnn_gru(
+        last_output, outputs, new_h, runtime = cudnn_gru(
             inputs=inputs,
             init_h=initial_state[0],
             kernel=self.cell.kernel,
@@ -2169,7 +2165,7 @@ class UnifiedGRU(GRU):
             bias=self.cell.bias,
             time_major=self.time_major)
       else:
-        last_output, outputs, new_h, runtime = defun_standard_gru(
+        last_output, outputs, new_h, runtime = standard_gru(
             inputs=inputs,
             init_h=initial_state[0],
             kernel=self.cell.kernel,
@@ -2179,6 +2175,11 @@ class UnifiedGRU(GRU):
             recurrent_activation=self.recurrent_activation,
             time_major=self.time_major)
     else:
+      api_name = 'gru_' + str(uuid.uuid4())
+      defun_standard_gru = _generate_defun_backend(
+          api_name, _CPU_DEVICE_NAME, standard_gru)
+      defun_cudnn_gru = _generate_defun_backend(
+          api_name, _GPU_DEVICE_NAME, cudnn_gru)
       # Call the normal GRU impl and register the CuDNN impl function. The
       # grappler will kick in during session execution to optimize the graph.
       last_output, outputs, new_h, runtime = defun_standard_gru(
@@ -2266,8 +2267,7 @@ def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
       unroll=False,
       time_major=time_major,
       input_length=timesteps)
-  return last_output, outputs, new_states[0], constant_op.constant(
-      'cpu', dtype=dtypes.string, name='runtime')
+  return last_output, outputs, new_states[0], _runtime('cpu')
 
 
 def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
@@ -2309,8 +2309,7 @@ def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
   if not time_major:
     outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
   h = h[0]
-  return last_output, outputs, h, constant_op.constant(
-      'cudnn', dtype=dtypes.string, name='runtime')
+  return last_output, outputs, h, _runtime('cudnn')
 
 
 @keras_export('keras.layers.LSTMCell')
@@ -3090,8 +3089,7 @@ class UnifiedLSTM(LSTM):
           input_length=timesteps,
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
-      runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
+      runtime = _runtime('unknown')
     else:
       # Use the new defun approach for backend implementation swap.
       # Note that different implementations need to have same function
@@ -3112,29 +3110,29 @@ class UnifiedLSTM(LSTM):
 
         inputs *= self._dropout_mask[0]
 
-      # Each time a defun function is called, we will give a unique identifiable
-      # API name, so that the grappler won't get confused when it sees multiple
-      # LSTM layer added into same graph, and it will be able to pair up the
-      # different implementations across them.
-      experimental_api_name = 'lstm_' + str(uuid.uuid4())
-      defun_standard_lstm = _generate_defun_backend(
-          experimental_api_name, _CPU_DEVICE_NAME, standard_lstm)
-      defun_cudnn_lstm = _generate_defun_backend(
-          experimental_api_name, _GPU_DEVICE_NAME, cudnn_lstm)
-
       if ops.executing_eagerly_outside_functions():
         # Under eager context, the device placement is already known. Prefer the
         # GPU implementation here.
         if context.num_gpus() > 0:
-          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
+          last_output, outputs, new_h, new_c, runtime = cudnn_lstm(
               inputs, initial_state[0], initial_state[1], self.cell.kernel,
               self.cell.recurrent_kernel, self.cell.bias, self.time_major)
         else:
-          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+          last_output, outputs, new_h, new_c, runtime = standard_lstm(
               inputs, initial_state[0], initial_state[1], self.cell.kernel,
               self.cell.recurrent_kernel, self.cell.bias, self.activation,
               self.recurrent_activation, self.time_major)
       else:
+        # Each time a `tf.function` is called, we will give it a unique
+        # identifiable API name, so that Grappler won't get confused when it
+        # sees multiple LSTM layers added into same graph, and it will be able
+        # to pair up the different implementations across them.
+        api_name = 'lstm_' + str(uuid.uuid4())
+        defun_standard_lstm = _generate_defun_backend(
+            api_name, _CPU_DEVICE_NAME, standard_lstm)
+        defun_cudnn_lstm = _generate_defun_backend(
+            api_name, _GPU_DEVICE_NAME, cudnn_lstm)
+
         # Call the normal LSTM impl and register the CuDNN impl function. The
         # grappler will kick in during session execution to optimize the graph.
         last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
@@ -3266,8 +3264,7 @@ def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
       unroll=False,
       time_major=time_major,
       input_length=timesteps)
-  return last_output, outputs, new_states[0], new_states[
-      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
+  return last_output, outputs, new_states[0], new_states[1], _runtime('cpu')
 
 
 def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
@@ -3298,8 +3295,7 @@ def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
   h = h[0]
   c = c[0]
 
-  return last_output, outputs, h, c, constant_op.constant(
-      'cudnn', dtype=dtypes.string, name='runtime')
+  return last_output, outputs, h, c, _runtime('cudnn')
 
 
 def _generate_dropout_mask(ones, rate, training=None, count=1):
@@ -3418,3 +3414,9 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
   }
   return function.defun_with_attributes(func=func,
                                         attributes=function_attributes)
+
+
+def _runtime(runtime_name):
+  with ops.device('/cpu:0'):
+    return constant_op.constant(
+        runtime_name, dtype=dtypes.string, name='runtime')
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index ddea2f4eae49e0a1948ca2de151eaa5f74f6a378..4b7aeb9f22f7c215159a629e3483186f2a96a5e1 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -40,7 +40,8 @@ from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import nest
 
 # Used for nested input/output/state RNN test.
@@ -715,7 +716,7 @@ class RNNTest(keras_parameterized.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-  def test_checkpointable_dependencies(self):
+  def test_trackable_dependencies(self):
     rnn = keras.layers.SimpleRNN
     x = np.random.random((2, 2, 2))
     y = np.random.random((2, 2))
@@ -728,8 +729,9 @@ class RNNTest(keras_parameterized.TestCase):
     model.fit(x, y, epochs=1, batch_size=1)
 
     # check whether the model variables are present in the
-    # checkpointable list of objects
-    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    # trackable list of objects
+    checkpointed_objects = object_identity.ObjectIdentitySet(
+        trackable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
 
@@ -1162,6 +1164,30 @@ class RNNTest(keras_parameterized.TestCase):
       result_1[5, 3:] = 0
       self.assertAllClose(result_1, result_2)
 
+  def test_unroll_single_step(self):
+    """Even if the time dimension is only one, we should be able to unroll."""
+    cell = keras.layers.SimpleRNNCell(5)
+    x = keras.Input((1, 5))
+    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    np_x = np.ones((6, 1, 5))
+    result = model.predict(np_x)
+    self.assertEqual((6, 1, 5), result.shape)
+
+  def test_unroll_zero_step(self):
+    """If the time dimension is None, we should fail to unroll."""
+    cell = keras.layers.SimpleRNNCell(5)
+    x = keras.Input((None, 5))
+    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+    with self.assertRaisesRegexp(ValueError, 'Cannot unroll a RNN.*'):
+      layer(x)
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
@@ -1269,3 +1295,4 @@ class NestedCell(keras.layers.Layer):
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 932cc58e036bbb25b7a14c42b3d01b7484ea17ab..95adf78aff957bb58b9379972e39aef077f22d4f 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -39,10 +39,20 @@ from tensorflow.python.keras.layers.wrappers import *
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.util.tf_export import keras_export
 
+# TODO(b/124791387): replace mapping with layer attribute.
+_V2_CONVERSION_TABLE = {
+    # BatchNormalization Layer
+    'BatchNormalizationV1': 'BatchNormalization',
+    'BatchNormalizationV2': 'BatchNormalization',
+}
+
 
 @keras_export('keras.layers.serialize')
 def serialize(layer):
-  return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
+  layer_class_name = layer.__class__.__name__
+  if layer_class_name in _V2_CONVERSION_TABLE:
+    layer_class_name = _V2_CONVERSION_TABLE[layer_class_name]
+  return {'class_name': layer_class_name, 'config': layer.get_config()}
 
 
 @keras_export('keras.layers.deserialize')
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index 548c3ec1ac760a33d6eb998e7d601c843bd87779..d322e6afb5413e3b4164ee158f7768be472b4507 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -38,6 +38,17 @@ class LayerSerializationTest(test.TestCase):
                      keras.initializers.Ones)
     self.assertEqual(new_layer.units, 3)
 
+  def test_serialize_deserialize_batchnorm(self):
+    layer = keras.layers.BatchNormalization(
+        momentum=0.9, beta_initializer='zeros', gamma_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    self.assertEqual(config['class_name'], 'BatchNormalization')
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.momentum, 0.9)
+    self.assertEqual(new_layer.beta_initializer.__class__,
+                     keras.initializers.Zeros)
+    self.assertEqual(new_layer.gamma_regularizer.__class__,
+                     keras.regularizers.L1L2)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index f01c9d49643c7f567578e1aa1e36c96fb8bd3066..993f5a9afd39c4a7131e1f6d42b9a4c6da808ce7 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -40,6 +41,14 @@ def _single_op_at_end():
   return inputs, outputs
 
 
+def _single_identity_op_at_end():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  outputs = array_ops.identity(x)
+  assert 'Identity' in outputs.name
+  return inputs, outputs
+
+
 def _multiple_ops_at_end():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
@@ -88,17 +97,62 @@ def _multiple_uses():
   return inputs, outputs
 
 
+def _op_with_tensor_list():
+  inputs = keras.Input(shape=(10,))
+  x = array_ops.concat([inputs, inputs], axis=1)
+  outputs = keras.layers.Dense(10)(x)
+  return inputs, outputs
+
+
+def _add_n():
+  inputs = keras.Input(shape=(10,))
+  outputs = math_ops.add_n([inputs, inputs, inputs])
+  return inputs, outputs
+
+
+def _reuse_op():
+  inputs = keras.Input(shape=(10,))
+  # This op needs to be checked multiple times.
+  x = gen_nn_ops.relu(inputs)
+  y = keras.layers.Dense(10)(x)
+  x2 = x * 2
+  y2 = keras.layers.Dense(10)(x2)
+  outputs = y + y2
+  return inputs, outputs
+
+
+class LayerWithLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.bias = self.add_weight(name='bias', dtype='float32')
+    self.layer = keras.layers.Dense(10)
+
+  def call(self, inputs):
+    inputs = inputs * self.bias
+    # Would throw an error if Keras History was created here.
+    return self.layer(inputs)
+
+
+def _inner_layer():
+  inputs = keras.Input(shape=(10,))
+  outputs = LayerWithLayer()(inputs)
+  return inputs, outputs
+
+
 @keras_parameterized.run_all_keras_modes
 class AutoLambdaTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('single_op_at_end', _single_op_at_end),
+      ('single_identity_op_at_end', _single_identity_op_at_end),
       ('multiple_ops_at_end', _multiple_ops_at_end),
       ('single_op_in_middle', _single_op_in_middle),
       ('multiple_ops_in_middle', _multiple_ops_in_middle),
       ('single_standalone_branch', _single_standalone_branch),
       ('single_op_with_attrs', _single_op_with_attrs),
-      ('multiple_uses', _multiple_uses))
+      ('multiple_uses', _multiple_uses),
+      ('op_with_tensor_list', _op_with_tensor_list), ('add_n', _add_n),
+      ('_reuse_op', _reuse_op), ('_inner_layer', _inner_layer))
   def test_autolambda(self, model_fn):
     inputs, outputs = model_fn()
     model = keras.Model(inputs, outputs)
@@ -110,6 +164,14 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     np_outputs = nest.map_structure(lambda x: np.ones((10, 10), 'float32'),
                                     outputs)
     model.fit(np_inputs, np_outputs, batch_size=2)
+    model(np_inputs)  # Test calling the model directly on inputs.
+
+    new_model = keras.Model.from_config(
+        model.get_config(), custom_objects={'LayerWithLayer': LayerWithLayer})
+    new_model.compile(
+        adam.Adam(0.001), 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    new_model.fit(np_inputs, np_outputs, batch_size=2)
+    new_model(np_inputs)  # Test calling the new model directly on inputs.
 
   def test_numerical_correctness_simple(self):
     x = ops.convert_to_tensor([[-1., 0., -2., 1.]])
@@ -127,7 +189,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     y = self.evaluate(model(x))
     self.assertAllClose(y, [1.5, 3.])
 
-  def test_serialization(self):
+  def test_numerical_correctness_serialization(self):
     x = ops.convert_to_tensor([-1., 0., -2., 1.])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
@@ -158,12 +220,25 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     size_50 = _construct_graph_of_size(50)
     size_500 = _construct_graph_of_size(500)
 
-    # Check reasonable graph construction time.
-    self.assertLess(size_50, 5)
     # Check construction time grows approx. linearly with size.
     e = 2  # Fudge factor to prevent flakiness.
     self.assertLess(size_500, (10 * e) * size_50)
 
+  def test_no_mask_tracking(self):
+    x = keras.backend.placeholder((10, 10))
+    y = keras.layers.Masking(0.)(x)
+    self.assertTrue(y._keras_mask._keras_history_checked)
+
+  def test_built(self):
+    inputs = keras.Input(shape=(10,))
+    outputs = gen_nn_ops.relu(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+    for layer in model.layers:
+      self.assertTrue(layer.built)
+    # Test something that requires Layers to be built.
+    model.summary()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/unified_gru_test.py
index 8259643d7cc2493855cc5cce45ceb9b1665e9572..db8610423803354d67824cf6fef7df2db94e62bd 100644
--- a/tensorflow/python/keras/layers/unified_gru_test.py
+++ b/tensorflow/python/keras/layers/unified_gru_test.py
@@ -47,9 +47,7 @@ from tensorflow.python.training import gradient_descent
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
-_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-_customer_optimizer = _rewrites.custom_optimizers.add()
-_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
 _rewrites.min_graph_nodes = -1
 _graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
 _config = config_pb2.ConfigProto(graph_options=_graph_options)
@@ -464,6 +462,29 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
 
     np.testing.assert_allclose(out7, out6, atol=1e-5)
 
+  def test_stateful_GRU_training(self):
+    # See b/123587692 for more context.
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    model = keras.Sequential([
+        keras.layers.Embedding(vocab_size, embedding_dim,
+                               batch_input_shape=[batch_size, timestep]),
+        keras.layers.UnifiedGRU(units,
+                                return_sequences=True,
+                                stateful=True),
+        keras.layers.Dense(vocab_size)
+    ])
+    model.compile(optimizer='adam',
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, epochs=1, shuffle=False)
+
 
 class GRULayerGradientTapeTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
index 375894b166215ed7068767eed095fec2f60963ca..938c87c6b1aa2e493e2b053630381031c628d210 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -47,9 +47,7 @@ from tensorflow.python.training import gradient_descent
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
-_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-_customer_optimizer = _rewrites.custom_optimizers.add()
-_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
 _rewrites.min_graph_nodes = -1
 _graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
 _config = config_pb2.ConfigProto(graph_options=_graph_options)
@@ -633,6 +631,29 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
 
     self.assertAllClose(out7, out6, atol=1e-5)
 
+  def test_stateful_LSTM_training(self):
+    # See b/123587692 for more context.
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    model = keras.Sequential([
+        keras.layers.Embedding(vocab_size, embedding_dim,
+                               batch_input_shape=[batch_size, timestep]),
+        keras.layers.UnifiedLSTM(units,
+                                 return_sequences=True,
+                                 stateful=True),
+        keras.layers.Dense(vocab_size)
+    ])
+    model.compile(optimizer='adam',
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, epochs=1, shuffle=False)
+
 
 class LSTMLayerGraphOnlyTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index a10db509d33f6ebe62fc7d884145725de51e3677..182d5e3c5df973475add2cf0e97f925ac47c518a 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -29,7 +29,7 @@ from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -46,7 +46,7 @@ class Wrapper(Layer):
       layer: The layer to be wrapped.
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layer, **kwargs):
     assert isinstance(layer, Layer)
     self.layer = layer
@@ -170,7 +170,7 @@ class TimeDistributed(Wrapper):
           '`Layer` instance. You passed: {input}'.format(input=layer))
     super(TimeDistributed, self).__init__(layer, **kwargs)
     self.supports_masking = True
-    self._track_checkpointable(layer, name='layer')
+    self._track_trackable(layer, name='layer')
 
   def _get_shape_tuple(self, init_tuple, tensor, start_idx, int_shape=None):
     """Finds non-specific dimensions in the static shapes.
@@ -386,7 +386,7 @@ class Bidirectional(Wrapper):
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
     if not isinstance(layer, Layer):
       raise ValueError(
@@ -419,8 +419,8 @@ class Bidirectional(Wrapper):
     self._num_constants = None
     super(Bidirectional, self).__init__(layer, **kwargs)
     self.input_spec = layer.input_spec
-    self._track_checkpointable(self.forward_layer, name='forward_layer')
-    self._track_checkpointable(self.backward_layer, name='backward_layer')
+    self._track_trackable(self.forward_layer, name='forward_layer')
+    self._track_trackable(self.backward_layer, name='backward_layer')
 
   @property
   def trainable(self):
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 8a0b265c0d75a844b977271945dc5b971c5515bc..8fa0e7bdacc7169847d2c15ebe564d60089983a8 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -23,10 +23,12 @@ import copy
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import util as trackable_util
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
@@ -87,8 +89,8 @@ class TimeDistributedTest(test.TestCase):
     model.get_config()
 
     # check whether the model variables are present in the
-    # checkpointable list of objects
-    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    # trackable list of objects
+    checkpointed_objects = set(trackable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
 
@@ -302,8 +304,9 @@ class BidirectionalTest(test.TestCase):
         model.fit(x, y, epochs=1, batch_size=1)
 
         # check whether the model variables are present in the
-        # checkpointable list of objects
-        checkpointed_objects = set(checkpointable_util.list_objects(model))
+        # trackable list of objects
+        checkpointed_objects = object_identity.ObjectIdentitySet(
+            trackable_util.list_objects(model))
         for v in model.variables:
           self.assertIn(v, checkpointed_objects)
 
@@ -396,7 +399,6 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -424,10 +426,10 @@ class BidirectionalTest(test.TestCase):
             rnn(units, return_sequences=True), merge_mode=merge_mode)
         f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
         f_forward = keras.backend.function([inputs],
-                                           [layer.forward_layer.call(inputs)])
+                                           [layer.forward_layer(inputs)])
         f_backward = keras.backend.function(
             [inputs],
-            [keras.backend.reverse(layer.backward_layer.call(inputs), 1)])
+            [keras.backend.reverse(layer.backward_layer(inputs), 1)])
 
         y_merged = f_merged(x)
         y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
@@ -441,9 +443,9 @@ class BidirectionalTest(test.TestCase):
             rnn(units, return_state=True), merge_mode=merge_mode)
         f_merged = keras.backend.function([inputs], layer(inputs))
         f_forward = keras.backend.function([inputs],
-                                           layer.forward_layer.call(inputs))
+                                           layer.forward_layer(inputs))
         f_backward = keras.backend.function([inputs],
-                                            layer.backward_layer.call(inputs))
+                                            layer.backward_layer(inputs))
         n_states = len(layer.layer.states)
 
         y_merged = f_merged(x)
@@ -527,8 +529,10 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_updates(self):
+    if context.executing_eagerly():
+      self.skipTest('layer.updates is only available in graph mode.')
+
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
       x_reachable_update = x * x
@@ -556,10 +560,15 @@ class BidirectionalTest(test.TestCase):
       assert len(layer.losses) == 4
       assert len(layer.get_losses_for(None)) == 4
       assert not layer.get_losses_for(x)
+
+      # Create a random tensor that is not conditional on the inputs.
+      with keras.backend.get_graph().as_default():
+        const_tensor = constant_op.constant(1)
+
       layer.forward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.forward_layer.add_loss(1, inputs=None)
+      layer.forward_layer.add_loss(const_tensor, inputs=None)
       layer.backward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.backward_layer.add_loss(1, inputs=None)
+      layer.backward_layer.add_loss(const_tensor, inputs=None)
       assert len(layer.losses) == 8
       assert len(layer.get_losses_for(None)) == 6
       assert len(layer.get_losses_for(x)) == 2
@@ -696,3 +705,4 @@ def _to_list(ls):
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 366469d9dd0d5e1055626b4dca05e1a8c9435a23..2d8358cab43674bba0dd5352bc1ccfa070e6721c 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=unused-import
 """Built-in loss functions.
 """
 from __future__ import absolute_import
@@ -23,16 +22,14 @@ import abc
 
 import six
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
@@ -57,13 +54,13 @@ class Loss(object):
   ```
 
   Args:
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name=None):
     self.reduction = reduction
     self.name = name
@@ -97,7 +94,7 @@ class Loss(object):
     with ops.name_scope(scope_name, format(self.__class__.__name__),
                         (y_pred, y_true, sample_weight)):
       losses = self.call(y_true, y_pred)
-      return compute_weighted_loss(
+      return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self.reduction)
 
   @classmethod
@@ -133,7 +130,7 @@ class LossFunctionWrapper(Loss):
   Args:
     fn: The loss function to wrap, with signature `fn(y_true, y_pred,
       **kwargs)`.
-    reduction: (Optional) Type of `tf.losses.Reduction` to apply to loss.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
       Default value is `SUM_OVER_BATCH_SIZE`.
     name: (Optional) name for the loss.
     **kwargs: The keyword arguments that are passed on to `fn`.
@@ -141,7 +138,7 @@ class LossFunctionWrapper(Loss):
 
   def __init__(self,
                fn,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name=None,
                **kwargs):
     super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name)
@@ -186,13 +183,13 @@ class MeanSquaredError(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='mean_squared_error'):
     super(MeanSquaredError, self).__init__(
         mean_squared_error, name=name, reduction=reduction)
@@ -216,13 +213,13 @@ class MeanAbsoluteError(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='mean_absolute_error'):
     super(MeanAbsoluteError, self).__init__(
         mean_absolute_error, name=name, reduction=reduction)
@@ -246,13 +243,13 @@ class MeanAbsolutePercentageError(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='mean_absolute_percentage_error'):
     super(MeanAbsolutePercentageError, self).__init__(
         mean_absolute_percentage_error, name=name, reduction=reduction)
@@ -276,13 +273,13 @@ class MeanSquaredLogarithmicError(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='mean_squared_logarithmic_error'):
     super(MeanSquaredLogarithmicError, self).__init__(
         mean_squared_logarithmic_error, name=name, reduction=reduction)
@@ -310,7 +307,7 @@ class BinaryCrossentropy(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
   ```
 
@@ -318,15 +315,15 @@ class BinaryCrossentropy(LossFunctionWrapper):
     from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
       we assume that `y_pred` encodes a probability distribution.
     label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='binary_crossentropy'):
     super(BinaryCrossentropy, self).__init__(
         binary_crossentropy,
@@ -363,7 +360,7 @@ class CategoricalCrossentropy(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
   ```
 
@@ -374,15 +371,15 @@ class CategoricalCrossentropy(LossFunctionWrapper):
       meaning the confidence on label values are relaxed. e.g.
       `label_smoothing=0.2` means that we will use a value of `0.1` for label
       `0` and `0.9` for label `1`"
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='categorical_crossentropy'):
     super(CategoricalCrossentropy, self).__init__(
         categorical_crossentropy,
@@ -420,21 +417,21 @@ class SparseCategoricalCrossentropy(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.SparseCategoricalCrossentropy())
   ````
 
   Args:
     from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
       we assume that `y_pred` encodes a probability distribution.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name=None):
     super(SparseCategoricalCrossentropy, self).__init__(
         sparse_categorical_crossentropy,
@@ -453,7 +450,7 @@ class Hinge(LossFunctionWrapper):
   Usage:
 
   ```python
-  h = tf.losses.Hinge()
+  h = tf.keras.losses.Hinge()
   loss = h([-1., 1., 1.], [0.6, -0.7, -0.5])
 
   # loss = max(0, 1 - y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3
@@ -464,13 +461,13 @@ class Hinge(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.Hinge())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Hinge())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name=None):
     super(Hinge, self).__init__(hinge, name=name, reduction=reduction)
 
@@ -485,7 +482,7 @@ class SquaredHinge(LossFunctionWrapper):
   Usage:
 
   ```python
-  sh = tf.losses.SquaredHinge()
+  sh = tf.keras.losses.SquaredHinge()
   loss = sh([-1., 1., 1.], [0.6, -0.7, -0.5])
 
   # loss = (max(0, 1 - y_true * y_pred))^2 = [1.6^2 + 1.7^2 + 1.5^2] / 3
@@ -496,13 +493,13 @@ class SquaredHinge(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.SquaredHinge())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.SquaredHinge())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='squared_hinge'):
     super(SquaredHinge, self).__init__(
         squared_hinge, name=name, reduction=reduction)
@@ -515,7 +512,7 @@ class CategoricalHinge(LossFunctionWrapper):
   Usage:
 
   ```python
-  ch = tf.losses.CategoricalHinge()
+  ch = tf.keras.losses.CategoricalHinge()
   loss = ch([0., 1., 1.], [1., 0., 1.])
   print('Loss: ', loss.numpy())  # Loss: 1.0
   ```
@@ -523,13 +520,13 @@ class CategoricalHinge(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.CategoricalHinge())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalHinge())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='categorical_hinge'):
     super(CategoricalHinge, self).__init__(
         categorical_hinge, name=name, reduction=reduction)
@@ -544,7 +541,7 @@ class LogLoss(LossFunctionWrapper):
   Usage:
 
   ```python
-  l = tf.losses.LogLoss()
+  l = tf.keras.losses.LogLoss()
   loss = l([0., 1., 1.], [1., 0., 1.])
   print('Loss: ', loss.numpy())  # Loss: 10.745
   ```
@@ -552,13 +549,13 @@ class LogLoss(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.LogLoss())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.LogLoss())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='logloss'):
     super(LogLoss, self).__init__(logloss, name=name, reduction=reduction)
 
@@ -572,7 +569,7 @@ class Poisson(LossFunctionWrapper):
   Usage:
 
   ```python
-  p = tf.losses.Poisson()
+  p = tf.keras.losses.Poisson()
   loss = p([1, 9, 2], [4, 8, 12])
   print('Loss: ', loss.numpy())  # Loss: -4.63
   ```
@@ -580,13 +577,13 @@ class Poisson(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.Poisson())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Poisson())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='poisson'):
     super(Poisson, self).__init__(poisson, name=name, reduction=reduction)
 
@@ -600,7 +597,7 @@ class LogCosh(LossFunctionWrapper):
   Usage:
 
   ```python
-  l = tf.losses.LogCosh()
+  l = tf.keras.losses.LogCosh()
   loss = l([0., 1., 1.], [1., 0., 1.])
   print('Loss: ', loss.numpy())  # Loss: 0.289
   ```
@@ -608,13 +605,13 @@ class LogCosh(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.LogCosh())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.LogCosh())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='logcosh'):
     super(LogCosh, self).__init__(logcosh, name=name, reduction=reduction)
 
@@ -628,7 +625,7 @@ class KLDivergence(LossFunctionWrapper):
   Usage:
 
   ```python
-  k = tf.losses.KLDivergence()
+  k = tf.keras.losses.KLDivergence()
   loss = k([.4, .9, .2], [.5, .8, .12])
   print('Loss: ', loss.numpy())  # Loss: -0.043
   ```
@@ -636,13 +633,13 @@ class KLDivergence(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.KLDivergence())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.KLDivergence())
   ```
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='kullback_leibler_divergence'):
     super(KLDivergence, self).__init__(
         kullback_leibler_divergence, name=name, reduction=reduction)
@@ -663,7 +660,7 @@ class Huber(LossFunctionWrapper):
   Usage:
 
   ```python
-  l = tf.losses.Huber()
+  l = tf.keras.losses.Huber()
   loss = l([0., 1., 1.], [1., 0., 1.])
   print('Loss: ', loss.numpy())  # Loss: 0.333
   ```
@@ -671,21 +668,21 @@ class Huber(LossFunctionWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.Huber())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Huber())
   ```
 
   Args:
     delta: A float, the point where the Huber loss function changes from a
       quadratic to linear.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                delta=1.0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name='huber_loss'):
     super(Huber, self).__init__(
         huber_loss, name=name, reduction=reduction, delta=delta)
@@ -946,63 +943,64 @@ def poisson(y_true, y_pred):
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
-@keras_export('keras.metrics.cosine_proximity',
-              'keras.metrics.cosine',
-              'keras.losses.cosine_proximity',
-              'keras.losses.cosine')
+# Retaining the legacy namespaces: 'cosine_proximity' and 'cosine'.
+# TODO(psv): Change name of this function to `cosine_similarity` after fixing
+# estimator test.
+@keras_export(
+    'keras.losses.cosine_similarity',
+    v1=[
+        'keras.metrics.cosine_proximity',
+        'keras.metrics.cosine',
+        'keras.losses.cosine_proximity',
+        'keras.losses.cosine',
+        'keras.losses.cosine_similarity',
+    ])
 def cosine_proximity(y_true, y_pred, axis=-1):
+  """Computes the cosine similarity between labels and predictions."""
   y_true = nn.l2_normalize(y_true, axis=axis)
   y_pred = nn.l2_normalize(y_pred, axis=axis)
-  return -math_ops.reduce_sum(y_true * y_pred, axis=axis)
+  return math_ops.reduce_sum(y_true * y_pred, axis=axis)
 
 
-@keras_export('keras.losses.CosineProximity')
-class CosineProximity(Loss):
-  """Computes the cosine proximity between `y_true` and `y_pred`.
+@keras_export('keras.losses.CosineSimilarity')
+class CosineSimilarity(LossFunctionWrapper):
+  """Computes the cosine similarity between `y_true` and `y_pred`.
 
   Usage:
 
   ```python
-  cosine_loss = tf.losses.CosineProximity()
-  loss = cosine_loss([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: -0.5
+  cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
+  loss = cosine_loss([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+  # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+  # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+         = ((0. + 0.) +  (0.5 + 0.5)) / 2
+
+  print('Loss: ', loss.numpy())  # Loss: 0.5
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.CosineProximity())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CosineSimilarity(axis=1))
   ```
 
   Args:
     axis: (Optional) Defaults to -1. The dimension along which the cosine
-      proximity is computed.
-    reduction: (Optional) Type of `tf.losses.Reduction` to apply to loss.
+      similarity is computed.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
       Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                axis=-1,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
-    super(CosineProximity, self).__init__(reduction=reduction, name=name)
-    self.axis = axis
-
-  def call(self, y_true, y_pred):
-    """Calculates the cosine proximity loss.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Cosine distance loss.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return cosine_proximity(y_true, y_pred, axis=self.axis)
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='cosine_similarity'):
+    super(CosineSimilarity, self).__init__(
+        cosine_similarity, reduction=reduction, name=name, axis=axis)
 
 
 # Aliases.
@@ -1012,7 +1010,7 @@ mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
 kld = KLD = kullback_leibler_divergence
-cosine = cosine_proximity
+cosine_similarity = cosine_proximity
 
 
 def is_categorical_crossentropy(loss):
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index e72209059b8f3d97d69fa5995adfdc24ef43adca..6fe68335e856accd6f60f693d6b97e6328fc8e59 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -27,7 +27,7 @@ from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.platform import test
 
 try:
@@ -45,7 +45,7 @@ ALL_LOSSES = [keras.losses.mean_squared_error,
               keras.losses.binary_crossentropy,
               keras.losses.kullback_leibler_divergence,
               keras.losses.poisson,
-              keras.losses.cosine_proximity,
+              keras.losses.cosine_similarity,
               keras.losses.logcosh,
               keras.losses.categorical_hinge]
 
@@ -186,7 +186,7 @@ class KerasLossesTest(test.TestCase):
 
     self.assertEqual(mse_obj.name, 'mean_squared_error')
     self.assertEqual(mse_obj.reduction,
-                     losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE)
+                     losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
 
     y_true = constant_op.constant([[1., 9.], [2., 5.]])
     y_pred = constant_op.constant([[4., 8.], [12., 3.]])
@@ -205,9 +205,9 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.SUM, name='mse_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mse_1')
     self.assertEqual(mse_obj.name, 'mse_1')
-    self.assertEqual(mse_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     mse_obj = keras.losses.MeanSquaredError()
@@ -273,7 +273,7 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_no_reduction(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.NONE)
+        reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -284,7 +284,7 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_sum_reduction(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.SUM)
+        reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -298,9 +298,9 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_config(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.SUM, name='mae_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mae_1')
     self.assertEqual(mae_obj.name, 'mae_1')
-    self.assertEqual(mae_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mae_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     mae_obj = keras.losses.MeanAbsoluteError()
@@ -366,7 +366,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_no_reduction(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.NONE)
+        reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -377,7 +377,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_sum_reduction(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.SUM)
+        reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -391,9 +391,9 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
 
   def test_config(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError(
-        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(mape_obj.name, 'mape_1')
-    self.assertEqual(mape_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError()
@@ -448,9 +448,9 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
   def test_config(self):
     msle_obj = keras.losses.MeanSquaredLogarithmicError(
-        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(msle_obj.name, 'mape_1')
-    self.assertEqual(msle_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(msle_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     msle_obj = keras.losses.MeanSquaredLogarithmicError()
@@ -501,7 +501,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class CosineSimilarityTest(test.TestCase):
 
   def l2_norm(self, x, axis):
     epsilon = 1e-12
@@ -515,28 +515,27 @@ class CosineProximityTest(test.TestCase):
 
     y_true = self.l2_norm(self.np_y_true, axis)
     y_pred = self.l2_norm(self.np_y_pred, axis)
-    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
 
     self.y_true = constant_op.constant(self.np_y_true)
     self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    cosine_obj = keras.losses.CosineProximity(
-        axis=2, reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+    cosine_obj = keras.losses.CosineSimilarity(
+        axis=2, reduction=losses_utils.ReductionV2.SUM, name='cosine_loss')
     self.assertEqual(cosine_obj.name, 'cosine_loss')
-    self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
-    self.assertEqual(cosine_obj.axis, 2)
+    self.assertEqual(cosine_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_scalar_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     sample_weight = 2.3
     loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     expected_loss = np.mean(self.expected_loss * sample_weight)
@@ -544,7 +543,7 @@ class CosineProximityTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     sample_weight = np.asarray([1.2, 3.4])
     loss = cosine_obj(
         self.y_true,
@@ -555,14 +554,14 @@ class CosineProximityTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     np_y_true = self.np_y_true.reshape((2, 3, 1))
     np_y_pred = self.np_y_pred.reshape((2, 3, 1))
     sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
 
     y_true = self.l2_norm(np_y_true, 2)
     y_pred = self.l2_norm(np_y_pred, 2)
-    expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(2,))
+    expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(2,))
 
     y_true = constant_op.constant(np_y_true)
     y_pred = constant_op.constant(np_y_pred)
@@ -574,13 +573,13 @@ class CosineProximityTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
   def test_axis(self):
     self.setup(axis=1)
-    cosine_obj = keras.losses.CosineProximity(axis=1)
+    cosine_obj = keras.losses.CosineSimilarity(axis=1)
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
@@ -591,9 +590,9 @@ class BinaryCrossentropyTest(test.TestCase):
 
   def test_config(self):
     bce_obj = keras.losses.BinaryCrossentropy(
-        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(bce_obj.name, 'bce_1')
-    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(bce_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
@@ -722,7 +721,7 @@ class BinaryCrossentropyTest(test.TestCase):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = bce_obj(y_true, logits)
 
     # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
@@ -757,9 +756,9 @@ class CategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
     cce_obj = keras.losses.CategoricalCrossentropy(
-        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(cce_obj.name, 'bce_1')
-    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
@@ -823,7 +822,7 @@ class CategoricalCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
     cce_obj = keras.losses.CategoricalCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
@@ -914,7 +913,7 @@ class SparseCategoricalCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[0], [1], [2]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
     cce_obj = keras.losses.SparseCategoricalCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
@@ -924,9 +923,9 @@ class HingeTest(test.TestCase):
 
   def test_config(self):
     hinge_obj = keras.losses.Hinge(
-        reduction=losses_impl.ReductionV2.SUM, name='hinge_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='hinge_loss')
     self.assertEqual(hinge_obj.name, 'hinge_loss')
-    self.assertEqual(hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     hinge_obj = keras.losses.Hinge()
@@ -1024,9 +1023,9 @@ class SquaredHingeTest(test.TestCase):
 
   def test_config(self):
     sq_hinge_obj = keras.losses.SquaredHinge(
-        reduction=losses_impl.ReductionV2.SUM, name='sq_hinge_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='sq_hinge_loss')
     self.assertEqual(sq_hinge_obj.name, 'sq_hinge_loss')
-    self.assertEqual(sq_hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(sq_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     sq_hinge_obj = keras.losses.SquaredHinge()
@@ -1133,9 +1132,9 @@ class CategoricalHingeTest(test.TestCase):
 
   def test_config(self):
     cat_hinge_obj = keras.losses.CategoricalHinge(
-        reduction=losses_impl.ReductionV2.SUM, name='cat_hinge_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='cat_hinge_loss')
     self.assertEqual(cat_hinge_obj.name, 'cat_hinge_loss')
-    self.assertEqual(cat_hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cat_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     cat_hinge_obj = keras.losses.CategoricalHinge()
@@ -1214,9 +1213,9 @@ class LogLossTest(test.TestCase):
 
   def test_config(self):
     log_loss_obj = keras.losses.LogLoss(
-        reduction=losses_impl.ReductionV2.SUM, name='log')
+        reduction=losses_utils.ReductionV2.SUM, name='log')
     self.assertEqual(log_loss_obj.name, 'log')
-    self.assertEqual(log_loss_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(log_loss_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct(self):
     self.setup()
@@ -1301,9 +1300,9 @@ class LogCoshTest(test.TestCase):
 
   def test_config(self):
     logcosh_obj = keras.losses.LogCosh(
-        reduction=losses_impl.ReductionV2.SUM, name='logcosh_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='logcosh_loss')
     self.assertEqual(logcosh_obj.name, 'logcosh_loss')
-    self.assertEqual(logcosh_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(logcosh_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
@@ -1382,9 +1381,9 @@ class PoissonTest(test.TestCase):
 
   def test_config(self):
     poisson_obj = keras.losses.Poisson(
-        reduction=losses_impl.ReductionV2.SUM, name='poisson')
+        reduction=losses_utils.ReductionV2.SUM, name='poisson')
     self.assertEqual(poisson_obj.name, 'poisson')
-    self.assertEqual(poisson_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(poisson_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
@@ -1463,9 +1462,9 @@ class KLDivergenceTest(test.TestCase):
 
   def test_config(self):
     k_obj = keras.losses.KLDivergence(
-        reduction=losses_impl.ReductionV2.SUM, name='kld')
+        reduction=losses_utils.ReductionV2.SUM, name='kld')
     self.assertEqual(k_obj.name, 'kld')
-    self.assertEqual(k_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(k_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
@@ -1553,9 +1552,9 @@ class HuberLossTest(test.TestCase):
 
   def test_config(self):
     h_obj = keras.losses.Huber(
-        reduction=losses_impl.ReductionV2.SUM, name='huber')
+        reduction=losses_utils.ReductionV2.SUM, name='huber')
     self.assertEqual(h_obj.name, 'huber')
-    self.assertEqual(h_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(h_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct(self):
     self.setup()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 5d41a5c6f1a05393b7fef4e3499d49890c12c2c1..b0b4a8dfb9d9811ffdc63e3a29c5b29030809d35 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -34,7 +34,7 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import categorical_hinge
-from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import cosine_similarity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
 from tensorflow.python.keras.losses import logcosh
@@ -109,28 +109,25 @@ class Metric(Layer):
   Example subclass implementation:
 
   ```
-  class BinaryTruePositives(Metric):
-    def __init__(self, name='binary_true_positives', dtype=None):
-      super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
-      self.true_positives = self.add_weight(
-          'true_positives', initializer=init_ops.zeros_initializer)
+  class BinaryTruePositives(tf.keras.metrics.Metric):
+
+    def __init__(self, name='binary_true_positives'):
+      super(BinaryTruePositives, self).__init__(name=name)
+      self.true_positives = self.add_weight(name='tp', initializer='zeros')
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-      y_true = math_ops.cast(y_true, dtypes.bool)
-      y_pred = math_ops.cast(y_pred, dtypes.bool)
-      y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
-          y_pred, y_true, sample_weight)
-
-      values = math_ops.logical_and(
-          math_ops.equal(y_true, True), math_ops.equal(y_pred, True))
-      values = math_ops.cast(values, self._dtype)
+      y_true = tf.cast(y_true, tf.bool)
+      y_pred = tf.cast(y_pred, tf.bool)
+
+      values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+      values = tf.cast(values, self.dtype)
       if sample_weight is not None:
-        sample_weight = math_ops.cast(sample_weight, self._dtype)
-        values = math_ops.multiply(values, sample_weight)
-      self.true_positives.assign_add(math_ops.reduce_sum(values))
+        sample_weight = tf.cast(sample_weight, self.dtype)
+        values = tf.multiply(values, sample_weight)
+      return self.true_positives.assign_add(tf.reduce_sum(values))
 
     def result(self):
-      return array_ops.identity(self.true_positives)
+      return tf.identity(self.true_positives)
   ```
   """
 
@@ -203,8 +200,7 @@ class Metric(Layer):
     This function is called between epochs/steps,
     when a metric is evaluated during training.
     """
-    for v in self.variables:
-      K.set_value(v, 0)
+    K.batch_set_value([(v, 0) for v in self.variables])
 
   @abc.abstractmethod
   def update_state(self, *args, **kwargs):
@@ -374,7 +370,7 @@ class Sum(Reduce):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.add_metric(tf.keras.metrics.Sum(name='sum_1')(outputs))
   model.compile('sgd', loss='mse')
   ```
@@ -416,7 +412,7 @@ class Mean(Reduce):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
   model.compile('sgd', loss='mse')
   ```
@@ -460,7 +456,7 @@ class MeanRelativeError(Mean):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -590,7 +586,7 @@ class Accuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Accuracy()])
   ```
   """
@@ -626,7 +622,7 @@ class BinaryAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.BinaryAccuracy()])
   ```
   """
@@ -676,7 +672,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -724,7 +720,7 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -752,7 +748,7 @@ class TopKCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
   ```
   """
@@ -785,7 +781,7 @@ class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
@@ -864,8 +860,8 @@ class _ConfusionMatrixConditionCount(Metric):
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {'thresholds': self.init_thresholds}
@@ -899,7 +895,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalsePositives()])
   ```
   """
@@ -949,7 +945,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalseNegatives()])
   ```
   """
@@ -999,7 +995,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TrueNegatives()])
   ```
   """
@@ -1049,7 +1045,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TruePositives()])
   ```
   """
@@ -1109,7 +1105,7 @@ class Precision(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Precision()])
   ```
   """
@@ -1187,8 +1183,8 @@ class Precision(Metric):
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {
@@ -1235,7 +1231,7 @@ class Recall(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Recall()])
   ```
   """
@@ -1313,8 +1309,8 @@ class Recall(Metric):
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {
@@ -1391,8 +1387,8 @@ class SensitivitySpecificityBase(Metric):
 
   def reset_states(self):
     num_thresholds = len(self.thresholds)
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
 
 @keras_export('keras.metrics.SensitivityAtSpecificity')
@@ -1426,7 +1422,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -1507,7 +1503,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -1602,7 +1598,7 @@ class AUC(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.AUC()])
   ```
   """
@@ -1809,8 +1805,8 @@ class AUC(Metric):
 
   def reset_states(self):
     num_thresholds = len(self.thresholds)
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {
@@ -1822,44 +1818,54 @@ class AUC(Metric):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export('keras.metrics.CosineProximity')
-class CosineProximity(MeanMetricWrapper):
-  """Computes the cosine distance between the labels and predictions.
+@keras_export('keras.metrics.CosineSimilarity')
+class CosineSimilarity(MeanMetricWrapper):
+  """Computes the cosine similarity between the labels and predictions.
+
+  cosine similarity = (a . b) / ||a|| ||b||
+  (https://en.wikipedia.org/wiki/Cosine_similarity)
 
   For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine
-  proximity is -0.5.
+  similarity is 0.5.
 
-  This metric keeps the average cosine distance between `predictions` and
+  This metric keeps the average cosine similarity between `predictions` and
   `labels` over a stream of data.
 
   Usage:
   ```python
-  m = tf.metrics.CosineProximity()
-  m.update_state([0, 1, 1], [1, 0, 1])
-  print('Final result: ', m.result().numpy())  # Final result: -0.5
+  m = tf.keras.metrics.CosineSimilarity(axis=1)
+  m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+  # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+  # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+         = ((0. + 0.) +  (0.5 + 0.5)) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
-      metrics=[tf.metrics.CosineProximity()])
+      metrics=[tf.keras.metrics.CosineSimilarity(axis=1)])
   ```
   """
 
-  def __init__(self, name='cosine_proximity', dtype=None, axis=-1):
-    """Creates a `CosineProximity` instance.
+  def __init__(self, name='cosine_similarity', dtype=None, axis=-1):
+    """Creates a `CosineSimilarity` instance.
 
     Args:
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
       axis: (Optional) Defaults to -1. The dimension along which the cosine
-        proximity is computed.
+        similarity is computed.
     """
-    super(CosineProximity, self).__init__(cosine, name, dtype=dtype, axis=axis)
+    super(CosineSimilarity, self).__init__(
+        cosine_similarity, name, dtype=dtype, axis=axis)
 
 
 @keras_export('keras.metrics.MeanAbsoluteError')
@@ -1871,7 +1877,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
 
   Usage:
   ```python
-  m = tf.metrics.MeanAbsoluteError()
+  m = tf.keras.metrics.MeanAbsoluteError()
   m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
   print('Final result: ', m.result().numpy())  # Final result: 0.75
   ```
@@ -1879,7 +1885,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsoluteError()])
   ```
   """
@@ -1907,7 +1913,7 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
   ```
   """
@@ -1935,7 +1941,7 @@ class MeanSquaredError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredError()])
   ```
   """
@@ -1963,7 +1969,7 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
   ```
   """
@@ -1997,7 +2003,7 @@ class Hinge(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.Hinge()])
   ```
   """
@@ -2030,7 +2036,7 @@ class SquaredHinge(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.SquaredHinge()])
   ```
   """
@@ -2057,7 +2063,7 @@ class CategoricalHinge(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.CategoricalHinge()])
   ```
   """
@@ -2081,7 +2087,7 @@ class RootMeanSquaredError(Mean):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.RootMeanSquaredError()])
   ```
   """
@@ -2131,7 +2137,7 @@ class LogCoshError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.LogCoshError()])
   ```
   """
@@ -2157,7 +2163,7 @@ class Poisson(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.Poisson()])
   ```
   """
@@ -2183,7 +2189,7 @@ class KLDivergence(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.KLDivergence()])
   ```
   """
@@ -2224,7 +2230,7 @@ class MeanIoU(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -2330,7 +2336,7 @@ class MeanTensor(Metric):
   Usage:
 
   ```python
-  m = tf.metrics.MeanTensor()
+  m = tf.keras.metrics.MeanTensor()
   m.update_state([0, 1, 2, 3])
   m.update_state([4, 5, 6, 7])
   print('Result: ', m.result().numpy())  # Result: [2, 3, 4, 5]
@@ -2425,8 +2431,8 @@ class MeanTensor(Metric):
 
   def reset_states(self):
     if self._built:
-      for v in self.variables:
-        K.set_value(v, np.zeros(self._shape.as_list()))
+      K.batch_set_value(
+          [(v, np.zeros(self._shape.as_list())) for v in self.variables])
 
 
 @keras_export('keras.metrics.BinaryCrossentropy')
@@ -2458,7 +2464,7 @@ class BinaryCrossentropy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -2524,7 +2530,7 @@ class CategoricalCrossentropy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -2598,7 +2604,7 @@ class SparseCategoricalCrossentropy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -2628,6 +2634,63 @@ class SparseCategoricalCrossentropy(MeanMetricWrapper):
         axis=axis)
 
 
+class SumOverBatchSize(Reduce):
+  """Computes the weighted sum over batch size of the given values.
+
+  For example, if values is [1, 3, 5, 7] then the metric value is 4.
+  If the weights were specified as [1, 1, 0, 0] then the value would be 1.
+
+  This metric creates two variables, `total` and `count` that are used to
+  compute the average of `values`. This average is ultimately returned as sum
+  over batch size which is an idempotent operation that simply divides `total`
+  by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
+  to mask values.
+  """
+
+  def __init__(self, name='sum_over_batch_size', dtype=None):
+    super(SumOverBatchSize, self).__init__(
+        reduction=metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+        name=name,
+        dtype=dtype)
+
+
+class SumOverBatchSizeMetricWrapper(SumOverBatchSize):
+  """Wraps a function with the `SumOverBatchSizeMetricWrapper` metric."""
+
+  def __init__(self, fn, name=None, dtype=None, **kwargs):
+    """Creates a `SumOverBatchSizeMetricWrapper` instance.
+
+    Args:
+      fn: The metric function to wrap, with signature `fn(y_true, y_pred,
+        **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(SumOverBatchSizeMetricWrapper, self).__init__(name=name, dtype=dtype)
+    self._fn = fn
+    self._fn_kwargs = kwargs
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    return super(SumOverBatchSizeMetricWrapper, self).update_state(
+        matches, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+    base_config = super(SumOverBatchSizeMetricWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 def accuracy(y_true, y_pred):
   y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
   if y_true.dtype != y_pred.dtype:
@@ -2691,7 +2754,7 @@ mse = MSE = mean_squared_error
 mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
-cosine = cosine_proximity
+cosine_proximity = cosine_similarity
 
 
 def clone_metric(metric):
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..abef3c4d3f1bd78ae70ea9662c3d49f473c0561c
--- /dev/null
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -0,0 +1,322 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests metrics correctness using Keras model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import tf2
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
+
+  def _get_multi_io_model(self):
+    inp_1 = layers.Input(shape=(1,), name='input_1')
+    inp_2 = layers.Input(shape=(1,), name='input_2')
+    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = layers.Dense(
+        1, kernel_initializer='ones', name='output_1', trainable=False)
+    out_2 = layers.Dense(
+        1, kernel_initializer='ones', name='output_2', trainable=False)
+
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
+        weighted_metrics=[
+            metrics.MeanSquaredError(name='mean_squared_error_2')
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def _custom_generator(self):
+    batch_size = 2
+    num_samples = 4
+    inputs = np.asarray([[1.], [2.], [3.], [4.]])
+    targets = np.asarray([[2.], [4.], [6.], [8.]])
+    w1 = np.asarray([2., 3., 4., 5.])
+    w2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+    i = 0
+    while True:
+      batch_index = i * batch_size % num_samples
+      i += 1
+      start = batch_index
+      end = start + batch_size
+      x = [inputs[start:end], inputs[start:end]]
+      y = [targets[start:end], targets[start:end]]
+      w = [w1[start:end], w2[start:end]]
+      yield x, y, w
+
+  def setUp(self):
+    super(TestMetricsCorrectnessMultiIO, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights_1 = np.asarray([2., 3., 4., 5.])
+    self.weights_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Metric `output_1`, `output_2`:
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+    #   Count = 2 + 2
+    #   Result = 7.5
+
+    # Weighted metric `output_1`:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = (2 + 3) + (4 + 5)
+    #   Result = 9.2857141
+
+    # Weighted metric `output_2`:
+    #   Total = ((3 - 2)^2 * 3.5 + (6 - 4)^2 * 2.5) +
+    #           ((9 - 6)^2 * 1.5 + (12 - 8)^2 * 0.5)
+    #         = 35
+    #   Count = (3.5 + 2.5) + (1.5 + 0.5)
+    #   Result = 4.375
+
+    # Loss `output_1`:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = 2 + 2
+    #   Result = 32.5
+
+    # Loss `output_2`:
+    #   Total = ((3 - 2)^2 * 3.5 + (6 - 4)^2 * 2.5) +
+    #           ((9 - 6)^2 * 1.5 + (12 - 8)^2 * 0.5)
+    #         = 35
+    #   Count = 2 + 2
+    #   Result = 8.75
+
+    # Total loss = 32.5 + 8.75 = 41.25
+
+    wmse = 'mean_squared_error_2'
+    if not tf2.enabled():
+      wmse = 'weighted_' + wmse
+    self.expected_fit_result = {
+        'output_1_mean_squared_error': [7.5, 7.5],
+        'output_2_mean_squared_error': [7.5, 7.5],
+        'output_1_' + wmse: [9.286, 9.286],
+        'output_2_' + wmse: [4.375, 4.375],
+        'loss': [41.25, 41.25],
+        'output_1_loss': [32.5, 32.5],
+        'output_2_loss': [8.75, 8.75],
+    }
+
+    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+    # 'output_1_mean_squared_error', 'output_1_mean_squared_error_2',
+    # 'output_2_mean_squared_error', 'output_2_mean_squared_error_2'
+    self.expected_batch_result = [41.25, 32.5, 8.75, 7.5, 9.286, 7.5, 4.375]
+
+  def test_fit(self):
+    model = self._get_multi_io_model()
+    history = model.fit([self.x, self.x], [self.y, self.y],
+                        sample_weight={
+                            'output_1': self.weights_1,
+                            'output_2': self.weights_2,
+                        },
+                        batch_size=2,
+                        epochs=2,
+                        shuffle=False)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval(self):
+    model = self._get_multi_io_model()
+    eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
+                                 batch_size=2,
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 1))
+    y = np.random.random((50, 1))
+    w = np.random.random((50,))
+    mse1 = model.evaluate([x, x], [y, y], sample_weight=[w, w], batch_size=5)[3]
+    mse2 = model.evaluate([x, x], [y, y], sample_weight=[w, w],
+                          batch_size=10)[3]
+    self.assertAllClose(mse1, mse2, 1e-3)
+
+  def test_train_on_batch(self):
+    model = self._get_multi_io_model()
+    result = model.train_on_batch([self.x, self.x], [self.y, self.y],
+                                  sample_weight={
+                                      'output_1': self.weights_1,
+                                      'output_2': self.weights_2,
+                                  })
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_test_on_batch(self):
+    model = self._get_multi_io_model()
+    result = model.test_on_batch([self.x, self.x], [self.y, self.y],
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_fit_generator(self):
+    model = self._get_multi_io_model()
+    history = model.fit_generator(
+        self._custom_generator(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval_generator(self):
+    model = self._get_multi_io_model()
+    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
+
+  def _get_model(self):
+    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out = layers.Dense(
+        1, kernel_initializer='ones', name='output', trainable=False)
+    model = testing_utils.get_model_from_layers([x, out], input_shape=(1,))
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
+        weighted_metrics=[
+            metrics.MeanSquaredError(name='mean_squared_error_2')
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def _custom_generator(self):
+    batch_size = 2
+    num_samples = 4
+    x = np.asarray([[1.], [2.], [3.], [4.]])
+    y = np.asarray([[2.], [4.], [6.], [8.]])
+    w = np.asarray([2., 3., 4., 5.])
+    i = 0
+    while True:
+      batch_index = i * batch_size % num_samples
+      i += 1
+      start = batch_index
+      end = start + batch_size
+      yield x[start:end], y[start:end], w[start:end]
+
+  def setUp(self):
+    super(TestMetricsCorrectnessSingleIO, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights = np.asarray([2., 3., 4., 5.])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Metric:
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+    #   Count = 2 + 2
+    #   Result = 7.5
+
+    # Weighted metric:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = (2 + 3) + (4 + 5)
+    #   Result = 9.2857141
+
+    # Total loss:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130,
+    #   Count = 2 + 2
+    #   Result = 32.5
+
+    wmse = 'mean_squared_error_2'
+    if not tf2.enabled():
+      wmse = 'weighted_' + wmse
+    self.expected_fit_result = {
+        'mean_squared_error': [7.5, 7.5],
+        wmse: [9.286, 9.286],
+        'loss': [32.5, 32.5]
+    }
+
+    # In the order: 'loss', 'mean_squared_error', 'mean_squared_error_2'
+    self.expected_batch_result = [32.5, 7.5, 9.286]
+
+  def test_fit(self):
+    model = self._get_model()
+    history = model.fit(
+        self.x,
+        self.y,
+        sample_weight=self.weights,
+        batch_size=2,
+        epochs=2,
+        shuffle=False)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval(self):
+    model = self._get_model()
+    eval_result = model.evaluate(
+        self.x, self.y, batch_size=2, sample_weight=self.weights)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 1))
+    y = np.random.random((50, 1))
+    w = np.random.random((50,))
+    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[1]
+    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[1]
+    self.assertAllClose(mse1, mse2, 1e-3)
+
+  def test_train_on_batch(self):
+    model = self._get_model()
+    result = model.train_on_batch(self.x, self.y, sample_weight=self.weights)
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_test_on_batch(self):
+    model = self._get_model()
+    result = model.test_on_batch(self.x, self.y, sample_weight=self.weights)
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_fit_generator(self):
+    model = self._get_model()
+    history = model.fit_generator(
+        self._custom_generator(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval_generator(self):
+    model = self._get_model()
+    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 4143066a1c3ef391538156b8a83784793676f979..ffe093532a4305e5ec79d17fa3b52f0f70b52757 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -37,7 +37,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -131,7 +131,7 @@ class KerasSumTest(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
     m = metrics.Sum()
-    checkpoint = checkpointable_utils.Checkpoint(sum=m)
+    checkpoint = trackable_utils.Checkpoint(sum=m)
     self.evaluate(variables.variables_initializer(m.variables))
 
     # update state
@@ -149,7 +149,7 @@ class KerasSumTest(test.TestCase):
 
     # restore to a different checkpoint sum object
     restore_sum = metrics.Sum()
-    restore_checkpoint = checkpointable_utils.Checkpoint(sum=restore_sum)
+    restore_checkpoint = trackable_utils.Checkpoint(sum=restore_sum)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_sum(300.)
     status.assert_consumed().run_restore_ops()
@@ -267,7 +267,7 @@ class KerasMeanTest(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
     m = metrics.Mean()
-    checkpoint = checkpointable_utils.Checkpoint(mean=m)
+    checkpoint = trackable_utils.Checkpoint(mean=m)
     self.evaluate(variables.variables_initializer(m.variables))
 
     # update state
@@ -285,7 +285,7 @@ class KerasMeanTest(test.TestCase):
 
     # restore to a different checkpoint mean object
     restore_mean = metrics.Mean()
-    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    restore_checkpoint = trackable_utils.Checkpoint(mean=restore_mean)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_mean(300.)
     status.assert_consumed().run_restore_ops()
@@ -453,7 +453,7 @@ class KerasAccuracyTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class CosineSimilarityTest(test.TestCase):
 
   def l2_norm(self, x, axis):
     epsilon = 1e-12
@@ -467,25 +467,25 @@ class CosineProximityTest(test.TestCase):
 
     y_true = self.l2_norm(self.np_y_true, axis)
     y_pred = self.l2_norm(self.np_y_pred, axis)
-    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
 
     self.y_true = constant_op.constant(self.np_y_true)
     self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    cosine_obj = metrics.CosineProximity(
+    cosine_obj = metrics.CosineSimilarity(
         axis=2, name='my_cos', dtype=dtypes.int32)
     self.assertEqual(cosine_obj.name, 'my_cos')
     self.assertEqual(cosine_obj._dtype, dtypes.int32)
 
     # Check save and restore config
-    cosine_obj2 = metrics.CosineProximity.from_config(cosine_obj.get_config())
+    cosine_obj2 = metrics.CosineSimilarity.from_config(cosine_obj.get_config())
     self.assertEqual(cosine_obj2.name, 'my_cos')
     self.assertEqual(cosine_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
     self.setup()
-    cosine_obj = metrics.CosineProximity()
+    cosine_obj = metrics.CosineSimilarity()
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
@@ -493,7 +493,7 @@ class CosineProximityTest(test.TestCase):
 
   def test_weighted(self):
     self.setup()
-    cosine_obj = metrics.CosineProximity()
+    cosine_obj = metrics.CosineSimilarity()
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
     sample_weight = np.asarray([1.2, 3.4])
     loss = cosine_obj(
@@ -506,7 +506,7 @@ class CosineProximityTest(test.TestCase):
 
   def test_axis(self):
     self.setup(axis=1)
-    cosine_obj = metrics.CosineProximity(axis=1)
+    cosine_obj = metrics.CosineSimilarity(axis=1)
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f994ab9c70a7491c3252ff781ac48055fe59bb9d
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+#   Contains the Keras Mixed Precision API (TensorFlow version).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "autocast_variable",
+    srcs = [
+        "autocast_variable.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/distribute:values",
+    ],
+)
+
+py_test(
+    name = "autocast_variable_test",
+    size = "medium",
+    srcs = ["autocast_variable_test.py"],
+    deps = [
+        ":autocast_variable",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64b5178316009354c6adecc8213bf7681504e6f
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -0,0 +1,178 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains AutoCastVariable, a variable which automatically casts itself."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import values as distribute_values
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+
+
+# TODO(reedwm): Make checkpointable?
+class AutoCastVariable(object):
+  """Variable that will cast itself to a different dtype in applicable contexts.
+
+  This class wraps a floating-point tf.Variable. It emulates the variable
+  interface and delegates to the wrapped variable, but it additionally will cast
+  the wrapped variable under a `Graph._enable_variable_auto_cast(dtype)` context
+  manager.
+
+  For example:
+
+  ```
+  v = tf.Variable(1.0, dtype=tf.float32)
+  v = AutoCastVariable(v)
+  print(tf.identity(v).dtype)  # tf.float32
+  with ops.get_default_graph()._enable_variable_auto_cast(tf.float16):
+    print(tf.identity(v).dtype)  # tf.float16, as v will cast itself to float16
+    print(v.dtype)  # tf.float16, as v.dtype also changes under the ctx manager.
+  ```
+
+  The purpose of this class is to allow Keras layers to create variables in
+  float32, and automatically cast them to float16 or bfloat16 when the layer is
+  called.
+  """
+
+  def __init__(self, variable):
+    """Creates an AutoCastVariable instance.
+
+    Args:
+      variable: A floating-point resource variable to wrap.
+
+    Raises:
+      ValueError: If `variable` is not a floating-point resource variable
+    """
+    if not resource_variable_ops.is_resource_variable(variable):
+      raise ValueError('variable must be of type tf.ResourceVariable, but got: '
+                       '%s' % variable)
+    if not variable.dtype.is_floating:
+      raise ValueError('variable must be a floating point variable but has '
+                       'type: %s' % variable.dtype.name)
+    self._variable = variable
+
+  @property
+  def name(self):
+    return self._variable.name
+
+  def _should_cast(self):
+    """Returns True if this variable should be casted when accessed."""
+    g = ops.get_default_graph()
+    # pylint:disable=protected-access
+    return (g._auto_cast_variable_read_dtype is not None and
+            self.true_dtype != g._auto_cast_variable_read_dtype)
+    # pylint:enable=protected-access
+
+  @property
+  def dtype(self):
+    """The dtype this variable will be casted to when read."""
+    if self._should_cast():
+      return ops.get_default_graph()._auto_cast_variable_read_dtype  # pylint:disable=protected-access
+    else:
+      return self._variable.dtype
+
+  @property
+  def true_dtype(self):
+    """The dtype of the underlying variable, before any casts are done."""
+    return self._variable.dtype
+
+  def value(self):
+    val = self._variable.value()
+    if not self._should_cast():
+      return val
+    # We colocate_with(None) to ignore the existing device constraints, so that
+    # the cast is always done on the variable's device
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(val.device):
+        return math_ops.cast(val, self.dtype)
+
+  def read_value(self):
+    val = self._variable.read_value()
+    if not self._should_cast():
+      return val
+    return math_ops.cast(val, self.dtype)
+
+  def sparse_read(self, indices, name=None):
+    """Reads the value of this variable sparsely, using `gather`."""
+    val = self._variable.sparse_read(indices, name=name)
+    if not self._should_cast():
+      return val
+    return math_ops.cast(val, self.dtype)
+
+  def assign(self, value, use_locking=None, name=None, read_value=True):
+    return self._variable.assign(
+        value, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_add(self, delta, use_locking=None, name=None, read_value=True):
+    return self._variable.assign_add(
+        delta, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
+    return self._variable.assign_sub(
+        delta, use_locking=use_locking, name=name, read_value=read_value)
+
+  # TODO(reedwm): Support assigning variables with tf.assign(), var.scatter_add,
+  # etc.
+
+  def __getattr__(self, name):
+    return getattr(self._variable, name)
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts this variable to a tensor."""
+    if not self._should_cast():
+      return ops.internal_convert_to_tensor(self._variable, dtype, name,
+                                            as_ref)
+    # TODO(reedwm): Support as_ref?
+    assert not as_ref
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          'Incompatible type conversion requested to type {!r} for variable '
+          'of type {!r}'.format(dtype.name, self.dtype.name))
+    val = ops.internal_convert_to_tensor(self._variable,
+                                         self._variable.dtype, name,
+                                         as_ref=False)
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(val.device):
+        return math_ops.cast(val, self.dtype)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  # TODO(reedwm): Define operator overloads.
+
+
+ops.register_tensor_conversion_function(
+    AutoCastVariable, AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
+ops.register_dense_tensor_like_type(AutoCastVariable)
+
+
+# We have DistributedVariable subclass to pass
+# isinstance(..., DistributedVariable) checks when wrapping a
+# DistributedVariable.
+# TODO(reedwm): We should not wrap DistributedVariable, but instead have
+# DistributedVariable wrap AutoCastVariable. Subclassing DistributedVariable is
+# messy, because we do not fully implement the interface of DistributedVariable.
+class AutoCastDistributedVariable(AutoCastVariable,
+                                  distribute_values.DistributedVariable):
+  """Version of AutoCastVariable that subclasses DistributedVariable."""
+
+  def __init__(self, variable):
+    if not isinstance(variable, distribute_values.DistributedValues):
+      raise ValueError('variable must be of type DistributedValues, '
+                       'but got: %s' % variable)
+    super(AutoCastDistributedVariable, self).__init__(variable)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1caec6a738709768b35aeab9bd18fe67e90982a9
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -0,0 +1,245 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AutoCastVariable."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+TESTCASES = ({
+    'testcase_name': 'base',
+    'distribute': False
+}, {
+    'testcase_name': 'distribute',
+    'distribute': True
+})
+
+
+def get_distribute_scope(distribute):
+
+  class DummyContextManager(object):
+
+    def __enter__(self):
+      pass
+
+    def __exit__(self, *args):
+      pass
+
+  if distribute:
+    return mirrored_strategy.MirroredStrategy(['cpu:0']).scope()
+  else:
+    return DummyContextManager()
+
+
+def get_autocast_var(var, distribute):
+  if distribute:
+    return autocast_variable.AutoCastDistributedVariable(var)
+  else:
+    return autocast_variable.AutoCastVariable(var)
+
+
+def get_var(val, dtype):
+  return variables.VariableV1(val, use_resource=True, dtype=dtype)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_read(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      # outside of auto cast scope.
+      self.assertEqual(x.dtype, dtypes.float32)
+      self.assertEqual(x.value().dtype, dtypes.float32)
+      self.assertEqual(x.read_value().dtype, dtypes.float32)
+      self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
+
+      # within auto cast scope of different dtype
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.value().dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+        self.assertEqual(array_ops.identity(x).dtype, dtypes.float16)
+
+      # within auto cast scope of same dtype
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float32):
+        self.assertEqual(x.dtype, dtypes.float32)
+        self.assertEqual(x.value().dtype, dtypes.float32)
+        self.assertEqual(x.read_value().dtype, dtypes.float32)
+        self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_read_nested_scopes(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+
+        with ops.get_default_graph()._enable_auto_casting_variables(
+            dtypes.float32):
+          self.assertEqual(x.dtype, dtypes.float32)
+          self.assertEqual(x.read_value().dtype, dtypes.float32)
+
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_operator_overloads(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+    v1 = constant_op.constant(2., dtype=dtypes.float32)
+    v2 = constant_op.constant(2., dtype=dtypes.float16)
+
+    # Because autocast variables do not yet define operator overloads, the
+    # operator is defined by the non-variable tensor
+
+    # Test variable as the LHS. Currently, this is not supported with
+    # distributed autocast variables
+    if not distribute:
+      self.assertEqual(self.evaluate(x + v1), 3.)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(self.evaluate(x + v2), 3.)
+
+    # Test variable as the RHS
+    self.assertEqual(self.evaluate(v1 + x), 3.)
+
+    with ops.get_default_graph()._enable_auto_casting_variables(
+        dtypes.float16):
+      self.assertEqual(self.evaluate(v2 + x), 3.)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_assign(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(0., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      # outside of auto cast scope.
+      v1 = constant_op.constant(3.14, dtype=dtypes.float32)
+      v2 = constant_op.constant(3.14, dtype=dtypes.float16)
+
+      def run_and_check():
+        # Assign float32 values
+        self.assertAllClose(3.14, self.evaluate(x.assign(v1)))
+        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(v1)))
+        self.assertAllClose(3.14, self.evaluate(x.assign_sub(v1)))
+
+        # Attempt to assign float16 values
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign(v2))
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign_add(v2))
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign_sub(v2))
+
+        # Assign Python floats
+        self.assertAllClose(3.14, self.evaluate(x.assign(3.14)))
+        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(3.14)))
+        self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14)))
+
+      run_and_check()
+      # reset x
+      self.evaluate(x.assign(0.))
+      # within auto cast scope.
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        # assign still expect float32 value even if in float16 scope
+        run_and_check()
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_assign_stays_in_true_dtype(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+      # small_val is a value such that 1.0 + small_val == 1.0 in fp16, but not
+      # in fp32
+      small_val = np.finfo('float16').eps / 2
+      small_tensor = constant_op.constant(small_val, dtype=dtypes.float32)
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        # Variable should be increased, despite it appearing to be the same
+        # float16 value.
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign(1. + small_tensor)))
+        self.assertEqual(1., self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+
+      self.evaluate(x.assign(1.))
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign_add(small_tensor)))
+        self.assertEqual(1., self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_invalid_wrapped_variable(self, distribute):
+    with get_distribute_scope(distribute):
+      # Wrap a non-variable
+      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+        x = constant_op.constant([1.], dtype=dtypes.float32)
+        get_autocast_var(x, distribute)
+
+      # Wrap a non-floating point variable
+      with self.assertRaisesRegexp(ValueError,
+                                   'variable must be a floating point'):
+        x = get_var(1, dtypes.int32)
+        get_autocast_var(x, distribute)
+
+    if distribute:
+      # Wrap a non-distributed variable with AutoCastDistributedVariable
+      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+        x = get_var(1., dtypes.float32)
+        get_autocast_var(x, distribute)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 6d8ff9d847bafe8a6632741dd8ccb09295db3057..5220f4e28f4244773fb4a6597fd2939fdd799662 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import data_structures
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 9bc5aa2be5628d05b97ac58058f0183b5375b7d3..487259864c40abab644272a6c2a341d8335fc9be 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -313,15 +313,16 @@ def _in_place_subclassed_model_reset(model):
       continue
     if isinstance(value, Layer):
       attributes_cache[name] = value
-      assert value in model._layers
-      if hasattr(value, '_layers') and value._layers:
+      assert value in model.layers
+      if hasattr(value, 'layers') and value.layers:
         raise ValueError('We do not support the use of nested layers '
                          'in `model_to_estimator` at this time. Found nested '
                          'layer: %s' % value)
     elif isinstance(
         value,
         (list, tuple)) and name not in ('layers', '_layers', 'metrics',
-                                        '_compile_stateful_metric_functions'):
+                                        '_compile_stateful_metric_functions',
+                                        '_output_loss_metrics'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -461,7 +462,7 @@ def clone_and_build_model(
       or functions.
     compile_clone: Boolean, whether to compile model clone (default `True`).
     in_place_reset: Boolean, whether to reset the model in place. Only used if
-      the model is not a graph network. If the model is a subclassed model, then
+      the model is a subclassed model. In the case of a subclassed model,
       this argument must be set to `True` (default `False`). To restore the
       original model, use the function
       `in_place_subclassed_model_state_restoration(model)`.
@@ -504,8 +505,8 @@ def clone_and_build_model(
   else:
     if not in_place_reset:
       raise ValueError(
-          'Model is not a graph network (usually means that it is a subclassed '
-          'model). The model cannot be cloned, but there is a workaround where '
+          'This model is a subclassed model. '
+          'Such a model cannot be cloned, but there is a workaround where '
           'the model is reset in-place. To use this, please set the argument '
           '`in_place_reset` to `True`. This will reset the attributes in the '
           'original model. To restore the attributes, call '
diff --git a/tensorflow/python/keras/ops.py b/tensorflow/python/keras/ops.py
index bc14eef505853723dc494e0f8c6b764bf5d297d0..b2d852054448b4887fe9f9f28ad4f99e12ce7680 100644
--- a/tensorflow/python/keras/ops.py
+++ b/tensorflow/python/keras/ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -56,22 +55,33 @@ keras_export(v1=["keras.initializers.he_uniform"])(
 
 keras_export("keras.initializers.Initializer", v1=[])(
     init_ops_v2.Initializer)
-keras_export("keras.initializers.Zeros", v1=[])(
-    init_ops_v2.Zeros)
-keras_export("keras.initializers.Ones", v1=[])(
-    init_ops_v2.Ones)
-keras_export("keras.initializers.Constant", v1=[])(
-    init_ops_v2.Constant)
+keras_export(
+    "keras.initializers.Zeros", "keras.initializers.zeros", v1=[])(
+        init_ops_v2.Zeros)
+keras_export(
+    "keras.initializers.Ones", "keras.initializers.ones", v1=[])(
+        init_ops_v2.Ones)
+keras_export(
+    "keras.initializers.Constant", "keras.initializers.constant", v1=[])(
+        init_ops_v2.Constant)
 keras_export("keras.initializers.VarianceScaling", v1=[])(
     init_ops_v2.VarianceScaling)
-keras_export("keras.initializers.Orthogonal", v1=[])(
-    init_ops_v2.Orthogonal)
-keras_export("keras.initializers.Identity", v1=[])(
-    init_ops_v2.Identity)
-keras_export("keras.initializers.GlorotUniform", v1=[])(
-    init_ops_v2.GlorotUniform)
-keras_export("keras.initializers.GlorotNormal", v1=[])(
-    init_ops_v2.GlorotNormal)
+keras_export(
+    "keras.initializers.Orthogonal", "keras.initializers.orthogonal", v1=[])(
+        init_ops_v2.Orthogonal)
+keras_export(
+    "keras.initializers.Identity", "keras.initializers.identity", v1=[])(
+        init_ops_v2.Identity)
+keras_export(
+    "keras.initializers.GlorotUniform",
+    "keras.initializers.glorot_uniform",
+    v1=[])(
+        init_ops_v2.GlorotUniform)
+keras_export(
+    "keras.initializers.GlorotNormal",
+    "keras.initializers.glorot_normal",
+    v1=[])(
+        init_ops_v2.GlorotNormal)
 keras_export("keras.initializers.lecun_normal", v1=[])(
     init_ops_v2.lecun_normal)
 keras_export("keras.initializers.lecun_uniform", v1=[])(
@@ -90,6 +100,3 @@ keras_export("keras.initializers.TruncatedNormal", v1=[])(
 
 
 keras_export("keras.backend.name_scope")(ops.name_scope)
-
-keras_export("keras.losses.Reduction", v1=[])(
-    losses_impl.ReductionV2)
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 4fa7c7361543d5389872ebd70cb0df261325a9c4..965ae8669bbca53ec5bb7e6666a7a7ba7fba1575 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -64,7 +64,7 @@ class Adam(optimizer_v2.OptimizerV2):
       $$t := 0 \text{(Initialize timestep)}$$
 
       The update rule for `variable` with gradient `g` uses an optimization
-      described at the end of section2 of the paper:
+      described at the end of section 2 of the paper:
 
       $$t := t + 1$$
       $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
@@ -82,7 +82,7 @@ class Adam(optimizer_v2.OptimizerV2):
       $$t := 0 \text{(Initialize timestep)}$$
 
       The update rule for `variable` with gradient `g` uses an optimization
-      described at the end of section2 of the paper:
+      described at the end of section 2 of the paper:
 
       $$t := t + 1$$
       $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index bf6dcaad12c51cc549ba5940ed8c74fde68c6255..26369e976462806c39e3f6a3f3c2f7e15c6b10f2 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -25,7 +25,6 @@ import functools
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.distribute import values as distributed_values
@@ -44,7 +43,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -71,7 +70,7 @@ def _deduplicate_indexed_slices(values, indices):
 
 @six.add_metaclass(abc.ABCMeta)
 @keras_export("keras.optimizers.Optimizer")
-class OptimizerV2(checkpointable.Checkpointable):
+class OptimizerV2(trackable.Trackable):
   """Updated base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -126,7 +125,26 @@ class OptimizerV2(checkpointable.Checkpointable):
   opt.apply_gradients(capped_grads_and_vars)
   ```
 
+  ### Use with `tf.distribute.Strategy`.
+
+  This optimizer class is `tf.distribute.Strategy` aware, which means it
+  automatically sums gradients across all replicas. To average gradients,
+  you divide your loss by the global batch size, which is done automatically
+  if you use a member of `tf.keras.losses` or `tf.losses`. See the
+  `reduction` argument of your loss which should be set to
+  `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` for averaging or
+  `tf.keras.losses.Reduction.SUM` for not.
+
+  If you are not using these and you want to average gradients, you should use
+  `tf.math.reduce_sum` to add up your per-example losses and then divide by the
+  global batch size. Note that when using `tf.distribute.Strategy`, the first
+  component of a tensor's shape is the *replica-local* batch size, which is off
+  by a factor equal to the number of replicas being used to compute a single
+  step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
+  resulting in gradients that can be many times too big.
+
   ### Variable Constraint
+
   All Keras optimizers respect variable constraints. If constraint function is
   passed to any variable, the constraint will be applied to the variable after
   the gradient has been applied to the variable.
@@ -174,15 +192,13 @@ class OptimizerV2(checkpointable.Checkpointable):
   ```
 
   ### Write a customized optimizer.
+  If you intend to create your own optimization algorithm, simply inherit from
+  this class and override the following methods:
 
-  This optimizer class updates variables from gradients and is
-  tf.distribute.Strategy aware. If you intend to create your own optimization
-  algorithm, simply inherit from this class and override the following methods:
     - resource_apply_dense (update variable given gradient tensor is dense)
     - resource_apply_sparse (update variable given gradient tensor is sparse)
     - create_slots (if your optimizer algorithm requires additional variables)
     - get_config (serialization of the optimizer, include all hyper parameters)
-
   """
 
   def __init__(self, name, **kwargs):
@@ -228,9 +244,9 @@ class OptimizerV2(checkpointable.Checkpointable):
     self._weights = []
     self._iterations = None
 
-    # For implementing Checkpointable. Stores information about how to restore
+    # For implementing Trackable. Stores information about how to restore
     # slot variables which have not yet been created
-    # (checkpointable._CheckpointPosition objects).
+    # (trackable._CheckpointPosition objects).
     #  {slot_name :
     #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
     #   ... }
@@ -310,7 +326,6 @@ class OptimizerV2(checkpointable.Checkpointable):
     with backprop.GradientTape() as tape:
       tape.watch(var_list)
       loss_value = loss()
-      loss_value = self._scale_loss(loss_value)
     grads = tape.gradient(loss_value, var_list, grad_loss)
 
     if hasattr(self, "clipnorm"):
@@ -329,14 +344,6 @@ class OptimizerV2(checkpointable.Checkpointable):
 
     return grads_and_vars
 
-  @staticmethod
-  def _scale_loss(loss_value):
-    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
-      if num_replicas > 1:
-        loss_value *= (1. / num_replicas)
-    return loss_value
-
   def get_gradients(self, loss, params):
     """Returns gradients of `loss` with respect to `params`.
 
@@ -351,7 +358,6 @@ class OptimizerV2(checkpointable.Checkpointable):
       ValueError: In case any gradient cannot be computed (e.g. if gradient
         function not implemented).
     """
-    loss = self._scale_loss(loss)
     grads = gradients.gradients(loss, params)
     if None in grads:
       raise ValueError("An operation has `None` for gradient. "
@@ -553,7 +559,7 @@ class OptimizerV2(checkpointable.Checkpointable):
         self._weights.append(self._iterations)
     for name, value in self._hyper.items():
       if isinstance(value, ops.Tensor) or callable(value):
-        pass
+        continue
       else:
         self._hyper[name] = self.add_weight(
             name,
@@ -642,7 +648,8 @@ class OptimizerV2(checkpointable.Checkpointable):
     if callable(value):
       return value()
     if isinstance(value, (ops.Tensor, tf_variables.Variable,
-                          distributed_values.TPUMirroredVariable)):
+                          distributed_values.TPUMirroredVariable,
+                          distributed_values.DistributedVariable)):
       return backend.get_value(value)
     return value
 
@@ -822,7 +829,7 @@ class OptimizerV2(checkpointable.Checkpointable):
       return x.value()
 
   # ---------------
-  # For implementing the checkpointable interface
+  # For implementing the trackable interface
   # ---------------
 
   def _restore_slot_variable(self, slot_name, variable, slot_variable):
@@ -853,8 +860,8 @@ class OptimizerV2(checkpointable.Checkpointable):
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
@@ -872,7 +879,7 @@ class OptimizerV2(checkpointable.Checkpointable):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.add_slot(
           var=variable,
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index b704b885cb967997a7a8735b31f08a1537cf4a1c..1fb8f8d2802d5e067a6c8ed79915147e086cf172 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
 from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import ftrl
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
@@ -40,7 +41,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -710,19 +711,19 @@ class Nadam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class TFOptimizer(Optimizer, checkpointable.Checkpointable):
+class TFOptimizer(Optimizer, trackable.Trackable):
   """Wrapper class for native TensorFlow optimizers.
   """
 
   def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
-    self._track_checkpointable(optimizer, name='optimizer')
+    self._track_trackable(optimizer, name='optimizer')
     if iterations is None:
       with K.name_scope(self.__class__.__name__):
         self.iterations = K.variable(0, dtype='int64', name='iterations')
     else:
       self.iterations = iterations
-    self._track_checkpointable(self.iterations, name='global_step')
+    self._track_trackable(self.iterations, name='global_step')
 
   def apply_gradients(self, grads):
     self.optimizer.apply_gradients(grads, global_step=self.iterations)
@@ -806,7 +807,8 @@ def deserialize(config, custom_objects=None):
       'adamax': adamax_v2.Adamax,
       'nadam': nadam_v2.Nadam,
       'rmsprop': rmsprop_v2.RMSprop,
-      'sgd': gradient_descent_v2.SGD
+      'sgd': gradient_descent_v2.SGD,
+      'ftrl': ftrl.Ftrl
   }
 
   # Make deserialization case-insensitive for built-in optimizers.
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 606e711483137e3e7eb5336029edcb52b3cfe916..e3ae442cedbe3d1eaee1cf759e434cc6475e6a5a 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -53,9 +53,7 @@ class KerasOptimizersTest(test.TestCase):
     y_train = keras.utils.to_categorical(y_train)
     model = _get_model(x_train.shape[1], 20, y_train.shape[1])
     model.compile(
-        loss='categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=['accuracy'])
+        loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
     np.testing.assert_equal(
         keras.backend.get_value(model.optimizer.iterations), 0)
     history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 4abaadfcd305f493b163ad710d11c977b3d1adac..f7cbb589dc9de63e4426a0a0338a67f78d7f07d3 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -386,6 +386,8 @@ class TestImage(test.TestCase):
     _ = keras.preprocessing.image.random_shift(x, 0.2, 0.2)
     _ = keras.preprocessing.image.random_shear(x, 2.)
     _ = keras.preprocessing.image.random_zoom(x, (0.5, 0.5))
+    _ = keras.preprocessing.image.apply_channel_shift(x, 2, 2)
+    _ = keras.preprocessing.image.apply_affine_transform(x, 2)
     with self.assertRaises(ValueError):
       keras.preprocessing.image.random_zoom(x, (0, 0, 0))
     _ = keras.preprocessing.image.random_channel_shift(x, 2.)
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index b828fa933bb56ca7cd79336bff6918648f4bf32c..2dabe504b0c5fcdb95223ace15a60c33bfa058e2 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -54,6 +54,8 @@ class L1L2(Regularizer):
     self.l2 = K.cast_to_floatx(l2)
 
   def __call__(self, x):
+    if not self.l1 and not self.l2:
+      return K.constant(0.)
     regularization = 0.
     if self.l1:
       regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index 3d6b259d87de8b6533d008a839f0df2226d71ed4..fb2439395bd94e781b9c4b7576c9b249ee44286f 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python import keras
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -28,50 +30,54 @@ DATA_DIM = 5
 NUM_CLASSES = 2
 
 
-def get_data():
-  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-      train_samples=10,
-      test_samples=10,
-      input_shape=(DATA_DIM,),
-      num_classes=NUM_CLASSES)
-  y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
-  y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
-  return (x_train, y_train), (x_test, y_test)
-
-
-def create_model(kernel_regularizer=None, activity_regularizer=None):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(NUM_CLASSES,
-                               kernel_regularizer=kernel_regularizer,
-                               activity_regularizer=activity_regularizer,
-                               input_shape=(DATA_DIM,)))
-  return model
+class KerasRegularizersTest(test.TestCase, parameterized.TestCase):
 
+  def create_model(self, kernel_regularizer=None, activity_regularizer=None):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(NUM_CLASSES,
+                                 kernel_regularizer=kernel_regularizer,
+                                 activity_regularizer=activity_regularizer,
+                                 input_shape=(DATA_DIM,)))
+    return model
 
-class KerasRegularizersTest(test.TestCase):
+  def get_data(self):
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=10,
+        test_samples=10,
+        input_shape=(DATA_DIM,),
+        num_classes=NUM_CLASSES)
+    y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
+    return (x_train, y_train), (x_test, y_test)
 
-  def test_kernel_regularization(self):
+  @parameterized.named_parameters([
+      ('l1', keras.regularizers.l1()),
+      ('l2', keras.regularizers.l2()),
+      ('l1_l2', keras.regularizers.l1_l2()),
+  ])
+  def test_kernel_regularization(self, regularizer):
     with self.cached_session():
-      (x_train, y_train), _ = get_data()
-      for reg in [keras.regularizers.l1(),
-                  keras.regularizers.l2(),
-                  keras.regularizers.l1_l2()]:
-        model = create_model(kernel_regularizer=reg)
-        model.compile(loss='categorical_crossentropy', optimizer='sgd')
-        assert len(model.losses) == 1
-        model.fit(x_train, y_train, batch_size=10,
-                  epochs=1, verbose=0)
+      (x_train, y_train), _ = self.get_data()
+      model = self.create_model(kernel_regularizer=regularizer)
+      model.compile(loss='categorical_crossentropy', optimizer='sgd')
+      assert len(model.losses) == 1
+      model.fit(x_train, y_train, batch_size=10,
+                epochs=1, verbose=0)
 
-  @test_util.run_deprecated_v1
-  def test_activity_regularization(self):
+  @parameterized.named_parameters([
+      ('l1', keras.regularizers.l1()),
+      ('l2', keras.regularizers.l2()),
+      ('l2_zero', keras.regularizers.l2(0.)),
+  ])
+  @test_util.deprecated_graph_mode_only
+  def test_activity_regularization(self, regularizer):
     with self.cached_session():
-      (x_train, y_train), _ = get_data()
-      for reg in [keras.regularizers.l1(), keras.regularizers.l2()]:
-        model = create_model(activity_regularizer=reg)
-        model.compile(loss='categorical_crossentropy', optimizer='sgd')
-        assert len(model.losses) == 1
-        model.fit(x_train, y_train, batch_size=10,
-                  epochs=1, verbose=0)
+      (x_train, y_train), _ = self.get_data()
+      model = self.create_model(activity_regularizer=regularizer)
+      model.compile(loss='categorical_crossentropy', optimizer='sgd')
+      assert len(model.losses) == 1
+      model.fit(x_train, y_train, batch_size=10,
+                epochs=1, verbose=0)
 
   def test_zero_regularization(self):
     inputs = keras.backend.ones(shape=(10, 10))
diff --git a/tensorflow/python/keras/saving/__init__.py b/tensorflow/python/keras/saving/__init__.py
index bb4db681248e8f25672cacd2d80dc65ea43a4113..b32ae4041c6d5c6111de42c5401095607972281f 100644
--- a/tensorflow/python/keras/saving/__init__.py
+++ b/tensorflow/python/keras/saving/__init__.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.saving.hdf5_format import save_weights_to_hdf5_grou
 from tensorflow.python.keras.saving.model_config import model_from_config
 from tensorflow.python.keras.saving.model_config import model_from_json
 from tensorflow.python.keras.saving.model_config import model_from_yaml
-from tensorflow.python.keras.saving.saved_model import export
+from tensorflow.python.keras.saving.saved_model import export_saved_model
 from tensorflow.python.keras.saving.saved_model import load_from_saved_model
 from tensorflow.python.keras.saving.saving_utils import trace_model_call
 
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index c51eecf4a3cbb5ab091e1306a221fc25fcd37b80..534c78da1756d17d11dbac8cbc47d6c708afb605 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training as training_module
-from tensorflow.python.training.checkpointable import util as checkpointable
+from tensorflow.python.training.tracking import util as trackable
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -994,7 +994,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_incompatible_checkpoint(self):
-    save_path = checkpointable.Checkpoint().save(
+    save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
     m = keras.Model()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
index fbf0bf68ef7a6abf2130fe9c8f959e15dc11983b..ffaf02be7d992e1a126d9e131efe317acee5cd10 100644
--- a/tensorflow/python/keras/saving/saved_model.py
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -25,8 +25,10 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import model_from_json
 from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -35,46 +37,39 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import model_utils
 from tensorflow.python.saved_model import save as save_lib
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training import mode_keys
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.experimental.export')
-def export(
-    model, saved_model_path, custom_objects=None, as_text=None,
-    input_signature=None, serving_only=False):
-  """Saves a `tf.keras.Model` into Tensorflow SavedModel format.
-
-  `save_model` generates new files/folders under the `saved_model_path` folder:
-  1) a checkpoint containing the model weights.
-  2) a saved_model.pb file containing the model's MetaGraphs. The prediction
-     graph is always exported. The evaluaton and training graphs are exported
-     if the following conditions are met:
-     - Evaluation: model loss is defined.
-     - Training: model is compiled with an optimizer defined under `tf.train`.
-       This is because `tf.keras.optimizers.Optimizer` instances cannot be
-       saved to checkpoints.
-  3) Model's json configuration, if model.get_config() has been implemented.
-     This file can be used to reload the model using
-     tf.keras.models.model_from_json(). Note that if any custom objects were
-     used, they should be passed to the `custom_object` argument when loading
-     the model.
-
-  Model limitations:
-  - Sequential and functional models can always be saved.
-  - Subclassed models can only be saved when `serving_only=True`. This is due to
-    the current implementation copying the model in order to export the training
-    and evaluation graphs. Because the topology of subclassed models cannot be
-    determined, the subclassed models cannot be cloned. Subclassed models will
-    be entirely exportable in the future.
-
-  Note that each mode is exported in separate graphs, so different modes do not
-  share variables. To use the train graph with evaluation or prediction graphs,
-  create a new checkpoint if variable values have been updated.
+@keras_export('keras.experimental.export_saved_model')
+def export_saved_model(model,
+                       saved_model_path,
+                       custom_objects=None,
+                       as_text=False,
+                       input_signature=None,
+                       serving_only=False):
+  """Exports a `tf.keras.Model` as a Tensorflow SavedModel.
+
+  Note that at this time, subclassed models can only be saved using
+  `serving_only=True`.
+
+  The exported `SavedModel` is a standalone serialization of Tensorflow objects,
+  and is supported by TF language APIs and the Tensorflow Serving system.
+  To load the model, use the function
+  `tf.keras.experimental.load_from_saved_model`.
+
+  The `SavedModel` contains:
+
+  1. a checkpoint containing the model weights.
+  2. a `SavedModel` proto containing the Tensorflow backend graph. Separate
+     graphs are saved for prediction (serving), train, and evaluation. If
+     the model has not been compiled, then only the graph computing predictions
+     will be exported.
+  3. the model's json config. If the model is subclassed, this will only be
+     included if the model's `get_config()` method is overwritten.
 
   Example:
 
@@ -87,62 +82,48 @@ def export(
   model.summary()
 
   # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.keras.experimental.export(
-        model, '/tmp/my_simple_tf_keras_saved_model')
+  path = '/tmp/simple_keras_model'
+  tf.keras.experimental.export_saved_model(model, path)
 
   # Load the saved keras model back.
-  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
-  model_prime.summary()
+  new_model = tf.keras.experimental.load_from_saved_model(path)
+  new_model.summary()
   ```
 
   Args:
     model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
       `serving_only` must be set to True.
     saved_model_path: a string specifying the path to the SavedModel directory.
-      The SavedModel will be saved to a timestamped folder created within this
-      directory.
     custom_objects: Optional dictionary mapping string names to custom classes
       or functions (e.g. custom loss functions).
-    as_text: whether to write the `SavedModel` proto in text format. Currently
-      unavailable in serving-only mode.
+    as_text: bool, `False` by default. Whether to write the `SavedModel` proto
+      in text format. Currently unavailable in serving-only mode.
     input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
-      to specify the expected model inputs. `input_signature`'s nested structure
-      should match the expected nested structure of the inputs to the model. If
-      this is not set, this function will attempt to infer the input shapes and
-      dtypes from the model. Note that if the model is subclassed, the tensor
-      inputs to the call function should be nested in the first argument (this
-      is a general requirement for using subclassed models with Keras functions
-      .fit(), .predict(), etc.).
-    serving_only: Export only the outputs produced from calling the model in
-      predict mode. The losses, optimizer, and other training configurations are
-      not saved. If the SavedModel will only be used for serving (rather than
-      retraining), or if the model is subclassed, this can be set to True.
-
-  Returns:
-    String path to the SavedModel folder, a subdirectory of `saved_model_path`.
+      to specify the expected model inputs. See `tf.function` for more details.
+    serving_only: bool, `False` by default. When this is true, only the
+      prediction graph is saved.
 
   Raises:
     NotImplementedError: If the model is a subclassed model, and serving_only is
       False.
     ValueError: If the input signature cannot be inferred from the model.
+    AssertionError: If the SavedModel directory already exists and isn't empty.
   """
-  export_dir = model_utils.get_timestamped_export_dir(saved_model_path)
-
   if serving_only:
     save_lib.save(
-        model, export_dir,
+        model,
+        saved_model_path,
         signatures=saving_utils.trace_model_call(model, input_signature))
   else:
-    _save_v1_format(model, export_dir, custom_objects, as_text, input_signature)
+    _save_v1_format(model, saved_model_path, custom_objects, as_text,
+                    input_signature)
 
   try:
-    _export_model_json(model, export_dir)
+    _export_model_json(model, saved_model_path)
   except NotImplementedError:
     logging.warning('Skipped saving model JSON, subclassed model does not have '
                     'get_config() defined.')
 
-  return export_dir
-
 
 def _export_model_json(model, saved_model_path):
   """Saves model configuration as a json string under assets folder."""
@@ -202,8 +183,8 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
 
   has_saved_vars = False
   if model.optimizer:
-    # TODO(kathywu): Verify this works with v2 optimizer.
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
+    if isinstance(model.optimizer, (optimizers.TFOptimizer,
+                                    optimizer_v2.OptimizerV2)):
       _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
       has_saved_vars = True
       _export_mode(mode_keys.ModeKeys.TEST, has_saved_vars, **export_args)
@@ -220,7 +201,8 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
 
 def _get_var_list(model):
   """Returns list of all checkpointed saveable objects in the model."""
-  return checkpointable_utils.named_saveables(model)
+  var_list, _, _ = graph_view.ObjectGraphView(model).serialize_object_graph()
+  return var_list
 
 
 def create_placeholder(spec):
@@ -287,9 +269,8 @@ def _export_mode(
       clone._make_predict_function()
     g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
 
-    clone_var_list = checkpointable_utils.named_saveables(clone)
-
     with session.Session().as_default():
+      clone_var_list = _get_var_list(clone)
       if has_saved_vars:
         # Confirm all variables in the clone have an entry in the checkpoint.
         status = clone.load_weights(checkpoint_path)
@@ -299,10 +280,10 @@ def _export_mode(
         # not counting optimizer objects. Optimizer objects are ignored because
         # if the model has not trained, the slot variables will not have been
         # created yet.
-        # TODO(b/113179535): Replace with checkpointable equivalence.
+        # TODO(b/113179535): Replace with trackable equivalence.
         _assert_same_non_optimizer_objects(model, model_graph, clone, g)
 
-        # TODO(b/113178242): Use value transfer for checkpointable objects.
+        # TODO(b/113178242): Use value transfer for trackable objects.
         clone.load_weights(checkpoint_path)
 
         # Add graph and variables to SavedModel.
@@ -310,13 +291,13 @@ def _export_mode(
         clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
         builder._has_saved_variables = True
 
-    # Add graph to the SavedModel builder.
-    builder.add_meta_graph(
-        model_utils.EXPORT_TAG_MAP[mode],
-        signature_def_map=_create_signature_def_map(clone, mode),
-        saver=saver_lib.Saver(clone_var_list),
-        init_op=variables.local_variables_initializer(),
-        train_op=train_op)
+      # Add graph to the SavedModel builder.
+      builder.add_meta_graph(
+          model_utils.EXPORT_TAG_MAP[mode],
+          signature_def_map=_create_signature_def_map(clone, mode),
+          saver=saver_lib.Saver(clone_var_list),
+          init_op=variables.local_variables_initializer(),
+          train_op=train_op)
     return None
 
 
@@ -360,15 +341,15 @@ def _create_signature_def_map(model, mode):
 
 
 def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
-  """Asserts model and clone contain the same checkpointable objects."""
+  """Asserts model and clone contain the same trackable objects."""
 
   # TODO(fchollet, kathywu): make sure this works in eager mode.
   return True
 
 
 @keras_export('keras.experimental.load_from_saved_model')
-def load_from_saved_model(saved_model_path):
-  """Loads a keras.Model from a SavedModel created by keras export().
+def load_from_saved_model(saved_model_path, custom_objects=None):
+  """Loads a keras Model from a SavedModel created by `export_saved_model()`.
 
   This function reinstantiates model state by:
   1) loading model topology from json (this will eventually come
@@ -386,16 +367,19 @@ def load_from_saved_model(saved_model_path):
   model.summary()
 
   # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.keras.experimental.export(
-        model, '/tmp/my_simple_tf_keras_saved_model')
+  path = '/tmp/simple_keras_model'
+  tf.keras.experimental.export_saved_model(model, path)
 
   # Load the saved keras model back.
-  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
-  model_prime.summary()
+  new_model = tf.keras.experimental.load_from_saved_model(path)
+  new_model.summary()
   ```
 
   Args:
     saved_model_path: a string specifying the path to an existing SavedModel.
+    custom_objects: Optional dictionary mapping names
+        (strings) to custom classes or functions to be
+        considered during deserialization.
 
   Returns:
     a keras.Model instance.
@@ -406,7 +390,7 @@ def load_from_saved_model(saved_model_path):
       compat.as_bytes(constants.ASSETS_DIRECTORY),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
   model_json = file_io.read_file_to_string(model_json_filepath)
-  model = model_from_json(model_json)
+  model = model_from_json(model_json, custom_objects=custom_objects)
 
   # restore model weights
   checkpoint_prefix = os.path.join(
diff --git a/tensorflow/python/keras/saving/saved_model_test.py b/tensorflow/python/keras/saving/saved_model_test.py
index 8063b8af4de91f73fcc9a00bb626a88a204b44cc..50ddf1f24c7c3360702ea4d9222b2661886dffff 100644
--- a/tensorflow/python/keras/saving/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model_test.py
@@ -25,6 +25,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -32,14 +33,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.keras.saving import saved_model as keras_saved_model
+from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import model_utils
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import mode_keys
 from tensorflow.python.training import training as training_module
 
 
@@ -67,10 +68,10 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
 
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
@@ -85,9 +86,9 @@ class TestModelSavingandLoading(test.TestCase):
       x = np.random.random((1, 3))
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
@@ -109,9 +110,9 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
@@ -130,58 +131,56 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
   @test_util.run_in_graph_and_eager_modes
   def test_saving_with_tf_optimizer(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
-      loaded_model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test that new updates are the same with both models
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-
-      ref_loss = model.train_on_batch(x, y)
-      loss = loaded_model.train_on_batch(x, y)
-      self.assertAllClose(ref_loss, loss, atol=1e-05)
-
-      ref_y = model.predict(x)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test saving/loading again
-      temp_saved_model2 = self._save_model_dir('saved_model_2')
-      output_path2 = keras_saved_model.export(
-          loaded_model, temp_saved_model2)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path2)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    model.add(keras.layers.Dense(3))
+    model.compile(
+        loss='mse',
+        optimizer=training_module.RMSPropOptimizer(0.1),
+        metrics=['acc'])
+
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+    model.train_on_batch(x, y)
+    ref_y = model.predict(x)
+
+    saved_model_dir = self._save_model_dir()
+    keras_saved_model.export_saved_model(model, saved_model_dir)
+    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
+    loaded_model.compile(
+        loss='mse',
+        optimizer=training_module.RMSPropOptimizer(0.1),
+        metrics=['acc'])
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
+
+    # test that new updates are the same with both models
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+
+    ref_loss = model.train_on_batch(x, y)
+    loss = loaded_model.train_on_batch(x, y)
+    self.assertAllClose(ref_loss, loss, atol=1e-05)
+
+    ref_y = model.predict(x)
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
+
+    # test saving/loading again
+    saved_model_dir2 = self._save_model_dir('saved_model_2')
+    keras_saved_model.export_saved_model(loaded_model, saved_model_dir2)
+    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir2)
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
 
   def test_saving_subclassed_model_raise_error(self):
     # For now, saving subclassed model should raise an error. It should be
@@ -199,9 +198,9 @@ class TestModelSavingandLoading(test.TestCase):
 
     model = SubclassedModel()
 
-    temp_saved_model = self._save_model_dir()
+    saved_model_dir = self._save_model_dir()
     with self.assertRaises(NotImplementedError):
-      keras_saved_model.export(model, temp_saved_model)
+      keras_saved_model.export_saved_model(model, saved_model_dir)
 
 
 class LayerWithLearningPhase(keras.engine.base_layer.Layer):
@@ -264,10 +263,7 @@ def subclassed_model():
 
 def load_model(sess, path, mode):
   tags = model_utils.EXPORT_TAG_MAP[mode]
-  if mode == mode_keys.ModeKeys.PREDICT:
-    sig_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-  else:
-    sig_def_key = mode
+  sig_def_key = model_utils.SIGNATURE_KEY_MAP[mode]
 
   meta_graph_def = loader_impl.load(sess, tags, path)
   inputs = {
@@ -291,61 +287,64 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       {
           'model_builder': functional_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': adadelta.Adadelta,
           'train_before_export': True},
       {
           'model_builder': functional_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': False},
       {
           'model_builder': functional_model,
           'uses_learning_phase': False,
-          'optimizer': None,
+          'optimizer_cls': None,
           'train_before_export': False},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': True},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': adadelta.Adadelta,
           'train_before_export': False},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': False,
-          'optimizer': None,
+          'optimizer_cls': None,
           'train_before_export': False},
       {
           'model_builder': sequential_model_without_input_shape,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': False})
   def testSaveAndLoadSavedModelExport(
-      self, model_builder, uses_learning_phase, optimizer, train_before_export):
-    saved_model_path = self._save_model_dir()
-    with self.session(graph=ops.Graph()):
-      np.random.seed(130)
-      input_arr = np.random.random((1, 3))
-      target_arr = np.random.random((1, 3))
-
-      model = model_builder(uses_learning_phase)
-      if optimizer is not None:
-        model.compile(
-            loss='mse',
-            optimizer=optimizer,
-            metrics=['mae'])
-        if train_before_export:
-          model.train_on_batch(input_arr, target_arr)
-
-        ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+      self, model_builder, uses_learning_phase, optimizer_cls,
+      train_before_export):
+    optimizer = None if optimizer_cls is None else optimizer_cls()
 
-      ref_predict = model.predict(input_arr)
+    saved_model_dir = self._save_model_dir()
 
-      # Export SavedModel
-      output_path = keras_saved_model.export(model, saved_model_path)
+    np.random.seed(130)
+    input_arr = np.random.random((1, 3))
+    target_arr = np.random.random((1, 3))
+
+    model = model_builder(uses_learning_phase)
+    if optimizer is not None:
+      model.compile(
+          loss='mse',
+          optimizer=optimizer,
+          metrics=['mae'])
+      if train_before_export:
+        model.train_on_batch(input_arr, target_arr)
+
+      ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+
+    ref_predict = model.predict(input_arr)
+
+    # Export SavedModel
+    keras_saved_model.export_saved_model(model, saved_model_dir)
 
     input_name = model.input_names[0]
     output_name = model.output_names[0]
@@ -353,7 +352,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
     # Load predict graph, and test predictions
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
                                       mode_keys.ModeKeys.PREDICT)
 
       predictions = sess.run(outputs[output_name],
@@ -363,21 +362,25 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     if optimizer:
       # Load eval graph, and test predictions, loss and metric values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, output_path,
+        inputs, outputs, _ = load_model(sess, saved_model_dir,
                                         mode_keys.ModeKeys.TEST)
 
         # First obtain the loss and predictions, and run the metric update op by
         # feeding in the inputs and targets.
+        metrics_name = 'mae' if tf2.enabled() else 'mean_absolute_error'
+        metrics_update_op_key = 'metrics/' + metrics_name + '/update_op'
+        metrics_value_op_key = 'metrics/' + metrics_name + '/value'
+
         loss, predictions, _ = sess.run(
             (outputs['loss'], outputs['predictions/' + output_name],
-             outputs['metrics/mean_absolute_error/update_op']), {
+             outputs[metrics_update_op_key]), {
                  inputs[input_name]: input_arr,
                  inputs[target_name]: target_arr
              })
 
         # The metric value should be run after the update op, to ensure that it
         # reflects the correct value.
-        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
+        metric_value = sess.run(outputs[metrics_value_op_key])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
@@ -388,12 +391,12 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
         inputs, outputs, meta_graph_def = load_model(
-            sess, output_path, mode_keys.ModeKeys.TRAIN)
+            sess, saved_model_dir, mode_keys.ModeKeys.TRAIN)
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
-        self.assertIn('metrics/mean_absolute_error/value', outputs)
+        self.assertIn(metrics_update_op_key, outputs)
+        self.assertIn(metrics_value_op_key, outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
@@ -414,17 +417,17 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
               atol=1e-05)
 
   def testSaveAndLoadSavedModelWithCustomObject(self):
-    saved_model_path = self._save_model_dir()
+    saved_model_dir = self._save_model_dir()
     with session.Session(graph=ops.Graph()) as sess:
       def relu6(x):
         return keras.backend.relu(x, max_value=6)
       inputs = keras.layers.Input(shape=(1,))
       outputs = keras.layers.Activation(relu6)(inputs)
       model = keras.models.Model(inputs, outputs)
-      output_path = keras_saved_model.export(
-          model, saved_model_path, custom_objects={'relu6': relu6})
+      keras_saved_model.export_saved_model(
+          model, saved_model_dir, custom_objects={'relu6': relu6})
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
                                       mode_keys.ModeKeys.PREDICT)
       input_name = model.input_names[0]
       output_name = model.output_names[0]
@@ -490,17 +493,18 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     model = sequential_model_without_input_shape(True)
     # A Sequential model that hasn't been built should raise an error.
     with self.assertRaisesRegexp(ValueError, 'Please build the model'):
-      keras_saved_model.export(model, '')
+      keras_saved_model.export_saved_model(model, '')
 
-    saved_model_path = self._save_model_dir()
-    output_path = keras_saved_model.export(
-        model, saved_model_path,
-        input_signature=tensor_spec.TensorSpec(shape=(10, 11, 12, 13, 14),
-                                               dtype=dtypes.float32,
-                                               name='spec_input'))
+    saved_model_dir = self._save_model_dir()
+    keras_saved_model.export_saved_model(
+        model,
+        saved_model_dir,
+        input_signature=tensor_spec.TensorSpec(
+            shape=(10, 11, 12, 13, 14), dtype=dtypes.float32,
+            name='spec_input'))
 
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
                                       mode_keys.ModeKeys.PREDICT)
       self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
       self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
@@ -517,18 +521,20 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
                                                      dtype=dtypes.float32)]})
   def testServingOnly(self, model_builder, input_signature):
     if context.executing_eagerly():
-      saved_model_path = self._save_model_dir()
+      saved_model_dir = self._save_model_dir()
       input_arr = np.random.random((5, 3)).astype(np.float32)
       model = model_builder()
       ref_predict = model.predict(input_arr)
 
-      output_path = keras_saved_model.export(
-          model, saved_model_path, serving_only=True,
+      keras_saved_model.export_saved_model(
+          model,
+          saved_model_dir,
+          serving_only=True,
           input_signature=input_signature)
 
       # Load predict graph, and test predictions
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, output_path,
+        inputs, outputs, _ = load_model(sess, saved_model_dir,
                                         mode_keys.ModeKeys.PREDICT)
         predictions = sess.run(outputs[next(iter(outputs.keys()))],
                                {inputs[next(iter(inputs.keys()))]: input_arr})
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index eff0f39b6d006c60198a607e796e7619b968eaf3..fdc01d1dcd842025c3152e8884c7201d497f9576 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -160,7 +160,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+    np.testing.assert_allclose(output, actual_output, rtol=2e-3)
 
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
@@ -209,7 +209,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+    np.testing.assert_allclose(output, actual_output, rtol=2e-3)
 
   # for further checks in the caller function
   return actual_output
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e7711ebd61079b625890b14520ef18b724ddbb
--- /dev/null
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras composite tensor support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import Layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+
+
+# Define test-only Layer classes to validate passing Sparse and Ragged tensors
+# between layers.
+class ToDense(Layer):
+  """Create a dense (standard) tensor from the given input tensor."""
+
+  def __init__(self, default_value, **kwargs):
+    super(ToDense, self).__init__(**kwargs)
+    self._default_value = default_value
+
+  def call(self, inputs):
+    if isinstance(inputs, ragged_tensor.RaggedTensor):
+      return inputs.to_tensor(default_value=self._default_value)
+    elif isinstance(inputs, sparse_tensor.SparseTensor):
+      return sparse_ops.sparse_tensor_to_dense(
+          inputs, default_value=self._default_value)
+    elif isinstance(inputs, ops.Tensor):
+      return inputs
+    else:
+      raise TypeError("Unexpected tensor type %s" % type(inputs).__name__)
+
+
+class ToRagged(Layer):
+  """Create a ragged tensor based on a given dense tensor."""
+
+  def __init__(self, padding, ragged_rank=1, **kwargs):
+    super(ToRagged, self).__init__(**kwargs)
+    self._padding = padding
+    self._ragged_rank = ragged_rank
+
+  def call(self, inputs):
+    return ragged_tensor.RaggedTensor.from_tensor(
+        inputs, padding=self._padding, ragged_rank=self._ragged_rank)
+
+
+class ToSparse(Layer):
+  """Create a sparse tensor based on a given dense tensor."""
+
+  def call(self, inputs):
+    indices = array_ops.where(math_ops.not_equal(inputs, 0))
+    values = array_ops.gather_nd(inputs, indices)
+    shape = array_ops.shape(inputs, out_type=dtypes.int64)
+    return sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class InternalCompositeTest(keras_parameterized.TestCase):
+
+  def test_model_with_internal_ragged_tensors(self):
+    # Create a model that accepts an input, converts it to Ragged, and
+    # converts the ragged tensor back to a dense tensor.
+    layers = [ToRagged(padding=0), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+
+    # Define some training data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    expected_output = np.array([[1, -1], [2, 3]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+  def test_model_with_internal_sparse_tensors(self):
+    # Create a model that accepts an input, converts it to Sparse, and
+    # converts the sparse tensor back to a dense tensor.
+    layers = [ToSparse(), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+
+    # Define some training data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+  def test_training_model_with_internal_ragged_tensors(self):
+
+    # Create a model that implements y=Mx. This is easy to learn and will
+    # demonstrate appropriate gradient passing. (We have to use RaggedTensors
+    # for this test, as ToSparse() doesn't support gradient propagation through
+    # the layer.) TODO(b/124796939): Investigate this.
+    layers = [core.Dense(2), ToRagged(padding=0), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+
+    input_data = np.random.rand(1024, 1)
+    expected_data = np.concatenate((input_data * 3, input_data * .5), axis=-1)
+
+    model.compile(
+        loss="mse",
+        optimizer="adam",
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(input_data, expected_data, epochs=10, verbose=0)
+
+    # If the model trained, the loss stored at history[0] should be different
+    # than the one stored at history[-1].
+    self.assertNotEqual(history.history["loss"][-1], history.history["loss"][0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index d42b354fb140bc592ee1127c3789069365371bc4..4b37c741d1c53febed28252c4bb12b77c8c75722 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -26,7 +27,34 @@ from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.losses.Reduction', v1=[])
+class ReductionV2(object):
+  """Types of loss reduction.
+
+  Contains the following values:
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+     Note that when using `tf.distribute.Strategy`, this is the global batch
+     size across all the replicas that are contributing to a single step.
+  """
+
+  NONE = 'none'
+  SUM = 'sum'
+  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
+
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
+
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError('Invalid Reduction Key %s.' % key)
 
 
 def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
@@ -140,21 +168,23 @@ def _num_elements(losses):
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
-def _reduce_weighted_loss(
-    weighted_losses, reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE):
+def reduce_weighted_loss(weighted_losses,
+                         reduction=ReductionV2.SUM_OVER_BATCH_SIZE):
   """Reduces the individual weighted loss measurements."""
-  if reduction == losses_impl.ReductionV2.NONE:
+  if reduction == ReductionV2.NONE:
     loss = weighted_losses
   else:
     loss = math_ops.reduce_sum(weighted_losses)
-    if reduction == losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE:
-      loss = _safe_mean(loss, _num_elements(weighted_losses))
+    if reduction == ReductionV2.SUM_OVER_BATCH_SIZE:
+      num_replicas = (  # Used to convert from local to global batch size.
+          distribution_strategy_context.get_strategy().num_replicas_in_sync)
+      loss = _safe_mean(loss, num_replicas * _num_elements(weighted_losses))
   return loss
 
 
 def compute_weighted_loss(losses,
                           sample_weight=None,
-                          reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+                          reduction=ReductionV2.SUM_OVER_BATCH_SIZE,
                           name=None):
   """Computes the weighted loss.
 
@@ -162,8 +192,8 @@ def compute_weighted_loss(losses,
     losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
     sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
       `losses`, or be broadcastable to `losses`.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
 
   Raises:
@@ -173,15 +203,10 @@ def compute_weighted_loss(losses,
     Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
     `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
   """
-  losses_impl.ReductionV2.validate(reduction)
+  ReductionV2.validate(reduction)
   if sample_weight is None:
     sample_weight = 1.0
   with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas.
-    # TODO(josh11b): Associate it with the returned op for more precision.
-    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
     # Update dimensions of `sample_weight` to match with `losses` if possible.
     losses, _, sample_weight = squeeze_or_expand_dimensions(
         losses, None, sample_weight)
@@ -203,7 +228,16 @@ def compute_weighted_loss(losses,
     sample_weight.get_shape().assert_is_compatible_with(losses.get_shape())
     weighted_losses = math_ops.multiply(losses, sample_weight)
     # Apply reduction function to the individual weighted losses.
-    loss = _reduce_weighted_loss(weighted_losses, reduction)
+    loss = reduce_weighted_loss(weighted_losses, reduction)
     # Convert the result back to the input type.
     loss = math_ops.cast(loss, input_dtype)
     return loss
+
+
+def scale_loss_for_distribution(loss_value):
+  """Scales and returns the given loss value by the number of replicas."""
+  num_replicas = (
+      distribution_strategy_context.get_strategy().num_replicas_in_sync)
+  if num_replicas > 1:
+    loss_value *= (1. / num_replicas)
+  return loss_value
diff --git a/tensorflow/python/training/mode_keys.py b/tensorflow/python/keras/utils/mode_keys.py
similarity index 70%
rename from tensorflow/python/training/mode_keys.py
rename to tensorflow/python/keras/utils/mode_keys.py
index ef64554bd5783e7e3ac802708099424ff8244fd8..fb6fc3eef7e8967b8a87707569770f7ec1495022 100644
--- a/tensorflow/python/training/mode_keys.py
+++ b/tensorflow/python/keras/utils/mode_keys.py
@@ -12,22 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model modeKeys for TensorFlow and Estimator."""
+"""Keras model mode constants."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-class ModeKeys(object):
-  """Standard names for model modes.
-
-  The following standard keys are defined:
-
-  * `TRAIN`: training/fitting mode.
-  * `TEST`: testing/evaluation mode.
-  * `PREDICT`: prediction/inference mode.
-  """
-
-  TRAIN = 'train'
-  TEST = 'test'
-  PREDICT = 'predict'
+# pylint: disable=unused-import
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
+# pylint: enable=unused-import
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 387fbf576749a5adc1f022639a59feb940ab923d..1c1d30ba4b5c12b36d4e439f3f85037c3f954f16 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 import six
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -313,7 +313,9 @@ def is_symbolic_tensor(tensor):
   """
   if isinstance(tensor, variables.Variable):
     return not context.executing_eagerly()
-  if isinstance(tensor, (ops.Tensor, sparse_tensor.SparseTensor)):
+  if isinstance(tensor, composite_tensor.CompositeTensor):
+    return tensor._is_graph_tensor  # pylint: disable=protected-access
+  if isinstance(tensor, ops.Tensor):
     return hasattr(tensor, 'graph')
   if isinstance(tensor, tuple(_user_convertible_tensor_types)):
     return hasattr(ops.convert_to_tensor(tensor), 'graph')
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 9c478af4ecbbe1bb976c982e596f82ac56e2045d..902ecf91670d52ff6839f42d345944b9be009f85 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -89,6 +89,10 @@ class TestIsSymbolicTensor(test.TestCase):
         self._input = input_
         self.value = ops.convert_to_tensor(42.)
 
+      @property
+      def dtype(self):
+        return self.value.dtype
+
     ops.register_tensor_conversion_function(
         Foo, lambda x, *args, **kwargs: x.value)
     tf_utils.register_symbolic_tensor_type(Foo)
@@ -128,6 +132,16 @@ class TestIsSymbolicTensor(test.TestCase):
     # `Tensor`.
     y = model(ops.convert_to_tensor(7.))
     self.assertIsInstance(y, Foo)
+    # Confirm that (custom) loss sees `Foo` instance, not Tensor.
+    obtained_prediction_box = [None]
+    def custom_loss(y_obs, y_pred):
+      del y_obs
+      obtained_prediction_box[0] = y_pred
+      return y_pred
+    # Apparently `compile` calls the loss function enough to trigger the
+    # side-effect.
+    model.compile('SGD', loss=custom_loss)
+    self.assertIsInstance(obtained_prediction_box[0], Foo)
 
 
 class ConvertInnerNodeDataTest(test.TestCase):
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 9394cc114e9e9b8366acc8de37a90b972bbd96ed..d396851a629b80114496b4a978768d73730aecd2 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -145,6 +145,10 @@ def plot_model(model,
           a string specifying the format of the plot:
           'TB' creates a vertical plot;
           'LR' creates a horizontal plot.
+
+  Returns:
+      A Jupyter notebook Image object if Jupyter is installed.
+      This enables in-line display of the model plots in notebooks.
   """
   dot = model_to_dot(model, show_shapes, show_layer_names, rankdir)
   _, extension = os.path.splitext(to_file)
@@ -152,4 +156,13 @@ def plot_model(model,
     extension = 'png'
   else:
     extension = extension[1:]
+  # Save image to disk.
   dot.write(to_file, format=extension)
+  # Return the image as a Jupyter Image object, to be displayed in-line.
+  # Note that we cannot easily detect whether the code is running in a
+  # notebook, and thus we always return the Image if Jupyter is available.
+  try:
+    from IPython import display
+    return display.Image(filename=to_file)
+  except ImportError:
+    pass
diff --git a/tensorflow/python/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/wrappers/scikit_learn.py
index b5fe7669ab89f5c2cb3e92be60673fd7029d871a..149ad06f57c23990777a854836c7c8beb352799f 100644
--- a/tensorflow/python/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn.py
@@ -303,7 +303,7 @@ class KerasClassifier(BaseWrapper):
     if not isinstance(outputs, list):
       outputs = [outputs]
     for name, output in zip(self.model.metrics_names, outputs):
-      if name == 'acc':
+      if name in ['accuracy', 'acc']:
         return output
     raise ValueError('The model is not configured to compute accuracy. '
                      'You should pass `metrics=["accuracy"]` to '
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 60126bb9fa4788864224926120d59e39421aad5e..51599dcf1bd9754b61805b11ab7870c48ce1362a 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -131,6 +131,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     grpc_enabled = True,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1091,12 +1092,16 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:context",
     ],
@@ -1149,6 +1154,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tridiagonal_solve_op_test",
+    size = "medium",
+    srcs = ["tridiagonal_solve_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+    shard_count = 5,
+)
+
 tf_py_test(
     name = "unicode_script_op_test",
     size = "small",
@@ -1675,6 +1693,21 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "conv1d_transpose_test",
+    size = "small",
+    srcs = ["conv1d_transpose_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_ops",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1837,7 +1870,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "functional_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["functional_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2172,7 +2205,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
-    shard_count = 4,
+    shard_count = 6,
     tags = ["no_windows_gpu"],
     xla_enable_strict_auto_jit = True,
 )
@@ -2188,12 +2221,12 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    shard_count = 3,
     tags = [
         "manual",
         "no_gpu",
         "nogpu",
         "noguitar",
-        "notap",
     ],
     xla_enable_strict_auto_jit = True,
 )
@@ -2578,6 +2611,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     flaky = 1,  # create_local_cluster sometimes times out.
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2737,6 +2771,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = [
+        "no_rocm",
         "optonly",  # flaky timeouts unless optimized
     ],
     xla_enable_strict_auto_jit = True,
@@ -2778,6 +2813,7 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
+        "no_rocm",
         "optonly",  # times out
     ],
     xla_enable_strict_auto_jit = True,
@@ -2841,6 +2877,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2860,6 +2897,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops_gen",
     ],
     shard_count = 4,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2893,6 +2931,34 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "rnn_cell_test",
+    size = "medium",
+    srcs = ["rnn_cell_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    shard_count = 10,
+    xla_enable_strict_auto_jit = True,
+)
+
 cuda_py_test(
     name = "scatter_ops_test",
     size = "medium",  # NOTE: This is not run by default.
@@ -3043,6 +3109,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    tags = ["no_oss"],  # b/124474135
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3091,6 +3158,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
     ],
     shard_count = 30,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3229,8 +3297,11 @@ cuda_py_test(
     ],
     data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
-    tags = ["no_windows"],
-    xla_enable_strict_auto_jit = True,
+    tags = [
+        "no_rocm",  # flaky test
+        "no_windows",
+    ],
+    # TODO(kuny): Add xla_enable_strict_auto_jit = True after b/124377352 is fixed.
 )
 
 cuda_py_test(
@@ -3578,3 +3649,24 @@ cuda_py_test(
     grpc_enabled = True,
     xla_enable_strict_auto_jit = True,
 )
+
+cuda_py_test(
+    name = "critical_section_test",
+    size = "medium",
+    srcs = ["critical_section_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:critical_section_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index a91f96cf952252bf162e9e708d7b8e5808aad38c..3fa2054847db635a96caedf4d596020ec2137003 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -126,7 +126,7 @@ class BenchmarkTest(test.TestCase):
     self.assertFalse(_ran_somebenchmark_2[0])
     self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123744455")  # GPU memory is incorrect
   def testReportingBenchmark(self):
     tempdir = test.get_temp_dir()
     try:
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 6b04e8abf40dc6fc396581e82b59bc6c4dec2a41..e74193049b1df732e5d986340c16329a207cf2fe 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -896,12 +896,37 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionOnEmptyEnsembleMultiClass(self):
+    """Tests that prediction on empty ensemble does not fail for multiclass."""
+    with self.cached_session() as session:
+      # Create an empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+      logits_dimension = 2
+      expected_logits = [[0.0, 0.0], [0.0, 0.0]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=logits_dimension)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.cached_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -1007,6 +1032,158 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionMultipleTreeMultiClass(self):
+    """Tests the predictions work when we have multiple trees."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 0.51
+              }
+              vector: {
+                value: 1.14
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 1.29
+              }
+              vector: {
+                value: 8.79
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: -4.33
+              }
+              vector: {
+                value: 7.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 0.2
+              }
+              vector: {
+                value: 5.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: -4.1
+              }
+              vector: {
+                value: 6.0
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 2.0
+              }
+              vector: {
+                value: -7.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 6.3
+              }
+              vector: {
+                value: 5.0
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Example 1: tree 0: (0.51, 1.14), tree 1: (0.2, 5.0), tree 2: (6.3, 5.0)
+      #
+      #            logits = (0.1*0.51+0.2*0.2+1*6.3,
+      #                      0.1*1.14+0.2*5.0+1*5)
+      # Example 2: tree 0: (0.51, 1.14), tree 1: (-4.33, 7.0), tree 2: (2.0, -7)
+      #
+      #            logits = (0.1*0.51+0.2*-4.33+1*2.0,
+      #                      0.1*1.14+0.2*7.0+1*-7)
+      logits_dimension = 2
+      expected_logits = [[6.391, 6.114], [1.185, -5.486]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=logits_dimension)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the predictions work for categorical splits."""
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index b9eb2391b490f659bd20e26a2c5b290ab4bfea1b..3c5433cb8990539d28bac70df2e8d589ffd9bb7a 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -66,6 +66,36 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeInnerDim(self):
+    input_shape = [2, 1, 3]
+    output_shape = [2, 5, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2]
+    output_shape = [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 15, 3, 2, 2, 2]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim2(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
   @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
     with self.session(use_gpu=True):
@@ -78,8 +108,9 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
   def testBroadcastScalarToNonScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1.0, dtype=np.float)
-      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4])
-      v_np = np.broadcast_to(x, [2, 3, 4])
+      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4,
+                                                                1, 1, 1])
+      v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
       self.assertAllEqual(v_tf.eval(), v_np)
 
   @test_util.run_deprecated_v1
@@ -130,14 +161,26 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
-    x = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [5, 4, 6])
+    x = constant_op.constant([1], dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [5, 2, 3])
     out = 2 * v
     with self.cached_session():
       err = gradient_checker.compute_gradient_error(x, x.get_shape(),
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
+  def testGradientWithLargeDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    x = constant_op.constant(np.array(np.random.randn(*input_shape),
+                                      dtype=np.float32))
+    v = array_ops.broadcast_to(x, output_shape)
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
+                                                    out, out.get_shape())
+    self.assertLess(err, 1e-4)
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index d5f3696a9dc8a86e8a6fb75a4c59f9accf279ba9..7d00919cc8a9927c3e8d05b1c92aa89c8fb54ad9 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -889,8 +889,8 @@ class EnsureShapeTest(test.TestCase):
 
   # Dynamic shape check
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA"
-                        )  # Dynamic shapes not supported now with XLA
+  @test_util.disable_xla(
+      "b/123337890")  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -904,8 +904,8 @@ class EnsureShapeTest(test.TestCase):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA"
-                        )  # Dynamic shapes not supported now with XLA
+  @test_util.disable_xla(
+      "b/123337890")  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index abb71a672c13dd62eda24f0b0e31c7625ea6727a..2305c0b568ee6220dab8dd9be8b7bda339b9f082 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -163,7 +163,9 @@ class CholeskyOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(tensor3)
 
-  @test_util.disable_xla("This test never passed for XLA")  # all nan on XLA
+  # The below invalid Cholesky call returns an error with TF Classic and just
+  # returns NaNs with XLA.
+  @test_util.disable_xla("b/123337890")
   def testNotInvertibleCPU(self):
     # The input should be invertible.
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a968b061270ae00ddcb056f73cad3b215e413d1d..7e37785344391364b2e5d8ea54170e68659335dc 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class ConcatOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -642,7 +641,6 @@ class ConcatOpTest(test.TestCase):
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
@@ -686,8 +684,7 @@ class ConcatOffsetTest(test.TestCase):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla(
-      "This test never passed for XLA")  # Different error message on XLA
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testSizeMismatch(self):
     cdim = constant_op.constant(1, dtypes.int32)
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 81a530a263458e2e2d1af70e92d3bfb2378e2d00..982ead7e94557d41b9c4415c3fc604b829ad7390 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -654,6 +654,27 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       self.assertAllEqual([11, 12], self.evaluate(r))
 
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testCond_Device(self):
+    x = constant_op.constant(-10)
+
+    # True branch function defined outside of device scope
+    def true_fn():
+      return math_ops.exp(x)
+
+    with ops.device("CPU:0"):
+      r = control_flow_ops.cond(
+          constant_op.constant(True), true_fn, lambda: 0.)
+      self.assertIn("cpu", r.device.lower())
+
+    with session.Session() as sess:
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(r, options=options, run_metadata=run_metadata)
+      # We expect that everything runs on CPU, even if GPU is available.
+      self.assertEqual(len(run_metadata.partition_graphs), 1)
+
   def testCondListOutput(self):
     with self.cached_session() as sess:
       x = constant_op.constant(10)
@@ -1017,6 +1038,31 @@ class ControlFlowTest(test.TestCase):
     r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
     self.assertEqual(self.evaluate(r), 1.0)
 
+  @test_util.run_v1_only("Tests Session.run() pruning logic.")
+  def testCondFeedConstantPredicate(self):
+    with self.cached_session() as sess:
+      value = constant_op.constant(37.0)
+      predicate = constant_op.constant(True)
+      cond_output = control_flow_ops.cond(
+          predicate, lambda: constant_op.constant(0.0), lambda: value)
+      result = array_ops.identity(cond_output)
+      self.assertEqual(37.0, sess.run(result, feed_dict={predicate: False}))
+      self.assertEqual(0.0, sess.run(result, feed_dict={predicate: True}))
+      self.assertEqual(0.0, sess.run(result))
+
+  @test_util.run_v1_only("Tests Session.run() pruning logic.")
+  def testCondFeedPlaceholderWithDefaultPredicate(self):
+    with self.cached_session() as sess:
+      value = constant_op.constant(37.0)
+      predicate = array_ops.placeholder_with_default(
+          constant_op.constant(True), [])
+      cond_output = control_flow_ops.cond(
+          predicate, lambda: constant_op.constant(0.0), lambda: value)
+      result = array_ops.identity(cond_output)
+      self.assertAllEqual(37.0, sess.run(result, feed_dict={predicate: False}))
+      self.assertAllEqual(0.0, sess.run(result, feed_dict={predicate: True}))
+      self.assertAllEqual(0.0, sess.run(result))
+
   @test_util.run_in_graph_and_eager_modes
   def testCondAutoControlDeps(self):
 
@@ -1503,6 +1549,26 @@ class ControlFlowTest(test.TestCase):
       result = r[2]
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testWhile_Device(self):
+
+    # Body function defined outside of device scope
+    def body(x):
+      return math_ops.exp(x)
+
+    with ops.device("CPU:0"):
+      r = control_flow_ops.while_loop(
+          lambda x: x < 10, body, [constant_op.constant(-10.)])
+      self.assertIn("cpu", r.device.lower())
+
+    with session.Session() as sess:
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(r, options=options, run_metadata=run_metadata)
+      # We expect that everything runs on CPU, even if GPU is available.
+      self.assertEqual(len(run_metadata.partition_graphs), 1)
+
   @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
   @test_util.run_v1_only("b/120545219")
   def testBufferForwarding(self):
@@ -1744,6 +1810,8 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
   def testWhileShapeInferenceRaggedTensor(self):
+    if context.executing_eagerly():
+      self.skipTest("b/116328420")
     i = constant_op.constant(0)
     x = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
     c = lambda i, _: i < 10
@@ -1787,6 +1855,8 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
   def testWhileShapeInferenceRaggedTensorRaggedRank2(self):
+    if context.executing_eagerly():
+      self.skipTest("b/116328420")
     i = constant_op.constant(0)
     x = ragged_factory_ops.constant([[[1, 2], [3], [4, 5, 6]],
                                      [[], [8, 9, 10]]])
@@ -2769,8 +2839,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
-  @test_util.disable_xla("This test never passed for XLA"
-                        )  # Resource variable issue for ControlFlowV2
   @test_util.run_gpu_only
   def testGpuResourceAccess(self):
     with ops.device(test.gpu_device_name()):
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index e8463323df90bd37d927f88bd41b09bef45de541..4b44bb6c913533b3025692b0eb06d7e2b77bfb9e 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -68,7 +68,7 @@ class Conv1DTest(test.TestCase):
       f = constant_op.constant(
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
-          x, f, y_shape, stride=stride, padding="VALID")
+          x, f, y_shape, strides=stride, padding="VALID")
       value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
diff --git a/tensorflow/python/kernel_tests/conv1d_transpose_test.py b/tensorflow/python/kernel_tests/conv1d_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ac5af7aae80277d7a93ef0585c1ccb41286bae
--- /dev/null
+++ b/tensorflow/python/kernel_tests/conv1d_transpose_test.py
@@ -0,0 +1,260 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolution related functionality in tensorflow.ops.nn."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+class Conv1DTransposeTest(test.TestCase):
+
+  def testConv1DTransposeSingleStride(self):
+    with self.cached_session():
+      strides = [1, 1, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 6, 3]
+      y_shape = [2, 6, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = self.evaluate(output)
+
+      for n in xrange(y_shape[0]):
+        for w in xrange(y_shape[1]):
+          for c in xrange(y_shape[2]):
+            target = 2 * 3.0
+            w_in = w > 0 and w < y_shape[1] - 1
+            if w_in:
+              target += 3.0
+            self.assertAllClose(target, value[n, w, c])
+
+  def testConv1DTransposeSame(self):
+    with self.cached_session():
+      strides = [1, 2, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 8, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = self.evaluate(output)
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(y_shape[1]):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % strides[1] == 0 and w > 0 and w < y_shape[1] - 1
+            if w_in:
+              target += 3.0
+            self.assertAllClose(target, value[n, w, k])
+
+  def testConv1DTransposeValid(self):
+    with self.cached_session():
+      strides = [1, 2, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 9, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="VALID")
+      value = self.evaluate(output)
+
+      cache_values = np.zeros(y_shape, dtype=np.float32)
+
+      # The amount of padding added
+      pad = 1
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(pad, y_shape[1] - pad):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % strides[1] == 0 and w > pad and w < y_shape[1] - 1 - pad
+            if w_in:
+              target += 3.0
+            cache_values[n, w, k] = target
+
+          # copy values in the border
+          cache_values[n, 0, k] = cache_values[n, 1, k]
+          cache_values[n, -1, k] = cache_values[n, -2, k]
+          cache_values[n, :, k] = cache_values[n, :, k]
+
+    self.assertAllClose(cache_values, value)
+
+  @test_util.run_deprecated_v1
+  def testGradient(self):
+    x_shape = [2, 4, 3]
+    f_shape = [3, 2, 3]
+    y_shape = [2, 8, 2]
+    strides = [1, 2, 1]
+    np.random.seed(1)  # Make it reproducible.
+    x_val = np.random.random_sample(x_shape).astype(np.float64)
+    f_val = np.random.random_sample(f_shape).astype(np.float64)
+    with self.cached_session():
+      x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(f_val, name="f", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      err = gradient_checker.compute_gradient_error([x, f], [x_shape, f_shape],
+                                                    output, y_shape)
+    print("conv1d_transpose gradient err = %g " % err)
+    err_tolerance = 0.0005
+    self.assertLess(err, err_tolerance)
+
+  def testConv1DTransposeSingleStrideNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 1]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 4]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="SAME", data_format="NCW")
+
+        value = self.evaluate(output)
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(y_shape[2]):
+              target = 2 * 3.0
+              w_in = w > 0 and w < y_shape[2] - 1
+              if w_in:
+                target += 3.0
+              self.assertAllClose(target, value[n, k, w])
+
+  def testConv1DTransposeSameNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 2]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 8]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="SAME", data_format="NCW")
+
+        value = self.evaluate(output)
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(y_shape[2]):
+              target = 3.0
+              # We add a case for locations divisible by the stride.
+              w_in = w % strides[2] == 0 and w > 0 and w < y_shape[2] - 1
+              if w_in:
+                target += 3.0
+              self.assertAllClose(target, value[n, k, w])
+
+  def testConv1DTransposeValidNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 2]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 9]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="VALID", data_format="NCW")
+
+        value = self.evaluate(output)
+        cache_values = np.zeros(y_shape, dtype=np.float32)
+        # The amount of padding added
+        pad = 1
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(pad, y_shape[2] - pad):
+              target = 3.0
+              # We add a case for locations divisible by the stride.
+              w_in = w % strides[2] == 0 and w > pad and \
+                     w < y_shape[2] - 1 - pad
+              if w_in:
+                target += 3.0
+              cache_values[n, k, w] = target
+
+            # copy values in the border
+            cache_values[n, k, 0] = cache_values[n, k, 1]
+            cache_values[n, k, -1] = cache_values[n, k, -2]
+            cache_values[n, k, :] = cache_values[n, k, :]
+
+        self.assertAllClose(cache_values, value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index d9b908be1e7d813b6a37a0e4e26fd008e3d20526..732d870c3ae11f368b76a62be4ef68fe525b7d19 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -574,7 +574,6 @@ class Conv2DTest(test.TestCase):
         padding="VALID")
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D0x0Padding(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 2, 3, 3],
@@ -589,7 +588,6 @@ class Conv2DTest(test.TestCase):
         padding=[[0, 0], [0, 0]])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D1x1Padding(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 2, 3, 2],
@@ -604,7 +602,6 @@ class Conv2DTest(test.TestCase):
         padding=[[1, 1], [1, 1]])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Padding(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 2, 1, 2],
@@ -619,7 +616,6 @@ class Conv2DTest(test.TestCase):
         padding=[[2, 2], [2, 2]])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2DOnlyBottomPadding(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 2, 3, 3],
@@ -634,7 +630,6 @@ class Conv2DTest(test.TestCase):
         padding=[[0, 3], [0, 0]])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2DOnlyTopRightPadding(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 2, 3, 3],
@@ -650,7 +645,6 @@ class Conv2DTest(test.TestCase):
         padding=[[1, 0], [0, 2]])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2DLotsPadding(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 1, 1, 3],
@@ -665,7 +659,6 @@ class Conv2DTest(test.TestCase):
         padding=[[3, 4], [4, 2]])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2DExplicitPaddingWithDilations(self):
     self._VerifyExplicitPaddings(
         tensor_in_sizes=[1, 3, 2, 1],
@@ -681,7 +674,6 @@ class Conv2DTest(test.TestCase):
         padding=[[2, 1], [1, 2]],
         dilations=[2, 3])
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2DExplicitPaddingWithLayoutOptimizer(self):
     # Test with Grappler's layout optimizer, to ensure the layout optimizer
     # handles explicit padding correctly.
@@ -1349,7 +1341,6 @@ class Conv2DTest(test.TestCase):
         dilations=dilations)
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding0x0BackpropInput(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1372,7 +1363,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding1x1BackpropInput(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1405,7 +1395,6 @@ class Conv2DTest(test.TestCase):
             dilations=[2, 2])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding2x2BackpropInput(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1430,7 +1419,6 @@ class Conv2DTest(test.TestCase):
             dilations=[2, 3])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding_1_8_4_1_BackpropInput(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1453,7 +1441,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding_5_0_2_2_BackpropInput(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1512,7 +1499,6 @@ class Conv2DTest(test.TestCase):
         err=err)
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding0x0BackpropFilter(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1535,7 +1521,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding1x1BackpropFilter(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1569,7 +1554,6 @@ class Conv2DTest(test.TestCase):
             dilations=[2, 2])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding2x2BackpropFilter(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1594,7 +1578,6 @@ class Conv2DTest(test.TestCase):
             dilations=[2, 3])
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding_1_8_4_1_BackpropFilter(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1618,7 +1601,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
 
   @test_util.run_in_graph_and_eager_modes()
-  @test_util.disable_xla("This test never passed for XLA")
   def testConv2D2x2Depth1Padding_5_0_2_2_BackpropFilter(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1976,7 +1958,6 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testInputGradient1x1PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1998,7 +1979,6 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             max_err=0.0025)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFilterGradient1x1PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2019,7 +1999,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testInputGradient1x1PaddingStrideTwo(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2040,7 +2019,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFilterGradient1x1PaddingStrideTwo(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2061,7 +2039,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testInputGradient2x2PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2082,7 +2059,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFilterGradient2x2PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2104,7 +2080,6 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             max_err=0.003)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testInputGradient1_2_3_4PaddingStride3x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2125,7 +2100,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFilterGradient1_2_3_4PaddingStride3x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2146,7 +2120,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testInputGradient4_3_2_1PaddingStride2x1(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2167,7 +2140,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFilterGradient4_3_2_1PaddingStride2x1(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2188,7 +2160,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testInputGradient0_0_0_5PaddingStride1x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2209,7 +2180,6 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFilterGradient0_0_0_5PaddingStride1x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2316,7 +2286,7 @@ class Conv2DTest(test.TestCase):
           strides=[1, 1, 1, 1],
           padding=[0, 0, 0, 0])
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
similarity index 84%
rename from tensorflow/contrib/framework/python/ops/critical_section_test.py
rename to tensorflow/python/kernel_tests/critical_section_test.py
index 34fd5018af125335845540dedfdffc984ba02313..7b1519c5e3c77d4676e5084ab06ed49b1a3c42f9 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import critical_section_ops
+from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import critical_section_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -47,7 +49,7 @@ class CriticalSectionTest(test.TestCase):
           return array_ops.identity(c)
 
     num_concurrent = 100
-    r = [cs.execute(fn, 1.0, 2.0) for _ in range(num_concurrent)]
+    r = [cs.execute(lambda: fn(1.0, 2.0)) for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     r_value = self.evaluate(r)
     self.assertAllClose([2.0 * i for i in range(num_concurrent)],
@@ -73,7 +75,7 @@ class CriticalSectionTest(test.TestCase):
               array_ops.identity(inner_cond), true_fn, lambda: c)
 
         def execute():
-          return cs.execute(fn, 1.0, 2.0)
+          return cs.execute(lambda: fn(1.0, 2.0))
 
         r = [
             control_flow_ops.cond(array_ops.identity(outer_cond),
@@ -91,6 +93,7 @@ class CriticalSectionTest(test.TestCase):
         else:
           self.assertAllClose([0] * num_concurrent, r_value)
 
+  @test_util.run_v1_only("b/123990562 Sees CancelledError on some calls")
   def testCriticalSectionInParallelDoesntDeadlockOnError(self):
     # No eager mode execution of this test because eager does not
     # run fn() in parallel, which is where the deadlock could
@@ -102,12 +105,23 @@ class CriticalSectionTest(test.TestCase):
       error = control_flow_ops.Assert((i % 2) == 1, ["Error"])
       with ops.control_dependencies([error]):
         return v.read_value()
+
     num_concurrent = 2
-    r = [cs.execute(fn, i) for i in range(num_concurrent)]
+
+    @def_function.function(autograph=False)
+    def run_concurrently():
+      return [cs.execute(lambda: fn(i)) for i in range(num_concurrent)]
+
+    if not context.executing_eagerly():
+      run_concurrently = run_concurrently()
+
     self.evaluate(v.initializer)
     for _ in range(100):
       with self.assertRaisesOpError("Error"):
-        self.evaluate(r)
+        if context.executing_eagerly():
+          run_concurrently()
+        else:
+          self.evaluate(run_concurrently)
 
   @test_util.run_in_graph_and_eager_modes
   def testCreateCriticalSectionFnReturnsOp(self):
@@ -122,17 +136,20 @@ class CriticalSectionTest(test.TestCase):
           return control_flow_ops.no_op()
 
     num_concurrent = 100
-    r = [cs.execute(fn_return_op, 1.0, 2.0) for _ in range(num_concurrent)]
+    r = [cs.execute(lambda: fn_return_op(1.0, 2.0))
+         for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     self.evaluate(r)
     final_v = self.evaluate(v)
     self.assertAllClose(2.0 * num_concurrent, final_v)
 
+  @test_util.run_v1_only("Collections don't exist in TF2")
   def testCollection(self):
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     self.assertIn(
         cs, ops.get_collection(critical_section_ops.CRITICAL_SECTIONS))
-    execute = cs.execute(lambda x: x + 1, 1.0, name="my_execute")
+    add = lambda x: x + 1
+    execute = cs.execute(lambda: add(1.0), name="my_execute")
     execute_op = [
         x for x in execute.graph.get_operations()
         if "my_execute" in x.name and "MutexLock" in x.type
@@ -142,18 +159,21 @@ class CriticalSectionTest(test.TestCase):
         [signature.op for signature in
          ops.get_collection(critical_section_ops.CRITICAL_SECTION_EXECUTIONS)])
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testRecursiveCriticalSectionAccessIsIllegal(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection()
+    add = lambda y: y + 1
     def fn(x):
-      return cs.execute(lambda y: y + 1, x)
+      return cs.execute(lambda: add(x))
+
     with self.assertRaisesRegexp(
         ValueError,
         r"attempts to directly access the CriticalSection in which it "
         r"would be running"):
-      cs.execute(fn, 1.0)
+      cs.execute(lambda: fn(1.0))
 
   def testRecursiveCriticalSectionAccessViaCapturedTensorIsProtected(self):
     # This one is subtle; and we're being overly cautious here.  The
@@ -173,24 +193,24 @@ class CriticalSectionTest(test.TestCase):
     # operations are finished before anything runs within the critical section.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     fn = array_ops.identity
-    to_capture = cs.execute(fn, 1.0)
+    to_capture = cs.execute(lambda: fn(1.0))
     fn_captures = lambda x: x + to_capture
     to_capture_too = array_ops.identity(to_capture)
 
-    ex_0 = cs.execute(fn_captures, 1.0)
+    ex_0 = cs.execute(lambda: fn_captures(1.0))
 
     with ops.control_dependencies([to_capture]):
       # This is OK because to_capture will execute before this next call
-      ex_1 = cs.execute(fn_captures, 1.0)
+      ex_1 = cs.execute(lambda: fn_captures(1.0))
 
     dependency = array_ops.identity(to_capture)
 
     fn_captures_dependency = lambda x: x + dependency
 
-    ex_2 = cs.execute(fn_captures_dependency, 1.0)
+    ex_2 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     with ops.control_dependencies([to_capture_too]):
-      ex_3 = cs.execute(fn_captures_dependency, 1.0)
+      ex_3 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     # Ensure there's no actual deadlock on to_execute.
     self.assertEquals(2.0, self.evaluate(ex_0))
@@ -216,6 +236,8 @@ class CriticalSectionTest(test.TestCase):
         body_implicit_capture,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -241,6 +263,8 @@ class CriticalSectionTest(test.TestCase):
         body_implicit_capture_protected,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -257,13 +281,15 @@ class CriticalSectionTest(test.TestCase):
       # This version is ok because j is an argument to fn and we can
       # ensure there's a control dependency on j.
       fn = lambda x: x + 1
-      return (i + 1, cs.execute(fn, j))
+      return (i + 1, cs.execute(lambda: fn(j)))
 
     (i_n, j_n) = control_flow_ops.while_loop(
         lambda i, _: i < 1000,
         body_args_capture,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -276,20 +302,23 @@ class CriticalSectionTest(test.TestCase):
         "body_args_capture'\n"
         "==============\n")
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testRecursiveCriticalSectionAccessIsIllegalSameSharedName(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     cs_same = critical_section_ops.CriticalSection(shared_name="cs")
+    add = lambda x: x + 1
     def fn(x):
-      return cs_same.execute(lambda x: x+1, x)
+      return cs_same.execute(lambda: add(x))
     with self.assertRaisesRegexp(
         ValueError,
         r"attempts to directly access the CriticalSection in which it "
         r"would be running"):
-      cs.execute(fn, 1.0)
+      cs.execute(lambda: fn(1.0))
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testMultipleCSExecutionsRequestSameResource(self):
     cs0 = critical_section_ops.CriticalSection()
     cs1 = critical_section_ops.CriticalSection()
@@ -327,20 +356,32 @@ class CriticalSectionTest(test.TestCase):
     # Note, here v must be a resource variable (or something similar),
     # otherwise it gets hoisted into the while_loop by the time we add
     # control dependencies to the lock_op.
+    def body(i):
+      add_j = lambda j: v + j + 1
+      return cs.execute(lambda: add_j(i))
     out = control_flow_ops.while_loop(
-        lambda i: i < 10, lambda i: cs.execute(lambda j: v + j + 1, i), [0])
+        lambda i: i < 10, body, [0])
     self.evaluate(v.initializer)
     self.assertEqual(10, self.evaluate(out))
 
   @test_util.run_in_graph_and_eager_modes
   def testInsideFunction(self):
+    if test_util.is_gpu_available():
+      self.skipTest(
+          "b/123899495: Colocation errors for critical sections in map on GPU")
     cs = critical_section_ops.CriticalSection()
-    v = resource_variable_ops.ResourceVariable(1)
+    with ops.device("/gpu:0" if test_util.is_gpu_available() else "/cpu:0"):
+      v = resource_variable_ops.ResourceVariable(1)
     def fn():
       return v.read_value()
 
     # map() creates a TensorFlow function.
-    ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn))
+    ds = dataset_ops.Dataset.range(1)
+    if test_util.is_gpu_available():
+      ds = (ds.apply(prefetching_ops.copy_to_device("/gpu:0"))
+            .apply(prefetching_ops.map_on_gpu(lambda _: cs.execute(fn))))
+    else:
+      ds = ds.map(lambda _: cs.execute(fn))
 
     def get_first():
       if context.executing_eagerly():
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 008e59ba3e64915d8642243d335701e8adea19c0..bb8d2cf6a051867a28f984378d0db4779b06c0e0 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -89,6 +89,32 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  @test_util.run_deprecated_v1
+  def testToComplex64(self):
+    with self.cached_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.complex64)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]],
+                                  dtype="<c8")
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
+
+      self.assertAllEqual(expected_result, result)
+
+  @test_util.run_deprecated_v1
+  def testToComplex128(self):
+    with self.cached_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.complex128)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]],
+                                  dtype="<c16")
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
+
+      self.assertAllEqual(expected_result, result)
+
   @test_util.run_deprecated_v1
   def testEmptyStringInput(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 97d3645b617947c2ced88ac52207ced98c59c877..b7a865cf13e90be8ad29fe491bd03c230eaa3c27 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -295,7 +295,7 @@ class DepthToSpaceTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123553551")  # Unsupported data format
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index b8139918c597fa455ce9b726d165ec685c959fb3..0bf48fd228fda5640e203f74b4717a2cfffd2ba3 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -65,7 +65,7 @@ class MatrixDiagTest(test.TestCase):
       array_ops.matrix_diag(0)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -270,7 +270,7 @@ class MatrixDiagPartTest(test.TestCase):
       array_ops.matrix_diag_part(0)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 13f9acbe1bf503547f648b7093c43a9c17e7461e..0ab2b4bdfb52665e7ebc9323d73ab41adac0ba4d 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -56,7 +58,6 @@ def simple_scoped_fn(a, x):
 
 
 @test_util.with_control_flow_v2
-@test_util.disable_all_xla("This test never passed for XLA")
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -660,8 +661,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(Run(100., True), 5050.)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.disable_xla(
-      "This test never passed for XLA")  # Different error message
+  @test_util.disable_xla("b/123337890")  # Different error message
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -938,7 +938,6 @@ class FunctionalOpsTest(test.TestCase):
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
 # below test cases.
-@test_util.disable_all_xla("This test never passed for XLA")
 class PartitionedCallTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -1113,6 +1112,37 @@ class PartitionedCallTest(test.TestCase):
       self.evaluate(op)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
+class FunctionalOpsCaseTest(test.TestCase):
+
+  def testCase(self):
+    @eager_function.defun
+    def two(x):
+      return x * 2
+
+    @eager_function.defun
+    def three(x):
+      return x * 3
+
+    @eager_function.defun
+    def four(x):
+      return x * 4
+
+    def f(branch, x):
+      tmpl = array_ops.zeros_like(x)
+      return array_ops.identity(gen_functional_ops.case(
+          branch, input=[x], Tout=[dtypes.float32],
+          branches=[f.get_concrete_function(tmpl)
+                    for f in (two, three, four)])[0])
+    one = array_ops.ones([])
+    self.assertAllEqual(np.float32(2), self.evaluate(f(0, one)))
+    self.assertAllEqual(np.float32(3), self.evaluate(f(1, one)))
+    self.assertAllEqual(np.float32(4), self.evaluate(f(2, one)))
+    self.assertAllEqual(np.float32(4), self.evaluate(f(-1, one)))  # <0 default
+    self.assertAllEqual(np.float32(4), self.evaluate(f(6, one)))  # >=N default
+
+
 if __name__ == "__main__":
   test.main()
 
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 76ae2fcb72f606d95a6d4523f08ecad3514eb974..ad8376b48c8f05809b310a432a12a92786aba989 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -34,7 +34,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class GatherNdTest(test.TestCase):
 
   def _testSimpleDtype(self, dtype):
@@ -57,7 +56,7 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
@@ -360,7 +359,6 @@ class GatherNdTest(test.TestCase):
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class GatherNdOpBenchmark(test.Benchmark):
 
   def benchmark_gather_nd_op(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 1d6f7955aa0cb002df7fc9d344066972820c9664..65043d9f4f0e7624fc42beb0ab299baa67d61f44 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -467,12 +467,12 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   def testErrors(self):
 
     with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 2 must be less than ndims\(indices\) = 2"):
+        ValueError, r"batch_dims = 2 must be less than rank\(indices\) = 2"):
       array_ops.gather(
           params=[[1, 2], [3, 4]], indices=[[1, 2], [3, 4]], batch_dims=2)
 
     with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 1 must be less than ndims\(params\) = 1"):
+        ValueError, r"batch_dims = 1 must be less than rank\(params\) = 1"):
       array_ops.gather(
           params=[1, 2, 3, 4], indices=[[1, 2], [3, 4]], batch_dims=1)
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
index 1bed4b5268e8d27a25ab735f7e3e1a6c9e4d5d95..f70d8c4e1cd557c34f07a90a39b102830d82dd0f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -114,5 +114,29 @@ class LinearOperatorAdjointTest(
     self.assertEqual("my_operator_adjoint", operator.name)
 
 
+class LinearOperatorAdjointNonSquareTest(
+    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
+  """Tests done in the base class NonSquareLinearOperatorDerivedClassTest."""
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    shape_before_adjoint = list(build_info.shape)
+    # We need to swap the last two dimensions because we are taking the adjoint
+    # of this operator
+    shape_before_adjoint[-1], shape_before_adjoint[-2] = (
+        shape_before_adjoint[-2], shape_before_adjoint[-1])
+    matrix = linear_operator_test_util.random_normal(
+        shape_before_adjoint, dtype=dtype)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    operator = LinearOperatorAdjoint(
+        linalg.LinearOperatorFullMatrix(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index ec78a3ffe0b2ae1ff5c5f6c4d73480f2ad92fd26..12da8659caca2dcbd8e981dd7124b52737bff970 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -26,15 +26,59 @@ from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
+_ADJOINTS = linear_operator_algebra._ADJOINTS
+_registered_adjoint = linear_operator_algebra._registered_adjoint
 _CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
-_MATMUL = linear_operator_algebra._MATMUL
 _registered_cholesky = linear_operator_algebra._registered_cholesky
-_registered_matmul = linear_operator_algebra._registered_matmul
 _INVERSES = linear_operator_algebra._INVERSES
 _registered_inverse = linear_operator_algebra._registered_inverse
+_MATMUL = linear_operator_algebra._MATMUL
+_registered_matmul = linear_operator_algebra._registered_matmul
 # pylint: enable=protected-access
 
 
+class AdjointTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Adjoint to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterAdjoint(CustomLinOp)
+    def _adjoint(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    self.assertEqual("OK", CustomLinOp(dtype=None).adjoint())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterAdjoint(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
+
+  def testExactAdjointRegistrationsAllMatch(self):
+    for (k, v) in _ADJOINTS.items():
+      self.assertEqual(v, _registered_adjoint(k[0]))
+
+
 class CholeskyTest(test.TestCase):
 
   def testRegistration(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 96e6e3c04c77e2a32d11d72feea02c177cfa3e61..28f8d20f61515328261771684d2571b80b686c64 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -136,6 +136,27 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_adjoint_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+        ],
+        is_non_singular=True,
+    )
+    adjoint = operator.adjoint()
+    self.assertIsInstance(
+        adjoint,
+        block_diag.LinearOperatorBlockDiag)
+    self.assertEqual(2, len(adjoint.operators))
+
   def test_block_diag_cholesky_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = block_diag.LinearOperatorBlockDiag(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 4d7a31be87cf5f51d952704ee585d140c3147a3f..5c3220e60f49e872bbf2b4f2f1bb63a2271f7b1d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -187,6 +187,11 @@ class LinearOperatorDiagTest(
         linalg_lib.LinearOperatorDiag))
     self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
 
+  def test_diag_adjoint_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+    self.assertIsInstance(operator.adjoint(), linalg.LinearOperatorDiag)
+
   def test_diag_cholesky_type(self):
     diag = [1., 3., 5., 8.]
     operator = linalg.LinearOperatorDiag(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index ea9ee99a582fee6441207a5d9710571bc5fd6804..55eff59e03e83d12e7019922758ef065fe2e2812 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -259,6 +259,12 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_adjoint_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2, is_non_singular=True)
+    self.assertIsInstance(
+        operator.adjoint(), linalg_lib.LinearOperatorIdentity)
+
   def test_identity_cholesky_type(self):
     operator = linalg_lib.LinearOperatorIdentity(
         num_rows=2,
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 54ccc0c5f642ad98c04174d01d9fca0c0fc056d6..166188f6cecac1c472d0855069c61fe0e2937b02 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -192,6 +192,23 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_adjoint_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+        ],
+        is_non_singular=True,
+    )
+    adjoint = operator.adjoint()
+    self.assertIsInstance(
+        adjoint,
+        kronecker.LinearOperatorKronecker)
+    self.assertEqual(2, len(adjoint.operators))
+
   def test_kronecker_cholesky_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = kronecker.LinearOperatorKronecker(
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 7d4c045f79caedc12c4d9c981313859c6c70183c..e3501294cab41a5c48e563607f163b424f594113 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -505,16 +505,24 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         "Indices in TensorListScatter must all be non-negative."):
-      l = list_ops.tensor_list_scatter(
-          c0, [-1, -2], ops.convert_to_tensor([], dtype=dtypes.int32))
+      l = list_ops.tensor_list_scatter(c0, [-1, -2], element_shape=[])
       self.evaluate(l)
 
+  def testScatterIntoExistingList(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    l = list_ops.tensor_list_scatter(tensor=[1.], indices=[0], element_shape=[])
+    l = list_ops.tensor_list_scatter(
+        tensor=[2., 3.], indices=[1, 2], element_shape=[], input_handle=l)
+    self.assertAllEqual(
+        list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
       tape.watch(c0)
-      l = list_ops.tensor_list_scatter(
-          c0, [1, 0], ops.convert_to_tensor([], dtype=dtypes.int32))
+      l = list_ops.tensor_list_scatter(c0, [1, 0], element_shape=[])
       t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
       t1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       self.assertAllEqual(self.evaluate(t0), 2.0)
@@ -527,8 +535,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
       tape.watch(c0)
-      l = list_ops.tensor_list_scatter(
-          c0, [1, 0], ops.convert_to_tensor([], dtype=dtypes.int32))
+      l = list_ops.tensor_list_scatter(c0, [1, 0], element_shape=[])
       t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
       self.assertAllEqual(self.evaluate(t0), 2.0)
       loss = t0 * t0
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 652260559ffebd40a8b9a0ece79ad4cb4d0b65d2..d7980db3dbef4d175f96d2f1d7e45b25f61498c7 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -38,7 +38,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
-from tensorflow.python.training.checkpointable import util as checkpointable
+from tensorflow.python.training.tracking import util as trackable
 
 
 class HashTableTest(test.TestCase):
@@ -1123,22 +1123,24 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       # Invalid data type
       other_type = constant_op.constant(1)
-      with self.assertRaises(ValueError):
+      with self.assertRaises(Exception) as cm:
         lookup_ops.HashTable(
             lookup_ops.TextFileInitializer(
                 other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
+      self.assertTrue(isinstance(cm.exception, (ValueError, TypeError)))
 
       # Non-scalar filename
       filenames = constant_op.constant([vocabulary_file, vocabulary_file])
       if not context.executing_eagerly():
-        with self.assertRaises(ValueError):
+        with self.assertRaises(Exception) as cm:
           lookup_ops.HashTable(
               lookup_ops.TextFileInitializer(
                   filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
                   dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
               default_value)
+        self.assertTrue(isinstance(cm.exception, (ValueError, TypeError)))
       else:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           lookup_ops.HashTable(
@@ -1689,7 +1691,7 @@ class MutableHashTableOpTest(test.TestCase):
     table = lookup_ops.MutableHashTable(
         dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
 
-    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
+    checkpoint = trackable.Checkpoint(table=table, v0=v0, v1=v1)
     self.evaluate([v0.initializer, v1.initializer])
 
     # Check that the parameter nodes have been initialized.
@@ -1714,7 +1716,7 @@ class MutableHashTableOpTest(test.TestCase):
             constant_op.constant([12, 24], dtypes.int64)))
     self.assertAllEqual(2, self.evaluate(table.size()))
 
-    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
+    checkpoint = trackable.Checkpoint(table=table, v0=v0, v1=v1)
 
     # Restore the saved values in the parameter nodes.
     checkpoint.restore(save_path).run_restore_ops()
@@ -1788,9 +1790,9 @@ class MutableHashTableOpTest(test.TestCase):
       exported_keys, exported_values = table.export()
       # exported data is in the order of the internal map, i.e. undefined
       sorted_keys = np.sort(self.evaluate(exported_keys))
-      sorted_values = np.sort(self.evaluate(exported_values))
+      sorted_values = np.sort(self.evaluate(exported_values), axis=0)
       self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]])
+      sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
       self.assertAllEqual(sorted_expected_values, sorted_values)
 
   def testMutableHashTableExportInsert(self):
@@ -2510,7 +2512,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
         checkpoint=True,
         initial_num_buckets=32)
 
-    save_checkpoint = checkpointable.Checkpoint(table=save_table)
+    save_checkpoint = trackable.Checkpoint(table=save_table)
 
     self.assertAllEqual(0, self.evaluate(save_table.size()))
     self.evaluate(save_table.insert(keys, values))
@@ -2536,7 +2538,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
     self.assertAllEqual(2, self.evaluate(load_table.size()))
     self.assertAllEqual(64, len(self.evaluate(load_table.export()[0])))
 
-    restore_checkpoint = checkpointable.Checkpoint(table=load_table)
+    restore_checkpoint = trackable.Checkpoint(table=load_table)
 
     # Restore the saved values in the parameter nodes.
     restore_checkpoint.restore(save_path).run_restore_ops()
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index a31db416a2eceaf03f94abdcd8fc1a5af155350b..d2b1d433c780a520fbb5a0168053f6708e74b95a 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -49,7 +49,6 @@ def simple_scoped_fn(a, x):
 
 
 @test_util.with_control_flow_v2
-@test_util.disable_all_xla("This test never passed for XLA")
 class MapFnTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -198,17 +197,17 @@ class MapFnTest(test.TestCase):
     y = map_fn.map_fn(lambda e: e, x)
     self.assertIs(None, y.get_shape().dims)
 
-  @test_util.disable_control_flow_v2("b/119323354")
-  @test_util.run_in_graph_and_eager_modes
+  # TODO(b/124383826): this test fails in eager: the iterable is of length 0 so
+  # so the body of the while loop never executes
   @test_util.run_v1_only("b/120545219")
   def testMapEmptyScalar(self):
-    map_return = map_fn.map_fn(lambda x: 1, constant_op.constant([]))
+    map_return = map_fn.map_fn(lambda x: 1,
+                               constant_op.constant([], dtype=dtypes.int32))
     self.assertAllEqual([0], map_return.get_shape().dims)
     self.assertAllEqual([0], self.evaluate(map_return).shape)
 
-  # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
+  # TODO(b/124383826): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
-  @test_util.disable_control_flow_v2("b/119323354")
   @test_util.run_v1_only("b/120545219")
   def testMapEmptyTensor(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 367c94dd1e689ade820a96322c786d416bd2b1b1..78e786f01ca9c167b5b175fcd833a83281c078de 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -303,7 +303,6 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")  # Much larger error
   def testGradient1D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e6d886cadbd7e5c9902ae12c162298a9fb76a20e..66b6c8769fd16ca71acb6a18ab07605476b523b8 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -730,7 +730,7 @@ class PoolingTest(test.TestCase):
         t = nn_ops.max_pool(
             t, ksize=ksize, strides=strides, padding="SAME").eval()
 
-  @test_util.disable_xla("b/123338077")
+  @test_util.disable_xla("b/123338077")  # Passes with XLA
   def testDepthwiseMaxPoolInvalidConfigs(self):
     self._testDepthwiseMaxPoolInvalidConfig(
         [1, 2, 2, 4], [1, 2, 2, 2], [1, 1, 1, 2],
@@ -1234,7 +1234,6 @@ class PoolingTest(test.TestCase):
                      [1, window_rows, window_cols, 1],
                      [1, row_stride, col_stride, 1], padding)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def _testMaxPoolGradDirect(self, input_data, output_backprop,
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
@@ -1376,6 +1375,7 @@ class PoolingTest(test.TestCase):
             use_gpu=use_gpu,
             v2=v2)
 
+  @test_util.disable_xla("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
     output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
@@ -1450,6 +1450,7 @@ class PoolingTest(test.TestCase):
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
+  @test_util.disable_xla("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
@@ -1650,7 +1651,6 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
@@ -1685,7 +1685,6 @@ class PoolingTest(test.TestCase):
         [1, row_stride, col_stride, 1], padding)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1845,7 +1844,7 @@ class PoolingTest(test.TestCase):
             padding="SAME")
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
@@ -1924,9 +1923,10 @@ if __name__ == "__main__":
     if name_ == "maxpool5":
       setattr(
           PoolingTest, "testMaxPoolGrad_" + name_,
-          test_util.disable_xla("maxpool5 fails while all others pass")(
-              GetMaxPoolGradTest(input_size_, filter_size_, output_size_,
-                                 stride_, padding_)))
+          test_util.disable_xla(
+              "b/123926014: incorrect output with only constants")(
+                  GetMaxPoolGradTest(input_size_, filter_size_, output_size_,
+                                     stride_, padding_)))
     else:
       setattr(
           PoolingTest, "testMaxPoolGrad_" + name_,
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index aac6eeac06abca3148947901b92b43058fe76e3c..38fa44f37152bbc1cb720594d171142ec7af9007 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -79,7 +79,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = [2, 3]
     alpha = array_ops.ones([2, 2])
     beta = array_ops.ones([1, 2])
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
@@ -89,7 +89,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = []
     alpha = array_ops.ones([2, 2])
     beta = array_ops.ones([1, 2])
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
@@ -99,7 +99,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = array_ops.placeholder(dtypes.int32)
     alpha = array_ops.placeholder(dtypes.float32)
     beta = array_ops.placeholder(dtypes.float32)
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
 
     alpha_val = np.ones([1, 2])
@@ -129,7 +129,8 @@ class RandomGammaGradTest(test.TestCase):
 
       alpha_val = np.logspace(-2, 3, dtype=np_dtype)
       alpha = constant_op.constant(alpha_val)
-      sample = random_ops.random_gamma([], alpha, np_dtype(1.0), dtype=dtype)
+      sample = random_ops.random_gamma(
+          [], alpha, np_dtype(1.0), dtype=dtype, seed=12345)
       actual = gradients_impl.gradients(sample, alpha)[0]
 
       (sample_val, actual_val) = self.evaluate((sample, actual))
@@ -175,7 +176,8 @@ class RandomGammaGradTest(test.TestCase):
     """
     np_dtype = dtype.as_numpy_dtype
     alpha = constant_op.constant(np.logspace(-2, 3, dtype=np_dtype))
-    sample = random_ops.random_gamma([], alpha, np_dtype(1.0), dtype=dtype)
+    sample = random_ops.random_gamma(
+        [], alpha, np_dtype(1.0), dtype=dtype, seed=12345)
     actual = gradients_impl.gradients(sample, alpha)[0]
 
     sample_sg = array_ops.stop_gradient(sample)
@@ -207,9 +209,9 @@ class RandomGammaGradTest(test.TestCase):
     Here we verify that the rhs is fairly close to one.
     The convergence speed is not great, so we use many samples and loose bounds.
     """
-    num_samples = 1000
+    num_samples = 10000
     alpha = constant_op.constant([0.8, 1e1, 1e3], dtype=dtypes.float32)
-    sample = random_ops.random_gamma([num_samples], alpha)
+    sample = random_ops.random_gamma([num_samples], alpha, seed=12345)
     # We need to average the gradients, which is equivalent to averaging the
     # samples and then doing backprop.
     mean_sample = math_ops.reduce_mean(sample, axis=0)
@@ -234,13 +236,13 @@ class RandomGammaGradTest(test.TestCase):
     We compare the Monte-Carlo estimate of the expectation with the
     true gradient.
     """
-    num_samples = 1000
+    num_samples = 10000
     t = 0.3
     alpha = 0.5
     expected = 1 + 2 * alpha - 2 * t
 
     alpha = constant_op.constant(alpha)
-    sample = random_ops.random_gamma([num_samples], alpha, 1.0)
+    sample = random_ops.random_gamma([num_samples], alpha, 1.0, seed=12345)
     loss = math_ops.reduce_mean(math_ops.square(sample - t))
     dloss_dalpha = gradients_impl.gradients(loss, alpha)[0]
     dloss_dalpha_val = self.evaluate(dloss_dalpha)
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 67a89461f3a885056f47c62af40bf6cfccd60583..5ab8bc3a0089742cfad891e772bd3a4ee447a55e 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -104,7 +104,8 @@ class ReductionUnknownShape(test.TestCase):
       for dtype, reductions in [(dtypes.float32,
                                  (math_ops.reduce_sum, math_ops.reduce_mean,
                                   math_ops.reduce_prod, math_ops.reduce_max,
-                                  math_ops.reduce_min)),
+                                  math_ops.reduce_min,
+                                  math_ops.reduce_euclidean_norm)),
                                 (dtypes.bool, (math_ops.reduce_all,
                                                math_ops.reduce_any))]:
         for reduction in reductions:
@@ -487,6 +488,79 @@ class MeanReductionTest(BaseReductionTest):
         self.assertTrue(np.all(np.isnan(y)))
 
 
+class EuclideanNormReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_euclidean_norm(x, reduction_axes, keepdims)
+
+  def _np_reduce(self, x, reduction_axes, keepdims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    if reduction_axes is None or reduction_axes != tuple():
+      np_fro = np.sqrt(
+          np.sum(x * np.conj(x), axis=reduction_axes, keepdims=keepdims))
+    else:
+      np_fro = x
+    if np.issubdtype(x.dtype, np.integer):
+      np_fro = np.floor(np_fro)
+    return np_fro
+
+  @test_util.run_deprecated_v1
+  def testAxesType(self):
+    for dtype in [dtypes.int64, dtypes.int32]:
+      with self.cached_session(use_gpu=True):
+        v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
+        tf_v = self.evaluate(v)
+      self.assertAllEqual(tf_v, 0)
+
+  @test_util.run_deprecated_v1
+  def testInfinity(self):
+    for dtype in [np.float32, np.float64]:
+      for special_value_x in [-np.inf, np.inf]:
+        for special_value_y in [-np.inf, np.inf]:
+          np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
+          self._compareAll(np_arr, None)
+
+  @test_util.run_deprecated_v1
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
+
+    with self.session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_euclidean_norm(x, [0]).eval()
+        self.assertEqual(y.shape, (9938,))
+        self.assertAllEqual(y, np.zeros(9938))
+
+
 class ProdReductionTest(BaseReductionTest):
 
   def _tf_reduce(self, x, reduction_axes, keepdims):
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
index 1e8524f72a9760af90695b3b24c6dda3e9ba8c4a..2d5cff383e46c3aac83eab6b830859a7614fd803 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test_big.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -46,6 +48,7 @@ class BigReductionTest(BaseReductionTest):
   def _tf_reduce_sum(self, x, reduction_axes, keepdims):
     return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
+  @test_util.run_deprecated_v1
   def testFloat32Sum(self):
     # make sure we test all possible kernel invocations
     # logic is the same for all ops, test just float32 for brevity
@@ -64,11 +67,13 @@ class BigReductionTest(BaseReductionTest):
         full_sum = np.ones([], dtype=np.float32) * size_x * size_y
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce_sum(arr, 1, False)
-          tf_col_sum = self._tf_reduce_sum(arr, 0, False)
-          tf_full_sum = self._tf_reduce_sum(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.float32,
+                                                  shape=(size_x, size_y))
+          tf_row_sum = self._tf_reduce_sum(arr_placeholder, 1, False)
+          tf_col_sum = self._tf_reduce_sum(arr_placeholder, 0, False)
+          tf_full_sum = self._tf_reduce_sum(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
+              [tf_row_sum, tf_col_sum, tf_full_sum], {arr_placeholder: arr})
         self.assertAllClose(col_sum, tf_out_col)
         self.assertAllClose(row_sum, tf_out_row)
         self.assertAllClose(full_sum, tf_out_full)
@@ -82,12 +87,16 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.ones([size_y], dtype=np.float32)
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_mean(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_mean(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.float32, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_mean(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_mean(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y],
+                                                   {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testFloat32Max(self):
     # make sure we test all possible kernel invocations
     # logic is the same for all ops, test just float32 for brevity
@@ -107,11 +116,13 @@ class BigReductionTest(BaseReductionTest):
         full_max = np.max(col_max)
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_max = self._tf_reduce_max(arr, 1, False)
-          tf_col_max = self._tf_reduce_max(arr, 0, False)
-          tf_full_max = self._tf_reduce_max(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.float32,
+                                                  shape=(size_x, size_y))
+          tf_row_max = self._tf_reduce_max(arr_placeholder, 1, False)
+          tf_col_max = self._tf_reduce_max(arr_placeholder, 0, False)
+          tf_full_max = self._tf_reduce_max(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_max, tf_col_max, tf_full_max])
+              [tf_row_max, tf_col_max, tf_full_max], {arr_placeholder: arr})
         self.assertAllClose(col_max, tf_out_col)
         self.assertAllClose(row_max, tf_out_row)
         self.assertAllClose(full_max, tf_out_full)
@@ -126,12 +137,16 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.max(arr, axis=(0, 2))
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_max(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_max(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.float32, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_max(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_max(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run(
+                [tf_sum_xz, tf_sum_y], {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testBooleanAll(self):
     # make sure we test all possible kernel invocations
     # test operation where T(0) is not the identity
@@ -150,11 +165,13 @@ class BigReductionTest(BaseReductionTest):
         full_sum = np.ones([1], dtype=np.bool).reshape([])
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce_all(arr, 1, False)
-          tf_col_sum = self._tf_reduce_all(arr, 0, False)
-          tf_full_sum = self._tf_reduce_all(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.bool,
+                                                  shape=(size_x, size_y))
+          tf_row_sum = self._tf_reduce_all(arr_placeholder, 1, False)
+          tf_col_sum = self._tf_reduce_all(arr_placeholder, 0, False)
+          tf_full_sum = self._tf_reduce_all(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
+              [tf_row_sum, tf_col_sum, tf_full_sum], {arr_placeholder: arr})
         self.assertAllClose(col_sum, tf_out_col)
         self.assertAllClose(row_sum, tf_out_row)
         self.assertAllClose(full_sum, tf_out_full)
@@ -168,9 +185,12 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.ones([size_y], dtype=np.bool)
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_all(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_all(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.bool, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_all(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_all(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run(
+                [tf_sum_xz, tf_sum_y], {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 3b8924904c5eb20670a2d61fe1f5a3af470eebde..ca02aa6e4e65919f26d330f623ba0e240cfc7f92 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -86,7 +86,7 @@ class ReluTest(test.TestCase):
     self.assertAllClose(np_relu, tf_relu)
     self.assertShapeEqual(np_relu, tf_relu)
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123338077")  # Passes with XLA
   def testReluInt8x4BadShape(self):
     if not test.is_gpu_available(cuda_only=True):
       self.skipTest("No GPU available")
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
similarity index 67%
rename from tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
rename to tensorflow/python/kernel_tests/rnn_cell_test.py
index ef372b947cedf71e9d44423f10cc43375b467cd9..c732c9be17a010a491d39c0da7d2deedaa7bd3f1 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -12,25 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for rnn module."""
+"""Tests for RNN cells."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
 
+from absl.testing import parameterized
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib import rnn as rnn_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers as keras_layers
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -38,16 +42,18 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
 
 
-class Plus1RNNCell(rnn_lib.RNNCell):
+class Plus1RNNCell(rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
   @property
@@ -62,7 +68,7 @@ class Plus1RNNCell(rnn_lib.RNNCell):
     return (input_ + 1, state + 1)
 
 
-class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
+class DummyMultiDimensionalLSTM(rnn_cell.RNNCell):
   """LSTM Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input to this cell may have an arbitrary number of dimensions that follow
@@ -97,7 +103,7 @@ class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
     return (input_ + 1, (h + 1, c + 1))
 
 
-class NestedRNNCell(rnn_lib.RNNCell):
+class NestedRNNCell(rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input, output and state of this cell is a tuple of two tensors.
@@ -161,18 +167,19 @@ class TestStateSaverWithCounters(TestStateSaver):
   inherits from the TestStateSaver and adds the counters for calls of functions.
   """
 
+  @test_util.run_v1_only("b/124229375")
   def __init__(self, batch_size, state_size):
     super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
     self._num_state_calls = variables_lib.VariableV1(0)
     self._num_save_state_calls = variables_lib.VariableV1(0)
 
   def state(self, name):
-    with ops_lib.control_dependencies(
+    with ops.control_dependencies(
         [state_ops.assign_add(self._num_state_calls, 1)]):
       return super(TestStateSaverWithCounters, self).state(name)
 
   def save_state(self, name, state):
-    with ops_lib.control_dependencies([state_ops.assign_add(
+    with ops.control_dependencies([state_ops.assign_add(
         self._num_save_state_calls, 1)]):
       return super(TestStateSaverWithCounters, self).save_state(name, state)
 
@@ -191,12 +198,14 @@ class RNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testInvalidSequenceLengthShape(self):
     cell = Plus1RNNCell()
     inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
     with self.assertRaisesRegexp(ValueError, "must be a vector"):
       rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=4)
 
+  @test_util.run_v1_only("b/124229375")
   def testRNN(self):
     cell = Plus1RNNCell()
     batch_size = 2
@@ -224,6 +233,7 @@ class RNNTest(test.TestCase):
                           max_length * np.ones(
                               (batch_size, input_size), dtype=np.float32))
 
+  @test_util.run_v1_only("b/124229375")
   def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
@@ -260,6 +270,7 @@ class RNNTest(test.TestCase):
       for d_v in full_dropout_values[:-1]:  # Add 1.0 to dropped_out (all zeros)
         self.assertAllClose(d_v, np.ones_like(input_value))
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicCalculation(self):
     cell = Plus1RNNCell()
     sequence_length = array_ops.placeholder(dtypes.int64)
@@ -310,7 +321,7 @@ class RNNTest(test.TestCase):
                                      1.0 * (2 + 1) * np.ones((input_size)))))
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -329,6 +340,7 @@ class RNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testScope(self):
 
     def factory(scope):
@@ -358,21 +370,22 @@ class LSTMTest(test.TestCase):
     lstm = rnn_cell.LSTMCell(10)
     input_tensor = array_ops.ones([10, 50])
     lstm.build(input_tensor.get_shape())
-    self.assertEqual(lstm._bias.dtype, dtypes.float32_ref)
+    self.assertEqual(lstm._bias.dtype.base_dtype, dtypes.float32)
 
     # Explicitly pass dtype in constructor
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       lstm = rnn_cell.LSTMCell(10, dtype=dtype)
       input_tensor = array_ops.ones([10, 50])
       lstm.build(input_tensor.get_shape())
-      self.assertEqual(lstm._bias.dtype, dtype._as_ref)
+      self.assertEqual(lstm._bias.dtype.base_dtype, dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoSharding(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -389,12 +402,13 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
+  @test_util.run_v1_only("b/124229375")
   def testCellClipping(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -419,12 +433,13 @@ class LSTMTest(test.TestCase):
       # if cell c is clipped to 0, tanh(c) = 0 => m==0
       self.assertAllEqual(value, np.zeros((batch_size, num_units)))
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingSimpleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2 * num_units)
@@ -452,12 +467,13 @@ class LSTMTest(test.TestCase):
           })
       self.assertAllEqual(last_state_value, saved_state_value)
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingTupleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, num_units)
@@ -486,12 +502,13 @@ class LSTMTest(test.TestCase):
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingNestedTupleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(
@@ -556,13 +573,14 @@ class LSTMTest(test.TestCase):
         self.assertAllEqual(last_states[i],
                             named_saved_states[flat_state_names[i]])
 
+  @test_util.run_v1_only("b/124229375")
   def testProjNoSharding(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -588,7 +606,7 @@ class LSTMTest(test.TestCase):
     num_proj = 4
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -630,7 +648,7 @@ class LSTMTest(test.TestCase):
       self.assertEqual(len(outputs_notuple), len(inputs))
       self.assertEqual(len(outputs_tuple), len(inputs))
       self.assertTrue(isinstance(state_tuple, tuple))
-      self.assertTrue(isinstance(state_notuple, ops_lib.Tensor))
+      self.assertTrue(isinstance(state_notuple, ops.Tensor))
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
@@ -651,6 +669,7 @@ class LSTMTest(test.TestCase):
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
+  @test_util.run_v1_only("b/124229375")
   def testProjSharding(self):
     num_units = 3
     input_size = 5
@@ -659,7 +678,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
 
@@ -684,6 +703,7 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
+  @test_util.run_v1_only("b/124229375")
   def testDoubleInput(self):
     num_units = 3
     input_size = 5
@@ -692,7 +712,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float64, shape=(None, input_size))
@@ -720,6 +740,7 @@ class LSTMTest(test.TestCase):
       values = sess.run(outputs, feed_dict={inputs[0]: input_value})
       self.assertEqual(values[0].dtype, input_value.dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testShardNoShardEquivalentOutput(self):
     num_units = 3
     input_size = 5
@@ -728,7 +749,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
@@ -774,6 +795,7 @@ class LSTMTest(test.TestCase):
       for (s_noshard, s_shard) in zip(state_values_noshard, state_values_shard):
         self.assertAllClose(s_noshard, s_shard, atol=1e-3)
 
+  @test_util.run_v1_only("b/124229375")
   def testDoubleInputWithDropoutAndDynamicCalculation(self):
     """Smoke test for using LSTM with doubles, dropout, dynamic calculation."""
 
@@ -784,7 +806,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       sequence_length = array_ops.placeholder(dtypes.int64)
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
@@ -828,13 +850,14 @@ class LSTMTest(test.TestCase):
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testSharingWeightsWithReuse(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
@@ -878,13 +901,14 @@ class LSTMTest(test.TestCase):
         # Different weights used so outputs should be different.
         self.assertTrue(np.linalg.norm(o1 - o3) > 1e-6)
 
+  @test_util.run_v1_only("b/124229375")
   def testSharingWeightsWithDifferentNamescope(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
@@ -896,10 +920,10 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
 
-      with ops_lib.name_scope("scope0"):
+      with ops.name_scope("scope0"):
         with variable_scope.variable_scope("share_scope"):
           outputs0, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
-      with ops_lib.name_scope("scope1"):
+      with ops.name_scope("scope1"):
         with variable_scope.variable_scope("share_scope", reuse=True):
           outputs1, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
@@ -915,6 +939,7 @@ class LSTMTest(test.TestCase):
       for out0, out1 in zip(outputs0_values, outputs1_values):
         self.assertAllEqual(out0, out1)
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicRNNAllowsUnknownTimeDimension(self):
     inputs = array_ops.placeholder(dtypes.float32, shape=[1, None, 20])
     cell = rnn_cell.GRUCell(30)
@@ -930,7 +955,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1006,7 +1031,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1117,7 +1142,7 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
     ########### Step 1: Run static graph and generate readouts
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1154,8 +1179,8 @@ class LSTMTest(test.TestCase):
             for y in [outputs_static[0], outputs_static[-1], state_static]
         ])
         # Generate gradients of individual variables w.r.t. inputs
-        trainable_variables = ops_lib.get_collection(
-            ops_lib.GraphKeys.TRAINABLE_VARIABLES)
+        trainable_variables = ops.get_collection(
+            ops.GraphKeys.TRAINABLE_VARIABLES)
         assert len(trainable_variables) > 1, (
             "Count of trainable variables: %d" % len(trainable_variables))
         # pylint: disable=bad-builtin
@@ -1177,7 +1202,7 @@ class LSTMTest(test.TestCase):
             static_individual_variable_gradients, feed_dict=feeds)
 
     ########## Step 2: Run dynamic graph and generate readouts
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1225,8 +1250,8 @@ class LSTMTest(test.TestCase):
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
-        trainable_variables = ops_lib.get_collection(
-            ops_lib.GraphKeys.TRAINABLE_VARIABLES)
+        trainable_variables = ops.get_collection(
+            ops.GraphKeys.TRAINABLE_VARIABLES)
         assert len(trainable_variables) > 1, (
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
@@ -1259,12 +1284,12 @@ class LSTMTest(test.TestCase):
 
     self.assertEqual(len(values_static), len(values_dynamic))
     for (value_static, value_dynamic) in zip(values_static, values_dynamic):
-      self.assertAllEqual(value_static, value_dynamic)
-    self.assertAllEqual(state_value_static, state_value_dynamic)
+      self.assertAllClose(value_static, value_dynamic)
+    self.assertAllClose(state_value_static, state_value_dynamic)
 
     if in_graph_mode:
 
-      self.assertAllEqual(static_grad_values, dynamic_grad_values)
+      self.assertAllClose(static_grad_values, dynamic_grad_values)
 
       self.assertEqual(
           len(static_individual_grad_values),
@@ -1276,14 +1301,14 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_grad_values, dynamic_individual_grad_values)):
         tf_logging.info("Comparing individual gradients iteration %d" % i)
-        self.assertAllEqual(a, b)
+        self.assertAllClose(a, b)
 
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
         tf_logging.info(
             "Comparing individual variable gradients iteration %d" % i)
-        self.assertAllEqual(a, b)
+        self.assertAllClose(a, b)
 
   @test_util.run_in_graph_and_eager_modes
   def testDynamicEquivalentToStaticRNN(self):
@@ -1337,7 +1362,7 @@ class BidirectionalRNNTest(test.TestCase):
     return input_value, inputs, outputs, state_fw, state_bw, sequence_length
 
   def _testBidirectionalRNN(self, use_shape):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalRNN(use_shape, True))
       variables_lib.global_variables_initializer().run()
@@ -1358,33 +1383,33 @@ class BidirectionalRNNTest(test.TestCase):
       #
       # First sequence in batch is length=2
       # Check that the time=0 forward output is equal to time=1 backward output
-      self.assertEqual(out[0][0][0], out[1][0][3])
-      self.assertEqual(out[0][0][1], out[1][0][4])
-      self.assertEqual(out[0][0][2], out[1][0][5])
+      self.assertAllClose(out[0][0][0], out[1][0][3])
+      self.assertAllClose(out[0][0][1], out[1][0][4])
+      self.assertAllClose(out[0][0][2], out[1][0][5])
       # Check that the time=1 forward output is equal to time=0 backward output
-      self.assertEqual(out[1][0][0], out[0][0][3])
-      self.assertEqual(out[1][0][1], out[0][0][4])
-      self.assertEqual(out[1][0][2], out[0][0][5])
+      self.assertAllClose(out[1][0][0], out[0][0][3])
+      self.assertAllClose(out[1][0][1], out[0][0][4])
+      self.assertAllClose(out[1][0][2], out[0][0][5])
 
       # Second sequence in batch is length=3
       # Check that the time=0 forward output is equal to time=2 backward output
-      self.assertEqual(out[0][1][0], out[2][1][3])
-      self.assertEqual(out[0][1][1], out[2][1][4])
-      self.assertEqual(out[0][1][2], out[2][1][5])
+      self.assertAllClose(out[0][1][0], out[2][1][3])
+      self.assertAllClose(out[0][1][1], out[2][1][4])
+      self.assertAllClose(out[0][1][2], out[2][1][5])
       # Check that the time=1 forward output is equal to time=1 backward output
-      self.assertEqual(out[1][1][0], out[1][1][3])
-      self.assertEqual(out[1][1][1], out[1][1][4])
-      self.assertEqual(out[1][1][2], out[1][1][5])
+      self.assertAllClose(out[1][1][0], out[1][1][3])
+      self.assertAllClose(out[1][1][1], out[1][1][4])
+      self.assertAllClose(out[1][1][2], out[1][1][5])
       # Check that the time=2 forward output is equal to time=0 backward output
-      self.assertEqual(out[2][1][0], out[0][1][3])
-      self.assertEqual(out[2][1][1], out[0][1][4])
-      self.assertEqual(out[2][1][2], out[0][1][5])
+      self.assertAllClose(out[2][1][0], out[0][1][3])
+      self.assertAllClose(out[2][1][1], out[0][1][4])
+      self.assertAllClose(out[2][1][2], out[0][1][5])
       # Via the reasoning above, the forward and backward final state should be
       # exactly the same
       self.assertAllClose(s_fw, s_bw)
 
   def _testBidirectionalRNNWithoutSequenceLength(self, use_shape):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
@@ -1402,22 +1427,19 @@ class BidirectionalRNNTest(test.TestCase):
       #
       # Both sequences in batch are length=8.  Check that the time=i
       # forward output is equal to time=8-1-i backward output
-      for i in xrange(8):
-        self.assertEqual(out[i][0][0], out[8 - 1 - i][0][3])
-        self.assertEqual(out[i][0][1], out[8 - 1 - i][0][4])
-        self.assertEqual(out[i][0][2], out[8 - 1 - i][0][5])
-      for i in xrange(8):
-        self.assertEqual(out[i][1][0], out[8 - 1 - i][1][3])
-        self.assertEqual(out[i][1][1], out[8 - 1 - i][1][4])
-        self.assertEqual(out[i][1][2], out[8 - 1 - i][1][5])
+      for i in range(8):
+        self.assertAllClose(out[i][0][0:3], out[8 - 1 - i][0][3:6])
+        self.assertAllClose(out[i][1][0:3], out[8 - 1 - i][1][3:6])
       # Via the reasoning above, the forward and backward final state should be
       # exactly the same
       self.assertAllClose(s_fw, s_bw)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNN(self):
     self._testBidirectionalRNN(use_shape=False)
     self._testBidirectionalRNN(use_shape=True)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNNWithoutSequenceLength(self):
     self._testBidirectionalRNNWithoutSequenceLength(use_shape=False)
     self._testBidirectionalRNNWithoutSequenceLength(use_shape=True)
@@ -1472,7 +1494,7 @@ class BidirectionalRNNTest(test.TestCase):
 
   def _testBidirectionalDynamicRNN(self, use_shape, use_state_tuple,
                                    use_time_major, use_sequence_length):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalDynamicRNN(
               use_shape, use_state_tuple, use_time_major, use_sequence_length))
@@ -1534,6 +1556,7 @@ class BidirectionalRNNTest(test.TestCase):
           self.assertAllEqual(out[t, :, 0:3], out[max_length - t - 1, :, 3:6])
         self.assertAllClose(s_fw, s_bw)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalDynamicRNN(self):
     # Generate 2^5 option values
     # from [True, True, True, True, True] to [False, False, False, False, False]
@@ -1549,7 +1572,7 @@ class BidirectionalRNNTest(test.TestCase):
     # REMARKS: factory(scope) is a function accepting a scope
     #          as an argument, such scope can be None, a string
     #          or a VariableScope instance.
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1568,6 +1591,7 @@ class BidirectionalRNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNNScope(self):
 
     def factory(scope):
@@ -1578,6 +1602,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._testScope(factory, use_outer_scope=False)
     self._testScope(factory, prefix=None, use_outer_scope=False)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalDynamicRNNScope(self):
 
     def get_factory(use_time_major):
@@ -1606,13 +1631,14 @@ class MultiDimensionalLSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testMultiDimensionalLSTMAllRNNContainers(self):
     feature_dims = (3, 4, 5)
     input_size = feature_dims
     batch_size = 2
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
@@ -1717,13 +1743,14 @@ class NestedLSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testNestedIOLSTMAllRNNContainers(self):
     input_size = 5
     batch_size = 2
     state_size = 6
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
           dtypes.float32, shape=(None, input_size)),
@@ -1868,7 +1895,7 @@ class StateSaverRNNTest(test.TestCase):
     batch_size = 2
     state_saver = TestStateSaver(batch_size, 2 * num_units)
 
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           self._factory(scope=scope, state_saver=state_saver)
@@ -1900,6 +1927,7 @@ class StateSaverRNNTest(test.TestCase):
     have influence on number of calls to save_state and state methods of
     state_saver object (the number of calls should be same.)
     """
+    self.skipTest("b/124196246 Breakage for sess.run([out, ...]): 2 != 1")
 
     num_units = 3
     batch_size = 2
@@ -1935,6 +1963,7 @@ class GRUTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamic(self):
     time_steps = 8
     num_units = 3
@@ -1945,7 +1974,7 @@ class GRUTest(test.TestCase):
 
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       concat_inputs = array_ops.placeholder(
           dtypes.float32, shape=(time_steps, batch_size, input_size))
 
@@ -1967,7 +1996,7 @@ class GRUTest(test.TestCase):
       sess.run([outputs_dynamic, state_dynamic], feed_dict=feeds)
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1986,6 +2015,7 @@ class GRUTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicScope(self):
     time_steps = 8
     num_units = 3
@@ -2016,8 +2046,9 @@ class RawRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def _testRawRNN(self, max_time):
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       batch_size = 16
       input_depth = 4
       num_units = 3
@@ -2115,6 +2146,7 @@ class RawRNNTest(test.TestCase):
         for i in range(1, len(gradients_val)):
           self.assertAllClose(gradients_dynamic_rnn_val[i], gradients_val[i])
 
+  @test_util.run_v1_only("b/124229375")
   def testRawRNNZeroLength(self):
     # NOTE: Because with 0 time steps, raw_rnn does not have shape
     # information about the input, it is impossible to perform
@@ -2125,8 +2157,9 @@ class RawRNNTest(test.TestCase):
   def testRawRNN(self):
     self._testRawRNN(max_time=10)
 
+  @test_util.run_v1_only("b/124229375")
   def testLoopState(self):
-    with self.session(graph=ops_lib.Graph()):
+    with self.session(graph=ops.Graph()):
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2161,8 +2194,9 @@ class RawRNNTest(test.TestCase):
       loop_state = r[-1]
       self.assertEqual([10], loop_state.eval())
 
+  @test_util.run_v1_only("b/124229375")
   def testLoopStateWithTensorArray(self):
-    with self.session(graph=ops_lib.Graph()):
+    with self.session(graph=ops.Graph()):
       max_time = 4
       batch_size = 16
       input_depth = 4
@@ -2204,8 +2238,9 @@ class RawRNNTest(test.TestCase):
       loop_state = loop_state.stack()
       self.assertAllEqual([1, 2, 2 + 2, 4 + 3, 7 + 4], loop_state.eval())
 
+  @test_util.run_v1_only("b/124229375")
   def testEmitDifferentStructureThanCellOutput(self):
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2253,7 +2288,7 @@ class RawRNNTest(test.TestCase):
           np.ones((max_time, batch_size, 1), np.int64), output_vals[1])
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -2272,6 +2307,7 @@ class RawRNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testRawRNNScope(self):
     max_time = 10
     batch_size = 16
@@ -2329,7 +2365,7 @@ class DeviceWrapperCell(rnn_cell.RNNCell):
 
   def __call__(self, input_, state, scope=None):
     if self._device is not None:
-      with ops_lib.device(self._device):
+      with ops.device(self._device):
         return self._cell(input_, state, scope=scope)
     else:
       return self._cell(input_, state, scope=scope)
@@ -2353,11 +2389,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
-      with ops_lib.device(input_device):
+      with ops.device(input_device):
         inputs = constant_op.constant(inputs)
 
     if rnn_device is not None:
-      with ops_lib.device(rnn_device):
+      with ops.device(rnn_device):
         outputs, _ = rnn.dynamic_rnn(
             gpu_cell,
             inputs,
@@ -2443,5 +2479,876 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     _assert_in("TensorArray", gpu_stats, cpu_stats)
 
 
+class RNNCellTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicRNNCellNotTrainable(self):
+    with self.cached_session() as sess:
+
+      def not_trainable_getter(getter, *args, **kwargs):
+        kwargs["trainable"] = False
+        return getter(*args, **kwargs)
+
+      with variable_scope.variable_scope(
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
+          custom_getter=not_trainable_getter):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertFalse(cell.trainable_variables)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.non_trainable_variables])
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  @test_util.run_v1_only("b/124229375")
+  def testGRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.175991, 0.175991]])
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)):
+        # Test GRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.156736, 0.156736]])
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCell(self):
+    for dtype in [dtypes.float16, dtypes.float32]:
+      np_dtype = dtype.as_numpy_dtype
+      with self.session(graph=ops.Graph()) as sess:
+        with variable_scope.variable_scope(
+            "root", initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.zeros([1, 2], dtype=dtype)
+          m = array_ops.zeros([1, 8], dtype=dtype)
+          cell = rnn_cell_impl.MultiRNNCell(
+              [
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                  for _ in range(2)
+              ],
+              state_is_tuple=False)
+          self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
+          g, out_m = cell(x, m)
+          # Layer infers the input type.
+          self.assertEqual(cell.dtype, dtype.name)
+          expected_variable_names = [
+              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME
+          ]
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
+          self.assertFalse(cell.non_trainable_variables)
+          sess.run([variables_lib.global_variables_initializer()])
+          res = sess.run([g, out_m], {
+              x: np.array([[1., 1.]]),
+              m: 0.1 * np.ones([1, 8])
+          })
+          self.assertEqual(len(res), 2)
+          variables = variables_lib.global_variables()
+          self.assertEqual(expected_variable_names, [v.name for v in variables])
+          # The numbers in results were not calculated, this is just a
+          # smoke test.
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          expected_mem = np.array(
+              [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
+              dtype=np_dtype)
+          self.assertAllClose(res[1], expected_mem, 1e-2)
+        with variable_scope.variable_scope(
+            "other", initializer=init_ops.constant_initializer(0.5)):
+          # Test BasicLSTMCell with input_size != num_units.
+          x = array_ops.zeros([1, 3], dtype=dtype)
+          m = array_ops.zeros([1, 4], dtype=dtype)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          res = sess.run(
+              [g, out_m], {
+                  x: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
+          self.assertEqual(len(res), 2)
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellDimension0Error(self):
+    """Tests that dimension 0 in both(x and m) shape must be equal."""
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size - 1, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run(
+              [g, out_m], {
+                  x: 1 * np.ones([batch_size, input_size]),
+                  m: 0.1 * np.ones([batch_size - 1, state_size])
+              })
+
+  def testBasicLSTMCellStateSizeError(self):
+    """Tests that state_size must be num_units * 2."""
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 3  # state_size must be num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run(
+              [g, out_m], {
+                  x: 1 * np.ones([batch_size, input_size]),
+                  m: 0.1 * np.ones([batch_size, state_size])
+              })
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellStateTupleType(self):
+    with self.cached_session():
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m0 = (array_ops.zeros([1, 2]),) * 2
+        m1 = (array_ops.zeros([1, 2]),) * 2
+        cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
+            state_is_tuple=True)
+        self.assertTrue(isinstance(cell.state_size, tuple))
+        self.assertTrue(
+            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(
+            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
+
+        # Pass in regular tuples
+        _, (out_m0, out_m1) = cell(x, (m0, m1))
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
+
+        # Pass in LSTMStateTuples
+        variable_scope.get_variable_scope().reuse_variables()
+        zero_state = cell.zero_state(1, dtypes.float32)
+        self.assertTrue(isinstance(zero_state, tuple))
+        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
+        _, (out_m0, out_m1) = cell(x, zero_state)
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellWithStateTuple(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m0 = array_ops.zeros([1, 4])
+        m1 = array_ops.zeros([1, 4])
+        cell = rnn_cell_impl.MultiRNNCell(
+            [
+                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                for _ in range(2)
+            ],
+            state_is_tuple=True)
+        g, (out_m0, out_m1) = cell(x, (m0, m1))
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x: np.array([[1., 1.]]),
+                m0: 0.1 * np.ones([1, 4]),
+                m1: 0.1 * np.ones([1, 4])
+            })
+        self.assertEqual(len(res), 3)
+        # The numbers in results were not calculated, this is just a smoke test.
+        # Note, however, these values should match the original
+        # version having state_is_tuple=False.
+        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
+        expected_mem0 = np.array(
+            [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
+        expected_mem1 = np.array(
+            [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
+        self.assertAllClose(res[1], expected_mem0)
+        self.assertAllClose(res[2], expected_mem1)
+
+  @test_util.run_v1_only("b/124229375")
+  def testLSTMCell(self):
+    with self.cached_session() as sess:
+      num_units = 8
+      num_proj = 6
+      state_size = num_units + num_proj
+      batch_size = 3
+      input_size = 2
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        cell = rnn_cell_impl.LSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            state_is_tuple=False)
+        output, state = cell(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [output, state], {
+                x: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m: 0.1 * np.ones((batch_size, state_size))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1].shape, (batch_size, state_size))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
+
+  @test_util.run_v1_only("b/124229375")
+  def testLSTMCellVariables(self):
+    with self.cached_session():
+      num_units = 8
+      num_proj = 6
+      state_size = num_units + num_proj
+      batch_size = 3
+      input_size = 2
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        cell = rnn_cell_impl.LSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            state_is_tuple=False)
+        cell(x, m)  # Execute to create variables
+      variables = variables_lib.global_variables()
+      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
+      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
+      self.assertEquals(variables[2].op.name,
+                        "root/lstm_cell/projection/kernel")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperCheckpointing(self):
+    for wrapper_type in [
+        rnn_cell_impl.DropoutWrapper,
+        rnn_cell_impl.ResidualWrapper,
+        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
+      cell = rnn_cell_impl.BasicRNNCell(1)
+      wrapper = wrapper_type(cell)
+      wrapper(array_ops.ones([1, 1]),
+              state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
+      self.evaluate([v.initializer for v in cell.variables])
+      checkpoint = trackable_utils.Checkpoint(wrapper=wrapper)
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      self.evaluate(cell._bias.assign([40.]))
+      save_path = checkpoint.save(prefix)
+      self.evaluate(cell._bias.assign([0.]))
+      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+      self.assertAllEqual([40.], self.evaluate(cell._bias))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testResidualWrapper(self, wrapper_type):
+    x = ops.convert_to_tensor(np.array([[1., 1., 1.]]))
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    base_cell = rnn_cell_impl.GRUCell(
+        3, kernel_initializer=init_ops.constant_initializer(0.5),
+        bias_initializer=init_ops.constant_initializer(0.5))
+    g, m_new = base_cell(x, m)
+    wrapper_object = wrapper_type(base_cell)
+    (name, dep), = wrapper_object._checkpoint_dependencies
+    wrapper_object.get_config()  # Should not throw an error
+    self.assertIs(dep, base_cell)
+    self.assertEqual("cell", name)
+
+    g_res, m_new_res = wrapper_object(x, m)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res = self.evaluate([g, g_res, m_new, m_new_res])
+    # Residual connections
+    self.assertAllClose(res[1], res[0] + [1., 1., 1.])
+    # States are left untouched
+    self.assertAllClose(res[2], res[3])
+
+  @parameterized.parameters(
+      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testResidualWrapperWithSlice(self, wrapper_type):
+    x = ops.convert_to_tensor(np.array([[1., 1., 1., 1., 1.]]))
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    base_cell = rnn_cell_impl.GRUCell(
+        3, kernel_initializer=init_ops.constant_initializer(0.5),
+        bias_initializer=init_ops.constant_initializer(0.5))
+    g, m_new = base_cell(x, m)
+
+    def residual_with_slice_fn(inp, out):
+      inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
+      return inp_sliced + out
+
+    g_res, m_new_res = wrapper_type(
+        base_cell, residual_with_slice_fn)(x, m)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res_g, res_g_res, res_m_new, res_m_new_res = self.evaluate(
+        [g, g_res, m_new, m_new_res])
+    # Residual connections
+    self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
+    # States are left untouched
+    self.assertAllClose(res_m_new, res_m_new_res)
+
+  @test_util.run_v1_only("b/124229375")
+  def testDeviceWrapper(self):
+    with variable_scope.variable_scope(
+        "root", initializer=init_ops.constant_initializer(0.5)):
+      x = array_ops.zeros([1, 3])
+      m = array_ops.zeros([1, 3])
+      wrapped = rnn_cell_impl.GRUCell(3)
+      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
+      (name, dep), = cell._checkpoint_dependencies
+      cell.get_config()  # Should not throw an error
+      self.assertIs(dep, wrapped)
+      self.assertEqual("cell", name)
+
+      outputs, _ = cell(x, m)
+      self.assertTrue("cpu:14159" in outputs.device.lower())
+
+  def _retrieve_cpu_gpu_stats(self, run_metadata):
+    cpu_stats = None
+    gpu_stats = None
+    step_stats = run_metadata.step_stats
+    for ds in step_stats.dev_stats:
+      if "cpu:0" in ds.device[-5:].lower():
+        cpu_stats = ds.node_stats
+      if "gpu:0" == ds.device[-5:].lower():
+        gpu_stats = ds.node_stats
+    return cpu_stats, gpu_stats
+
+  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
+    if not test.is_gpu_available():
+      # Can't perform this test w/o a GPU
+      return
+
+    gpu_dev = test.gpu_device_name()
+    with self.session(use_gpu=True) as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1, 3])
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
+        with ops.device("/cpu:0"):
+          outputs, _ = rnn.dynamic_rnn(
+              cell=cell, inputs=x, dtype=dtypes.float32)
+        run_metadata = config_pb2.RunMetadata()
+        opts = config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.FULL_TRACE)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
+
+      cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
+      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
+      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
+
+  @test_util.run_v1_only("b/124229375")
+  def testMultiRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 4])
+        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)
+        _, ml = multi_rnn_cell(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(ml, {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1, 0.1, 0.1]])
+        })
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
+        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
+        self.assertTrue(
+            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
+
+  @test_util.run_v1_only("b/124229375")
+  def testMultiRNNCellWithStateTuple(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m_bad = array_ops.zeros([1, 4])
+        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
+
+        # Test incorrectness of state
+        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
+          rnn_cell_impl.MultiRNNCell(
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
+
+        _, ml = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            ml, {
+                x: np.array([[1., 1.]]),
+                m_good[0]: np.array([[0.1, 0.1]]),
+                m_good[1]: np.array([[0.1, 0.1]])
+            })
+
+        # The numbers in results were not calculated, this is just a
+        # smoke test.  However, these numbers should match those of
+        # the test testMultiRNNCell.
+        self.assertAllClose(res[0], [[0.175991, 0.175991]])
+        self.assertAllClose(res[1], [[0.13248, 0.13248]])
+
+  @parameterized.parameters(
+      [[rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2],
+       [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2]])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperKerasStyle(self, wrapper, wrapper_v2):
+    """Tests if wrapper cell is instantiated in keras style scope."""
+    wrapped_cell_v2 = wrapper_v2(rnn_cell_impl.BasicRNNCell(1))
+    self.assertTrue(wrapped_cell_v2._keras_style)
+
+    wrapped_cell = wrapper(rnn_cell_impl.BasicRNNCell(1))
+    self.assertFalse(wrapped_cell._keras_style)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2VariableNames(self, wrapper):
+    """Tests that variables names do not depend on wrapper in RNN layer."""
+
+    def _rnn_input(apply_wrapper, name):
+      """Creates a RNN layer with/without wrapper and returns built rnn cell."""
+      with base_layer.keras_style_scope():
+        base_cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
+             for _ in range(2)])
+      if apply_wrapper:
+        rnn_cell = wrapper(base_cell)
+      else:
+        rnn_cell = base_cell
+      rnn_layer = keras_layers.RNN(rnn_cell, name=name)
+      inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
+      _ = rnn_layer(inputs)
+      return base_cell._cells[0]
+
+    rnn_1 = _rnn_input(True, name="rnn_0")
+    rnn_2 = _rnn_input(False, name="rnn_1")
+
+    for i, cell in enumerate([rnn_1, rnn_2]):
+      var_prefix = "rnn_{}/cell_0/basic_rnn_cell/".format(i)
+      self.assertCountEqual([v.name for v in cell.weights],
+                            (var_prefix + "kernel:0", var_prefix + "bias:0"))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperWeights(self, wrapper):
+    """Tests that wrapper weights contain wrapped cells weights."""
+
+    with base_layer.keras_style_scope():
+      base_cell = rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
+    rnn_cell = wrapper(base_cell)
+    rnn_layer = keras_layers.RNN(rnn_cell)
+    inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
+    rnn_layer(inputs)
+
+    expected_weights = ["rnn/" + var for var in ("kernel:0", "bias:0")]
+    self.assertEqual(len(rnn_cell.weights), 2)
+    self.assertCountEqual([v.name for v in rnn_cell.weights], expected_weights)
+    self.assertCountEqual([v.name for v in rnn_cell.trainable_variables],
+                          expected_weights)
+    self.assertCountEqual([v.name for v in rnn_cell.non_trainable_variables],
+                          [])
+    self.assertCountEqual([v.name for v in rnn_cell._cell.weights],
+                          expected_weights)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2Caller(self, wrapper):
+    """Tests that wrapper V2 is using the LayerRNNCell's caller."""
+
+    with base_layer.keras_style_scope():
+      base_cell = rnn_cell_impl.MultiRNNCell(
+          [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
+    rnn_cell = wrapper(base_cell)
+    inputs = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
+    state = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
+    _ = rnn_cell(inputs, [state, state])
+    weights = base_cell._cells[0].weights
+    self.assertLen(weights, expected_len=2)
+    self.assertTrue(all(["_wrapper" in v.name for v in weights]))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2Build(self, wrapper):
+    cell = rnn_cell_impl.LSTMCell(10)
+    wrapper = wrapper(cell)
+    wrapper.build((1,))
+    self.assertTrue(cell.built)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
+
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          wrapper_type=None,
+                          scope="root",
+                          **kwargs):
+    if batch_size is None and time_steps is None:
+      # 2 time steps, batch size 1, depth 3
+      batch_size = 1
+      time_steps = 2
+      x = constant_op.constant(
+          [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
+      m = rnn_cell_impl.LSTMStateTuple(
+          *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)] * 2)
+    else:
+      x = constant_op.constant(
+          np.random.randn(time_steps, batch_size, 3).astype(np.float32))
+      m = rnn_cell_impl.LSTMStateTuple(*[
+          constant_op.
+          constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)] * 2)
+    outputs, final_state = rnn.dynamic_rnn(
+        cell=wrapper_type(
+            rnn_cell_impl.LSTMCell(
+                3, initializer=init_ops.constant_initializer(0.5)),
+            dtype=x.dtype, **kwargs),
+        time_major=True,
+        parallel_iterations=parallel_iterations,
+        inputs=x,
+        initial_state=m,
+        scope=scope)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res = self.evaluate([outputs, final_state])
+    self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
+    self.assertEqual(res[1].c.shape, (batch_size, 3))
+    self.assertEqual(res[1].h.shape, (batch_size, 3))
+    return res
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperProperties(self, wrapper_type):
+    cell = rnn_cell_impl.BasicRNNCell(10)
+    wrapper = wrapper_type(cell)
+    # Github issue 15810
+    self.assertEqual(wrapper.wrapped_cell, cell)
+    self.assertEqual(wrapper.state_size, 10)
+    self.assertEqual(wrapper.output_size, 10)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperZeroState(self, wrapper_type):
+    class _Cell(rnn_cell_impl.BasicRNNCell):
+
+      def zero_state(self, batch_size=None, dtype=None):
+        return "wrapped_cell_zero_state"
+    wrapper = wrapper_type(_Cell(10))
+    self.assertEqual(wrapper.zero_state(10, dtypes.float32),
+                     "wrapped_cell_zero_state")
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepAllConstantInput(self, wrapper_type):
+    keep = array_ops.ones([])
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepAll(self, wrapper_type):
+    keep = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperWithSeed(self, wrapper_type):
+    keep_some = 0.5
+    random_seed.set_random_seed(2)
+    ## Use parallel_iterations = 1 in both calls to
+    ## _testDropoutWrapper to ensure the (per-time step) dropout is
+    ## consistent across both calls.  Otherwise the seed may not end
+    ## up being munged consistently across both graphs.
+    res_standard_1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
+        parallel_iterations=1,
+        wrapper_type=wrapper_type,
+        scope="root_1")
+    random_seed.set_random_seed(2)
+    res_standard_2 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
+        parallel_iterations=1,
+        wrapper_type=wrapper_type,
+        scope="root_2")
+    self.assertAllClose(res_standard_1[0], res_standard_2[0])
+    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
+    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoOutput(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
+        state_keep_prob=keep_all,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(np.zeros(res[0].shape), res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    # Even though we dropout state, by default DropoutWrapper never
+    # drops out the memory ("c") term of an LSTMStateTuple.
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
+        state_keep_prob=keep_none,
+        wrapper_type=wrapper_type)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    self.assertAllClose(true_full_output[0], res[0][0])
+    # Second output is modified by zero input state
+    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
+    # h state has been set to zero
+    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
+    # c state of an LSTMStateTuple is NEVER modified.
+    self.assertAllClose(true_c_state, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoInput(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    # All outputs are different because inputs are zeroed out
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
+        state_keep_prob=keep_all,
+        wrapper_type=wrapper_type)
+    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentOutput(self, wrapper_type):
+    keep_some = 0.8
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
+    # Ensure the same dropout pattern for all time steps
+    output_mask = np.abs(res[0]) > 1e-6
+    for m in output_mask[1:]:
+      self.assertAllClose(output_mask[0], m)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentStateInputAndOutput(self, wrapper_type):
+    keep_some = 0.9
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
+
+    # Smoke test for the state/input masks.
+    output_mask = np.abs(res[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res[1].c) > 1e-6
+    state_h_mask = np.abs(res[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(
+      self, wrapper_type):
+    keep_some = 0.9
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res0 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987,
+        scope="root_0")
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987,
+        scope="root_1")
+
+    output_mask = np.abs(res0[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res0[1].c) > 1e-6
+    state_h_mask = np.abs(res0[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+    # Ensure seeded calculation is identical.
+    self.assertAllClose(res0[0], res1[0])
+    self.assertAllClose(res0[1].c, res1[1].c)
+    self.assertAllClose(res0[1].h, res1[1].h)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 4e15894fb4aa8a90d8dd9914ba25dcfd27d5fe95..2a3021f982149f619eef2d32edd7929e0c8b7603 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -70,7 +70,6 @@ def handle_options(func, x, axis, exclusive, reverse):
   return x
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class CumsumTest(test.TestCase):
 
   valid_dtypes = [
@@ -135,6 +134,7 @@ class CumsumTest(test.TestCase):
         self._compareAll(x, axis)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123860949")  # The computation is constant folded
   def testLarge(self):
     for dtype in self.valid_dtypes:
       x = np.ones([1000000], dtype=dtype) / 1024
@@ -194,7 +194,6 @@ class CumsumTest(test.TestCase):
           self._compareGradient([5, 10], axis, exclusive, reverse)
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class CumprodTest(test.TestCase):
 
   valid_dtypes = [
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 88f7b27b77ee24c732b84a674587b63638b2c903..5bc301b61360584969e391b093a3f488dec06925 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -296,7 +296,7 @@ class StatefulScatterNdTest(test.TestCase):
                                     updates).get_shape().as_list(), shape)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 554bf38029473bb9ff204a09556a182b378dd549..4caecc85ca5a1ab72648e015666a5666cf5335ab 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -29,6 +29,7 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -45,7 +46,10 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
     ],
     shard_count = 4,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -73,6 +77,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:spectral_ops_test_util",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -130,7 +135,10 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
-    tags = ["nomac"],
+    tags = [
+        "no_rocm",
+        "nomac",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
diff --git a/tensorflow/python/kernel_tests/signal/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
index e698afde635d58e159340bac20bdd12d8cf3e711..a3ac15bab8a7b8223bd1ea085386b965b7fdd62e 100644
--- a/tensorflow/python/kernel_tests/signal/dct_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -134,7 +134,6 @@ class DCTOpsTest(parameterized.TestCase, test.TestCase):
   @parameterized.parameters([
       [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def test_random(self, shape):
     """Test randomly generated batches of data."""
     with spectral_ops_test_util.fft_kernel_label_map():
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index f3bee87a65f484788aa027fd51f7d006c5dcf24f..4577587fa4e05b1e2fe3353c4a12ac37744bddb6 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -159,7 +159,6 @@ class FFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -170,7 +169,6 @@ class FFTOpsTest(BaseFFTOpsTest):
             self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -180,7 +178,6 @@ class FFTOpsTest(BaseFFTOpsTest):
                 np.mod(np.arange(np.power(4, dims)), 10).reshape(
                     (4,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testLargeBatch(self):
     if test.is_gpu_available(cuda_only=True):
       rank = 1
@@ -212,7 +209,6 @@ class FFTOpsTest(BaseFFTOpsTest):
                 rank, use_placeholder=True, rtol=tol, atol=tol)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
@@ -228,7 +224,6 @@ class FFTOpsTest(BaseFFTOpsTest):
                           rtol=tol, atol=tol)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testRandom1D(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -345,7 +340,6 @@ class RFFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -356,7 +350,6 @@ class RFFTOpsTest(BaseFFTOpsTest):
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -371,7 +364,6 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._compareBackward(
                 c2r.astype(np.complex64), rank, (size,) * rank)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testLargeBatch(self):
     if test.is_gpu_available(cuda_only=True):
       rank = 1
@@ -405,7 +397,6 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 rank, (size,) * rank,
                 use_placeholder=True)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testFftLength(self):
     if test.is_gpu_available(cuda_only=True):
       with spectral_ops_test_util.fft_kernel_label_map():
@@ -449,7 +440,6 @@ class RFFTOpsTest(BaseFFTOpsTest):
                   use_placeholder=True)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       def gen_real(shape):
@@ -475,7 +465,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 gen_complex(complex_dims), rank, (size,) * rank)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123738986")  # More assertions needed.
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index a72a836592667889b7c54665c2277eaffd40846c..7b9748c7f260b60d7322a6de68e35970513ac969 100644
--- a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -116,7 +115,6 @@ class SpectralOpsTest(test.TestCase):
       self.assertAllClose(
           expected_inverse_stft, actual_inverse_stft, 1e-4, 1e-4)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def test_shapes(self):
     with spectral_ops_test_util.fft_kernel_label_map(), (
         self.session(use_gpu=True)):
@@ -152,7 +150,6 @@ class SpectralOpsTest(test.TestCase):
       self.assertAllEqual([256], inverse_stft.shape.as_list())
       self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
     # Tuples of (signal_length, frame_length, frame_step, fft_length).
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 7f3c381fa161bd59b1956e880d82e62d6b051b25..69243afb69c90b53d9e470a2d81f4067d1c2191e 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -285,7 +285,7 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123553551")  # Unsupported data format
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 80004db833c50ef460ed16237f4f775eb80b6877..42b4d1b778296e936def71e5f347ed5019faae26 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -373,7 +373,7 @@ class SplitOpTest(test.TestCase):
     assert s1.shape.as_list() == [1]
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index ca3357a0ed8f87cfcccd08a62c5b8526a898b664..04d635cdb1e6f35db10193c74623b5aa1013ee9d 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -81,7 +81,7 @@ class StackOpTest(test.TestCase):
     np.random.seed(7)
     with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.bool, np.float32, np.int32, np.int64]:
+        for dtype in [np.bool, np.float32, np.int16, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
           # Stack back into a single tensorflow tensor directly using np array
           c = array_ops.stack(data)
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
index cd446eb40eb9ff1931a3eb4555f9dd81a77b659f..350e51a2120638b9a5ea12cbe1606ac21a6f00cb 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -19,22 +19,32 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import time
+import unittest
 
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
-class SummaryOpsTest(test_util.TensorFlowTestCase):
+class SummaryOpsCoreTest(test_util.TensorFlowTestCase):
 
   def testWrite(self):
     logdir = self.get_temp_dir()
@@ -136,44 +146,101 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       self.assertFalse(summary_ops.write('tag', 42, step=0))
 
-  def testWrite_shouldRecordSummaries(self):
+  def testWrite_recordIf_constant(self):
     logdir = self.get_temp_dir()
     with context.eager_mode():
       with summary_ops.create_file_writer(logdir).as_default():
-        self.assertTrue(summary_ops.write('default_on', 1, step=0))
-        with summary_ops.always_record_summaries():
+        self.assertTrue(summary_ops.write('default', 1, step=0))
+        with summary_ops.record_if(True):
           self.assertTrue(summary_ops.write('set_on', 1, step=0))
-        with summary_ops.never_record_summaries():
+        with summary_ops.record_if(False):
           self.assertFalse(summary_ops.write('set_off', 1, step=0))
     events = events_from_logdir(logdir)
     self.assertEqual(3, len(events))
-    self.assertEqual('default_on', events[1].summary.value[0].tag)
+    self.assertEqual('default', events[1].summary.value[0].tag)
     self.assertEqual('set_on', events[2].summary.value[0].tag)
 
-  def testWrite_shouldRecordSummaries_fromFunction(self):
+  def testWrite_recordIf_constant_fromFunction(self):
     logdir = self.get_temp_dir()
     @def_function.function
-    def f(tag_prefix):
+    def f():
+      with summary_ops.create_file_writer(logdir).as_default():
+        # Use assertAllEqual instead of assertTrue since it works in a defun.
+        self.assertAllEqual(summary_ops.write('default', 1, step=0), True)
+        with summary_ops.record_if(True):
+          self.assertAllEqual(summary_ops.write('set_on', 1, step=0), True)
+        with summary_ops.record_if(False):
+          self.assertAllEqual(summary_ops.write('set_off', 1, step=0), False)
+    with context.eager_mode():
+      f()
+    events = events_from_logdir(logdir)
+    self.assertEqual(3, len(events))
+    self.assertEqual('default', events[1].summary.value[0].tag)
+    self.assertEqual('set_on', events[2].summary.value[0].tag)
+
+  def testWrite_recordIf_callable(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      step = variables.Variable(-1, dtype=dtypes.int64)
+      def record_fn():
+        step.assign_add(1)
+        return int(step % 2) == 0
+      with summary_ops.create_file_writer(logdir).as_default():
+        with summary_ops.record_if(record_fn):
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+          self.assertFalse(summary_ops.write('tag', 1, step=step))
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+          self.assertFalse(summary_ops.write('tag', 1, step=step))
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  def testWrite_recordIf_callable_fromFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      step = variables.Variable(-1, dtype=dtypes.int64)
+      @def_function.function
+      def record_fn():
+        step.assign_add(1)
+        return math_ops.equal(step % 2, 0)
+      @def_function.function
+      def f():
+        with summary_ops.create_file_writer(logdir).as_default():
+          with summary_ops.record_if(record_fn):
+            return [
+                summary_ops.write('tag', 1, step=step),
+                summary_ops.write('tag', 1, step=step),
+                summary_ops.write('tag', 1, step=step)]
+      self.assertAllEqual(f(), [True, False, True])
+      self.assertAllEqual(f(), [False, True, False])
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  def testWrite_recordIf_tensorInput_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[], dtype=dtypes.int64)])
+    def f(step):
       with summary_ops.create_file_writer(logdir).as_default():
-        default_output = summary_ops.write(tag_prefix + '_default', 1, step=0)
-        with summary_ops.always_record_summaries():
-          on_output = summary_ops.write(tag_prefix + '_on', 1, step=0)
-        with summary_ops.never_record_summaries():
-          off_output = summary_ops.write(tag_prefix + '_off', 1, step=0)
-        return [default_output, on_output, off_output]
+        with summary_ops.record_if(math_ops.equal(step % 2, 0)):
+          return summary_ops.write('tag', 1, step=step)
     with context.eager_mode():
-      self.assertAllEqual([True, True, False], f('default'))
-      with summary_ops.always_record_summaries():
-        self.assertAllEqual([True, True, False], f('on'))
-      with summary_ops.never_record_summaries():
-        self.assertAllEqual([False, True, False], f('off'))
+      self.assertTrue(f(0))
+      self.assertFalse(f(1))
+      self.assertTrue(f(2))
+      self.assertFalse(f(3))
+      self.assertTrue(f(4))
     events = events_from_logdir(logdir)
-    self.assertEqual(6, len(events))
-    self.assertEqual('default_default', events[1].summary.value[0].tag)
-    self.assertEqual('default_on', events[2].summary.value[0].tag)
-    self.assertEqual('on_default', events[3].summary.value[0].tag)
-    self.assertEqual('on_on', events[4].summary.value[0].tag)
-    self.assertEqual('off_on', events[5].summary.value[0].tag)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
 
   @test_util.also_run_as_tf_function
   def testSummaryScope(self):
@@ -223,6 +290,301 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual('with/slash', tag)
 
 
+class SummaryWriterTest(test_util.TensorFlowTestCase):
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=1000, flush_millis=1000000)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      file1 = files[0]
+      self.assertEqual(1, len(events_from_file(file1)))  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, len(events_from_file(file1)))
+      with writer.as_default():
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, len(events_from_file(file1)))
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, len(events_from_file(file1)))
+        # Calling init() on a closed writer should start a new file
+        time.sleep(1.1)  # Ensure filename has a different timestamp
+        writer.init()
+        files = gfile.Glob(os.path.join(logdir, '*'))
+        self.assertEqual(2, len(files))
+        files.remove(file1)
+        file2 = files[0]
+        self.assertEqual(1, len(events_from_file(file2)))  # file_version
+        self.assertEqual(2, len(events_from_file(file1)))  # should be unchanged
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.write('tag', 1, step=1)
+        summary_ops.flush()
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.write('tag', 1, step=2)
+        summary_ops.flush()
+      # Create with different shared name (should be separate resource/file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.write('tag', 1, step=3)
+        summary_ops.flush()
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*'))))
+
+    # First file has tags "one" and "two"
+    events = iter(events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(1, next(events).step)
+    self.assertEqual(2, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file has tag "three"
+    events = iter(events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(3, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(
+          logdir, max_queue=1, flush_millis=999999).as_default():
+        get_total = lambda: len(events_from_logdir(logdir))
+        # Note: First tf.Event is always file_version.
+        self.assertEqual(1, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        # Should flush after second summary since max_queue = 1
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(3, get_total())
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(events_from_logdir(logdir))
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=1000, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(2, get_total())
+      # Exiting the "as_default()" should do an implicit flush
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=999999, flush_millis=999999)
+      with writer.as_default(), summary_ops.always_record_summaries():
+        get_total = lambda: len(events_from_logdir(logdir))
+        # Note: First tf.Event is always file_version.
+        self.assertEqual(1, get_total())
+        summary_ops.write('tag', 1, step=0)
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        summary_ops.flush()
+        self.assertEqual(3, get_total())
+        # Test "writer" parameter
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(3, get_total())
+        summary_ops.flush(writer=writer)
+        self.assertEqual(4, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(4, get_total())
+        summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+        self.assertEqual(5, get_total())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testEagerMemory(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.create_file_writer(logdir).as_default():
+      summary_ops.write('tag', 1, step=0)
+
+  def testClose_closesOpenFile(self):
+    try:
+      import psutil  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise unittest.SkipTest('test requires psutil')
+    proc = psutil.Process()
+    get_open_filenames = lambda: set(info[0] for info in proc.open_files())
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(logdir)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      eventfile = files[0]
+      self.assertIn(eventfile, get_open_filenames())
+      writer.close()
+      self.assertNotIn(eventfile, get_open_filenames())
+
+  def testDereference_closesOpenFile(self):
+    try:
+      import psutil  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise unittest.SkipTest('test requires psutil')
+    proc = psutil.Process()
+    get_open_filenames = lambda: set(info[0] for info in proc.open_files())
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(logdir)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      eventfile = files[0]
+      self.assertIn(eventfile, get_open_filenames())
+      del writer
+      self.assertNotIn(eventfile, get_open_filenames())
+
+
+class SummaryOpsTest(test_util.TensorFlowTestCase):
+
+  def run_metadata(self, *args, **kwargs):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.run_metadata(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1].summary
+
+  def run_metadata_graphs(self, *args, **kwargs):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.run_metadata_graphs(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1].summary
+
+  def create_run_metadata(self):
+    step_stats = step_stats_pb2.StepStats(dev_stats=[
+        step_stats_pb2.DeviceStepStats(
+            device='cpu:0',
+            node_stats=[step_stats_pb2.NodeExecStats(node_name='hello')])
+    ])
+    return config_pb2.RunMetadata(
+        function_graphs=[
+            config_pb2.RunMetadata.FunctionGraphs(
+                pre_optimization_graph=graph_pb2.GraphDef(
+                    node=[node_def_pb2.NodeDef(name='foo')]))
+        ],
+        step_stats=step_stats)
+
+  @test_util.run_v2_only
+  def testRunMetadata_usesNameAsTag(self):
+    meta = config_pb2.RunMetadata()
+
+    with ops.name_scope('foo'):
+      summary = self.run_metadata(name='my_name', data=meta, step=1)
+      first_val = summary.value[0]
+
+    self.assertEqual('foo/my_name', first_val.tag)
+
+  @test_util.run_v2_only
+  def testRunMetadata_summaryMetadata(self):
+    expected_summary_metadata = """
+      plugin_data {
+        plugin_name: "graph_run_metadata"
+        content: "1"
+      }
+    """
+    meta = config_pb2.RunMetadata()
+    summary = self.run_metadata(name='my_name', data=meta, step=1)
+    actual_summary_metadata = summary.value[0].metadata
+    self.assertProtoEquals(expected_summary_metadata, actual_summary_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadata_wholeRunMetadata(self):
+    expected_run_metadata = """
+      step_stats {
+        dev_stats {
+          device: "cpu:0"
+          node_stats {
+            node_name: "hello"
+          }
+        }
+      }
+      function_graphs {
+        pre_optimization_graph {
+          node {
+            name: "foo"
+          }
+        }
+      }
+    """
+    meta = self.create_run_metadata()
+    summary = self.run_metadata(name='my_name', data=meta, step=1)
+    first_val = summary.value[0]
+
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+    self.assertProtoEquals(expected_run_metadata, actual_run_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_usesNameAsTag(self):
+    meta = config_pb2.RunMetadata()
+
+    with ops.name_scope('foo'):
+      summary = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+      first_val = summary.value[0]
+
+    self.assertEqual('foo/my_name', first_val.tag)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_summaryMetadata(self):
+    expected_summary_metadata = """
+      plugin_data {
+        plugin_name: "graph_run_metadata_graph"
+        content: "1"
+      }
+    """
+    meta = config_pb2.RunMetadata()
+    summary = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+    actual_summary_metadata = summary.value[0].metadata
+    self.assertProtoEquals(expected_summary_metadata, actual_summary_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_runMetadataFragment(self):
+    expected_run_metadata = """
+      function_graphs {
+        pre_optimization_graph {
+          node {
+            name: "foo"
+          }
+        }
+      }
+    """
+    meta = self.create_run_metadata()
+
+    summary = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+    first_val = summary.value[0]
+
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+    self.assertProtoEquals(expected_run_metadata, actual_run_metadata)
+
+
 def events_from_file(filepath):
   """Returns all events in a single event file.
 
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 3b2f476e0cb44561380a61ebd09f7e934e4526c6..056e3b9ff88af197995739a10a080b91aac2ee5e 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1430,6 +1430,19 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([1.0, -1.0], read_val)
       self.assertAllEqual([[2.0, 3.0], [0.0, 0.0]], grad_val)
 
+  def testScatterIntoExistingList(self):
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.float32, tensor_array_name="foo", size=5)
+
+    ta = ta.scatter(indices=[3, 4], value=array_ops.ones([2]))
+    self.assertAllEqual(ta.stack(), [0., 0., 0., 1., 1.])
+
+    ta = ta.scatter(indices=[1], value=array_ops.ones([1]))
+    self.assertAllEqual(ta.stack(), [0., 1., 0., 1., 1.])
+
+    ta = ta.scatter(indices=[0, 2], value=[5., 6.])
+    self.assertAllEqual(ta.stack(), [5., 1., 6., 1., 1.])
+
   @test_util.run_v1_only("b/118890905")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
@@ -1513,7 +1526,7 @@ class TensorArrayTest(test.TestCase):
       if "/task:1/" in d:
         self.assertTrue(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
-      else:
+      elif "/host:CPU" not in d:
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
diff --git a/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd603a9547ee6f318f55f5b790953a51e591b0e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
@@ -0,0 +1,456 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.linalg.linalg_impl.tridiagonal_solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+_sample_diags = np.array([[2, 1, 4, 0], [1, 3, 2, 2], [0, 1, -1, 1]])
+_sample_rhs = np.array([1, 2, 3, 4])
+_sample_result = np.array([-9, 5, -4, 4])
+
+
+def _tfconst(array):
+  return constant_op.constant(array, dtypes.float64)
+
+
+def _tf_ones(shape):
+  return array_ops.ones(shape, dtype=dtypes.float64)
+
+
+class TridiagonalSolveOpTest(test.TestCase):
+
+  def _test(self,
+            diags,
+            rhs,
+            expected,
+            diags_format="compact",
+            transpose_rhs=False,
+            conjugate_rhs=False):
+    with self.cached_session(use_gpu=False):
+      result = linalg_impl.tridiagonal_solve(diags, rhs, diags_format,
+                                             transpose_rhs, conjugate_rhs)
+      self.assertAllClose(self.evaluate(result), expected)
+
+  def _testWithLists(self,
+                     diags,
+                     rhs,
+                     expected,
+                     diags_format="compact",
+                     transpose_rhs=False,
+                     conjugate_rhs=False):
+    self._test(
+        _tfconst(diags), _tfconst(rhs), _tfconst(expected), diags_format,
+        transpose_rhs, conjugate_rhs)
+
+  def _assertRaises(self, diags, rhs, diags_format="compact"):
+    with self.assertRaises(ValueError):
+      linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+
+  # Tests with various dtypes
+
+  def testReal(self):
+    for dtype in dtypes.float32, dtypes.float64:
+      self._test(
+          diags=constant_op.constant(_sample_diags, dtype),
+          rhs=constant_op.constant(_sample_rhs, dtype),
+          expected=constant_op.constant(_sample_result, dtype))
+
+  def testComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      self._test(
+          diags=constant_op.constant(_sample_diags, dtype) * (1 + 1j),
+          rhs=constant_op.constant(_sample_rhs, dtype) * (1 - 1j),
+          expected=constant_op.constant(_sample_result, dtype) * (1 - 1j) /
+          (1 + 1j))
+
+  # Tests with small matrix sizes
+
+  def test3x3(self):
+    self._testWithLists(
+        diags=[[2, -1, 0], [1, 3, 1], [0, -1, -2]],
+        rhs=[1, 2, 3],
+        expected=[-3, 2, 7])
+
+  def test2x2(self):
+    self._testWithLists(
+        diags=[[2, 0], [1, 3], [0, 1]], rhs=[1, 4], expected=[-5, 3])
+
+  def test1x1(self):
+    self._testWithLists(diags=[[0], [3], [0]], rhs=[6], expected=[2])
+
+  def test0x0(self):
+    self._test(
+        diags=constant_op.constant(0, shape=(3, 0), dtype=dtypes.float32),
+        rhs=constant_op.constant(0, shape=(0, 1), dtype=dtypes.float32),
+        expected=constant_op.constant(0, shape=(0, 1), dtype=dtypes.float32))
+
+  # Other edge cases
+
+  def testCaseRequiringPivoting(self):
+    # Without partial pivoting (e.g. Thomas algorithm) this would fail.
+    self._testWithLists(
+        diags=[[2, -1, 1, 0], [1, 4, 1, -1], [0, 2, -2, 3]],
+        rhs=[1, 2, 3, 4],
+        expected=[8, -3.5, 0, -4])
+
+  def testCaseRequiringPivotingLastRows(self):
+    self._testWithLists(
+        diags=[[2, 1, -1, 0], [1, -1, 2, 1], [0, 1, -6, 1]],
+        rhs=[1, 2, -1, -2],
+        expected=[5, -2, -5, 3])
+
+  def testNotInvertible(self):
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self._testWithLists(
+          diags=[[2, -1, 1, 0], [1, 4, 1, -1], [0, 2, 0, 3]],
+          rhs=[1, 2, 3, 4],
+          expected=[8, -3.5, 0, -4])
+
+  def testDiagonal(self):
+    self._testWithLists(
+        diags=[[0, 0, 0, 0], [1, 2, -1, -2], [0, 0, 0, 0]],
+        rhs=[1, 2, 3, 4],
+        expected=[1, 1, -3, -2])
+
+  def testUpperTriangular(self):
+    self._testWithLists(
+        diags=[[2, 4, -1, 0], [1, 3, 1, 2], [0, 0, 0, 0]],
+        rhs=[1, 6, 4, 4],
+        expected=[13, -6, 6, 2])
+
+  def testLowerTriangular(self):
+    self._testWithLists(
+        diags=[[0, 0, 0, 0], [2, -1, 3, 1], [0, 1, 4, 2]],
+        rhs=[4, 5, 6, 1],
+        expected=[2, -3, 6, -11])
+
+  # Multiple right-hand sides and batching
+
+  def testWithTwoRightHandSides(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.transpose([_sample_result, 2 * _sample_result]))
+
+  def testBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]))
+
+  def testBatchingAndTwoRightHandSides(self):
+    rhs = np.transpose([_sample_rhs, 2 * _sample_rhs])
+    expected_result = np.transpose([_sample_result, 2 * _sample_result])
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([rhs, 2 * rhs]),
+        expected=np.array([expected_result, -2 * expected_result]))
+
+  # Various input formats
+
+  def testSequenceFormat(self):
+    self._test(
+        diags=(_tfconst([2, 1, 4]), _tfconst([1, 3, 2, 2]), _tfconst([1, -1,
+                                                                      1])),
+        rhs=_tfconst([1, 2, 3, 4]),
+        expected=_tfconst([-9, 5, -4, 4]),
+        diags_format="sequence")
+
+  def testSequenceFormatWithDummyElements(self):
+    dummy = 20
+    self._test(
+        diags=(_tfconst([2, 1, 4, dummy]), _tfconst([1, 3, 2, 2]),
+               _tfconst([dummy, 1, -1, 1])),
+        rhs=_tfconst([1, 2, 3, 4]),
+        expected=_tfconst([-9, 5, -4, 4]),
+        diags_format="sequence")
+
+  def testSequenceFormatWithBatching(self):
+    self._test(
+        diags=(_tfconst([[2, 1, 4], [-2, -1, -4]]),
+               _tfconst([[1, 3, 2, 2], [-1, -3, -2, -2]]),
+               _tfconst([[1, -1, 1], [-1, 1, -1]])),
+        rhs=_tfconst([[1, 2, 3, 4], [1, 2, 3, 4]]),
+        expected=_tfconst([[-9, 5, -4, 4], [9, -5, 4, -4]]),
+        diags_format="sequence")
+
+  def testMatrixFormat(self):
+    self._testWithLists(
+        diags=[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+        rhs=[1, 2, 3, 4],
+        expected=[-9, 5, -4, 4],
+        diags_format="matrix")
+
+  def testMatrixFormatWithMultipleRightHandSides(self):
+    self._testWithLists(
+        diags=[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+        rhs=[[1, -1], [2, -2], [3, -3], [4, -4]],
+        expected=[[-9, 9], [5, -5], [-4, 4], [4, -4]],
+        diags_format="matrix")
+
+  def testMatrixFormatWithBatching(self):
+    self._testWithLists(
+        diags=[[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+               [[-1, -2, 0, 0], [-1, -3, -1, 0], [0, 1, -2, -4], [0, 0, -1,
+                                                                  -2]]],
+        rhs=[[1, 2, 3, 4], [1, 2, 3, 4]],
+        expected=[[-9, 5, -4, 4], [9, -5, 4, -4]],
+        diags_format="matrix")
+
+  def testRightHandSideAsColumn(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs]),
+        expected=np.transpose([_sample_result]),
+        diags_format="compact")
+
+  # Tests with transpose and adjoint
+
+  def testTransposeRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, 2 * _sample_result]),
+        transpose_rhs=True)
+
+  def testConjugateRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs * (1 + 1j), _sample_rhs * (1 - 2j)]),
+        expected=np.transpose(
+            [_sample_result * (1 - 1j), _sample_result * (1 + 2j)]),
+        conjugate_rhs=True)
+
+  def testAdjointRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.array([_sample_rhs * (1 + 1j), _sample_rhs * (1 - 2j)]),
+        expected=np.array(
+            [_sample_result * (1 - 1j), _sample_result * (1 + 2j)]),
+        transpose_rhs=True,
+        conjugate_rhs=True)
+
+  def testTransposeRhsWithBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([[_sample_rhs, 2 * _sample_rhs],
+                      [3 * _sample_rhs, 4 * _sample_rhs]]),
+        expected=np.array([[_sample_result, 2 * _sample_result],
+                           [-3 * _sample_result, -4 * _sample_result]]),
+        transpose_rhs=True)
+
+  def testTransposeRhsWithRhsAsVector(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs,
+        expected=_sample_result,
+        transpose_rhs=True)
+
+  def testConjugateRhsWithRhsAsVector(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs * (1 + 1j),
+        expected=_sample_result * (1 - 1j),
+        conjugate_rhs=True)
+
+  def testTransposeRhsWithRhsAsVectorAndBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]),
+        transpose_rhs=True)
+
+  # Invalid input shapes
+
+  def testInvalidShapesCompactFormat(self):
+
+    def test_raises(diags_shape, rhs_shape):
+      self._assertRaises(_tf_ones(diags_shape), _tf_ones(rhs_shape), "compact")
+
+    test_raises((5, 4, 4), (5, 4))
+    test_raises((5, 3, 4), (4, 5))
+    test_raises((5, 3, 4), (5))
+    test_raises((5), (5, 4))
+
+  def testInvalidShapesSequenceFormat(self):
+
+    def test_raises(diags_tuple_shapes, rhs_shape):
+      diagonals = tuple(_tf_ones(shape) for shape in diags_tuple_shapes)
+      self._assertRaises(diagonals, _tf_ones(rhs_shape), "sequence")
+
+    test_raises(((5, 4), (5, 4)), (5, 4))
+    test_raises(((5, 4), (5, 4), (5, 6)), (5, 4))
+    test_raises(((5, 3), (5, 4), (5, 6)), (5, 4))
+    test_raises(((5, 6), (5, 4), (5, 3)), (5, 4))
+    test_raises(((5, 4), (7, 4), (5, 4)), (5, 4))
+    test_raises(((5, 4), (7, 4), (5, 4)), (3, 4))
+
+  def testInvalidShapesMatrixFormat(self):
+
+    def test_raises(diags_shape, rhs_shape):
+      self._assertRaises(_tf_ones(diags_shape), _tf_ones(rhs_shape), "matrix")
+
+    test_raises((5, 4, 7), (5, 4))
+    test_raises((5, 4, 4), (3, 4))
+    test_raises((5, 4, 4), (5, 3))
+
+  # Tests with placeholders
+
+  def _testWithPlaceholders(self,
+                            diags_shape,
+                            rhs_shape,
+                            diags_feed,
+                            rhs_feed,
+                            expected,
+                            diags_format="compact"):
+    if context.executing_eagerly():
+      return
+    diags = array_ops.placeholder(dtypes.float64, shape=diags_shape)
+    rhs = array_ops.placeholder(dtypes.float64, shape=rhs_shape)
+    x = linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+    with self.cached_session(use_gpu=False) as sess:
+      result = sess.run(x, feed_dict={diags: diags_feed, rhs: rhs_feed})
+      self.assertAllClose(result, expected)
+
+  def testCompactFormatAllDimsUnknown(self):
+    self._testWithPlaceholders(
+        diags_shape=[None, None],
+        rhs_shape=[None],
+        diags_feed=_sample_diags,
+        rhs_feed=_sample_rhs,
+        expected=_sample_result)
+
+  def testCompactFormatUnknownMatrixSize(self):
+    self._testWithPlaceholders(
+        diags_shape=[3, None],
+        rhs_shape=[4],
+        diags_feed=_sample_diags,
+        rhs_feed=_sample_rhs,
+        expected=_sample_result)
+
+  def testCompactFormatUnknownRhsCount(self):
+    self._testWithPlaceholders(
+        diags_shape=[3, 4],
+        rhs_shape=[4, None],
+        diags_feed=_sample_diags,
+        rhs_feed=np.transpose([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.transpose([_sample_result, 2 * _sample_result]))
+
+  def testCompactFormatUnknownBatchSize(self):
+    self._testWithPlaceholders(
+        diags_shape=[None, 3, 4],
+        rhs_shape=[None, 4],
+        diags_feed=np.array([_sample_diags, -_sample_diags]),
+        rhs_feed=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]))
+
+  def testMatrixFormatWithUnknownDims(self):
+    if context.executing_eagerly():
+      return
+
+    def test_with_matrix_shapes(matrix_shape):
+      matrix = np.array([[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4],
+                         [0, 0, 1, 2]])
+      rhs = np.array([1, 2, 3, 4])
+      x = np.array([-9, 5, -4, 4])
+      self._testWithPlaceholders(
+          diags_shape=matrix_shape,
+          rhs_shape=[None, None],
+          diags_feed=matrix,
+          rhs_feed=np.transpose([rhs, 2 * rhs]),
+          expected=np.transpose([x, 2 * x]),
+          diags_format="matrix")
+
+    test_with_matrix_shapes(matrix_shape=[4, 4])
+    test_with_matrix_shapes(matrix_shape=[None, 4])
+    test_with_matrix_shapes(matrix_shape=[4, None])
+    with self.assertRaises(ValueError):
+      test_with_matrix_shapes(matrix_shape=[None, None])
+
+  def testSequenceFormatWithUnknownDims(self):
+    if context.executing_eagerly():
+      return
+    superdiag = array_ops.placeholder(dtypes.float64, shape=[None])
+    diag = array_ops.placeholder(dtypes.float64, shape=[None])
+    subdiag = array_ops.placeholder(dtypes.float64, shape=[None])
+    rhs = array_ops.placeholder(dtypes.float64, shape=[None])
+
+    x = linalg_impl.tridiagonal_solve((superdiag, diag, subdiag),
+                                      rhs,
+                                      diagonals_format="sequence")
+    with self.cached_session(use_gpu=False) as sess:
+      result = sess.run(
+          x,
+          feed_dict={
+              subdiag: [20, 1, -1, 1],
+              diag: [1, 3, 2, 2],
+              superdiag: [2, 1, 4, 20],
+              rhs: [1, 2, 3, 4]
+          })
+      self.assertAllClose(result, [-9, 5, -4, 4])
+
+  # Benchmark
+
+  class TridiagonalSolveBenchmark(test.Benchmark):
+    sizes = [(100000, 1, 1), (1000000, 1, 1), (10000000, 1, 1), (100000, 10, 1),
+             (100000, 100, 1), (10000, 1, 100), (10000, 1, 1000),
+             (10000, 1, 10000)]
+
+    def _generateData(self, matrix_size, batch_size, num_rhs, seed=42):
+      data = random_ops.random_normal(
+          shape=(batch_size, 3 + num_rhs, matrix_size),
+          dtype=dtypes.float64,
+          seed=seed)
+      diags = array_ops.stack([data[:, 0], data[:, 1], data[:, 2]], axis=-2)
+      rhs = data[:, 3:, :]
+      return diags, rhs
+
+    def benchmarkTridiagonalSolveOp(self):
+      for matrix_size, batch_size, num_rhs in self.sizes:
+        with ops.Graph().as_default(), \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
+                ops.device("/cpu:0"):
+          diags, rhs = self._generateData(matrix_size, batch_size, num_rhs)
+          x = linalg_impl.tridiagonal_solve(diags, rhs, transpose_rhs=True)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(x),
+              min_iters=10,
+              store_memory_usage=False,
+              name=("tridiagonal_solve_matrix_size_{}_batch_size_{}_"
+                    "num_rhs_{}").format(matrix_size, batch_size, num_rhs))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 1b84ec1f69ed55a5c86c7767e986c7bc542e1841..bb50442dca1a5109b793a0daa950e4a2b9abd913 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
@@ -554,7 +554,7 @@ class Layer(base_layer.Layer):
 
   def __setattr__(self, value, name):
     # By-pass the automatic dependency tracking performed by the parent Layer.
-    super(checkpointable.Checkpointable, self).__setattr__(value, name)
+    super(trackable.Trackable, self).__setattr__(value, name)
 
 
 def _add_elements_to_collection(elements, collection_list):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index b2d54a98272be53b69872e900901d9552177a172..7e12dcacd86a2f792743316f65a97806c7028fc0 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -64,7 +64,7 @@ class Dense(keras_layers.Dense, base.Layer):
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such cases.
-    reuse: Boolean, whether to reuse the weights of a previous layer
+    _reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
 
   Properties:
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index f681cff6cff35bfd8ed0e3a880d26936a54fabee..77fbfd51bbbace5b043b719de45c474476f69fd4 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -64,6 +64,19 @@ bool IsPyFloat(PyObject* obj) {
          PyIsInstance(obj, &PyFloatingArrType_Type);  // NumPy float types
 }
 
+// If the input is a zero dimensional PyArray return it converted to a scalar.
+// Otherwise return the input and increment its reference count.
+// Users must Py_DECREF the output of this method.
+PyObject* ZeroDimArrayToScalar(PyObject* obj) {
+  if (PyArray_IsZeroDim(obj) && !PyArray_IsScalar(obj, Generic)) {
+    auto pyarray_obj = reinterpret_cast<PyArrayObject*>(obj);
+    obj = PyArray_ToScalar(PyArray_DATA(pyarray_obj), pyarray_obj);
+  } else {
+    Py_INCREF(obj);
+  }
+  return obj;
+}
+
 // Converts Python object `c` that should hold a Python string into a
 // C++ string in *out.  Returns nullptr on success, or a message on error.
 // Defined below, but forward declared here for use in PyRepr.
@@ -130,6 +143,10 @@ Status SampleElementFromSequence(PyObject* seq, PyObject** elem) {
 Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
   std::vector<Safe_PyObjectPtr> refs_to_clean;
   while (true) {
+    // Convert any zero dimensional numpy arrays to scalars first of all.
+    // We also have to make sure a reference to the safe_obj is kept.
+    obj = ZeroDimArrayToScalar(obj);
+    refs_to_clean.push_back(make_safe(obj));
     // We test strings first, in case a string is considered a sequence.
     if (IsPyString(obj)) {
       *dtype = DT_STRING;
@@ -240,7 +257,9 @@ const char ErrorFoundFloat[] =
       }                                                                   \
       PyObject** l = PySequence_Fast_ITEMS(seq.get());                    \
       for (int64 i = 0; i < s; ++i) {                                     \
-        const char* error = CONVERT(l[i], *buf);                          \
+        auto scalar = ZeroDimArrayToScalar(l[i]);                         \
+        const char* error = CONVERT(scalar, *buf);                        \
+        Py_DECREF(scalar);                                                \
         if (TF_PREDICT_FALSE(error != nullptr)) return error;             \
         ++*buf;                                                           \
       }                                                                   \
@@ -253,7 +272,9 @@ const char ErrorFoundFloat[] =
     Tensor result(TYPE_ENUM, shape);                                      \
     if (shape.dims() == 0) { /* Scalar case */                            \
       TYPE value;                                                         \
-      const char* error = CONVERT(obj, &value);                           \
+      auto scalar = ZeroDimArrayToScalar(obj);                            \
+      const char* error = CONVERT(scalar, &value);                        \
+      Py_DECREF(scalar);                                                  \
       if (error != nullptr) return error;                                 \
       result.scalar<TYPE>()() = value;                                    \
     } else {                                                              \
@@ -331,8 +352,25 @@ DEFINE_HELPER(ConvertInt32, int32, DT_INT32, ConvertOneInt32);
 
 template <class T>
 const char* ConvertOneFloat(PyObject* v, T* out) {
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
   if (TF_PREDICT_TRUE(PyFloat_Check(v))) {
-    *out = PyFloat_AS_DOUBLE(v);
+    double as_double = PyFloat_AsDouble(v);
+    // Handle infinity.
+    if (as_double == std::numeric_limits<double>::infinity()) {
+      *out = std::numeric_limits<T>::infinity();
+      return nullptr;
+    } else if (as_double == -1 * std::numeric_limits<double>::infinity()) {
+      *out = -1 * std::numeric_limits<T>::infinity();
+      return nullptr;
+    }
+    // Check for overflow.
+    if (as_double > std::numeric_limits<T>::max() ||
+        as_double < std::numeric_limits<T>::lowest()) {
+      return ErrorOutOfRangeDouble;
+    }
+    *out = static_cast<T>(as_double);
     return nullptr;
   }
 #if PY_MAJOR_VERSION < 3
@@ -348,6 +386,9 @@ const char* ConvertOneFloat(PyObject* v, T* out) {
   }
   if (PyIsInstance(v, &PyFloatingArrType_Type)) {  // NumPy float types
     Safe_PyObjectPtr as_float = make_safe(PyNumber_Float(v));
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     return ConvertOneFloat<T>(as_float.get(), out);
   }
   if (PyIsInstance(v, &PyIntegerArrType_Type)) {  // NumPy integers
@@ -356,6 +397,9 @@ const char* ConvertOneFloat(PyObject* v, T* out) {
 #else
     Safe_PyObjectPtr as_int = make_safe(PyNumber_Long(v));
 #endif
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     return ConvertOneFloat<T>(as_int.get(), out);
   }
   return ErrorMixedTypes;
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 8aa3a199b562a382f4e74c0df15c711f7312b035..55909cc2672b4e601f3a6e5607c1efe1b10e06cc 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -13,7 +13,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python/training/tracking",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/module/module.py b/tensorflow/python/module/module.py
index 62d1bd46c4180c30930ca25d01981828a8da9893..07455bd7f7cd9455c18f92a5eb4c898735d5e376 100644
--- a/tensorflow/python/module/module.py
+++ b/tensorflow/python/module/module.py
@@ -18,43 +18,66 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import re
 import sys
 
 import six
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
+NO_MODULE_NAME_SCOPE = "__no_module_name_scope__"
 
-class ModuleMetaclass(type):
+
+class ModuleMetaclass(abc.ABCMeta):
   """Metaclass for `tf.Module`."""
 
   def __new__(mcs, name, bases, clsdict):
+    methods = []
+
     for key, value in clsdict.items():
-      if key in ("__init__", "name_scope"):
+      if key == "name_scope":
+        continue
+
+      elif key.startswith("__") and key != "__call__":
+        # Don't patch methods like `__getattr__` or `__del__`.
         continue
 
       elif tf_inspect.isfunction(value):
-        if getattr(value, "_no_module_name_scope", False):
-          # The function has been annotated to say that no autoscoping should
-          # be applied, so do not patch it.
-          continue
-        clsdict[key] = with_name_scope(value)
+        # We defer patching methods until after the type is created such that we
+        # can trigger the descriptor binding them to the class.
+        methods.append(key)
 
       elif isinstance(value, property):
+        # TODO(tomhennigan) Preserve the type of property subclasses.
         clsdict[key] = property(
             value.fget if not value.fget else with_name_scope(value.fget),
             value.fset if not value.fset else with_name_scope(value.fset),
             value.fdel if not value.fdel else with_name_scope(value.fdel),
             doc=value.__doc__)
 
-    return type.__new__(mcs, name, bases, clsdict)
+    cls = super(ModuleMetaclass, mcs).__new__(mcs, name, bases, clsdict)
+
+    for method_name in methods:
+      # Note: the below is quite subtle, we need to ensure that we're wrapping
+      # the method bound to the class. In some cases (e.g. `wrapt`) this is
+      # important since the method can trigger different behavior when it is
+      # bound (e.g. in wrapt `FunctionWrapper.__get__(None, cls)` produces a
+      # `BoundFunctionWrapper` which in turn populates the `instance` argument
+      # to decorator functions using args[0]).
+      # Equivalent to: `cls.__dict__[method_name].__get__(None, cls)`
+      method = getattr(cls, method_name)
+      method = with_name_scope(method)
+      setattr(cls, method_name, method)
+
+    return cls
 
   def __call__(cls, *args, **kwargs):
     # Call new such that we have an un-initialized module instance that we can
@@ -91,7 +114,7 @@ class ModuleMetaclass(type):
     return module
 
 
-def with_name_scope(unbound_method):
+def wrap_with_name_scope(unbound_method):
   """Patches the given method so it enters the modules name scope."""
   def enter_name_scope(self, *args, **kwargs):
     """Decorator that calls the given function in the module name scope.
@@ -118,11 +141,37 @@ def with_name_scope(unbound_method):
       # for a particular method annotate it with `@no_module_name_scope`.
       return unbound_method(self, *args, **kwargs)
 
-  return tf_decorator.make_decorator(unbound_method, enter_name_scope)
+  return enter_name_scope
+
+
+def wrap_with_name_scope_no_exception(unbound_method):
+  """Patches the given method so it enters the modules name scope."""
+  def enter_name_scope(self, *args, **kwargs):
+    with self.name_scope:
+      # tf.Module enters the module name scope for all methods. To disable this
+      # for a particular method annotate it with `@no_module_name_scope`.
+      return unbound_method(self, *args, **kwargs)
+  return enter_name_scope
 
 
-@tf_export("experimental.Module")
-class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
+def with_name_scope(unbound_method):
+  """Patches the given method so it enters the modules name scope."""
+  if getattr(unbound_method, NO_MODULE_NAME_SCOPE, False):
+    # The function has been annotated to say that no autoscoping should be
+    # applied, so do not patch it.
+    return unbound_method
+
+  if isinstance(unbound_method, def_function.Function):
+    # Autograph cannot convert functions that have try/catch.
+    unbound_method._decorate(wrap_with_name_scope_no_exception)  # pylint: disable=protected-access
+    return unbound_method
+  else:
+    return tf_decorator.make_decorator(unbound_method,
+                                       wrap_with_name_scope(unbound_method))
+
+
+@tf_export("Module")
+class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoTrackable)):
   """Base neural network module class.
 
   A module is a named container for `tf.Variable`s, other `tf.Module`s and
@@ -131,7 +180,7 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
 
   >>> class Dense(tf.Module):
   ...   def __init__(self, in_features, output_features):
-  ...     super(Linear, self).__init__()
+  ...     super(Dense, self).__init__()
   ...     self.w = tf.Variable(
   ...         tf.random_normal([input_features, output_features]), name='w')
   ...     self.b = tf.Variable(tf.zeros([output_features]), name='b')
@@ -150,7 +199,7 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
   By subclassing `tf.Module` instead of `object` any variables created inside
   the module are automatically created within the modules name scope:
 
-  >> d.w.name
+  >>> d.w.name
   "dense/w:0"
 
   In eager mode this is useful for debugging, and when used with `@tf.function`
@@ -199,92 +248,44 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
 
   @property
   def variables(self):
-    """Collection of variables owned by this module and it's submodules.
+    """Sequence of variables owned by this module and it's submodules.
 
     Note: this method uses reflection to find variables on the current instance
     and submodules. For performance reasons you may wish to cache the result
     of calling this method if you don't expect the return value to change.
 
     Returns:
-      A collection of variables for the current module (sorted by attribute
-      name) followed by variables from all submodules recursively (depth first).
-    """
-    return tuple(walk(self, recurse_if=_IS_MODULE, predicate=_IS_VARIABLE))
-
-  @property
-  def owned_variables(self):
-    """Collection of variables that are attributes of the current module.
-
-    See `variables` for a property which returns all variables from the current
-    module and all it's submodules recursively.
-
-    Returns:
-      A collection of variables which are attributes of the current module. Will
-      yield variables inside nested structures (lists etc) but not in other
-      modules.
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
     """
-    return tuple(walk(self, predicate=_IS_VARIABLE))
+    return tuple(self._flatten(predicate=_IS_VARIABLE))
 
   @property
   def trainable_variables(self):
-    """Collection of variables owned by this module and it's submodules.
+    """Sequence of variables owned by this module and it's submodules.
 
     Note: this method uses reflection to find variables on the current instance
     and submodules. For performance reasons you may wish to cache the result
     of calling this method if you don't expect the return value to change.
 
     Returns:
-      A collection of variables for the current module (sorted by attribute
-      name) followed by variables from all submodules recursively (depth first).
-    """
-    return tuple(
-        walk(self, recurse_if=_IS_MODULE, predicate=_IS_TRAINABLE_VARIABLE))
-
-  @property
-  def owned_trainable_variables(self):
-    """Collection of variables that are attributes of the current module.
-
-    See `variables` for a property which returns all variables from the current
-    module and all it's submodules recursively.
-
-    Returns:
-      A collection of variables which are attributes of the current module. Will
-      yield variables inside nested structures (lists etc) but not in other
-      modules.
-    """
-    return tuple(walk(self, predicate=_IS_TRAINABLE_VARIABLE))
-
-  @property
-  def owned_submodules(self):
-    """Collection of immediate child modules.
-
-    Child modules are modules which are found as properties of the current
-    module.
-
-    >>> a = tf.experimental.Module()
-    >>> b = tf.experimental.Module()
-    >>> c = tf.experimental.Module()
-    >>> a.b = b
-    >>> b.c = c
-    >>> assert list(a.owned_submodules) == [b]
-    >>> assert list(b.owned_submodules) == [c]
-    >>> assert list(c.owned_submodules) == []
-
-    Returns:
-      A collection of all child modules.
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
     """
-    return tuple(walk(self, predicate=_IS_MODULE))
+    return tuple(self._flatten(predicate=_IS_TRAINABLE_VARIABLE))
 
   @property
   def submodules(self):
-    """Collection of all sub-modules.
+    """Sequence of all sub-modules.
 
     Submodules are modules which are properties of this module, or found as
     properties of modules which are properties of this module (and so on).
 
-    >>> a = tf.experimental.Module()
-    >>> b = tf.experimental.Module()
-    >>> c = tf.experimental.Module()
+    >>> a = tf.Module()
+    >>> b = tf.Module()
+    >>> c = tf.Module()
     >>> a.b = b
     >>> b.c = c
     >>> assert list(a.submodules) == [b, c]
@@ -292,9 +293,71 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
     >>> assert list(c.submodules) == []
 
     Returns:
-      A collection of all submodules.
+      A sequence of all submodules.
+    """
+    return tuple(self._flatten(predicate=_IS_MODULE))
+
+  def _flatten(self,
+               recursive=True,
+               predicate=None,
+               attribute_traversal_key=None,
+               with_path=False):
+    """Flattened attribute values in sorted order by attribute name.
+
+    Modules are flattened by first walking their attributes in name order.
+    Each attribute value is then flattened to find leaf values. If flatten is
+    to be applied `recursive`ly then if the leaf is a `Module` it will also be
+    flattened to find leaves. Finally every leaf value is optionally tested
+    against the given `predicate` and finally yielded.
+
+    >>> class Foo(tf.Module):
+    ...   def __init__(self):
+    ...     super(Foo, self).__init__()
+    ...     self.x = [tf.constant('a'), tf.constant('b')]
+    ...     self.y = {'i': tf.constant('c'), 'j': tf.constant('d')}
+    ...     self.z = tf.constant('e')
+    ...
+    ...   @property
+    ...   def tensors(self):
+    ...     return tuple(self._flatten(predicate=is_tensor, with_path=True))
+
+    >>> foo = Foo()
+    >>> foo.tensors
+    ((('x', 0),   <tf.Tensor: ...'a'>),
+     (('x', 1),   <tf.Tensor: ...'b'>),
+     (('y', 'i'), <tf.Tensor: ...'c'>),
+     (('y', 'j'), <tf.Tensor: ...'d'>),
+     (('z',),     <tf.Tensor: ...'e'>))
+
+    `attribute_traversal_key` controls the order object properties are visited.
+    If not set objects are visited in ascending order by name.
+
+    Args:
+      recursive: Whether to recurse into child modules or not.
+      predicate: (Optional) If set then only values matching predicate are
+        yielded. A value of `None` (the default) means no items will be
+        filtered.
+      attribute_traversal_key: (Optional) Method to rekey object attributes
+        before they are sorted. Contract is the same as `key` argument to
+        builtin `sorted` and only applies to object properties.
+      with_path: (Optional) Whether to include the path to the object as well
+        as the object itself. If `with_path` is `True` then leaves will not be
+        de-duplicated (e.g. if the same leaf instance is reachable via multiple
+        modules then it will be yielded multiple times with different paths).
+
+    Returns:
+      Flat generator for leaves of the current module and optionally all
+      submodules.
     """
-    return tuple(walk(self, recurse_if=_IS_MODULE, predicate=_IS_MODULE))
+    if predicate is None:
+      predicate = lambda _: True
+
+    return _flatten_module(
+        self,
+        recursive=recursive,
+        predicate=predicate,
+        attribute_traversal_key=attribute_traversal_key,
+        with_path=with_path)
 
   @classmethod
   def no_name_scope(cls, method):
@@ -316,7 +379,7 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
     Returns:
       The method, with a flag indicating no name scope wrapping should occur.
     """
-    setattr(method, "_no_module_name_scope", True)
+    setattr(method, NO_MODULE_NAME_SCOPE, True)
     return method
 
 _IS_VARIABLE = lambda o: isinstance(o, variables.Variable)
@@ -334,77 +397,61 @@ def camel_to_snake(value):
   return _CAMEL_TO_SNAKE_R.sub(r"_\1", value).lower()
 
 
-def walk(o, recurse_if=None, predicate=None):
-  """Flattened attributes of `o` in sorted order by attribute name.
-
-  >>> class Foo(object):
-  ...   def __init__(self, prefix=''):
-  ...     self.z = prefix + 'c'
-  ...     self.a = [prefix + 'a', prefix + 'b']
-
-  >>> tuple(walk(Foo()))
-  ('a', 'b', 'c')
-
-  If `predicate` is not None, then only values matching predicate are returned:
-
-  >>> tuple(walk(Foo(), predicate=lambda v: v != 'a'))
-  ('b', 'c')
-
-  If `recurse_if` is not None then it should be a callable which tests if the
-  given leaf should be expanded:
+# AutoTrackable adds object attributes that users will not expect us to
+# include when flattening (these reference dependencies reachable via other
+# object attributes).
+AUTO_CHECKPOINTABLE_ATTRS = ("_unconditional_checkpoint_dependencies",
+                             "_unconditional_dependency_names")
 
-  >>> is_string = lambda v: isinstance(v, str)
-  >>> is_foo = lambda l: isinstance(l, Foo)
-  >>> o = Foo(prefix='root_')
-  >>> o.b = Foo(prefix='child_')
-  >>> tuple(walk(o, predicate=is_string))
-  ('root_a', 'root_b', 'root_c')
-  >>> tuple(walk(o, recurse_if=is_foo, predicate=is_string))
-  ('root_a', 'root_b', 'root_c', 'child_a', 'child_b', 'child_c')
 
-  Args:
-    o: An object who's attributes are walked.
-    recurse_if: (Optional) Visited items of this type will be walked to extract
-      more leaves. If `None`, it will not recurse into leaves.
-    predicate: (Optional) If set then only values matching predicate are
-      yielded.
-
-  Returns:
-    Attributes of `o` in name order. If `recurse_if` is not `None` then
-    attributes for which `recurse_if(attribute) == True` will be walked
-    recursively. If `predicate` is not `None` then only attributes for which
-    `predicate(attribute) == True` will be yielded.
-  """
-  if predicate is None:
-    predicate = lambda _: True
-  return _walk_internal(
-      o, recurse_if=recurse_if, predicate=predicate, seen=set())
+def _flatten_module(module,
+                    recursive,
+                    predicate,
+                    attribute_traversal_key,
+                    with_path,
+                    module_path=(),
+                    seen=None):
+  """Implementation of `flatten`."""
+  if seen is None:
+    seen = set([id(module)])
 
+  module_dict = vars(module)
+  submodules = []
 
-def _walk_internal(o, recurse_if, predicate, seen):
-  """Implementation of `walk`."""
-  if seen is None:
-    seen = set([id(o)])
+  for key in sorted(module_dict, key=attribute_traversal_key):
+    if key in AUTO_CHECKPOINTABLE_ATTRS:
+      continue
 
-  o_dict = vars(o)
-  to_walk = []
+    for leaf_path, leaf in nest.flatten_with_tuple_paths(module_dict[key]):
+      leaf_path = (key,) + leaf_path
 
-  for key in sorted(o_dict):
-    values = nest.flatten(o_dict[key])
-    for value in values:
-      value_id = id(value)
-      if value_id in seen:
-        continue
+      # TODO(tomhennigan) Handle cycles for `with_path=True` (e.g. `a.a = a`).
+      if not with_path:
+        leaf_id = id(leaf)
+        if leaf_id in seen:
+          continue
+        seen.add(leaf_id)
 
-      seen.add(value_id)
-      if predicate(value):
-        yield value
+      if predicate(leaf):
+        if with_path:
+          yield module_path + leaf_path, leaf
+        else:
+          yield leaf
 
-      if recurse_if is not None and recurse_if(value):
+      if recursive and isinstance(leaf, Module):
         # Walk direct properties first then recurse.
-        to_walk.append(value)
-
-  for value in to_walk:
-    for subvalue in _walk_internal(value, recurse_if, predicate, seen):
+        submodules.append((module_path + leaf_path, leaf))
+
+  for submodule_path, submodule in submodules:
+    subvalues = _flatten_module(
+        submodule,
+        recursive=recursive,
+        predicate=predicate,
+        attribute_traversal_key=attribute_traversal_key,
+        with_path=with_path,
+        module_path=submodule_path,
+        seen=seen)
+
+    for subvalue in subvalues:
       # Predicate is already tested for these values.
       yield subvalue
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 1e4e1959c371c35f6f71bf2b73e4adfa75053c71..52bb97bc5bc6379e93ab586d5290be2be408485b 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -18,11 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
 
 from absl.testing import parameterized
+import six
 
 from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import variables
@@ -84,6 +87,20 @@ class TestModuleNaming(test.TestCase):
     # `foo` is not a method so we do not re-enter the name scope.
     self.assertEqual(mod.foo(), "")
 
+  def test_property(self):
+    mod = PropertyModule()
+    mod.some_property = None, None  # None, None for the linter.
+    getter_scope_name, setter_scope_name = mod.some_property
+    self.assertEqual(getter_scope_name, "property_module/")
+    self.assertEqual(setter_scope_name, "property_module/")
+
+  def test_property_no_name_scope(self):
+    mod = PropertyModule()
+    mod.no_name_scope_property = None, None  # None, None for the linter.
+    getter_scope_name, setter_scope_name = mod.no_name_scope_property
+    self.assertEqual(getter_scope_name, "")
+    self.assertEqual(setter_scope_name, "")
+
   def test_invalid_name(self):
     msg = ".* is not a valid module name"
     with self.assertRaisesRegexp(ValueError, msg):
@@ -133,6 +150,34 @@ class TestModuleNaming(test.TestCase):
 
     self.assertEqual("", get_name_scope())
 
+  def test_get_attr_doesnt_enter_name_scope(self):
+    scope_names = []
+
+    class GetAttrModule(module.Module):
+
+      def __getattr__(self, name):
+        scope_names.append((name, get_name_scope()))
+        return super(GetAttrModule, self).__getattr__(name)
+
+    mod = GetAttrModule()
+    with self.assertRaises(AttributeError):
+      mod.does_not_exist  # pylint: disable=pointless-statement
+    self.assertIn(("does_not_exist", ""), scope_names)
+
+  def test_get_attribute_doesnt_enter_name_scope(self):
+    scope_names = []
+
+    class GetAttributeModule(module.Module):
+
+      def __getattribute__(self, name):
+        scope_names.append((name, get_name_scope()))
+        return super(GetAttributeModule, self).__getattribute__(name)
+
+    mod = GetAttributeModule()
+    with self.assertRaises(AttributeError):
+      mod.does_not_exist  # pylint: disable=pointless-statement
+    self.assertIn(("does_not_exist", ""), scope_names)
+
 
 class VariableNamingTest(test.TestCase):
 
@@ -151,12 +196,6 @@ class VariableTrackingTest(test.TestCase):
     self.assertEqual(m.child.variables, (m.child.w, m.child.child.w))
     self.assertEqual(m.child.child.variables, (m.child.child.w,))
 
-  def test_owned_variables(self):
-    m = RecursiveModule(3)
-    self.assertEqual(m.owned_variables, (m.w,))
-    self.assertEqual(m.child.owned_variables, (m.child.w,))
-    self.assertEqual(m.child.child.owned_variables, (m.child.child.w,))
-
   def test_trainable_variables(self):
     m = RecursiveModule(3)
     self.assertEqual(m.trainable_variables,
@@ -171,28 +210,9 @@ class VariableTrackingTest(test.TestCase):
     self.assertEqual(len(m.child.trainable_variables), 0)
     self.assertEqual(len(m.child.child.trainable_variables), 0)
 
-  def test_owned_trainable_variables(self):
-    m = RecursiveModule(3)
-    self.assertEqual(m.owned_trainable_variables, (m.w,))
-    self.assertEqual(m.child.owned_trainable_variables, (m.child.w,))
-    self.assertEqual(m.child.child.owned_trainable_variables,
-                     (m.child.child.w,))
-
-  def test_owned_trainable_variables_ignores_non_trainable(self):
-    m = RecursiveModule(3, trainable=False)
-    self.assertEqual(len(m.owned_trainable_variables), 0)
-    self.assertEqual(len(m.child.owned_trainable_variables), 0)
-    self.assertEqual(len(m.child.child.owned_trainable_variables), 0)
-
 
 class ModuleTrackingTest(test.TestCase):
 
-  def test_owned_submodules(self):
-    m = RecursiveModule(3)
-    self.assertEqual(list(m.owned_submodules), [m.child])
-    self.assertEqual(list(m.child.owned_submodules), [m.child.child])
-    self.assertEqual(list(m.child.child.owned_submodules), [])
-
   def test_submodules(self):
     m = RecursiveModule(3)
     self.assertEqual(list(m.submodules), [m.child, m.child.child])
@@ -225,6 +245,43 @@ class CommonErrorsTest(test.TestCase):
         CallsMethodBeforeSuperConstructorModule(allowed_method=True))
 
 
+class ForwardMethodsTest(test.TestCase):
+
+  def testFunctionType(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertTrue(isinstance(mod.forward, def_function.Function))
+    self.assertTrue(isinstance(mod.forward_ag, def_function.Function))
+
+  def testEntersNameScope_call(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertEqual(mod.forward().numpy(),
+                     b"module_with_function_annotated_call/")
+    self.assertEqual(mod.forward_ag().numpy(),
+                     b"module_with_function_annotated_call/")
+
+  def testEntersNameScope_concreteFunction(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertEqual(mod.forward.get_concrete_function()().numpy(),
+                     b"module_with_function_annotated_call/")
+    self.assertEqual(mod.forward_ag.get_concrete_function()().numpy(),
+                     b"module_with_function_annotated_call/")
+
+
+class AbcTest(test.TestCase):
+
+  def testAbstract(self):
+    msg = "Can't instantiate .* abstract methods"
+    with self.assertRaisesRegexp(TypeError, msg):
+      AbstractModule()  # pylint: disable=abstract-class-instantiated
+
+  def testConcrete(self):
+    mod = ConcreteModule()
+    x, scope_name = mod(2.)
+    self.assertEqual(x, 4.)
+    self.assertEqual(scope_name, "concrete_module/")
+    self.assertEqual(get_name_scope(), "")
+
+
 def get_name_scope():
   with ops.name_scope("x") as ns:
     return ns[:-2]
@@ -256,6 +313,20 @@ class RecursiveModule(module.Module):
     self.w = variables.Variable(1.0, trainable=trainable, name="mushroom")
 
 
+@six.add_metaclass(abc.ABCMeta)
+class AbstractModule(module.Module):
+
+  @abc.abstractmethod
+  def __call__(self, x):
+    pass
+
+
+class ConcreteModule(AbstractModule):
+
+  def __call__(self, x):
+    return x ** 2, get_name_scope()
+
+
 class TreeModule(module.Module):
 
   def __init__(self, name=None):
@@ -320,25 +391,110 @@ class CallsMethodBeforeSuperConstructorModule(module.Module):
   def with_name_scope(self):
     pass
 
+
+class ModuleWithFunctionAnnotatedCall(module.Module):
+
+  @def_function.function(autograph=False)
+  def forward(self):
+    return get_name_scope()
+
+  @def_function.function(autograph=True)
+  def forward_ag(self):
+    return get_name_scope()
+
+
+class PropertyModule(module.Module):
+
+  def __init__(self):
+    super(PropertyModule, self).__init__()
+    self._setter_scope_name = None
+
+  @property
+  def some_property(self):
+    getter_scope_name = get_name_scope()
+    return getter_scope_name, self._setter_scope_name
+
+  @some_property.setter
+  def some_property(self, my_property):
+    self._setter_scope_name = get_name_scope()
+
+  @property
+  @module.Module.no_name_scope
+  def no_name_scope_property(self):
+    getter_scope_name = get_name_scope()
+    return getter_scope_name, self._setter_scope_name
+
+  @no_name_scope_property.setter
+  @module.Module.no_name_scope
+  def no_name_scope_property(self, my_property):
+    self._setter_scope_name = get_name_scope()
+
 NamedPair = collections.namedtuple("NamedPair", ("first", "second"))
 mk_index_dict = lambda v: dict(enumerate(v))
 
 
-class WalkTest(parameterized.TestCase, test.TestCase):
+class FlattenTest(parameterized.TestCase, test.TestCase):
 
   @parameterized.parameters(lambda v: NamedPair(*v), list, tuple, mk_index_dict)
-  def test_walk(self, container_type):
+  def test_flatten(self, container_type):
     parent = SimpleModule(container_type=container_type)
     child = parent.c
 
     self.assertEqual(
-        list(module.walk(parent, predicate=IS_MEMBER)),
+        list(parent._flatten(recursive=False, predicate=IS_MEMBER)),
         [parent.a[0], parent.a[1], parent.z])
 
     self.assertEqual(
-        list(module.walk(parent, recurse_if=IS_MODULE, predicate=IS_MEMBER)),
+        list(parent._flatten(predicate=IS_MEMBER)),
         [parent.a[0], parent.a[1], parent.z, child.a[0], child.a[1], child.z])
 
+  def test_attribute_traversal_key(self):
+    mod = LayerModule()
+    self.assertEqual(
+        mod.variables,
+        mod._trainable_variables + mod._non_trainable_variables + [mod._bonus])
+
+  def test_with_path(self):
+    mod = module.Module()
+    mod.w = variables.Variable(1.)
+    mod.encoder = module.Module()
+    mod.encoder.w = [({"k": mod.w}, {"k": mod.w})]
+    mod.decoder = mod.encoder
+
+    state_dict = dict(
+        mod._flatten(with_path=True, predicate=module._IS_VARIABLE))
+
+    self.assertEqual(state_dict,
+                     {("w",): mod.w,
+                      ("encoder", "w", 0, 0, "k"): mod.encoder.w[0][0]["k"],
+                      ("encoder", "w", 0, 1, "k"): mod.encoder.w[0][1]["k"],
+                      ("decoder", "w", 0, 0, "k"): mod.decoder.w[0][0]["k"],
+                      ("decoder", "w", 0, 1, "k"): mod.decoder.w[0][1]["k"]},)
+
+
+class LayerModule(module.Module):
+
+  def __init__(self):
+    super(LayerModule, self).__init__()
+    self._trainable_variables = [
+        variables.Variable(1., name="a"),
+        variables.Variable(2., name="b"),
+    ]
+    self._non_trainable_variables = [
+        variables.Variable(3., name="c"),
+        variables.Variable(4., name="d"),
+    ]
+    self._bonus = variables.Variable(5., name="e")
+
+  @property
+  def variables(self):
+    def key_function(name):
+      indexes = {"_trainable_variables": 0, "_non_trainable_variables": 1}
+      return indexes.get(name, 2), name
+
+    return list(self._flatten(predicate=module._IS_VARIABLE,
+                              attribute_traversal_key=key_function))
+
 
 class MemberType(object):
   """A simple type to search for."""
@@ -358,6 +514,43 @@ class SimpleModule(module.Module):
 IS_MEMBER = lambda v: isinstance(v, MemberType)
 IS_MODULE = lambda v: isinstance(v, module.Module)
 
+
+class CustomMetaclass(type):
+
+  TAG = "__custom_metaclass__"
+
+  def __new__(mcs, name, bases, clsdict):
+    new_type = super(CustomMetaclass, mcs).__new__(mcs, name, bases, clsdict)
+    setattr(new_type, CustomMetaclass.TAG, True)
+    return new_type
+
+
+class CombiningMetaclass(module.ModuleMetaclass, CustomMetaclass):
+
+  TAG = "__combining_metaclass__"
+
+  def __new__(mcs, name, bases, clsdict):
+    new_type = super(CombiningMetaclass, mcs).__new__(mcs, name, bases, clsdict)
+    setattr(new_type, CombiningMetaclass.TAG, True)
+    return new_type
+
+
+@six.add_metaclass(CombiningMetaclass)
+class ModuleWithCustomMetaclass(module.Module):
+
+  def __init__(self):
+    super(ModuleWithCustomMetaclass, self).__init__()
+    self.init_name_scope = get_name_scope()
+
+
+class CustomMetaclassTest(test.TestCase):
+
+  def testSupportsCustomMetaclass(self):
+    m = ModuleWithCustomMetaclass()
+    self.assertEqual(m.init_name_scope, "module_with_custom_metaclass/")
+    self.assertTrue(getattr(ModuleWithCustomMetaclass, CombiningMetaclass.TAG))
+    self.assertTrue(getattr(ModuleWithCustomMetaclass, CustomMetaclass.TAG))
+
 if __name__ == "__main__":
   v2_compat.enable_v2_behavior()
   test.main()
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 45e741ef222b1dcde21b66ab6cdc3db9576a85ce..57572328f6d4a4f8284887f86ce3694bf0551f37 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -839,16 +839,11 @@ def _ScatterNdNonAliasingAddGrad(op, grad):
 def _BroadcastToGrad(op, grad):
   input_value = op.inputs[0]
   broadcast_shape = op.inputs[1]
-  # Assign ids for each position in input_value.
   input_value_shape = array_ops.shape(input_value)
-  input_value_size = array_ops.size(input_value)
-  ids = array_ops.reshape(math_ops.range(input_value_size), input_value_shape)
-  broadcast_ids = array_ops.broadcast_to(ids, broadcast_shape)
-  # Group by ids and sum its gradients.
-  grad_flatten = array_ops.reshape(grad, [-1])
-  broadcast_ids_flatten = array_ops.reshape(broadcast_ids, [-1])
-  updates_grad_flatten = math_ops.unsorted_segment_sum(grad_flatten,
-                                                       broadcast_ids_flatten,
-                                                       input_value_size)
-  updates_grad = array_ops.reshape(updates_grad_flatten, input_value_shape)
+  _, reduction_axes = gen_array_ops.broadcast_gradient_args(broadcast_shape,
+                                                            input_value_shape)
+  updates_grad_reshaped = math_ops.reduce_sum(grad,
+                                              axis=reduction_axes,
+                                              keepdims=True)
+  updates_grad = array_ops.reshape(updates_grad_reshaped, input_value_shape)
   return [updates_grad, None]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 230ce62db7ece06c682a77f73898b9b4686693d0..8db23c467a85649422d376bcf8b1289aa3937e19 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -67,9 +67,9 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not hasattr(input, "graph"):
     input = ops.convert_to_tensor(input)
-    in_device = input.device
+    in_device = input.backing_device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
     context_device = context.context().device_name
     if not context_device:
@@ -3278,8 +3278,8 @@ def gather(params,
            batch_dims=0):
   r"""Gather slices from params axis axis according to indices.
 
-  Gather slices from params axis axis according to indices.  `indices` must be
-  an integer tensor of any dimension (usually 0-D or 1-D).
+  Gather slices from params axis `axis` according to `indices`.  `indices` must
+  be an integer tensor of any dimension (usually 0-D or 1-D).
 
   For 0-D (scalar) `indices`:
 
@@ -3336,17 +3336,17 @@ def gather(params,
       to `batch_dims`.  Defaults to the first non-batch dimension. Supports
       negative indexes.
     batch_dims: An `integer`.  The number of batch dimensions.  Must be less
-      than `ndims(inices)`.
+      than `rank(indices)`.
 
   Returns:
     A `Tensor`. Has the same type as `params`.
   """
   del validate_indices
-  if axis is None:
-    axis = batch_dims
   if batch_dims != 0:
     with ops.name_scope(name, "Gather", [params, indices, axis]):
       return _batch_gather(params, indices, batch_dims, axis)
+  if axis is None:
+    axis = batch_dims
   if axis != 0:
     # Note that we do a sparse_read here to avoid snapshotting the entire
     # resource variable and doing a gather, which can be inefficient and lead to
@@ -3393,15 +3393,15 @@ def _batch_gather(params, indices, batch_dims, axis=None):
   This operation assumes that the leading `batch_dims` dimensions of `indices`
   and `params` are batch dimensions; and performs a `tf.gather` operation within
   each batch. (If `batch_dims` is not specified, then it defaults to
-  `ndims(indices) - 1`.)  In the case in which `batch_dims==0`, this operation
+  `rank(indices)-1`.)  In the case in which `batch_dims==0`, this operation
   is equivalent to `tf.gather`.
 
   Args:
     params: A Tensor. The tensor from which to gather values.
     indices: A Tensor. Must be one of the following types: int32, int64. Index
       tensor. Must be in range `[0, params.shape[batch_dims]]`.
-    batch_dims: An integer.  The number of batch dimensions.  Must be less than
-      ndims(inices).  Defaults to `ndims(indices) - 1` if not specified.
+    batch_dims: An integer or none.  The number of batch dimensions.  Must be
+      less than `rank(indices)`.  Defaults to `rank(indices) - 1` if None.
     axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
       `axis` in `params` to gather `indices` from. Must be greater than or equal
       to `batch_dims`.  Defaults to the first non-batch dimension. Supports
@@ -3427,10 +3427,10 @@ def _batch_gather(params, indices, batch_dims, axis=None):
   if batch_dims < 0:
     batch_dims += indices_ndims
   if batch_dims < 0 or batch_dims >= indices_ndims:
-    raise ValueError("batch_dims = %d must be less than ndims(indices) = %d" %
+    raise ValueError("batch_dims = %d must be less than rank(indices) = %d" %
                      (batch_dims, indices_ndims))
   if params.shape.ndims is not None and batch_dims >= params.shape.ndims:
-    raise ValueError("batch_dims = %d must be less than ndims(params) = %d" %
+    raise ValueError("batch_dims = %d must be less than rank(params) = %d" %
                      (batch_dims, params.shape.ndims))
 
   # Handle axis by transposing the axis dimension to be the first non-batch
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index c182874c7f2d77b317f42a0cbfadb7435534f747..d154b6759bfbc50ad2e5ea34e4f04b945ef2d397 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -35,7 +35,6 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
     super(BitwiseOpTest, self).__init__(method_name)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -73,7 +72,6 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(truth, popcnt_result)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -99,7 +97,6 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(inverted, expected)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testShiftsWithPositiveLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64,
                   np.uint8, np.uint16, np.uint32, np.uint64]
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index f6c37027ae80d95d0cbc5c3df3f68543798b06c2..362c17ec6f40f7ea93802ef651365e7b4d061822 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -43,7 +43,7 @@ from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantil
 # pylint: enable=unused-import
 
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 class PruningMode(object):
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index f1f36269cf2bd9bcd3d25638a82d776850bc6bb8..b452b4a0f341738aac1da7c7b78ba99a5a469e70 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1526,6 +1526,25 @@ def _get_diff_for_monotonic_comparison(x):
     v1=['debugging.is_numeric_tensor', 'is_numeric_tensor'])
 @deprecation.deprecated_endpoints('is_numeric_tensor')
 def is_numeric_tensor(tensor):
+  """Returns `True` if the elements of `tensor` are numbers.
+
+  Specifically, returns `True` if the dtype of `tensor` is one of the following:
+
+  * `tf.float32`
+  * `tf.float64`
+  * `tf.int8`
+  * `tf.int16`
+  * `tf.int32`
+  * `tf.int64`
+  * `tf.uint8`
+  * `tf.qint8`
+  * `tf.qint32`
+  * `tf.quint8`
+  * `tf.complex64`
+
+  Returns `False` if `tensor` is of a non-numeric type or if `tensor` is not
+  a `tf.Tensor` object.
+  """
   return isinstance(tensor, ops.Tensor) and tensor.dtype in NUMERIC_TYPES
 
 
@@ -1702,7 +1721,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
-  """Asserts that the given `tensor` is a scalar.
+  """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
 
   This function raises `ValueError` unless it can be certain that the given
   `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index a237cfff826bf0fb4cacd0c25fe5d361e3d7b26e..767dcb9358e99f866d628b2aac3bd85ec78ef65a 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -126,8 +126,8 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
   In this case, the L2-norm of the output tensor is `clip_norm`.
 
   As another example, if `t` is a matrix and `axes == [1]`, then each row
-  of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
-  instead, each column of the output will be clipped.
+  of the output will have L2-norm less than or equal to `clip_norm`. If
+  `axes == [0]` instead, each column of the output will be clipped.
 
   This operation is typically used to clip gradients before applying them with
   an optimizer.
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index 98668facd5bc56892fa00f258dfebcbe93c063da..32a71fc25d370f4e96ef4036f4fdee3c670502d2 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -48,7 +48,7 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
   if not device.canonical_name(t.device):
     raise ValueError('Device assignment required for collective ops')
   if group_size <= 1:
-    raise ValueError('Parameter group_size to add_reduce must be at least 2.')
+    raise ValueError('Parameter group_size to all_reduce must be at least 2.')
   return gen_collective_ops.collective_reduce(t,
                                               group_size=group_size,
                                               group_key=group_key,
@@ -58,6 +58,35 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
                                               subdiv_offsets=subdiv_offsets)
 
 
+def all_gather(t, group_size, group_key, instance_key):
+  """Accumulates tensors collectively, across devices, along first dimension.
+
+  Args:
+    t: the tensor to participate in the accumulation.
+    group_size: the total number of tensors to be collectively accumulated.
+      Each must reside on a different device.
+    group_key: an integer identifying the group of devices.
+    instance_key: an integer identifying the participating group of Ops.
+
+  Returns:
+    An Op implementing the distributed operation.
+
+  Raises:
+    ValueError: if any of the input parameter constraints are not met.
+  """
+  if not device.canonical_name(t.device):
+    raise ValueError('Device assignment required for collective ops')
+  if group_size <= 1:
+    raise ValueError('Parameter group_size to all_gather must be at least 2.')
+  dims = t.shape.as_list()
+  output_shape = [dims[0] * group_size] + dims[1:]
+  return gen_collective_ops.collective_gather(t,
+                                              shape=output_shape,
+                                              group_size=group_size,
+                                              group_key=group_key,
+                                              instance_key=instance_key)
+
+
 def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
   """Broadcasts one tensor to a group of others, across devices.
 
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 9c5a39b90e0e163f559524e33f7deb04794c1d0d..c9b376caf8f07236c4dde2bda2ba43c89a0ad8c2 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,8 +25,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
-# TODO(tucker): Make these ops work in eager mode. b/79776476
-
 
 class CollectiveOpTest(test.TestCase):
 
@@ -114,6 +112,42 @@ class CollectiveOpTest(test.TestCase):
   def testCollectiveBroadcast(self):
     self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
+  def _testCollectiveGather(self, t0, t1, expected, set_graph_key):
+    group_key = 1
+    instance_key = 1
+    with self.session(
+        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+      with ops.device('/CPU:0'):
+        in0 = constant_op.constant(t0)
+        colred0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+      with ops.device('/CPU:1'):
+        in1 = constant_op.constant(t1)
+        colred1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+      run_options = config_pb2.RunOptions()
+      if set_graph_key:
+        run_options.experimental.collective_graph_key = 1
+      results = sess.run([colred0, colred1], options=run_options)
+    self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testCollectiveGather(self):
+    self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
+                               [10, 11, 12, 13, 14, 15, 16, 17],
+                               [0, 1, 2, 3, 4, 5, 6, 7,
+                                10, 11, 12, 13, 14, 15, 16, 17],
+                               True)
+    self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
+                               [[10, 11, 12, 13], [14, 15, 16, 17]],
+                               [[0, 1, 2, 3], [4, 5, 6, 7],
+                                [10, 11, 12, 13], [14, 15, 16, 17]],
+                               True)
+    self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
+                               [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                               [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
+                                [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                               True)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e0b83c490caf261ef71ef988d3436f6b2ab615dc..32a5db2c1ae0687f3c9954e943735a7748a2b777 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3443,6 +3443,12 @@ def while_loop(cond,
         if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
           packed = True
           loop_vars = (loop_vars,)
+
+      def convert(x):
+        if isinstance(x, tensor_array_ops.TensorArray):
+          return x
+        return ops.convert_to_tensor(x)
+      loop_vars = nest.map_structure(convert, loop_vars)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
@@ -3619,7 +3625,7 @@ def group(*inputs, **kwargs):
       """A sort key that allows None to be compared to strings."""
       return "" if dev is None else dev
 
-    for dev in sorted(six.iterkeys(ops_on_device), key=device_key):
+    for dev in sorted(ops_on_device, key=device_key):
       deps.append(_GroupControlDeps(dev, ops_on_device[dev]))
 
     with ops.control_dependencies(deps):
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/python/ops/critical_section_ops.py
similarity index 92%
rename from tensorflow/contrib/framework/python/ops/critical_section_ops.py
rename to tensorflow/python/ops/critical_section_ops.py
index 71ab755aa2948c548db89b330bb93c9524412fa6..21872ffff139b3f5b74d044746a83f3ce5ab265b 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/python/ops/critical_section_ops.py
@@ -31,6 +31,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+__all__ = ["CriticalSection"]
 
 
 # Graph Keys
@@ -66,6 +70,7 @@ def _get_colocation(op):
     return None
 
 
+@tf_export("CriticalSection")
 class CriticalSection(object):
   """Critical section.
 
@@ -179,37 +184,36 @@ class CriticalSection(object):
   def name(self):
     return self._handle.op.name
 
-  def execute(self, fn, *args, **kwargs):
-    """Execute function `fn(*args, **kwargs)` inside the CriticalSection.
+  def execute(self, fn, exclusive_resource_access=True, name=None):
+    """Execute function `fn()` inside the critical section.
+
+    `fn` should not accept any arguments.  To add extra arguments to when
+    calling `fn` in the critical section, create a lambda:
+
+    ```python
+    critical_section.execute(lambda: fn(*my_args, **my_kwargs))
+    ```
 
     Args:
       fn: The function to execute.  Must return at least one tensor.
-      *args: Additional positional arguments to `fn`.
-      **kwargs: Additional keyword arguments to `fn`.
-        Several keywords are reserved for `execute`.  These are:
-
-        - name; The name to use when creating the execute operation.
-        - exclusive_resource_access; Whether the resources required by
-          `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
-          You may want to set this to `False` if you will be accessing a
-          resource in read-only mode in two different CriticalSections.
+      exclusive_resource_access: Whether the resources required by
+        `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
+        You may want to set this to `False` if you will be accessing a
+        resource in read-only mode in two different CriticalSections.
+      name: The name to use when creating the execute operation.
 
     Returns:
-      The tensors returned from `fn(*args, **kwargs)`.
+      The tensors returned from `fn()`.
 
     Raises:
       ValueError: If `fn` attempts to lock this `CriticalSection` in any nested
         or lazy way that may cause a deadlock.
-      ValueError: If `exclusive_resource_access` is not provided (is `True`) and
+      ValueError: If `exclusive_resource_access == True` and
         another `CriticalSection` has an execution requesting the same
-        resources as in `*args`, `**kwargs`, and any additionally captured
-        inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
-        if another execution in another `CriticalSection` was created without
-        `exclusive_resource_access=True`, a `ValueError` will be raised.
+        resources as `fn``.  Note, even if `exclusive_resource_access` is
+        `True`, if another execution in another `CriticalSection` was created
+        without `exclusive_resource_access=True`, a `ValueError` will be raised.
     """
-    name = kwargs.pop("name", None)
-    exclusive_resource_access = kwargs.pop("exclusive_resource_access", True)
-
     with ops.name_scope(name, "critical_section_execute", []):
 
       # Ensure that mutex locking only happens *after* all args and
@@ -222,7 +226,7 @@ class CriticalSection(object):
         with ops.get_default_graph()._lock:  # pylint: disable=protected-access
           existing_ops = ops.get_default_graph().get_operations()
           with ops.control_dependencies([lock]):
-            r = fn(*args, **kwargs)
+            r = fn()
           # TODO(ebrevdo): If creating critical sections in a python loop, this
           # makes graph creation time quadratic.  Revisit if this
           # becomes a problem.
@@ -230,7 +234,7 @@ class CriticalSection(object):
                          .difference(existing_ops))
       else:
         with ops.control_dependencies([lock]):
-          r = fn(*args, **kwargs)
+          r = fn()
 
       if not context.executing_eagerly():
         self._add_control_dependencies_to_lock(created_ops, lock.op)
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 5fc3fdcc3e108d3cc90dd118d9a6274b4f8e4704..a2b7767dcd72e1b71251cd2d8d912bf942e2e4a5 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -185,7 +185,7 @@ def _graph_mode_decorator(f, *args, **kwargs):
                    current_var_scope.local_variables())
   new_vars = after_vars - before_vars
   for v in new_vars:
-    if not isinstance(v, resource_variable_ops.ResourceVariable):
+    if not resource_variable_ops.is_resource_variable(v):
       raise TypeError(
           "All variables used by a function wrapped with @custom_gradient must "
           "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index d0291e2095bdb6574c707c7458e4cc335fc4b825..881466cb23a65ec7d1946183574eb3b76f1a08fc 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -558,7 +558,7 @@ def embedding_lookup_sparse_v2(params,
                                combiner=None,
                                max_norm=None,
                                name=None):
-  return embedding_lookup_sparse_v2(
+  return embedding_lookup_sparse(
       params, sp_ids, sp_weights, partition_strategy, name, combiner, max_norm)
 
 
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index 4f73423d75e7c878549f5c41b04cb907dd9363bc..64c199ad29d421ceaa7364fe229b846d76ca2f6b 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -23,7 +23,6 @@ import contextlib
 import warnings
 
 import numpy as np
-import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
@@ -912,7 +911,7 @@ def _MultiDeviceAddN(tensor_list, gradient_uid):
   def DeviceKey(dev):
     return "" if dev is None else dev
 
-  for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
+  for dev in sorted(tensors_on_device, key=DeviceKey):
     tensors = tensors_on_device[dev]
     with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
         tensors[0].op,
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index c481266dd71c1300612dbc384d240d34b98b3599..f363f1b24a058b12740aabf06e93b562475285fa 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
+@test_util.disable_all_xla("b/124289666")  # align_corners=False unimplemented
 class ResizeNearestNeighborOpTest(test.TestCase):
 
   TYPES = [np.float32, np.float64]
@@ -149,6 +150,7 @@ class ResizeBilinearOpTest(test.TestCase):
     self.assertLess(err, 1e-3)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/124290659")  # align_corners=False unimplemented
   def testCompareGpuVsCpu(self):
     in_shape = [2, 4, 6, 3]
     out_shape = [2, 8, 16, 3]
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index b032d6454da1ac03858cb9e093960a23d30606a2..490e80e09dbf026552f182d065a8dde9d8e5a429 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2399,6 +2399,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
+  @test_util.disable_xla("b/124289666")  # align_corners=False unimplemented
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
@@ -2430,6 +2431,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
+  @test_util.disable_xla("b/124289666")  # align_corners=False unimplemented
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
     data = [64, 32, 32, 64, 50, 100]
@@ -2553,6 +2555,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
+  @test_util.disable_xla("b/124289666")  # align_corners=False unimplemented
   def testCompareNearestNeighbor(self):
     if test.is_gpu_available():
       input_shape = [1, 5, 6, 3]
@@ -3612,7 +3615,8 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # If we negate all pixel-values then the total variation is unchanged.
     self._test(-a, tot_var)
 
-    # Scale the pixel-values by a float. This scales the total variation as well.
+    # Scale the pixel-values by a float. This scales the total variation as
+    # well.
     b = 1.1 * a
     self._test(b, 1.1 * tot_var)
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index a4cebc8d5891da23e9c1042b478dcabe9b7994a0..035534ef49cc4c715b2101beb98e1d1aa6a72071 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -38,6 +38,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
@@ -531,7 +532,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -605,7 +606,8 @@ class Orthogonal(Initializer):
     num_rows = 1
     for dim in shape[:-1]:
       num_rows *= dim
-    num_cols = shape[-1]
+    num_rows = int(num_rows)
+    num_cols = int(shape[-1])
     if num_rows < num_cols:
       flat_shape = (num_cols, num_rows)
     else:
@@ -1207,6 +1209,8 @@ class Identity(Initializer):
           "Identity matrix initializer can only be used for 2D matrices.")
     if dtype is None:
       dtype = self.dtype
+    if isinstance(full_shape, tensor_shape.TensorShape):
+      full_shape = full_shape.as_list()
     initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
     if partition_info is not None:
       initializer = array_ops.slice(initializer, partition_info.var_offset,
@@ -1263,9 +1267,10 @@ class GlorotNormal(VarianceScaling):
   """The Glorot normal initializer, also called Xavier normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / (fan_in + fan_out))`
-  where `fan_in` is the number of input units in the weight tensor
-  and `fan_out` is the number of output units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number
+  of input units in the weight tensor and `fan_out` is the number of
+  output units in the weight tensor.
 
   Args:
     seed: A Python integer. Used to create random seeds. See
@@ -1321,8 +1326,9 @@ def lecun_normal(seed=None):
   """LeCun normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(1 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(1 / fan_in)` where `fan_in` is the number of
+  input units in the weight tensor.
 
   Arguments:
       seed: A Python integer. Used to seed the random generator.
@@ -1371,8 +1377,9 @@ def he_normal(seed=None):
   """He normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of
+  input units in the weight tensor.
 
   Arguments:
       seed: A Python integer. Used to seed the random generator.
@@ -1422,7 +1429,7 @@ def _compute_fans(shape):
     shape: Integer shape tuple or TF tensor shape.
 
   Returns:
-    A tuple of scalars (fan_in, fan_out).
+    A tuple of integer scalars (fan_in, fan_out).
   """
   if len(shape) < 1:  # Just to avoid errors for constants.
     fan_in = fan_out = 1
@@ -1434,12 +1441,12 @@ def _compute_fans(shape):
   else:
     # Assuming convolution kernels (2D, 3D, or more).
     # kernel shape: (..., input_depth, depth)
-    receptive_field_size = 1.
+    receptive_field_size = 1
     for dim in shape[:-2]:
       receptive_field_size *= dim
     fan_in = shape[-2] * receptive_field_size
     fan_out = shape[-1] * receptive_field_size
-  return fan_in, fan_out
+  return int(fan_in), int(fan_out)
 
 
 def _assert_float_dtype(dtype):
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index b3cdec9dd407c26277ed2d710397a0a831d75e16..1205f367bc99c8e07b97d45b6e4ae7089a089e13 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape as tensor_shape_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
@@ -54,114 +55,126 @@ class InitializersTest(test.TestCase):
       self.assertGreater(lim, abs(output.min() - target_min))
 
   def test_uniform(self):
-    tensor_shape = (9, 6, 7)
+    shape = (9, 6, 99)
     with self.cached_session():
-      self._runner(
-          init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
-          tensor_shape,
-          target_mean=0.,
-          target_max=1,
-          target_min=-1)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
+            tensor_shape,
+            target_mean=0.,
+            target_max=1,
+            target_min=-1)
 
   def test_normal(self):
-    tensor_shape = (8, 12, 99)
+    shape = (8, 12, 99)
     with self.cached_session():
-      self._runner(
-          init_ops.RandomNormal(mean=0, stddev=1, seed=153),
-          tensor_shape,
-          target_mean=0.,
-          target_std=1)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.RandomNormal(mean=0, stddev=1, seed=153),
+            tensor_shape,
+            target_mean=0.,
+            target_std=1)
 
   def test_truncated_normal(self):
-    tensor_shape = (12, 99, 7)
+    shape = (12, 99, 7)
     with self.cached_session():
-      self._runner(
-          init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
-          tensor_shape,
-          target_mean=0.,
-          target_max=2,
-          target_min=-2)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
+            tensor_shape,
+            target_mean=0.,
+            target_max=2,
+            target_min=-2)
 
   def test_constant(self):
-    tensor_shape = (5, 6, 4)
+    shape = (5, 6, 4)
     with self.cached_session():
-      self._runner(
-          init_ops.Constant(2),
-          tensor_shape,
-          target_mean=2,
-          target_max=2,
-          target_min=2)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Constant(2),
+            tensor_shape,
+            target_mean=2,
+            target_max=2,
+            target_min=2)
 
   def test_lecun_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          init_ops.lecun_uniform(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(1. / fan_in)
+        self._runner(
+            init_ops.lecun_uniform(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_glorot_uniform_initializer(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          init_ops.glorot_uniform_initializer(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / (fan_in + fan_out))
+        self._runner(
+            init_ops.glorot_uniform_initializer(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_he_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          init_ops.he_uniform(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / fan_in)
+        self._runner(
+            init_ops.he_uniform(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_lecun_normal(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          init_ops.lecun_normal(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(1. / fan_in)
+        self._runner(
+            init_ops.lecun_normal(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_glorot_normal_initializer(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          init_ops.glorot_normal_initializer(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / (fan_in + fan_out))
+        self._runner(
+            init_ops.glorot_normal_initializer(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_he_normal(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          init_ops.he_normal(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / fan_in)
+        self._runner(
+            init_ops.he_normal(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_Orthogonal(self):
-    tensor_shape = (20, 20)
+    shape = (20, 20)
     with self.cached_session():
-      self._runner(init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
 
   def testVariablePlacementWithOrthogonalInitializer(self):
     if not context.context().num_gpus():
@@ -199,31 +212,36 @@ class InitializersTest(test.TestCase):
 
   def test_Identity(self):
     with self.cached_session():
-      tensor_shape = (3, 4, 5)
-      with self.assertRaises(ValueError):
+      shape = (3, 4, 5)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        with self.assertRaises(ValueError):
+          self._runner(
+              init_ops.Identity(),
+              tensor_shape,
+              target_mean=1. / int(tensor_shape[0]),
+              target_max=1.)
+
+      shape = (3, 3)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
         self._runner(
             init_ops.Identity(),
             tensor_shape,
-            target_mean=1. / tensor_shape[0],
+            target_mean=1. / int(tensor_shape[0]),
             target_max=1.)
 
-      tensor_shape = (3, 3)
-      self._runner(
-          init_ops.Identity(),
-          tensor_shape,
-          target_mean=1. / tensor_shape[0],
-          target_max=1.)
-
   def test_Zeros(self):
-    tensor_shape = (4, 5)
+    shape = (4, 5)
     with self.cached_session():
-      self._runner(
-          init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
 
   def test_Ones(self):
-    tensor_shape = (4, 5)
+    shape = (4, 5)
     with self.cached_session():
-      self._runner(init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/linalg/adjoint_registrations.py b/tensorflow/python/ops/linalg/adjoint_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ec97d2758f80aaa90c52646430b0d9c5e642bd
--- /dev/null
+++ b/tensorflow/python/ops/linalg/adjoint_registrations.py
@@ -0,0 +1,127 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.adjoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+
+
+# By default, return LinearOperatorAdjoint which switched the .matmul
+# and .solve methods.
+@linear_operator_algebra.RegisterAdjoint(linear_operator.LinearOperator)
+def _adjoint_linear_operator(linop):
+  return linear_operator_adjoint.LinearOperatorAdjoint(
+      linop,
+      is_non_singular=linop.is_non_singular,
+      is_self_adjoint=linop.is_self_adjoint,
+      is_positive_definite=linop.is_positive_definite,
+      is_square=linop.is_square)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_adjoint.LinearOperatorAdjoint)
+def _adjoint_adjoint_linear_operator(linop):
+  return linop.operator
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_identity.LinearOperatorIdentity)
+def _adjoint_identity(identity_operator):
+  return identity_operator
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _adjoint_scaled_identity(identity_operator):
+  multiplier = identity_operator.multiplier
+  if multiplier.dtype.is_complex:
+    multiplier = math_ops.conj(multiplier)
+
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=multiplier,
+      is_non_singular=identity_operator.is_non_singular,
+      is_self_adjoint=identity_operator.is_self_adjoint,
+      is_positive_definite=identity_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_diag.LinearOperatorDiag)
+def _adjoint_diag(diag_operator):
+  diag = diag_operator.diag
+  if diag.dtype.is_complex:
+    diag = math_ops.conj(diag)
+
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=diag,
+      is_non_singular=diag_operator.is_non_singular,
+      is_self_adjoint=diag_operator.is_self_adjoint,
+      is_positive_definite=diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _adjoint_block_diag(block_diag_operator):
+    # We take the adjoint of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.adjoint() for operator in block_diag_operator.operators],
+      is_non_singular=block_diag_operator.is_non_singular,
+      is_self_adjoint=block_diag_operator.is_self_adjoint,
+      is_positive_definite=block_diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _adjoint_kronecker(kronecker_operator):
+    # Adjoint of a Kronecker product is the Kronecker product
+    # of adjoints.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.adjoint() for operator in kronecker_operator.operators],
+      is_non_singular=kronecker_operator.is_non_singular,
+      is_self_adjoint=kronecker_operator.is_self_adjoint,
+      is_positive_definite=kronecker_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_circulant.LinearOperatorCirculant)
+def _adjoint_circulant(circulant_operator):
+  spectrum = circulant_operator.spectrum
+  if spectrum.dtype.is_complex:
+    spectrum = math_ops.conj(spectrum)
+
+  # Conjugating the spectrum is sufficient to get the adjoint.
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=spectrum,
+      is_non_singular=circulant_operator.is_non_singular,
+      is_self_adjoint=circulant_operator.is_self_adjoint,
+      is_positive_definite=circulant_operator.is_positive_definite,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index eebe741337d8eefae44e5206ce990edbf261bdd9..b9f8411c934aabfa30de2d684d5afcb354401509 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import adjoint_registrations as _adjoint_registrations
 from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
 from tensorflow.python.ops.linalg import inverse_registrations as _inverse_registrations
 from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 2259eaa65cd1a857e369ee8673165c76c882df7e..fec2b2713e3709d9104204412a8c52fd062e9336 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
@@ -329,3 +330,189 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
           result,
           array_ops.concat((batch_shape, array_ops.shape(result)[-2:]), axis=0))
     return array_ops.reshape(result, batch_shape.concatenate(result.shape[-2:]))
+
+
+@tf_export('linalg.tridiagonal_solve')
+def tridiagonal_solve(diagonals,
+                      rhs,
+                      diagonals_format='compact',
+                      transpose_rhs=False,
+                      conjugate_rhs=False,
+                      name=None):
+  r"""Solves tridiagonal systems of equations.
+
+  Solution is computed via Gaussian elemination with partial pivoting.
+
+  The input can be supplied in various formats: `matrix`, `tuple` and `compact`,
+  specified by the `diagonals_format` arg.
+
+  In `matrix` format, `diagonals` must be a tensor of shape `[..., M, M]`, with
+  two inner-most dimensions representing the square tridiagonal matrices.
+  Elements outside of the three diagonals will be ignored.
+
+  In `sequence` format, `diagonals` are supplied as a tuple or list of three
+  tensors of shapes `[..., N]`, `[..., M]`, `[..., N]` representing
+  superdiagonals, diagonals, and subdiagonals, respectively. `N` can be either
+  `M-1` or `M`; in the latter case, the last element of superdiagonal and the
+  first element of subdiagonal will be ignored.
+
+  In `compact` format the three diagonals are brought together into one tensor
+  of shape `[..., 3, M]`, with last two dimensions containing superdiagonals,
+  diagonals, and subdiagonals, in order. Similarly to `sequence` format,
+  elements `diagonals[..., 0, M-1]` and `diagonals[..., 2, 0]` are ignored.
+
+  The `compact` format is recommended as the one with best performance. In case
+  you need to cast a tensor into a compact format manually, use `tf.gather_nd`.
+  An example for a tensor of shape [m, m]:
+
+  ```python
+  rhs = tf.constant([...])
+  matrix = tf.constant([[...]])
+  m = matrix.shape[0]
+  dummy_idx = [0, 0]  # An arbitrary element to use as a dummy
+  indices = [[[i, i + 1] for i in range(m - 1)] + [dummy_idx],  # Superdiagonal
+           [[i, i] for i in range(m)],                          # Diagonal
+           [dummy_idx] + [[i + 1, i] for i in range(m - 1)]]    # Subdiagonal
+  diagonals=tf.gather_nd(matrix, indices)
+  x = tf.linalg.tridiagonal_solve(diagonals, rhs)
+  ```
+
+  Regardless of the `diagonals_format`, `rhs` is a tensor of shape `[..., M]` or
+  `[..., M, K]`. The latter allows to simultaneously solve K systems with the
+  same left-hand sides and K different right-hand sides. If `transpose_rhs`
+  is set to `True` the expected shape is `[..., M]` or `[..., K, M]`.
+
+  The batch dimensions, denoted as `...`, must be the same in `diagonals` and
+  `rhs`.
+
+  The output is a tensor of the same shape as `rhs`: either `[..., M]` or
+  `[..., M, K]`.
+
+  Args:
+    diagonals: A `Tensor` or tuple of `Tensor`s describing left-hand sides. The
+      shape depends of `diagonals_format`, see description above. Must be
+      `float32`, `float64`, `complex64`, or `complex128`.
+    rhs: A `Tensor` of shape [..., M] or [..., M, K] and with the same dtype as
+      `diagonals`.
+    diagonals_format: one of `matrix`, `sequence`, or `compact`. Default is
+      `compact`.
+    transpose_rhs: If `True`, `rhs` is transposed before solving (has no effect
+      if the shape of rhs is [..., M]).
+    conjugate_rhs: If `True`, `rhs` is conjugated before solving.
+    name:  A name to give this `Op` (optional).
+
+  Returns:
+    A `Tensor` of shape [..., M] or [..., M, K] containing the solutions.
+
+  Raises:
+    ValueError: An unsupported type is provided as input, or when the input
+    tensors have incorrect shapes.
+
+  """
+  if diagonals_format == 'compact':
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  if diagonals_format == 'sequence':
+    if not isinstance(diagonals, (tuple, list)) or len(diagonals) != 3:
+      raise ValueError('Expected diagonals to be a sequence of length 3.')
+
+    superdiag, maindiag, subdiag = diagonals
+    if (not subdiag.shape[:-1].is_compatible_with(maindiag.shape[:-1]) or
+        not superdiag.shape[:-1].is_compatible_with(maindiag.shape[:-1])):
+      raise ValueError(
+          'Tensors representing the three diagonals must have the same shape,'
+          'except for the last dimension, got {}, {}, {}'.format(
+              subdiag.shape, maindiag.shape, superdiag.shape))
+
+    m = tensor_shape.dimension_value(maindiag.shape[-1])
+
+    def pad_if_necessary(t, name, last_dim_padding):
+      n = tensor_shape.dimension_value(t.shape[-1])
+      if not n or n == m:
+        return t
+      if n == m - 1:
+        paddings = (
+            [[0, 0] for _ in range(len(t.shape) - 1)] + [last_dim_padding])
+        return array_ops.pad(t, paddings)
+      raise ValueError('Expected {} to be have length {} or {}, got {}.'.format(
+          name, m, m - 1, n))
+
+    subdiag = pad_if_necessary(subdiag, 'subdiagonal', [1, 0])
+    superdiag = pad_if_necessary(superdiag, 'superdiagonal', [0, 1])
+
+    diagonals = array_ops.stack((superdiag, maindiag, subdiag), axis=-2)
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  if diagonals_format == 'matrix':
+    m1 = tensor_shape.dimension_value(diagonals.shape[-1])
+    m2 = tensor_shape.dimension_value(diagonals.shape[-2])
+    if m1 and m2 and m1 != m2:
+      raise ValueError(
+          'Expected last two dimensions of diagonals to be same, got {} and {}'
+          .format(m1, m2))
+    m = m1 or m2
+    if not m:
+      raise ValueError('The size of the matrix needs to be known for '
+                       'diagonals_format="matrix"')
+
+    # Extract diagonals; use input[..., 0, 0] as "dummy" m-th elements of sub-
+    # and superdiagonal.
+    # gather_nd slices into first indices, whereas we need to slice into the
+    # last two, so transposing back and forth is necessary.
+    dummy_idx = [0, 0]
+    indices = ([[[1, 0], [0, 0], dummy_idx]] + [
+        [[i + 1, i], [i, i], [i - 1, i]] for i in range(1, m - 1)
+    ] + [[dummy_idx, [m - 1, m - 1], [m - 2, m - 1]]])
+    diagonals = array_ops.transpose(
+        array_ops.gather_nd(array_ops.transpose(diagonals), indices))
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  raise ValueError('Unrecognized diagonals_format: {}'.format(diagonals_format))
+
+
+def _tridiagonal_solve_compact_format(diagonals,
+                                      rhs,
+                                      transpose_rhs=False,
+                                      conjugate_rhs=False,
+                                      name=None):
+  """Helper function used after the input has been cast to compact form."""
+  diags_rank, rhs_rank = len(diagonals.shape), len(rhs.shape)
+
+  if diags_rank < 2:
+    raise ValueError(
+        'Expected diagonals to have rank at least 2, got {}'.format(diags_rank))
+  if rhs_rank != diags_rank and rhs_rank != diags_rank - 1:
+    raise ValueError('Expected the rank of rhs to be {} or {}, got {}'.format(
+        diags_rank - 1, diags_rank, rhs_rank))
+  if diagonals.shape[-2] != 3:
+    raise ValueError('Expected 3 diagonals got {}'.format(diagonals.shape[-2]))
+  if not diagonals.shape[:-2].is_compatible_with(rhs.shape[:diags_rank - 2]):
+    raise ValueError('Batch shapes {} and {} are incompatible'.format(
+        diagonals.shape[:-2], rhs.shape[:diags_rank - 2]))
+
+  def check_num_lhs_matches_num_rhs():
+    if diagonals.shape[-1] != rhs.shape[-2]:
+      raise ValueError('Expected number of left-hand sided and right-hand '
+                       'sides to be equal, got {} and {}'.format(
+                           diagonals.shape[-1], rhs.shape[-2]))
+
+  if rhs_rank == diags_rank - 1:
+    # Rhs provided as a vector, ignoring transpose_rhs
+    if conjugate_rhs:
+      rhs = math_ops.conj(rhs)
+    rhs = array_ops.expand_dims(rhs, -1)
+    check_num_lhs_matches_num_rhs()
+    return array_ops.squeeze(
+        linalg_ops.tridiagonal_solve(diagonals, rhs, name), -1)
+
+  if transpose_rhs:
+    rhs = array_ops.matrix_transpose(rhs, conjugate=conjugate_rhs)
+  elif conjugate_rhs:
+    rhs = math_ops.conj(rhs)
+
+  check_num_lhs_matches_num_rhs()
+  result = linalg_ops.tridiagonal_solve(diagonals, rhs, name)
+  return array_ops.matrix_transpose(result) if transpose_rhs else result
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 4c99e86dc59a8c39abb57494ae84bcfdc13faa1b..8fa9f63e043a59da5b3ea425883cb954a065c1ee 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -847,6 +847,26 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def adjoint(self, name="adjoint"):
+    """Returns the adjoint of the current `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, return `A*`.
+    Note that calling `self.adjoint()` and `self.H` are equivalent.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the adjoint of this `LinearOperator`.
+    """
+    if self.is_self_adjoint is True:  # pylint: disable=g-bool-id-comparison
+      return self
+    with self._name_scope(name):
+      return linear_operator_algebra.adjoint(self)
+
+  # self.H is equivalent to self.adjoint().
+  H = property(adjoint, None)
+
   def inverse(self, name="inverse"):
     """Returns the Inverse of this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
index 858e224b9adda57b4d472ae2f61b2b6cda74c243..7ee4752d264b73689c172240f10c89e1a52c5b68 100644
--- a/tensorflow/python/ops/linalg/linear_operator_adjoint.py
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
@@ -83,7 +84,7 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     r"""Initialize a `LinearOperatorAdjoint`.
 
     `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
-    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+    and `matmul` methods  effectively flip the `adjoint` argument.  E.g.
 
     ```
     A = MyLinearOperator(...)
@@ -175,15 +176,24 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     return self.operator.assert_self_adjoint()
 
   def _shape(self):
-    return self.operator.shape
+    # Rotate last dimension
+    shape = self.operator.shape
+    return shape[:-2].concatenate([shape[-1], shape[-2]])
 
   def _shape_tensor(self):
-    return self.operator.shape_tensor()
+    # Rotate last dimension
+    shape = self.operator.shape_tensor()
+    return array_ops.concat([
+        shape[:-2], [shape[-1], shape[-2]]], axis=-1)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return self.operator.matmul(
         x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
+  def _matvec(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matvec(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
   def _determinant(self):
     if self.is_self_adjoint:
       return self.operator.determinant()
@@ -201,7 +211,14 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     return self.operator.solve(
         rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
+  def _solvevec(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solvevec(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
   def _to_dense(self):
     if self.is_self_adjoint:
       return self.operator.to_dense()
     return linalg.adjoint(self.operator.to_dense())
+
+  def _add_to_tensor(self, x):
+    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
index c1513fdb38c6005c89f6994141797f7df5c65350..0d1eab4b735d64ad988507d6c52fc52202389fd0 100644
--- a/tensorflow/python/ops/linalg/linear_operator_algebra.py
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util import tf_inspect
 
 
+_ADJOINTS = {}
 _CHOLESKY_DECOMPS = {}
 _MATMUL = {}
 _INVERSES = {}
@@ -46,6 +47,11 @@ def _registered_function(type_list, registry):
   return registry.get(tuple(r[1] for r in registered_combination), None)
 
 
+def _registered_adjoint(type_a):
+  """Get the Adjoint function registered for class a."""
+  return _registered_function([type_a], _ADJOINTS)
+
+
 def _registered_cholesky(type_a):
   """Get the Cholesky function registered for class a."""
   return _registered_function([type_a], _CHOLESKY_DECOMPS)
@@ -61,6 +67,29 @@ def _registered_inverse(type_a):
   return _registered_function([type_a], _INVERSES)
 
 
+def adjoint(lin_op_a, name=None):
+  """Get the adjoint associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to take the adjoint of.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the adjoint of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Adjoint method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  adjoint_fn = _registered_adjoint(type(lin_op_a))
+  if adjoint_fn is None:
+    raise ValueError("No adjoint registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Adjoint"):
+    return adjoint_fn(lin_op_a)
+
+
 def cholesky(lin_op_a, name=None):
   """Get the Cholesky factor associated to lin_op_a.
 
@@ -132,6 +161,48 @@ def inverse(lin_op_a, name=None):
     return inverse_fn(lin_op_a)
 
 
+class RegisterAdjoint(object):
+  """Decorator to register an Adjoint implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterAdjoint(lin_op.LinearOperatorIdentity)
+  def _adjoint_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, adjoint_fn):
+    """Perform the Adjoint registration.
+
+    Args:
+      adjoint_fn: The function to use for the Adjoint.
+
+    Returns:
+      adjoint_fn
+
+    Raises:
+      TypeError: if adjoint_fn is not a callable.
+      ValueError: if a Adjoint function has already been registered for
+        the given argument classes.
+    """
+    if not callable(adjoint_fn):
+      raise TypeError(
+          "adjoint_fn must be callable, received: {}".format(adjoint_fn))
+    if self._key in _ADJOINTS:
+      raise ValueError("Adjoint({}) has already been registered to: {}".format(
+          self._key[0].__name__, _ADJOINTS[self._key]))
+    _ADJOINTS[self._key] = adjoint_fn
+    return adjoint_fn
+
+
 class RegisterCholesky(object):
   """Decorator to register a Cholesky implementation function.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index a957c84dc1ca6f26927ae3c39c6cb49caa2b19be..0383098804f49bd768b122e94f6c5ff63b33a9a3 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -278,6 +278,23 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  def test_adjoint(self):
+    self._skip_if_tests_to_skip_contains("adjoint")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder)
+            op_adjoint = operator.adjoint().to_dense()
+            op_adjoint_h = operator.H.to_dense()
+            mat_adjoint = linalg.adjoint(mat)
+            op_adjoint_v, op_adjoint_h_v, mat_adjoint_v = sess.run(
+                [op_adjoint, op_adjoint_h, mat_adjoint])
+            self.assertAC(mat_adjoint_v, op_adjoint_v)
+            self.assertAC(mat_adjoint_v, op_adjoint_h_v)
+
   def test_cholesky(self):
     self._skip_if_tests_to_skip_contains("cholesky")
     for use_placeholder in self._use_placeholder_options:
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index e8652ba44aebbfb0090310c184ee27396b949813..66960304c783965a7bb6f0cea27b07df4e1d842c 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -619,6 +619,8 @@ def norm(tensor,
         result = math_ops.sqrt(
             math_ops.reduce_sum(
                 tensor * math_ops.conj(tensor), axis, keepdims=True))
+        # TODO(rmlarsen): Replace with the following, once gradients are defined
+        # result = math_ops.reduce_euclidean_norm(tensor, axis, keepdims=True)
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 87409eb2ac6f962caacc566c4a6a9fd7e0e3c184..ee01ff7cf687c6ebfb2e7069534f52047fc0d9f7 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -89,28 +89,44 @@ def tensor_list_pop_back(input_handle, element_dtype, name=None):
       name=name)
 
 
-def tensor_list_gather(input_handle, indices, element_dtype, name=None):
+def tensor_list_gather(input_handle,
+                       indices,
+                       element_dtype,
+                       element_shape=None,
+                       name=None):
   return gen_list_ops.tensor_list_gather(
       input_handle=input_handle,
       indices=indices,
-      element_shape=-1,
+      element_shape=_build_element_shape(element_shape),
       element_dtype=element_dtype,
       name=name)
 
 
-def tensor_list_scatter(tensor, indices, element_shape, name=None):
-  return gen_list_ops.tensor_list_scatter_v2(
-      tensor=tensor,
-      indices=indices,
-      element_shape=_build_element_shape(element_shape),
-      num_elements=-1,
-      name=name)
+def tensor_list_scatter(tensor,
+                        indices,
+                        element_shape=None,
+                        input_handle=None,
+                        name=None):
+  if input_handle is not None:
+    return gen_list_ops.tensor_list_scatter_into_existing_list(
+        input_handle=input_handle, tensor=tensor, indices=indices, name=name)
+  else:
+    return gen_list_ops.tensor_list_scatter_v2(
+        tensor=tensor,
+        indices=indices,
+        element_shape=_build_element_shape(element_shape),
+        num_elements=-1,
+        name=name)
 
 
-def tensor_list_stack(input_handle, element_dtype, num_elements=-1, name=None):
+def tensor_list_stack(input_handle,
+                      element_dtype,
+                      num_elements=-1,
+                      element_shape=None,
+                      name=None):
   return gen_list_ops.tensor_list_stack(
       input_handle=input_handle,
-      element_shape=-1,
+      element_shape=_build_element_shape(element_shape),
       element_dtype=element_dtype,
       num_elements=num_elements,
       name=name)
@@ -269,12 +285,12 @@ def _TensorListResizeGrad(op, dlist):
 def _TensorListGatherGrad(op, dtensor):
   """Gradient function for TensorListGather."""
   input_list, indices, _ = op.inputs
-  dlist = gen_list_ops.tensor_list_scatter_v2(
-      tensor=dtensor,
-      indices=indices,
-      element_shape=gen_list_ops.tensor_list_element_shape(
-          input_list, shape_type=dtypes.int32),
-      num_elements=gen_list_ops.tensor_list_length(input_list))
+  element_shape = gen_list_ops.tensor_list_element_shape(
+      input_list, shape_type=dtypes.int32)
+  num_elements = gen_list_ops.tensor_list_length(input_list)
+  dlist = tensor_list_reserve(element_shape, num_elements, dtensor.dtype)
+  dlist = tensor_list_scatter(
+      tensor=dtensor, indices=indices, input_handle=dlist)
   return dlist, None, None
 
 
@@ -295,6 +311,20 @@ def _TensorListScatterGrad(op, dlist):
     return dtensor, None, None
 
 
+@ops.RegisterGradient("TensorListScatterIntoExistingList")
+def _TensorListScatterIntoExistingListGrad(op, dlist):
+  """Gradient function for TensorListScatterIntoExistingList."""
+  _, tensor, indices = op.inputs
+  dtensor = gen_list_ops.tensor_list_gather(
+      dlist,
+      indices,
+      element_shape=array_ops.slice(array_ops.shape(tensor), [1], [-1]),
+      element_dtype=tensor.dtype)
+  zeros = array_ops.zeros_like(tensor)
+  dlist = tensor_list_scatter(zeros, indices, indices, input_handle=dlist)
+  return dlist, dtensor, None
+
+
 def _build_element_shape(shape):
   """Converts shape to a format understood by list_ops for element_shape.
 
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 3cb16eb81e8c0796e199edb9c97acd1c269c832b..f05fbf4dd5665596b2a03d5b580b13877d14218b 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -25,6 +25,7 @@ import sys
 
 import six
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -40,6 +41,14 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
+# Register printing to the cell output if we are in a Colab or Jupyter Notebook.
+try:
+  get_ipython()  # Exists in an ipython env like Jupyter or Colab
+  pywrap_tensorflow.TFE_Py_EnableInteractivePythonLogging()
+except NameError:
+  pass
+
+
 # The python wrapper for Assert is in control_flow_ops, as the Assert
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
@@ -193,9 +202,8 @@ def print_v2(*inputs, **kwargs):
     (This prints "tensors: [0 1 2 ... 7 8 9] {2: [0 2 4 ... 14 16 18]}" to
     sys.stdout)
 
-  Note: This op is only partially compatible with Jupyter notebooks and colabs.
-    Because it prints to the C++ standard out / standard error, this will go
-    in the notebook kernel's console output, not in the notebook cell output.
+  Note: In Jupyter notebooks and colabs, this operator prints to the notebook
+    cell outputs. It will not write to the notebook kernel's console logs.
 
   Args:
     *inputs: Positional arguments that are the inputs to print. Inputs in the
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 127565d19d9bebb44c42b3b753917d5626fce660..8f200c6e00b661c4b16732841694979f77b804c1 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -38,10 +38,10 @@ from tensorflow.python.ops import string_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
-# pylint: enable=wildcard-import
-from tensorflow.python.training.checkpointable import base as checkpointable_base
-from tensorflow.python.training.checkpointable import tracking as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
+# pylint: enable=wildcard-import
+from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import tracking as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -102,7 +102,7 @@ def _check_table_dtypes(table, key_dtype, value_dtype):
                     (table.value_dtype, value_dtype))
 
 
-class LookupInterface(checkpointable.TrackableResource):
+class LookupInterface(trackable.TrackableResource):
   """Represent a lookup table that persists across different steps."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -165,8 +165,8 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    if isinstance(initializer, checkpointable_base.Checkpointable):
-      self._initializer = self._track_checkpointable(
+    if isinstance(initializer, trackable_base.Trackable):
+      self._initializer = self._track_trackable(
           initializer, "_initializer")
     self._resource_handle = self.create_resource()
     self._init_op = self.initialize()
@@ -314,7 +314,7 @@ class HashTable(InitializableLookupTableBase):
     return exported_keys, exported_values
 
 
-class TableInitializerBase(checkpointable_base.Checkpointable):
+class TableInitializerBase(trackable_base.Trackable):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -543,8 +543,8 @@ class TextFileInitializer(TableInitializerBase):
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
-    self._filename = self._track_checkpointable(
-        checkpointable.TrackableAsset(filename),
+    self._filename = self._track_trackable(
+        trackable.TrackableAsset(filename),
         "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 4aea0265a72dcd2b2358f063fb0a51a5877076e7..9155d890ded7782ef7d64e631540e98e07f34a80 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 3393e75335af1caca18a79ba2302dee7a46d5662..6cd1d8e5f8baf39f9051f95ead2f47d30826c945 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,31 +35,6 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction", v1=[])
-class ReductionV2(object):
-  """Types of loss reduction.
-
-  Contains the following values:
-
-  * `NONE`: Un-reduced weighted losses with the same shape as input.
-  * `SUM`: Scalar sum of weighted losses.
-  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  """
-
-  NONE = "none"
-  SUM = "sum"
-  SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
-
-  @classmethod
-  def all(cls):
-    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
-
-  @classmethod
-  def validate(cls, key):
-    if key not in cls.all():
-      raise ValueError("Invalid Reduction Key %s." % key)
-
-
 @tf_export(v1=["losses.Reduction"])
 class Reduction(object):
   """Types of loss reduction.
@@ -69,8 +45,13 @@ class Reduction(object):
   * `SUM`: Scalar sum of weighted losses.
   * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
   * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+     Note that when using `tf.distribute.Strategy`, this is the global batch
+     size across all the replicas that are contributing to a single step.
   * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
      weights. DEPRECATED.
+     Note that when using `tf.distribute.Strategy`, this is scaled by the
+     number of replicas that are contributing to a single step to get an
+     approximation to the global batch size.
   * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
   """
 
@@ -198,11 +179,6 @@ def compute_weighted_loss(
   """
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas.
-    # TODO(josh11b): Associate it with the returned op for more precision.
-    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
       losses = ops.convert_to_tensor(losses)
@@ -214,15 +190,17 @@ def compute_weighted_loss(
         loss = weighted_losses
       else:
         loss = math_ops.reduce_sum(weighted_losses)
+        num_replicas = (  # Used to convert from local to global batch size.
+            distribution_strategy_context.get_strategy().num_replicas_in_sync)
         if reduction == Reduction.MEAN:
-          loss = _safe_mean(
-              loss,
-              math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+          denom = (num_replicas *
+                   math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+          loss = _safe_mean(loss, denom)
         elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
               reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
-          loss = _safe_mean(loss, _num_present(losses, weights))
+          loss = _safe_mean(loss, num_replicas * _num_present(losses, weights))
         elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
-          loss = _safe_mean(loss, _num_elements(losses))
+          loss = _safe_mean(loss, num_replicas * _num_elements(losses))
 
       # Convert the result back to the input type.
       loss = math_ops.cast(loss, input_dtype)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index dc2340983afa21f9236708a77f50875fafd0699b..42495b189d2c7e45ab9100e00f891974564196c2 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -47,6 +47,10 @@ def _ArgMinGrad(op, grad):
   return [None, None]
 
 
+# TODO(rmlarsen): Implement gradient.
+ops.NotDifferentiable("EuclideanNorm")
+
+
 @ops.RegisterGradient("Sum")
 def _SumGrad(op, grad):
   """Gradient for Sum."""
@@ -99,7 +103,7 @@ def _MinOrMaxGrad(op, grad):
   num_selected = array_ops.reshape(
       math_ops.reduce_sum(indicators, op.inputs[1]), output_shape_kept_dims)
 
-  return [math_ops.div(indicators, num_selected) * grad, None]
+  return [math_ops.divide(indicators, num_selected) * grad, None]
 
 
 @ops.RegisterGradient("Max")
@@ -196,7 +200,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
   ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
-  scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
+  scaled_grad = math_ops.divide(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
 
@@ -260,7 +264,7 @@ def _SegmentMinOrMaxGrad(op, grad):
                                       op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
-  weighted_grads = math_ops.div(grad, num_selected)
+  weighted_grads = math_ops.divide(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
   return array_ops.where(is_selected, gathered_grads, zeros), None
 
@@ -314,7 +318,7 @@ def _UnsortedSegmentMinOrMaxGrad(op, grad):
       math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
-  weighted_grads = math_ops.div(grad, num_selected)
+  weighted_grads = math_ops.divide(grad, num_selected)
   gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
                                               zero_clipped_indices,
                                               is_positive)
@@ -946,6 +950,26 @@ def _MulGrad(op, grad):
               math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
 
 
+@ops.RegisterGradient("MulNoNan")
+def _MulNoNanGrad(op, grad):
+  """The gradient of scalar multiplication with NaN-suppression."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return gen_math_ops.mul_no_nan(grad, y), gen_math_ops.mul_no_nan(
+        x, grad)
+  assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  return (array_ops.reshape(
+      math_ops.reduce_sum(gen_math_ops.mul_no_nan(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(gen_math_ops.mul_no_nan(x, grad), ry),
+              sy))
+
+
 @ops.RegisterGradient("Div")
 def _DivGrad(op, grad):
   """The gradient for the Div operator."""
@@ -956,10 +980,11 @@ def _DivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  return (array_ops.reshape(math_ops.reduce_sum(math_ops.div(grad, y), rx), sx),
+  return (array_ops.reshape(
+      math_ops.reduce_sum(math_ops.divide(grad, y), rx), sx),
           array_ops.reshape(
-              math_ops.reduce_sum(grad * math_ops.div(math_ops.div(-x, y), y),
-                                  ry), sy))
+              math_ops.reduce_sum(
+                  grad * math_ops.divide(math_ops.divide(-x, y), y), ry), sy))
 
 
 @ops.RegisterGradient("FloorDiv")
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 36b54b62cb44dae96e951279195282678f0dc637..96c24c3c98f2ec6f52317f0f9c46380eb7fe35c5 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -280,6 +280,31 @@ class DivNoNanGradientTest(test.TestCase):
       self.assertAllClose(dy.eval(), np.zeros(y.shape.as_list()))
 
 
+class MulNoNanGradientTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testBasicGradient(self):
+    inputs = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
+    outputs = math_ops.mul_no_nan(inputs, 1 + math_ops.abs(inputs))
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs,
+          inputs.get_shape().as_list(), outputs,
+          outputs.get_shape().as_list())
+      self.assertLess(error, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithRhsIsZero(self):
+    x_vals = [0, 1.0, np.nan, np.inf, np.NINF]
+    x = constant_op.constant(x_vals, dtype=dtypes.float32)
+    y = array_ops.zeros_like(x, dtype=dtypes.float32)
+    outputs = math_ops.mul_no_nan(x, y)
+    with self.cached_session():
+      dx, dy = gradients.gradients(outputs, [x, y])
+      self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
+      self.assertAllClose(dy.eval(), x_vals)
+
+
 class XlogyTest(test.TestCase):
 
   def _xlogy_gradients(self, x, y):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c620712b1dfe80a23bc7d57acd182cc5ac945911..ddb399222157de4c251198b398d5545450193598 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -12,9 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Basic arithmetic operators.
+"""Math Operations.
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+`tf.convert_to_tensor`.
+
+Note: Elementwise binary operations in TensorFlow follow [numpy-style
+broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+TensorFlow provides a variety of math functions including:
+
+* Basic arithmetic operators and trigonometric functions.
+* Special math functions (like: `tf.math.igamma` and `tf.math.zeta`)
+* Complex number functions (like: `tf.math.imag` and `tf.math.angle`)
+* Reductions and scans (like: `tf.math.reduce_mean` and `tf.math.cumsum`)
+* Segment functions (like: `tf.math.segment_sum`)
+
+See: `tf.linalg` for matrix and tensor functions.
+
+<a id=Segmentation></a>
+
+## About Segmentation
+
+TensorFlow provides several operations that you can use to perform common
+math computations on tensor segments.
+Here a segmentation is a partitioning of a tensor along
+the first dimension, i.e. it  defines a mapping from the first dimension onto
+`segment_ids`. The `segment_ids` tensor should be the size of
+the first dimension, `d0`, with consecutive IDs in the range `0` to `k`,
+where `k<d0`.
+In particular, a segmentation of a matrix tensor is a mapping of rows to
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+#  ==>  [[0 0 0 0]
+#        [5 6 7 8]]
+```
+
+The standard `segment_*` functions assert that the segment indices are sorted.
+If you have unsorted indices use the equivalent `unsorted_segment_` function.
+Thses functions take an additional argument `num_segments` so that the output
+tensor can be efficiently allocated.
+
+``` python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 6,  8, 10, 12],
+#       [-1, -2, -3, -4]]
+```
 
-See the [python/math_ops](python/math_ops) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -1005,7 +1055,8 @@ def div(x, y, name=None):
   return _div_python2(x, y, name)
 
 
-@tf_export("div_no_nan")
+@tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
@@ -1029,6 +1080,31 @@ def div_no_nan(x, y, name=None):
     return gen_math_ops.div_no_nan(x, y, name=name)
 
 
+@tf_export("math.multiply_no_nan")
+@dispatch.add_dispatch_support
+def multiply_no_nan(x, y, name=None):
+  """Computes the product of x and y and returns 0 if the y is zero, even if x is NaN or infinite.
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
+    y: A `Tensor` whose dtype is compatible with `x`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The element-wise value of the x times y.
+  """
+
+  with ops.name_scope(name, "multiply_no_nan", [x, y]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
+    x_dtype = x.dtype.base_dtype
+    y_dtype = y.dtype.base_dtype
+    if x_dtype != y_dtype:
+      raise TypeError(
+          "x and y must have the same dtype, got %r != %r" % (x_dtype, y_dtype))
+    return gen_math_ops.mul_no_nan(x, y, name=name)
+
+
 # TODO(aselle): This should be removed
 mod = gen_math_ops.floor_mod
 
@@ -1336,6 +1412,47 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
           name=name))
 
 
+@tf_export("math.reduce_euclidean_norm")
+def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the Euclidean norm of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [1, 1, 1]])
+  tf.reduce_euclidean_norm(x)  # sqrt(17)
+  tf.reduce_euclidean_norm(x, 0)  # [sqrt(2), sqrt(5), sqrt(10)]
+  tf.reduce_euclidean_norm(x, 1)  # [sqrt(14), sqrt(3)]
+  tf.reduce_euclidean_norm(x, 1, keepdims=True)  # [[sqrt(14)], [sqrt(3)]]
+  tf.reduce_euclidean_norm(x, [0, 1])  # sqrt(17)
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+  """
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.euclidean_norm(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -3112,7 +3229,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3158,7 +3275,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3205,7 +3322,7 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
   r"""Computes the sum along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
@@ -3292,7 +3409,7 @@ def sparse_segment_mean(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
@@ -3337,7 +3454,7 @@ def sparse_segment_mean_v2(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index b4832e09c084e7165143f4e918b9ba76842e2311..fb7e4aacc5e72fd73691050afe05575db16e9de8 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -558,6 +558,22 @@ class DivNoNanTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(tf_result, np_result)
 
 
+class MultiplyNoNanTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testBasic(self):
+    for dtype in [np.float32, np.float64]:
+      values = [0, 1, np.nan, np.inf, np.NINF]
+      x = constant_op.constant(values, dtype=dtype)
+      zeros = constant_op.constant(np.zeros((5,)), dtype=dtype)
+      ones = constant_op.constant(np.ones((5,)), dtype=dtype)
+      with self.cached_session(use_gpu=True):
+        tf_result_zeros = math_ops.multiply_no_nan(x, zeros).eval()
+        self.assertAllEqual(tf_result_zeros, zeros)
+        tf_result_ones = math_ops.multiply_no_nan(x, ones).eval()
+        self.assertAllEqual(tf_result_ones, x)
+
+
 class XlogyTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4c79f819d4136add8738bb4166192d97037b460a..a1fdddc4e4434b9eccb8e98edea45e4a6df4b8fd 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -50,7 +50,7 @@ def _Conv2DBackpropInputGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format")),
+          data_format=op.get_attr("data_format").decode()),
       nn_ops.conv2d(
           grad,
           op.inputs[1],
@@ -58,7 +58,7 @@ def _Conv2DBackpropInputGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format"))
+          data_format=op.get_attr("data_format").decode())
   ]
 
 
@@ -73,7 +73,7 @@ def _Conv2DBackpropFilterGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format")), None,
+          data_format=op.get_attr("data_format").decode()), None,
       nn_ops.conv2d(
           op.inputs[0],
           grad,
@@ -81,13 +81,65 @@ def _Conv2DBackpropFilterGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format").decode())
+  ]
+
+
+@ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput")
+def _DepthwiseConv2dNativeBackpropInputGrad(op, grad):
+  """The derivatives for deconvolution.
+
+  Args:
+    op: the Deconvolution op.
+    grad: the tensor representing the gradient w.r.t. the output
+
+  Returns:
+    the gradients w.r.t. the input and the filter
+  """
+  return [
+      None,
+      nn_ops.depthwise_conv2d_native_backprop_filter(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format")),
+      nn_ops.depthwise_conv2d_native(
+          grad,
+          op.inputs[1],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format"))
+  ]
+
+
+@ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter")
+def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad):
+  return [
+      nn_ops.depthwise_conv2d_native_backprop_input(
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format")), None,
+      nn_ops.depthwise_conv2d_native(
+          op.inputs[0],
+          grad,
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
           data_format=op.get_attr("data_format"))
   ]
 
 
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
@@ -110,7 +162,7 @@ def _Conv3DGrad(op, grad):
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       None,
       nn_ops.conv3d_backprop_filter_v2(
@@ -133,7 +185,7 @@ def _Conv3DBackpropInputGrad(op, grad):
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
@@ -161,7 +213,7 @@ def _AvgPool3DGrad(op, grad):
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
       padding=op.get_attr("padding"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format").decode())
 
 
 @ops.RegisterGradient("AvgPool3DGrad")
@@ -172,7 +224,7 @@ def _AvgPool3DGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -184,7 +236,7 @@ def _MaxPool3DGrad(op, grad):
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
       padding=op.get_attr("padding"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format").decode())
 
 
 @ops.RegisterGradient("MaxPool3DGrad")
@@ -200,7 +252,7 @@ def _MaxPool3DGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               padding=op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("MaxPool3DGradGrad")
@@ -216,7 +268,7 @@ def _MaxPool3DGradGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               padding=op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("Softmax")
@@ -232,8 +284,8 @@ def _SoftmaxGrad(op, grad_softmax):
 
   Args:
      op: the Softmax op.
-     grad_softmax:  the tensor representing the gradient w.r.t. the
-       softmax output.
+     grad_softmax:  the tensor representing the gradient w.r.t. the softmax
+       output.
 
   Returns:
      gradient w.r.t the input to the softmax
@@ -309,7 +361,6 @@ def _BiasAddGradGrad(op, received_grad):
     data_format = None
 
   shape = array_ops.shape(op.inputs[0])
-  rank = array_ops.rank(op.inputs[0])
   bias_shape = array_ops.shape(received_grad)
 
   if data_format == b"NCHW":
@@ -360,9 +411,9 @@ def _ReluGrad(op, grad):
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
   return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
-          array_ops.where(elu_x < 0, grad * op.inputs[0],
-                          array_ops.zeros(
-                              shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
+          array_ops.where(
+              elu_x < 0, grad * op.inputs[0],
+              array_ops.zeros(shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
 
 
 @ops.RegisterGradient("SeluGrad")
@@ -370,11 +421,9 @@ def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
-          array_ops.where(x < 0.,
-                          gen_nn_ops.elu_grad(grad,
-                                              op.outputs[0] + scale_alpha),
-                          array_ops.zeros(
-                              shape=array_ops.shape(x), dtype=x.dtype)))
+          array_ops.where(
+              x < 0., gen_nn_ops.elu_grad(grad, op.outputs[0] + scale_alpha),
+              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -485,10 +534,10 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(array_ops.expand_dims(grad_grad, 1),
-                        array_ops.expand_dims(softmax, 2)),
-        axis=1)) *
-             softmax)
+        math_ops.matmul(
+            array_ops.expand_dims(grad_grad, 1),
+            array_ops.expand_dims(softmax, 2)),
+        axis=1)) * softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
@@ -787,9 +836,9 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   Args:
     op: The BatchNormOp for which we need to compute gradients.
     use_v2: Boolean indicating whether to use the V2 version of the fused batch
-            norm gradient.
-    *grad: An argument list for tensors of gradients wrt the outputs
-          with grad[0] as grad_y.
+      norm gradient.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_y.
 
   Returns:
     grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
@@ -876,8 +925,7 @@ def _BatchNormGrad(grad_y,
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
     is_training: A bool value to indicate the operation is for training
-      (default)
-        or inference.
+      (default) or inference.
 
   Returns:
     A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
@@ -941,9 +989,9 @@ def _FusedBatchNormGradGrad(op, *grad):
 
   Args:
     op: The FusedBatchNormGradOp for which we need to compute gradients.
-    *grad: An argument list for tensors of gradients wrt the outputs
-          with grad[0] as grad_grad_x, grad[1] as grad_grad_scale,
-          grad[2] as grad_grad_offset.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
+      grad_grad_offset.
 
   Returns:
     A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
@@ -1009,29 +1057,31 @@ def _TopKGrad(op, grad, _):
   ind_shape = array_ops.shape(op.outputs[1])
 
   # int32 is not supported on GPU hence up-casting
-  ind_lastdim = array_ops.gather(math_ops.cast(
-      ind_shape, dtypes.int64), array_ops.size(ind_shape) - 1)
+  ind_lastdim = array_ops.gather(
+      math_ops.cast(ind_shape, dtypes.int64),
+      array_ops.size(ind_shape) - 1)
   # Flatten indices to 2D.
   ind_2d = array_ops.reshape(op.outputs[1], array_ops.stack([-1, ind_lastdim]))
 
-  in_lastdim = array_ops.gather(math_ops.cast(
-      in_shape, dtypes.int64), array_ops.size(in_shape) - 1)
+  in_lastdim = array_ops.gather(
+      math_ops.cast(in_shape, dtypes.int64),
+      array_ops.size(in_shape) - 1)
   outerdim = array_ops.shape(ind_2d)[0]
   # Compute linear indices (flattened to 1D).
-  ind = array_ops.reshape(ind_2d + math_ops.cast(array_ops.expand_dims(
-      math_ops.range(0, math_ops.cast(outerdim, dtypes.int64)
-                     * in_lastdim, in_lastdim), -1), dtypes.int32), [-1])
+  ind = array_ops.reshape(
+      ind_2d + math_ops.cast(
+          array_ops.expand_dims(
+              math_ops.range(0,
+                             math_ops.cast(outerdim, dtypes.int64) * in_lastdim,
+                             in_lastdim), -1), dtypes.int32), [-1])
 
   # Substitute grad to appropriate locations and fill the rest with zeros,
   # finally reshaping it to the original input shape.
   return [
       array_ops.reshape(
           array_ops.scatter_nd(
-              array_ops.expand_dims(ind, -1),
-              array_ops.reshape(grad, [-1]),
-              [math_ops.reduce_prod(in_shape)]
-          ),
-          in_shape),
+              array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]),
+              [math_ops.reduce_prod(in_shape)]), in_shape),
       array_ops.zeros([], dtype=dtypes.int32)
   ]
 
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 95e05a977b856505f0b608442e85fda8468ead1f..783656a86932019e373e42b236acfacf96245faf 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -23,9 +23,11 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
@@ -49,5 +51,111 @@ class Relu6OpTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+class Conv2dOpTest(test.TestCase):
+
+  def run_test(self, x, y):
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(x,
+                                                      x.get_shape().as_list(),
+                                                      y,
+                                                      y.get_shape().as_list())
+      self.assertLess(error, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testConv2dGradWRTInput(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
+    self.run_test(x, y)
+
+  @test_util.run_deprecated_v1
+  def testConv2dGradWRTFilter(self):
+    x = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[1, 4, 4, 3],
+                             name='input')
+    f = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
+    y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
+    self.run_test(f, y)
+
+  @test_util.run_deprecated_v1
+  def testConv2dBackpropFilterGrad(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    out = nn_impl.depthwise_conv2d(x, f, strides, padding)
+
+    grad_wrt_input = gradients_impl.gradients(out, x)[0]
+    self.run_test(f, grad_wrt_input)
+
+    grad_wrt_filter = gradients_impl.gradients(out, f)[0]
+    self.run_test(x, grad_wrt_filter)
+
+
+class DepthwiseConv2dTest(test.TestCase):
+
+  def run_test(self, x, y):
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(x,
+                                                      x.get_shape().as_list(),
+                                                      y,
+                                                      y.get_shape().as_list())
+      self.assertLess(error, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dGradWRTInput(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    y = nn_impl.depthwise_conv2d(x, f, strides, padding)
+    self.run_test(x, y)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dGradWRTFilter(self):
+    x = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[1, 4, 4, 3],
+                             name='input')
+    f = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    y = nn_impl.depthwise_conv2d(x, f, strides, padding)
+    self.run_test(f, y)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dBackpropFilterGrad(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    out = nn_impl.depthwise_conv2d(x, f, strides, padding)
+
+    grad_wrt_input = gradients_impl.gradients(out, x)[0]
+    self.run_test(f, grad_wrt_input)
+
+    grad_wrt_filter = gradients_impl.gradients(out, f)[0]
+    self.run_test(x, grad_wrt_filter)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index f6ea1c45437b7bf81d81aecaa2d8f1133722a0b7..c8c53d0f051d70f8ec0dcff55d3f84cb252515af 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -65,7 +65,9 @@ def _get_sequence(value, n, channel_index, name):
     return value
   elif current_n == 1:
     value = list((value[0],) * n)
-  elif current_n != n:
+  elif current_n == n:
+    value = list(value)
+  else:
     raise ValueError("{} should be of length 1, {} or {} but was {}".format(
         name, n, n + 2, current_n))
 
@@ -528,7 +530,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a montonically increasing sequence of positive "
-          "integers")  # pylint: disable=line-too-long
+          "integers")
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -765,7 +767,6 @@ def convolution(
     data_format=None,
     filters=None,
     dilations=None):
-  # pylint: disable=line-too-long
   """Computes sums of N-D convolutions (actually cross-correlation).
 
   This also supports either output striding via the optional `strides` parameter
@@ -851,6 +852,7 @@ def convolution(
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     filters: Alias of filter.
+    dilations: Alias of dilation_rate.
 
   Returns:
     A `Tensor` with the same type as `input` of shape
@@ -881,21 +883,14 @@ def convolution(
   filter = deprecated_argument_lookup("filters", filters, "filter", filter)
   dilation_rate = deprecated_argument_lookup(
       "dilations", dilations, "dilation_rate", dilation_rate)
-  # pylint: enable=line-too-long
-  with ops.name_scope(name, "convolution", [input, filter]) as name:
-    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.get_shape()
-    op = Convolution(
-        input_shape,
-        filter_shape,
-        padding,
-        strides=strides,
-        dilation_rate=dilation_rate,
-        name=name,
-        data_format=data_format)
-    return op(input, filter)
+  return convolution_internal(
+      input,
+      filter,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilations=dilation_rate,
+      name=name)
 
 
 @tf_export("nn.convolution", v1=[])
@@ -907,14 +902,15 @@ def convolution_v2(
     data_format=None,
     dilations=None,
     name=None):
-  return convolution(
+  return convolution_internal(
       input,  # pylint: disable=redefined-builtin
       filters,
-      padding=padding,
       strides=strides,
-      dilation_rate=dilations,
-      name=name,
-      data_format=data_format)
+      padding=padding,
+      data_format=data_format,
+      dilations=dilations,
+      name=name)
+
 
 convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
@@ -922,6 +918,75 @@ convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     "filter", "filters")
 
 
+def convolution_internal(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  """Internal function which performs rank agnostic convolution."""
+  with ops.name_scope(name, "convolution", [input, filters]) as name:
+    if isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape.rank is not None:
+      n = len(input.shape) - 2
+    elif not isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape is not None:
+      n = len(input.shape) - 2
+    elif isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape.rank is not None:
+      n = len(filters.shape) - 2
+    elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape is not None:
+      n = len(filters.shape) - 2
+    else:
+      raise ValueError("rank of input or filter must be known")
+
+    if not 1 <= n <= 3:
+      raise ValueError(
+          "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+    if data_format is None:
+      channel_index = n + 1
+    else:
+      channel_index = 1 if data_format.startswith("NC") else n + 1
+
+    strides = _get_sequence(strides, n, channel_index, "strides")
+    dilations = _get_sequence(dilations, n, channel_index, "dilations")
+
+    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
+
+    if all(i == 1 for i in dilations):
+      # fast path if no dilation as gradient only supported on GPU for dilations
+      op = conv_ops[n]
+      return op(
+          input,
+          filters,
+          strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations,
+          name=name)
+    else:
+      if channel_index == 1:
+        strides = strides[2:]
+        dilations = dilations[2:]
+      else:
+        strides = strides[1:-1]
+        dilations = dilations[1:-1]
+
+      op = Convolution(
+          tensor_shape.as_shape(input.shape),
+          tensor_shape.as_shape(filters.shape),
+          padding,
+          strides=strides,
+          dilation_rate=dilations,
+          name=name,
+          data_format=data_format)
+      return op(input, filters)
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -1024,7 +1089,6 @@ def pool(
     name=None,
     data_format=None,
     dilations=None):
-  # pylint: disable=line-too-long
   """Performs an N-D pooling operation.
 
   In the case that `data_format` does not start with "NC", computes for
@@ -1462,6 +1526,250 @@ def _convert_padding(padding):
   return padding, explicit_paddings
 
 
+@tf_export(v1=["nn.conv1d"])
+@deprecation.deprecated_arg_values(
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
+@deprecation.deprecated_arg_values(
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(
+    value=None,
+    filters=None,
+    stride=None,
+    padding=None,
+    use_cudnn_on_gpu=None,
+    data_format=None,
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    dilations=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    value: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `value`.
+    stride: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults to `"NWC"`,
+      the data is stored in the order of [batch, in_width, in_channels].  The
+      `"NCW"` format stores data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+    input: Alias for value.
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
+  with ops.name_scope(name, "conv1d", [value, filters]) as name:
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    if data_format is None or data_format == "NHWC" or data_format == "NWC":
+      data_format = "NHWC"
+      spatial_start_dim = 1
+      channel_index = 2
+    elif data_format == "NCHW" or data_format == "NCW":
+      data_format = "NCHW"
+      spatial_start_dim = 2
+      channel_index = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+    strides = [1] + _get_sequence(stride, 1, channel_index, "stride")
+    dilations = [1] + _get_sequence(dilations, 1, channel_index, "dilations")
+
+    value = array_ops.expand_dims(value, spatial_start_dim)
+    filters = array_ops.expand_dims(filters, 0)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+    return array_ops.squeeze(result, [spatial_start_dim])
+
+
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    stride,
+    padding,
+    data_format="NWC",
+    dilations=None,
+    name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults to `"NWC"`,
+      the data is stored in the order of [batch, in_width, in_channels].  The
+      `"NCW"` format stores data as [batch, in_channels, in_width].
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      stride,
+      padding,
+      use_cudnn_on_gpu=True,
+      data_format=data_format,
+      name=name,
+      dilations=dilations)
+
+
+@tf_export("nn.conv1d_transpose")
+def conv1d_transpose(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NWC",
+    dilations=None,
+    name=None):
+  """The transpose of `conv1d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv1d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 3-D `Tensor` of type `float` and shape
+      `[batch, in_width, in_channels]` for `NWC` data format or
+      `[batch, in_channels, in_width]` for `NCW` data format.
+    filters: A 3-D `Tensor` with the same type as `value` and shape
+      `[filter_width, output_channels, in_channels]`.  `filter`'s
+      `in_channels` dimension must match that of `value`.
+    output_shape: A 1-D `Tensor`, containing three elements, representing the
+      output shape of the deconvolution op.
+    strides: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. `'NWC'` and `'NCW'` are supported.
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, if
+      `output_shape` is not at 3-element vector, if `padding` is other than
+      `'VALID'` or `'SAME'`, or if `data_format` is invalid.
+  """
+  with ops.name_scope(name, "conv1d_transpose",
+                      [input, filters, output_shape]) as name:
+    # The format could be either NWC or NCW, map to NHWC or NCHW
+    if data_format is None or data_format == "NWC":
+      data_format = "NHWC"
+      spatial_start_dim = 1
+      channel_index = 2
+    elif data_format == "NCW":
+      data_format = "NCHW"
+      spatial_start_dim = 2
+      channel_index = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    strides = [1] + _get_sequence(strides, 1, channel_index, "stride")
+    dilations = [1] + _get_sequence(dilations, 1, channel_index, "dilations")
+
+    input = array_ops.expand_dims(input, spatial_start_dim)
+    filters = array_ops.expand_dims(filters, 0)
+    output_shape = list(output_shape)
+    output_shape = output_shape[: spatial_start_dim] + [1] + \
+                   output_shape[spatial_start_dim:]
+
+    result = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+    return array_ops.squeeze(result, spatial_start_dim)
+
+
 @tf_export("nn.conv2d", v1=[])
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
@@ -1503,10 +1811,11 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
     filters: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
-    strides: A list of `ints`.
-      1-D tensor of length 4.  The stride of the sliding window for each
-      dimension of `input`. The dimension order is determined by the value of
-      `data_format`, see below for details.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 1. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
       padding algorithm to use, or a list indicating the explicit paddings at
       the start and end of each dimension. When explicit padding is used and
@@ -1521,20 +1830,20 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
           [batch, height, width, channels].
       Alternatively, the format could be "NCHW", the data storage order of:
           [batch, channels, height, width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by the
-      value of `data_format`, see above for details. Dilations in the batch and
-      depth dimensions must be 1.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
   # pylint: enable=line-too-long
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
   return conv2d(input,  # pylint: disable=redefined-builtin
                 filters,
                 strides,
@@ -1588,10 +1897,11 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     filter: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
-    strides: A list of `ints`.
-      1-D tensor of length 4.  The stride of the sliding window for each
-      dimension of `input`. The dimension order is determined by the value of
-      `data_format`, see below for details.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 1. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
       padding algorithm to use, or a list indicating the explicit paddings at
       the start and end of each dimension. When explicit padding is used and
@@ -1607,12 +1917,14 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
           [batch, height, width, channels].
       Alternatively, the format could be "NCHW", the data storage order of:
           [batch, channels, height, width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by the
-      value of `data_format`, see above for details. Dilations in the batch and
-      depth dimensions must be 1.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
     name: A name for the operation (optional).
     filters: Alias for filter.
 
@@ -1622,6 +1934,12 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
   filter = deprecation.deprecated_argument_lookup(
       "filters", filters, "filter", filter)
   padding, explicit_paddings = _convert_padding(padding)
+  if data_format is None:
+    data_format = "NHWC"
+  channel_index = 1 if data_format.startswith("NC") else 3
+
+  strides = _get_sequence(strides, 2, channel_index, "strides")
+  dilations = _get_sequence(dilations, 2, channel_index, "dilations")
   return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
                            filter,
                            strides,
@@ -1633,70 +1951,6 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
                            name=name)
 
 
-@tf_export("nn.conv2d_backprop_filter", v1=[])
-def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
-                              filter_sizes,
-                              out_backprop,
-                              strides,
-                              padding,
-                              data_format="NHWC",
-                              dilations=None,
-                              name=None):
-  r"""Computes the gradients of convolution with respect to the filter.
-
-  Args:
-    input: A `Tensor`. Must be one of the following types:
-      `half`, `bfloat16`, `float32`, `float64`.
-      4-D with shape `[batch, in_height, in_width, in_channels]`.
-    filter_sizes: A `Tensor` of type `int32`.
-      An integer vector representing the tensor shape of `filter`,
-      where `filter` is a 4-D
-      `[filter_height, filter_width, in_channels, out_channels]` tensor.
-    out_backprop: A `Tensor`. Must have the same type as `input`.
-      4-D with shape `[batch, out_height, out_width, out_channels]`.
-      Gradients w.r.t. the output of the convolution.
-    strides: A list of `ints`.
-      The stride of the sliding window for each dimension of the input
-      of the convolution. Must be in the same order as the dimension specified
-      with format.
-    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
-      padding algorithm to use, or a list indicating the explicit paddings at
-      the start and end of each dimension. When explicit padding is used and
-      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
-      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
-      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
-      [pad_top, pad_bottom], [pad_left, pad_right]]`.
-    data_format: An optional `string` from: `"NHWC", "NCHW"`.
-      Defaults to `"NHWC"`.
-      Specify the data format of the input and output data. With the
-      default format "NHWC", the data is stored in the order of:
-          [batch, in_height, in_width, in_channels].
-      Alternatively, the format could be "NCHW", the data storage order of:
-          [batch, in_channels, in_height, in_width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by
-      the value of `data_format`, see above for details. Dilations in the batch
-      and depth dimensions must be 1.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`. Has the same type as `input`.
-  """
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
-  return conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
-                                filter_sizes,
-                                out_backprop,
-                                strides,
-                                padding,
-                                use_cudnn_on_gpu=True,
-                                data_format=data_format,
-                                dilations=dilations,
-                                name=name)
-
-
 @tf_export(v1=["nn.conv2d_backprop_filter"])
 def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -1757,70 +2011,6 @@ def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-defau
       explicit_paddings, data_format, dilations, name)
 
 
-@tf_export("nn.conv2d_backprop_input", v1=[])
-def conv2d_backprop_input_v2(input_sizes,
-                             filters,
-                             out_backprop,
-                             strides,
-                             padding,
-                             data_format="NHWC",
-                             dilations=None,
-                             name=None):
-  r"""Computes the gradients of convolution with respect to the input.
-
-  Args:
-    input_sizes: A `Tensor` of type `int32`.
-      An integer vector representing the shape of `input`,
-      where `input` is a 4-D `[batch, height, width, channels]` tensor.
-    filters: A `Tensor`. Must be one of the following types:
-      `half`, `bfloat16`, `float32`, `float64`.
-      4-D with shape
-      `[filter_height, filter_width, in_channels, out_channels]`.
-    out_backprop: A `Tensor`. Must have the same type as `filters`.
-      4-D with shape `[batch, out_height, out_width, out_channels]`.
-      Gradients w.r.t. the output of the convolution.
-    strides: A list of `ints`.
-      The stride of the sliding window for each dimension of the input
-      of the convolution. Must be in the same order as the dimension specified
-      with format.
-    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
-      padding algorithm to use, or a list indicating the explicit paddings at
-      the start and end of each dimension. When explicit padding is used and
-      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
-      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
-      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
-      [pad_top, pad_bottom], [pad_left, pad_right]]`.
-    data_format: An optional `string` from: `"NHWC", "NCHW"`.
-      Defaults to `"NHWC"`.
-      Specify the data format of the input and output data. With the
-      default format "NHWC", the data is stored in the order of:
-          [batch, in_height, in_width, in_channels].
-      Alternatively, the format could be "NCHW", the data storage order of:
-          [batch, in_channels, in_height, in_width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by
-      the value of `data_format`, see above for details. Dilations in the batch
-      and depth dimensions must be 1.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`. Has the same type as `filters`.
-  """
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
-  return conv2d_backprop_input(input_sizes,
-                               filters,
-                               out_backprop,
-                               strides,
-                               padding,
-                               use_cudnn_on_gpu=True,
-                               data_format=data_format,
-                               dilations=dilations,
-                               name=name)
-
-
 @tf_export(v1=["nn.conv2d_backprop_input"])
 def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
@@ -1895,12 +2085,13 @@ def conv2d_transpose(
     data_format="NHWC",
     name=None,
     input=None,  # pylint: disable=redefined-builtin
-    filters=None):
+    filters=None,
+    dilations=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv2d` rather than an actual
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv2d` rather than an actual
   deconvolution.
 
   Args:
@@ -1912,14 +2103,25 @@ def conv2d_transpose(
       `in_channels` dimension must match that of `value`.
     output_shape: A 1-D `Tensor` representing the output shape of the
       deconvolution op.
-    strides: A list of ints. The stride of the sliding window for each
-      dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
     input: Alias for value.
     filters: Alias for filter.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
 
   Returns:
     A `Tensor` with the same type as `value`.
@@ -1932,68 +2134,86 @@ def conv2d_transpose(
   filter = deprecated_argument_lookup("filters", filters, "filter", filter)
   with ops.name_scope(name, "conv2d_transpose",
                       [value, filter, output_shape]) as name:
-    if data_format not in ("NCHW", "NHWC"):
-      raise ValueError("data_format has to be either NCHW or NHWC.")
-    value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    axis = 3 if data_format == "NHWC" else 1
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[3]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[3]))
-
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}".format(
-          output_shape_.get_shape()))
-
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape().dims[2].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[2]))
-
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
-
-    return gen_nn_ops.conv2d_backprop_input(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
-        strides=strides,
+    return conv2d_transpose_v2(
+        value,
+        filter,
+        output_shape,
+        strides,
         padding=padding,
         data_format=data_format,
+        dilations=dilations,
         name=name)
 
 
-# pylint: disable=redefined-builtin
 @tf_export("nn.conv2d_transpose", v1=[])
 def conv2d_transpose_v2(
-    input,
+    input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
     output_shape,
     strides,
     padding="SAME",
     data_format="NHWC",
+    dilations=None,
     name=None):
-  return conv2d_transpose(
-      input,
-      filters,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=data_format,
-      name=name)
-# pylint: enable=redefined-builtin
-conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
-    deprecation.rewrite_argument_docstring(
-        conv2d_transpose.__doc__, "filter", "filters"),
-    "value", "input")
+  """The transpose of `conv2d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 4-D `Tensor` of type `float` and shape `[batch, height, width,
+      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
+      width]` for `NCHW` data format.
+    filters: A 4-D `Tensor` with the same type as `value` and shape `[height,
+      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
+      must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  with ops.name_scope(name, "conv2d_transpose",
+                      [input, filter, output_shape]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+    dilations = _get_sequence(dilations, 2, channel_index, "dilations")
+
+    return gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
 
 
 @tf_export("nn.atrous_conv2d_transpose")
@@ -2006,9 +2226,9 @@ def atrous_conv2d_transpose(value,
   """The transpose of `atrous_conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `atrous_conv2d` rather than an actual
-  deconvolution.
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `atrous_conv2d` rather than an
+  actual deconvolution.
 
   Args:
     value: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
@@ -2154,7 +2374,7 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               name=None):
   if dilations is None:
     dilations = [1, 1, 1, 1, 1]
-  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+  return gen_nn_ops.conv3d(input,
                            filters,
                            strides,
                            padding,
@@ -2193,12 +2413,13 @@ def conv3d_transpose(
     data_format="NDHWC",
     name=None,
     input=None,  # pylint: disable=redefined-builtin
-    filters=None):
+    filters=None,
+    dilations=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv3d` rather than an actual
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv3d` rather than an actual
   deconvolution.
 
   Args:
@@ -2218,6 +2439,14 @@ def conv3d_transpose(
     name: Optional name for the returned tensor.
     input: Alias of value.
     filters: Alias of filter.
+    dilations: An int or list of `ints` that has length `1`, `3` or `5`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `D`, `H` and `W` dimension.
+      By default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 5-d tensor
+      must be 1.
 
   Returns:
     A `Tensor` with the same type as `value`.
@@ -2228,68 +2457,164 @@ def conv3d_transpose(
   """
   filter = deprecated_argument_lookup("filters", filters, "filter", filter)
   value = deprecated_argument_lookup("input", input, "value", value)
-  with ops.name_scope(name, "conv3d_transpose",
-                      [value, filter, output_shape]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    axis = 1 if data_format == "NCDHW" else 4
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[4]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[4]))
+  return conv3d_transpose_v2(
+      value,
+      filter,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      dilations=dilations,
+      name=name)
 
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}".format(
-          output_shape_.get_shape()))
 
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape().dims[3].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[3]))
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
+                        filters,
+                        output_shape,
+                        strides,
+                        padding="SAME",
+                        data_format="NDHWC",
+                        dilations=None,
+                        name=None):
+  """The transpose of `conv3d`.
 
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 5-D `Tensor` of type `float` and shape `[batch, height, width,
+      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
+      width]` for `NCHW` data format.
+    filters: A 5-D `Tensor` with the same type as `value` and shape `[height,
+      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
+      must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `D`, `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 0. The dimension order is
+      determined by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NDHWC' and 'NCDHW' are supported.
+    dilations: An int or list of `ints` that has length `1`, `3` or `5`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `D`, `H` and `W` dimension.
+      By default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 5-d tensor
+      must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+  """
+  with ops.name_scope(name, "conv3d_transpose",
+                      [input, filter, output_shape]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 4
+
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+    dilations = _get_sequence(dilations, 3, channel_index, "dilations")
 
     return gen_nn_ops.conv3d_backprop_input_v2(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
         strides=strides,
         padding=padding,
         data_format=data_format,
+        dilations=dilations,
         name=name)
 
 
-# pylint: disable=redefined-builtin
-@tf_export("nn.conv3d_transpose", v1=[])
-def conv3d_transpose_v2(
-    input,
-    filters,
-    output_shape,
-    strides,
-    padding="SAME",
-    data_format="NDHWC",
-    name=None):
-  return conv3d_transpose(
-      input,
-      filters,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=data_format,
-      name=name)
-# pylint: enable=redefined-builtin
-conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
-    deprecation.rewrite_argument_docstring(
-        conv3d_transpose.__doc__, "filter", "filters"),
-    "value", "input")
+CONV_TRANSPOSE_OPS = (
+    conv1d_transpose,
+    conv2d_transpose_v2,
+    conv3d_transpose_v2,
+)
+
+
+@tf_export("nn.conv_transpose")
+def conv_transpose(input,  # pylint: disable=redefined-builtin
+                   filters,
+                   output_shape,
+                   strides,
+                   padding="SAME",
+                   data_format=None,
+                   dilations=None,
+                   name=None):
+  """The transpose of `convolution`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `convolution` rather than an actual
+  deconvolution.
+
+  Args:
+    input: An N+2 dimensional `Tensor` of shape
+      `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
+      not start with "NC" (default), or
+      `[batch_size, in_channels] + input_spatial_shape` if data_format starts
+      with "NC". It must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+    filters: An N+2 dimensional `Tensor` with the same type as `input` and
+      shape `spatial_filter_shape + [in_channels, out_channels]`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the spatial dimensions. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: An int or list of `ints` that has length `1`, `N` or `N+2`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the spatial dimensions. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details.
+    name: A name for the operation (optional). If not specified "conv_transpose"
+      is used.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+  """
+  with ops.name_scope(name, "conv_transpose",
+                      [input, filter, output_shape]) as name:
+    if output_shape is not None:
+      n = len(output_shape) - 2
+    else:
+      raise ValueError("output_shape cannot be None")
+
+    if not 1 <= n <= 3:
+      raise ValueError(
+          "output_shape must be of length 3, 4 or 5 but was {}.".format(n + 2))
+
+    op = CONV_TRANSPOSE_OPS[n-1]
+    return op(
+        input,
+        filters,
+        output_shape,
+        strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
 
 
 @tf_export("nn.bias_add")
@@ -2307,13 +2632,21 @@ def bias_add(value, bias, data_format=None, name=None):
     bias: A 1-D `Tensor` with size matching the last dimension of `value`.
       Must be the same type as `value` unless `value` is a quantized type,
       in which case a different quantized type may be used.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    data_format: A string. 'N...C' and 'NC...' are supported.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor` with the same type as `value`.
   """
   with ops.name_scope(name, "BiasAdd", [value, bias]) as name:
+    if data_format is not None:
+      if data_format.startswith("NC"):
+        data_format = "NCHW"
+      elif data_format.startswith("N") and data_format.endswith("C"):
+        data_format = "NHWC"
+      else:
+        raise ValueError("data_format must be of the form `N...C` or `NC...`")
+
     if not context.executing_eagerly():
       value = ops.convert_to_tensor(value, name="input")
       bias = ops.convert_to_tensor(bias, dtype=value.dtype, name="bias")
@@ -2491,7 +2824,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
   # We need its original shape for shape inference.
   shape = logits.get_shape()
-  is_last_dim = (dim is -1) or (dim == shape.ndims - 1)
+  is_last_dim = (dim == -1) or (dim == shape.ndims - 1)
 
   if is_last_dim:
     return compute_op(logits, name=name)
@@ -2499,7 +2832,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   dim_val = dim
   if isinstance(dim, ops.Tensor):
     dim_val = tensor_util.constant_value(dim)
-  if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims):
+  if dim_val is not None and not -shape.ndims <= dim_val < shape.ndims:
     raise errors_impl.InvalidArgumentError(
         None, None,
         "Dimension (%d) must be in the range [%d, %d) where %d is the number of"
@@ -3028,8 +3361,71 @@ def sparse_softmax_cross_entropy_with_logits(
         return cost
 
 
-@tf_export("nn.avg_pool")
-def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
+@tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
+  """Performs the avg pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input:  Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if `data_format` does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC". Pooling happens over the spatial dimensions only.
+    ksize: An int or list of `ints` that has length `1`, `N` or `N+2`. The size
+      of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. Specifies the channel dimension. For N=1 it can be
+      either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
+      or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The average pooled output tensor.
+  """
+  if input.shape is not None:
+    n = len(input.shape) - 2
+  elif data_format is not None:
+    n = len(data_format) - 2
+  else:
+    raise ValueError(
+        "The input must have a rank or a data format must be given.")
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
+
+  ksize = _get_sequence(ksize, n, channel_index, "ksize")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+
+  avg_pooling_ops = {
+      1: avg_pool1d,
+      2: gen_nn_ops.avg_pool,
+      3: gen_nn_ops.avg_pool3d
+  }
+
+  op = avg_pooling_ops[n]
+  return op(
+      input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+
+
+@tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+def avg_pool(value, ksize, strides, padding, data_format="NHWC",
+             name=None, input=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
   Each entry in `output` is the mean of the corresponding size `ksize`
@@ -3038,20 +3434,30 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   Args:
     value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
       `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-    ksize: A list or tuple of 4 ints. The size of the window for each dimension
-      of the input tensor.
-    strides: A list or tuple of 4 ints. The stride of the sliding window for
-      each dimension of the input tensor.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the operation.
+    input: Alias for value.
 
   Returns:
     A `Tensor` with the same type as `value`.  The average pooled output tensor.
   """
   with ops.name_scope(name, "AvgPool", [value]) as name:
-    value = ops.convert_to_tensor(value, name="input")
+    value = deprecation.deprecated_argument_lookup(
+        "input", input, "value", value)
+
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
     return gen_nn_ops.avg_pool(
         value,
         ksize=ksize,
@@ -3061,8 +3467,198 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
         name=name)
 
 
-@tf_export("nn.max_pool")
-def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
+@tf_export("nn.avg_pool2d", v1=[])
+def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool2D", [input]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+@tf_export("nn.avg_pool1d")
+def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Note internally this op reshapes and uses the underlying 2d operation.
+
+  Args:
+    input: A 3-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1` or `3`. The size of the
+      window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1` or `3`. The stride of
+      the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool1D", [input]) as name:
+    if data_format is None:
+      data_format = "NWC"
+    channel_index = 1 if data_format.startswith("NC") else 2
+    ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
+    strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
+    expanding_dim = 1 if data_format == "NWC" else 2
+
+    input = array_ops.expand_dims_v2(input, expanding_dim)
+    result = gen_nn_ops.avg_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+    return array_ops.squeeze(result, expanding_dim)
+
+
+@tf_export("nn.avg_pool3d")
+def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input: A 5-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `3` or `5`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NDHWC' and 'NCDHW' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool3D", [input]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 3, channel_index, "ksize")
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool3d(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
+  """Performs the max pooling on the input.
+
+  Args:
+    input:  Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if `data_format` does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC". Pooling happens over the spatial dimensions only.
+    ksize: An int or list of `ints` that has length `1`, `N` or `N+2`. The size
+      of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. Specifies the channel dimension. For N=1 it can be
+      either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
+      or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  if input.shape is not None:
+    n = len(input.shape) - 2
+  elif data_format is not None:
+    n = len(data_format) - 2
+  else:
+    raise ValueError(
+        "The input must have a rank or a data format must be given.")
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
+
+  ksize = _get_sequence(ksize, n, channel_index, "ksize")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+
+  max_pooling_ops = {
+      1: max_pool1d,
+      2: gen_nn_ops.max_pool,
+      3: gen_nn_ops.max_pool3d
+  }
+
+  op = max_pooling_ops[n]
+  return op(
+      input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+
+
+@tf_export(v1=["nn.max_pool"])
+def max_pool(value,
+             ksize,
+             strides,
+             padding,
+             data_format="NHWC",
+             name=None,
+             input=None):  # pylint: disable=redefined-builtin
   """Performs the max pooling on the input.
 
   Args:
@@ -3075,17 +3671,18 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
       See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
+    input: Alias for value.
 
   Returns:
     A `Tensor` of format specified by `data_format`.
     The max pooled output tensor.
   """
+  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
   with ops.name_scope(name, "MaxPool", [value]) as name:
-    value = ops.convert_to_tensor(value, name="input")
     if data_format is None:
       data_format = "NHWC"
-
     channel_index = 1 if data_format.startswith("NC") else 3
+
     ksize = _get_sequence(ksize, 2, channel_index, "ksize")
     strides = _get_sequence(strides, 2, channel_index, "strides")
 
@@ -3139,8 +3736,44 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
         data_format=data_format,
         name=name)
     return array_ops.squeeze(result, expanding_dim)
+# pylint: enable=redefined-builtin
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool2d")
+def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
+  """Performs the max pooling on the input.
+
+  Args:
+    input: A 4-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  with ops.name_scope(name, "MaxPool2d", [input]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
 
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
 
+    return gen_nn_ops.max_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 # pylint: enable=redefined-builtin
 
 
@@ -3187,9 +3820,8 @@ def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 # pylint: enable=redefined-builtin
 
 
-# pylint: disable=redefined-builtin
 @tf_export("nn.max_pool_with_argmax", v1=[])
-def max_pool_with_argmax_v2(input,
+def max_pool_with_argmax_v2(input,  # pylint: disable=redefined-builtin
                             ksize,
                             strides,
                             padding,
@@ -3257,12 +3889,12 @@ def max_pool_with_argmax_v2(input,
 
 
 @tf_export(v1=["nn.max_pool_with_argmax"])
-def max_pool_with_argmax_v1(input,  # pylint: disable=missing-docstring,invalid-name
+def max_pool_with_argmax_v1(input,  # pylint: disable=missing-docstring,redefined-builtin,invalid-name
                             ksize,
                             strides,
                             padding,
                             data_format="NHWC",
-                            Targmax=None,  # pylint: disable=invalid-name
+                            Targmax=None,
                             name=None,
                             output_dtype=None,
                             include_batch_in_index=False):
@@ -3283,7 +3915,6 @@ def max_pool_with_argmax_v1(input,  # pylint: disable=missing-docstring,invalid-
       name=name)
 
 max_pool_with_argmax_v1.__doc__ = gen_nn_ops.max_pool_with_argmax.__doc__
-# pylint: enable=redefined-builtin
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
@@ -3353,7 +3984,7 @@ def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
     return bias_add(mm, biases, name=name)
 
 
-def xw_plus_b_v1(x, weights, biases, name=None):  # pylint: disable=invalid-name
+def xw_plus_b_v1(x, weights, biases, name=None):
   """Computes matmul(x, weights) + biases.
 
   This is a deprecated version of that will soon be removed.
@@ -3407,7 +4038,7 @@ def _get_noise_shape(x, noise_shape):
                              "Rate should be set to `rate = 1 - keep_prob`.",
                              "keep_prob")
 def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
-            rate=None):  # pylint: disable=invalid-name
+            rate=None):
   """Computes dropout.
 
   For each element of `x`, with probability `rate`, outputs `0`, and otherwise
@@ -3457,7 +4088,7 @@ def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
 
 
 @tf_export("nn.dropout", v1=[])
-def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
   """Computes dropout.
 
   With probability `rate`, drops elements of `x`. Input that are kept are
@@ -3881,251 +4512,6 @@ def fractional_avg_pool_v2(value,
                                           seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export(v1=["nn.conv1d"])
-@deprecation.deprecated_arg_values(
-    None,
-    "`NCHW` for data_format is deprecated, use `NCW` instead",
-    warn_once=True,
-    data_format="NCHW")
-@deprecation.deprecated_arg_values(
-    None,
-    "`NHWC` for data_format is deprecated, use `NWC` instead",
-    warn_once=True,
-    data_format="NHWC")
-def conv1d(value=None,
-           filters=None,
-           stride=None,
-           padding=None,
-           use_cudnn_on_gpu=None,
-           data_format=None,
-           name=None,
-           input=None):  # pylint: disable=redefined-builtin
-  r"""Computes a 1-D convolution given 3-D input and filter tensors.
-
-  Given an input tensor of shape
-    [batch, in_width, in_channels]
-  if data_format is "NWC", or
-    [batch, in_channels, in_width]
-  if data_format is "NCW",
-  and a filter / kernel tensor of shape
-  [filter_width, in_channels, out_channels], this op reshapes
-  the arguments to pass them to conv2d to perform the equivalent
-  convolution operation.
-
-  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-  For example, if `data_format` does not start with "NC", a tensor of shape
-    [batch, in_width, in_channels]
-  is reshaped to
-    [batch, 1, in_width, in_channels],
-  and the filter is reshaped to
-    [1, filter_width, in_channels, out_channels].
-  The result is then reshaped back to
-    [batch, out_width, out_channels]
-  \(where out_width is a function of the stride and padding as in conv2d\) and
-  returned to the caller.
-
-  Args:
-    value: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
-    filters: A 3D `Tensor`.  Must have the same type as `value`.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: 'SAME' or 'VALID'
-    use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
-    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
-      to `"NWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCW"` format stores
-      data as [batch, in_channels, in_width].
-    name: A name for the operation (optional).
-    input: Alias for value.
-
-  Returns:
-    A `Tensor`.  Has the same type as input.
-
-  Raises:
-    ValueError: if `data_format` is invalid.
-  """
-  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
-  with ops.name_scope(name, "conv1d", [value, filters]) as name:
-    # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format is None or data_format == "NHWC" or data_format == "NWC":
-      data_format = "NHWC"
-      spatial_start_dim = 1
-      strides = [1, 1, stride, 1]
-    elif data_format == "NCHW" or data_format == "NCW":
-      data_format = "NCHW"
-      spatial_start_dim = 2
-      strides = [1, 1, 1, stride]
-    else:
-      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
-    value = array_ops.expand_dims(value, spatial_start_dim)
-    filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(
-        value,
-        filters,
-        strides,
-        padding,
-        use_cudnn_on_gpu=use_cudnn_on_gpu,
-        data_format=data_format)
-    return array_ops.squeeze(result, [spatial_start_dim])
-
-
-@tf_export("nn.conv1d", v1=[])
-def conv1d_v2(input,  # pylint: disable=redefined-builtin
-              filters,
-              stride,
-              padding,
-              data_format=None,
-              name=None):
-  r"""Computes a 1-D convolution given 3-D input and filter tensors.
-
-  Given an input tensor of shape
-    [batch, in_width, in_channels]
-  if data_format is "NWC", or
-    [batch, in_channels, in_width]
-  if data_format is "NCW",
-  and a filter / kernel tensor of shape
-  [filter_width, in_channels, out_channels], this op reshapes
-  the arguments to pass them to conv2d to perform the equivalent
-  convolution operation.
-
-  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-  For example, if `data_format` does not start with "NC", a tensor of shape
-    [batch, in_width, in_channels]
-  is reshaped to
-    [batch, 1, in_width, in_channels],
-  and the filter is reshaped to
-    [1, filter_width, in_channels, out_channels].
-  The result is then reshaped back to
-    [batch, out_width, out_channels]
-  \(where out_width is a function of the stride and padding as in conv2d\) and
-  returned to the caller.
-
-  Args:
-    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
-    filters: A 3D `Tensor`.  Must have the same type as `input`.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: 'SAME' or 'VALID'
-    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
-      to `"NWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCW"` format stores
-      data as [batch, in_channels, in_width].
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`.  Has the same type as input.
-
-  Raises:
-    ValueError: if `data_format` is invalid.
-  """
-  return conv1d(input,  # pylint: disable=redefined-builtin
-                filters,
-                stride,
-                padding,
-                use_cudnn_on_gpu=True,
-                data_format=data_format,
-                name=name)
-
-
-def conv1d_transpose(
-    value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    stride,
-    padding="SAME",
-    data_format="NWC",
-    name=None):
-  """The transpose of `conv1d`.
-
-  This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv1d` rather than an actual
-  deconvolution.
-
-  Args:
-    value: A 3-D `Tensor` of type `float` and shape
-      `[batch, in_width, in_channels]` for `NWC` data format or
-      `[batch, in_channels, in_width]` for `NCW` data format.
-    filter: A 3-D `Tensor` with the same type as `value` and shape
-      `[filter_width, output_channels, in_channels]`.  `filter`'s
-      `in_channels` dimension must match that of `value`.
-    output_shape: A 1-D `Tensor` representing the output shape of the
-      deconvolution op.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
-    name: Optional name for the returned tensor.
-
-  Returns:
-    A `Tensor` with the same type as `value`.
-
-  Raises:
-    ValueError: If input/output depth does not match `filter`'s shape, or if
-      padding is other than `'VALID'` or `'SAME'`.
-  """
-  with ops.name_scope(name, "conv1d_transpose",
-                      [value, filter, output_shape]) as name:
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
-      raise ValueError("output_shape must have shape (3,), got {}".format(
-          output_shape_.get_shape()))
-
-    # The format could be either NWC or NCW, map to NHWC or NCHW
-    if data_format is None or data_format == "NWC":
-      data_format_2d = "NHWC"
-      axis = 2
-    elif data_format == "NCW":
-      data_format_2d = "NCHW"
-      axis = 1
-    else:
-      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
-
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[2]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[2]))
-
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [3] if reached this point.
-      if not filter.get_shape().dims[1].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[1]))
-
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
-
-    # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format_2d == "NHWC":
-      output_shape_ = array_ops.concat(
-          [output_shape_[:1], [1], output_shape_[1:]], axis=0)
-      spatial_start_dim = 1
-      strides = [1, 1, stride, 1]
-    else:
-      output_shape_ = array_ops.concat(
-          [output_shape_[:2], [1], output_shape_[2:]], axis=0)
-      spatial_start_dim = 2
-      strides = [1, 1, 1, stride]
-    value = array_ops.expand_dims(value, spatial_start_dim)
-    filter = array_ops.expand_dims(filter, 0)  # pylint: disable=redefined-builtin
-
-    result = gen_nn_ops.conv2d_backprop_input(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
-        strides=strides,
-        padding=padding,
-        data_format=data_format_2d,
-        name=name)
-    return array_ops.squeeze(result, [spatial_start_dim])
-
-
 @ops.RegisterStatistics("Dilation2D", "flops")
 def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 74561349ed1de72037e2e2f3c5d16e4a7cb03ce5..d79e420589f7c8346a30281a88637ea5d8fc16d2 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -24,9 +24,11 @@ from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -41,7 +43,6 @@ from tensorflow.python.ops.nn_impl import _compute_sampled_logits
 from tensorflow.python.platform import test as test_lib
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class ZeroFractionTest(test_lib.TestCase):
 
   def _ZeroFraction(self, x):
@@ -1018,11 +1019,10 @@ class LeakyReluTest(test_lib.TestCase):
 class SwishTest(test_lib.TestCase):
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testValues(self):
     np_values = np.array(
-        [np.linspace(-10.0, 0.0, 100),
-         np.linspace(0.0, 10.0, 100)],
+        [np.linspace(-7.0, 0.0, 100),
+         np.linspace(0.0, 7.0, 100)],
         dtype=np.float32)
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
@@ -1241,5 +1241,206 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class AvgPoolTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    x = array_ops.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test1DNumpy(self):
+    x = np.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    x = array_ops.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DNumpy(self):
+    x = np.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    x = array_ops.ones([3, 7, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DNumpy(self):
+    x = np.ones([3, 7, 6, 6, 5], dtype=np.float32)
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MaxPoolTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    x = array_ops.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test1DNumpy(self):
+    x = np.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    x = array_ops.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DNumpy(self):
+    x = np.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    x = array_ops.ones([3, 7, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DNumpy(self):
+    x = np.ones([3, 7, 6, 6, 5], dtype=np.float32)
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def testIncorrectSizeInputSmall(self):
+    x = array_ops.ones([3, 4])
+    with self.assertRaisesRegex(
+        ValueError, "Input tensor must be of rank 3, 4 or 5 but was 2."):
+      nn_ops.max_pool_v2(x, 2, 2, "SAME")
+
+  def testIncorrectSizeInput(self):
+    x = array_ops.ones([3, 4, 1, 2, 1, 2])
+    with self.assertRaisesRegex(
+        ValueError, "Input tensor must be of rank 3, 4 or 5 but was 6."):
+      nn_ops.max_pool_v2(x, 2, 2, "SAME")
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ConvolutionTest(test_lib.TestCase):
+
+  def testUnknownSize(self):
+    x = tensor_spec.TensorSpec(None, dtypes.float32, name="x")
+    k = np.ones([3, 6, 6, 5])
+
+    @def_function.function
+    def F(value):
+      return nn_ops.convolution(value, k, "SAME")
+
+    F.get_concrete_function(x)
+
+
+class ConvTransposeTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    t = array_ops.ones([2, 4, 3])
+    v = array_ops.ones([2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv1d_transpose(t, v, [2, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    t = array_ops.ones([2, 4, 4, 3])
+    v = array_ops.ones([2, 2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv2d_transpose_v2(t, v, [2, 8, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    t = array_ops.ones([2, 4, 4, 4, 3])
+    v = array_ops.ones([2, 2, 2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv3d_transpose_v2(t, v, [2, 8, 8, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 8, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def testIncorrectSizeInputSmall(self):
+    with self.assertRaisesRegex(
+        ValueError, "output_shape must be of length 3, 4 or 5 but was 2."):
+      nn_ops.conv_transpose(None, 2, [2, 3], "SAME")
+
+  def testIncorrectSizeInput(self):
+    with self.assertRaisesRegex(
+        ValueError, "output_shape must be of length 3, 4 or 5 but was 6."):
+      nn_ops.conv_transpose(None, 2, [2, 3, 4, 2, 5, 1], "SAME")
+
+  def testTensorsNoShape(self):
+    with self.assertRaisesRegex(ValueError, "output_shape cannot be None"):
+      nn_ops.conv_transpose(None, None, None, None)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 05d2e4c7fcd15ec63824531b119bac0e2403f836..0a2f3e25a29c5423915b5e9383867e5f690587b9 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -115,6 +115,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:util",
     ],
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -129,6 +130,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:backprop",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -143,6 +145,7 @@ cuda_py_test(
         "//tensorflow/python:util",
     ],
     tags = ["optonly"],  # Too slow in non-opt mode
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 7a5bef722939c7916c89e3903ac0d24fe355eee9..8a081e194f14ddc1eed4aed846a02706c051a71a 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -161,7 +161,6 @@ class MathTest(PForTestCase):
         math_ops.divide,
         math_ops.div_no_nan,
         math_ops.equal,
-        math_ops.floor_div,
         math_ops.floor_mod,
         math_ops.greater,
         math_ops.greater_equal,
@@ -182,6 +181,10 @@ class MathTest(PForTestCase):
         safe_polygamma,
         safe_zeta,
     ]
+    # FloorDiv fails on XLA due floor's discontinuities exacerbating small
+    # division differences.
+    if not test_util.is_xla_enabled():
+      float_ops += [math_ops.floor_div]
     for op in logical_ops + float_ops:
       x = random_ops.random_uniform([7, 3, 5])
       y = random_ops.random_uniform([3, 5])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 019d4f2224363ac6b8371153c54f9d2c99280c7f..b9f7a0ffca5af9b1283da0d7b1f88d9bad8aba2b 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1903,7 +1903,7 @@ def _convert_cumfoo(pfor_input, _, op_func):
 def _convert_biasadd(pfor_input):
   t, t_stacked, _ = pfor_input.input(0)
   bias, bias_stacked, _ = pfor_input.input(1)
-  data_format = pfor_input.get_attr("data_format")
+  data_format = pfor_input.get_attr("data_format").decode()
   if bias_stacked:
     # BiasAdd only supports 1-D biases, so cast bias to match value and use Add.
     pfor_input.expanddim_inputs_for_broadcast()
@@ -1921,7 +1921,7 @@ def _convert_biasadd(pfor_input):
       shape = array_ops.shape(t)
       flattened_shape = array_ops.concat([[-1], shape[2:]], axis=0)
       t = array_ops.reshape(t, flattened_shape)
-      t = nn_ops.bias_add(t, bias, data_format=b"NCHW")
+      t = nn_ops.bias_add(t, bias, data_format="NCHW")
       t = array_ops.reshape(t, shape)
       return wrap(t, True)
     return wrap(nn_ops.bias_add(t, bias, data_format=data_format), True)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 0c9e6efe8bcee5d4c3d3e9aceda1160709f9c00a..3bda777482bf0965939c0a6a6d1a82c95d669aaf 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -470,7 +470,8 @@ def register_dispatchers():
       _BINARY_ELEMENTWISE_OPS + [x[0] for x in _RAGGED_DISPATCH_OPS])
   for op in op_list:
     _, undecorated_op = tf_decorator.unwrap(op)
-    if not hasattr(undecorated_op, tf_export.API_ATTRS['tensorflow'].names):
+    if not hasattr(undecorated_op,
+                   tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names):
       raise AssertionError('Expected %s to be an exported symbol '
                            '(while adding a RaggedTensor dispatcher)')
 
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index fbe188bd1a305c1b366461528139bfcbb85b6367..1d342512c0206c8877f8e669c9e3df78d784fd8f 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional operations.
-
-See the [Higher Order
-Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
-"""
+"""Functional operations for RaggedTensors."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 42dc13223b67e2505578baefb783bc81182ec150..31e26e7c9d8b913e538b284654aa80d80c55150b 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -29,7 +29,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
-# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+# https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
 def row_splits_to_segment_ids(splits, name=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
@@ -64,7 +64,7 @@ def row_splits_to_segment_ids(splits, name=None):
 
 
 # For background on "segments" and "segment ids", see:
-# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+# https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
 def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index c1f11e89a075e06d03af3999d7640b20747f8eeb..afc9e978d0519645d448c227637544d893c0bd38 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -43,7 +43,7 @@ from tensorflow.python.ops import variables
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
@@ -505,8 +505,8 @@ class ResourceVariable(variables.VariableV1):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -1684,7 +1684,7 @@ def copy_to_graph_uninitialized(var):
       constraint=var._constraint,
       dtype=var.dtype,
       name=var._shared_name)
-  new_variable._maybe_initialize_checkpointable()
+  new_variable._maybe_initialize_trackable()
   # pylint: enable=protected-access
   return new_variable
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 603baea281bba362ad28c00a21c0b069a02ae375..cb9377df363351dccffd1155a14bdf0a9f4abcfc 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -50,7 +50,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -1095,8 +1095,8 @@ class _RNNCellWrapperV1(RNNCell):
   def __init__(self, cell):
     super(_RNNCellWrapperV1, self).__init__()
     self._cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self._cell, name="cell")
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self._cell, name="cell")
 
   def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
     """Calls the wrapped cell and performs the wrapping logic.
@@ -1611,8 +1611,8 @@ class DeviceWrapper(RNNCell):
     """
     super(DeviceWrapper, self).__init__()
     self._cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self._cell, name="cell")
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self._cell, name="cell")
     self._device = device
 
   @property
@@ -1678,11 +1678,11 @@ class MultiRNNCell(RNNCell):
 
     self._cells = cells
     for cell_number, cell in enumerate(self._cells):
-      # Add Checkpointable dependencies on these cells so their variables get
+      # Add Trackable dependencies on these cells so their variables get
       # saved with this object when using object-based saving.
-      if isinstance(cell, checkpointable.Checkpointable):
-        # TODO(allenl): Track down non-Checkpointable callers.
-        self._track_checkpointable(cell, name="cell-%d" % (cell_number,))
+      if isinstance(cell, trackable.Trackable):
+        # TODO(allenl): Track down non-Trackable callers.
+        self._track_trackable(cell, name="cell-%d" % (cell_number,))
     self._state_is_tuple = state_is_tuple
     if not state_is_tuple:
       if any(nest.is_sequence(c.state_size) for c in self._cells):
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index a5b31aff91660a6ac79c980dffb543e87fd40dfa..63b5eab56a3a6dc434ef03c3477945a9860bef65 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -391,12 +391,16 @@ def eager_py_func(func, inp, Tout, name=None):
 
 @deprecation.deprecated(
     date=None,
-    instructions="""tf.py_func is deprecated in TF V2. Instead, use
-    tf.py_function, which takes a python function which manipulates tf eager
+    instructions="""tf.py_func is deprecated in TF V2. Instead, there are two
+    options available in V2.
+    - tf.py_function takes a python function which manipulates tf eager
     tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
     an ndarray (just call tensor.numpy()) but having access to eager tensors
     means `tf.py_function`s can use accelerators such as GPUs as well as
     being differentiable using a gradient tape.
+    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
+    (it is not differentiable, and manipulates numpy arrays). It drops the
+    stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
 def py_func(func, inp, Tout, stateful=True, name=None):
@@ -467,6 +471,13 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   return _internal_py_func(
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
+@tf_export("numpy_function", v1=[])
+def numpy_function(func, inp, Tout, name=None):
+  return py_func(func, inp, Tout, stateful=True, name=name)
+
+numpy_function.__doc__ = py_func.__doc__.replace(
+    "py_func", "numpy_function")
+
 
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 0224e7ebb8aa6d35d20ab43a303887d325e00441..41ba060a4b666ceb7ecdf04431b14dce48cfba4f 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -119,7 +119,6 @@ class LBetaTest(test.TestCase):
           special_math_ops.lbeta(x).get_shape())
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.disable_xla('This test never passed for XLA')
   def test_length_1_last_dimension_results_in_one(self):
     # If there is only one coefficient, the formula still works, and we get one
     # as the answer, always.
@@ -127,7 +126,9 @@ class LBetaTest(test.TestCase):
     x_b = [0.1]
     with self.session(use_gpu=True):
       self.assertAllClose(
-          1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))))
+          1,
+          self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))),
+          rtol=3e-6)
       self.assertAllClose(
           1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_b))))
       self.assertEqual((), special_math_ops.lbeta(x_a).get_shape())
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index ba3bd094923abf2929d9e64e9f9bdb7d60cf4c80..5e217d8ed2f3bbe427c144700e485d1be339545f 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -54,6 +54,7 @@ from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=rede
 # pylint: enable=redefined-builtin
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
+from tensorflow.python.ops.critical_section_ops import *
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
 from tensorflow.python.ops.gradients import *
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 155ad969f67073f151fbdc295570e85af5dc22b1..91ca25254bc28b865c29b42fd5fa8082a9f24241 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import \
+from tensorflow.python.training.tracking import \
 tracking
 from tensorflow.python.util.tf_export import tf_export
 
@@ -51,6 +51,7 @@ SEED_SIZE = 16  # in units of SEED_TYPE
 
 
 STATE_TYPE = SEED_TYPE
+ALGORITHM_TYPE = STATE_TYPE
 RNG_ALG_PHILOX = 1
 DEFAULT_ALGORITHM = RNG_ALG_PHILOX
 
@@ -119,26 +120,18 @@ def _make_state_from_seed(seed, algorithm):
     raise ValueError("Unsupported algorithm id: %s" % algorithm)
 
 
-def create_rng_state(seed, algorithm=None):
+@tf_export("random.create_rng_state")
+def create_rng_state(seed, algorithm):
   """Creates a RNG state.
 
   Args:
     seed: an integer or 1-D tensor.
-    algorithm: (optional) an integer representing the RNG algorithm. If None, an
-      algorithm will be auto-selected.
+    algorithm: an integer representing the RNG algorithm.
 
   Returns:
-    a 1-D tensor "rng_state" with:
-    * rng_state[0] is a value that identifies the RNG algorithm;
-    * rng_state[1:] holds the RNG state itself (size dependent on the
-        algorithm).
+    a 1-D tensor whose size depends on the algorithm.
   """
-  if algorithm is None:
-    # TODO(wangpeng): more sophisticated algorithm selection
-    algorithm = DEFAULT_ALGORITHM
-  state = _make_state_from_seed(seed, algorithm)
-  return np.concatenate((np.array([algorithm], dtype=STATE_TYPE), state),
-                        axis=None)
+  return _make_state_from_seed(seed, algorithm)
 
 
 def _shape_tensor(shape):
@@ -151,24 +144,48 @@ def _shape_tensor(shape):
 
 
 @tf_export("random.experimental.Generator")
-class Generator(tracking.AutoCheckpointable):
+class Generator(tracking.AutoTrackable):
   """Random-number generator.
 
   It uses Variable to manage its internal state.
   """
 
   def __init__(self, copy_from=None, seed=None, algorithm=None):
+    """Creates a generator.
+
+    Args:
+      copy_from: (optional) a generator to be copied from.
+      seed: (optional) the seed for the RNG. If None, it will be chosen
+            nondeterministically
+      algorithm: (optional) the RNG algorithm. If None, it will be
+                 auto-selected.
+    """
     if copy_from is None:
       if seed is None:
         seed = non_deterministic_seed()
+      if algorithm is None:
+        # TODO(wangpeng): more sophisticated algorithm selection
+        algorithm = DEFAULT_ALGORITHM
       state = create_rng_state(seed, algorithm)
       self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+      self._alg_var = variables.Variable(initial_value=algorithm,
+                                         dtype=ALGORITHM_TYPE)
     else:
       assert seed is None
-      state = copy_from.state
-      self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+      self._state_var = variables.Variable(copy_from.state, dtype=STATE_TYPE)
+      self._alg_var = variables.Variable(initial_value=copy_from.algorithm,
+                                         dtype=ALGORITHM_TYPE)
 
   def reset(self, seed):
+    """Resets the generator.
+
+    This function is not thread-safe: if it is run concurrently with a call to
+    sampling, the latter might see the new algorithm but the old state or vice
+    versa.
+
+    Args:
+      seed: the seed to reset the RNG to.
+    """
     algorithm = int(self.algorithm)
     state = create_rng_state(seed, algorithm)
     self._state_var.assign(state)
@@ -179,13 +196,14 @@ class Generator(tracking.AutoCheckpointable):
 
   @property
   def algorithm(self):
-    return self._state_var[0]
+    return self._alg_var
 
   # The following functions return a tensor and as a side effect update
   # self._state_var.
   def standard_normal(self, shape, dtype=dtypes.float32):
-    return gen_stateful_random_ops.stateful_standard_normal(
-        self.state.handle, shape, dtype)
+    output = gen_stateful_random_ops.stateful_standard_normal_v2(
+        self.state.handle, self.algorithm, shape, dtype)
+    return output
 
   def normal(self, shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
              name=None):
@@ -237,5 +255,6 @@ def set_global_generator(generator):
 def reset_global_generator(seed, algorithm=None):
   global global_generator
   if algorithm is None:
-    algorithm = int(global_generator.algorithm)  # preserve the old algorithm
+    # preserve the old algorithm
+    algorithm = int(get_global_generator().algorithm)
   global_generator = Generator(seed=seed, algorithm=algorithm)
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 92419a0770e302acce205cf9c2c37023da72434c..0436736e004ee7882a9b1835ad4076128a002548 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -18,15 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_random_ops
+from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import stateful_random_ops as \
 random
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -39,8 +43,8 @@ class StatefulRandomOpsTest(test.TestCase):
                                     random.RNG_ALG_PHILOX)
     self.assertAllEqual(
         list(map(random._uint_to_int,
-                 [random.RNG_ALG_PHILOX, 0xFFAA666677778888,
-                  0xFFFF222233334444] + [0] * (random.PHILOX_STATE_SIZE - 2))),
+                 [0xFFAA666677778888, 0xFFFF222233334444] +
+                 [0] * (random.PHILOX_STATE_SIZE - 2))),
         state)
 
   @test_util.run_v2_only
@@ -112,27 +116,81 @@ class StatefulRandomOpsTest(test.TestCase):
     compare(True, False)
 
   @test_util.run_v2_only
-  def testSameAsOldRandomOps(self):
-    """Tests that the generated numbers are the same as the old random_ops.py .
+  def testCPUSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py.
+
+    The CPU version.
     """
-    seed1, seed2 = 50, 60
+    seed1, seed2 = 79, 25
     # note how the two seeds for the old op correspond to the seed for the new
     # op
-    random.get_global_generator().reset([0, seed2, seed1])
-    shape = constant_op.constant([2, 3])
-    dtype = dtypes.float32
+    with ops.device("/device:CPU:0"):
+      random.reset_global_generator([0, seed2, seed1])
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+
     # create a graph for the old op in order to call it many times
     @def_function.function
     def old():
-      return gen_random_ops.random_standard_normal(
-          shape, dtype=dtype, seed=seed1, seed2=seed2)
+      with ops.device("/device:CPU:0"):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    def new():
+      with ops.device("/device:CPU:0"):
+        return random.get_global_generator().standard_normal(shape, dtype=dtype)
+
+    for _ in range(100):
+      self.assertAllEqual(old(), new())
+
+  @test_util.run_v2_only
+  @test_util.run_cuda_only
+  def testGPUSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py.
+
+    The GPU version.
+    """
+    seed1, seed2 = 79, 25
+    with ops.device(test_util.gpu_device_name()):
+      random.reset_global_generator([0, seed2, seed1])
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+
+    @def_function.function
+    def old():
+      with ops.device(test_util.gpu_device_name()):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
 
     def new():
-      return random.get_global_generator().standard_normal(shape, dtype=dtype)
+      with ops.device(test_util.gpu_device_name()):
+        return random.get_global_generator().standard_normal(shape, dtype=dtype)
 
     for _ in range(100):
       self.assertAllEqual(old(), new())
 
+  @test_util.run_v2_only
+  def testStatefulStandardNormal(self):
+    """Tests that op 'StatefulStandardNormal' still works.
+    """
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+    seed = 1234
+    algorithm = random.RNG_ALG_PHILOX
+    state = random._make_state_from_seed(seed, algorithm)
+    with ops.device("/device:CPU:0"):
+      var1 = variables.Variable(
+          np.concatenate((np.array([algorithm], dtype=random.STATE_TYPE),
+                          state), axis=None),
+          dtype=random.STATE_TYPE)
+      var2 = variables.Variable(state, dtype=random.STATE_TYPE)
+      for _ in range(100):
+        t1 = gen_stateful_random_ops.stateful_standard_normal(
+            var1.handle, shape, dtype)
+        t2 = gen_stateful_random_ops.stateful_standard_normal_v2(
+            var2.handle, algorithm, shape, dtype)
+        self.assertAllEqual(t1, t2)
+
   @test_util.run_v2_only
   def testResetGlobalGeneratorBadWithDefun(self):
     """Demonstrates that reset_global_generator don't work properly with defun.
@@ -145,7 +203,7 @@ class StatefulRandomOpsTest(test.TestCase):
 
     random.reset_global_generator(50)
     with self.assertRaisesWithPredicateMatch(
-        errors_impl.NotFoundError, "Resource .+ does not exist"):
+        AssertionError, "variable.*deleted"):
       a = f()
       random.reset_global_generator(50)
       b = f()
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 168cb975548095be4648a9e705deb797241363c7..6ddecb2948bcf7fa6165155c86edaf0a76b629b6 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -27,6 +27,8 @@ import time
 import six
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -45,11 +47,6 @@ from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Dictionary mapping graph keys to a boolean Tensor (or callable returning
-# a boolean Tensor) indicating whether we should record summaries for the
-# graph identified by the key of the dictionary.
-_SHOULD_RECORD_SUMMARIES = {}
-
 # A global dictionary mapping graph keys to a list of summary writer init ops.
 _SUMMARY_WRITER_INIT_OP = {}
 
@@ -61,10 +58,8 @@ _USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
 def _should_record_summaries_internal():
   """Returns boolean Tensor if summaries should/shouldn't be recorded, or None.
   """
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  should = _SHOULD_RECORD_SUMMARIES.get(key)
-  return should() if callable(should) else should
+  condition = context.context().recording_summaries
+  return condition() if callable(condition) else condition
 
 
 def _should_record_summaries_v2():
@@ -83,32 +78,28 @@ def should_record_summaries():
   return False if result is None else result
 
 
+@tf_export("summary.record_if", v1=[])
 @tf_contextlib.contextmanager
-def _record_summaries(boolean=True):
+def record_if(condition):
   """Sets summary recording on or off per the provided boolean value.
 
   The provided value can be a python boolean, a scalar boolean Tensor, or
   or a callable providing such a value; if a callable is passed it will be
-  invoked each time should_record_summaries() is called to determine whether
-  summary writing should be enabled.
+  invoked on-demand to determine whether summary writing will occur.
 
   Args:
-    boolean: can be True, False, a bool Tensor, or a callable providing such.
-      Defaults to True.
+    condition: can be True, False, a bool Tensor, or a callable providing such.
 
   Yields:
     Returns a context manager that sets this value on enter and restores the
     previous value on exit.
   """
-  # TODO(nickfelt): make this threadlocal
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, None)
+  old = context.context().recording_summaries
   try:
-    _SHOULD_RECORD_SUMMARIES[key] = boolean
+    context.context().recording_summaries = condition
     yield
   finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+    context.context().recording_summaries = old
 
 
 # TODO(apassos) consider how to handle local step here.
@@ -120,17 +111,17 @@ def record_summaries_every_n_global_steps(n, global_step=None):
     should = lambda: math_ops.equal(global_step % n, 0)
     if not context.executing_eagerly():
       should = should()
-  return _record_summaries(should)
+  return record_if(should)
 
 
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  return _record_summaries(True)
+  return record_if(True)
 
 
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  return _record_summaries(False)
+  return record_if(False)
 
 
 @tf_export("summary.SummaryWriter", v1=[])
@@ -195,16 +186,19 @@ class SummaryWriter(object):
       return self._close()
 
 
+@tf_export(v1=["summary.initialize"])
 def initialize(
     graph=None,  # pylint: disable=redefined-outer-name
     session=None):
   """Initializes summary writing for graph execution mode.
 
+  This operation is a no-op when executing eagerly.
+
   This helper method provides a higher-level alternative to using
   `tf.contrib.summary.summary_writer_initializer_op` and
   `tf.contrib.summary.graph`.
 
-  Most users will also want to call `tf.train.create_global_step`
+  Most users will also want to call `tf.compat.v1.train.create_global_step`
   which can happen before or after this function is called.
 
   Args:
@@ -739,3 +733,72 @@ def _choose_step(step):
   if not isinstance(step, ops.Tensor):
     return ops.convert_to_tensor(step, dtypes.int64)
   return step
+
+
+def run_metadata(name, data, step):
+  """Writes entire RunMetadata summary.
+
+  A RunMetadata can contain DeviceStats, partition graphs, and function graphs.
+  Please refer to the proto for definition of each field.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A RunMetadata proto to write.
+    step: Required `int64`-castable monotonic step value.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_run_metadata"
+  # version number = 1
+  summary_metadata.plugin_data.content = "1"
+
+  with summary_scope(name,
+                     "graph_run_metadata_summary",
+                     [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(
+            data.SerializeToString(), dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
+
+
+def run_metadata_graphs(name, data, step):
+  """Writes graphs from a RunMetadata summary.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A RunMetadata proto to write.
+    step: Required `int64`-castable monotonic step value.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_run_metadata_graph"
+  # version number = 1
+  summary_metadata.plugin_data.content = "1"
+
+  data = config_pb2.RunMetadata(
+      function_graphs=data.function_graphs,
+      partition_graphs=data.partition_graphs)
+
+  with summary_scope(name,
+                     "graph_run_metadata_graph_summary",
+                     [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(
+            data.SerializeToString(), dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index e02175d6feaf1dd74c560bec9e7815cbe20eddc5..ff4f23a0e75157f00167bdd6d9001fc6fa53a0a0 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -26,8 +26,8 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -232,7 +232,7 @@ def _skip_common_stack_elements(stacktrace, base_case):
   return stacktrace[-1:]
 
 
-class Template(checkpointable.Checkpointable):
+class Template(trackable.Trackable):
   """Wrap a function to aid in variable sharing.
 
   Templates are functions that create variables the first time they are called
@@ -306,8 +306,8 @@ class Template(checkpointable.Checkpointable):
         result = self._func(*args, **kwargs)
       else:
         # The first time we run, restore variables if necessary (via
-        # Checkpointable).
-        with checkpointable_util.capture_dependencies(template=self):
+        # Trackable).
+        with trackable_util.capture_dependencies(template=self):
           result = self._func(*args, **kwargs)
 
       if self._variables_created:
@@ -577,8 +577,8 @@ class EagerTemplate(Template):
         result = self._func(*args, **kwargs)
       else:
         # The first time we run, restore variables if necessary (via
-        # Checkpointable).
-        with checkpointable_util.capture_dependencies(template=self):
+        # Trackable).
+        with trackable_util.capture_dependencies(template=self):
           result = self._func(*args, **kwargs)
 
       if self._variables_created:
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 3c0f98466b25726d28936d1180fb145700d77926..65dee7797cf82c5a834f38b3d1a9efe95230a707 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -575,18 +575,29 @@ class _GraphTensorArrayV2(object):
   def stack(self, name=None):
     """See TensorArray."""
     with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      if self._element_shape:
+        element_shape = self._element_shape[0]
+      else:
+        element_shape = tensor_shape.TensorShape(None)
       value = list_ops.tensor_list_stack(
-          input_handle=self._flow, element_dtype=self._dtype)
+          input_handle=self._flow,
+          element_dtype=self._dtype,
+          element_shape=element_shape)
       if self._element_shape and self._element_shape[0].dims is not None:
         value.set_shape([None] + self._element_shape[0].dims)
       return value
 
   def gather(self, indices, name=None):
     """See TensorArray."""
+    if self._element_shape:
+      element_shape = self._element_shape[0]
+    else:
+      element_shape = tensor_shape.TensorShape(None)
     value = list_ops.tensor_list_gather(
         input_handle=self._flow,
         indices=indices,
         element_dtype=self._dtype,
+        element_shape=element_shape,
         name=name)
     if self._element_shape and self._element_shape[0].dims is not None:
       value.set_shape([None] + self._element_shape[0].dims)
@@ -627,7 +638,7 @@ class _GraphTensorArrayV2(object):
         self._merge_element_shape(value.shape[1:])
       element_shape = self._element_shape[0] if self._element_shape else None
       flow_out = list_ops.tensor_list_scatter(
-          tensor=value, indices=indices, element_shape=element_shape)
+          tensor=value, indices=indices, input_handle=self._flow)
       return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
@@ -825,7 +836,7 @@ class _EagerTensorArray(object):
     if self._infer_shape:
       if self._element_shape is None:
         self._element_shape = value.shape
-      elif self._element_shape != value.shape:
+      elif not self._element_shape.is_compatible_with(value.shape):
         raise ValueError("Incompatible shape for value (%s), expected (%s)" %
                          (value.shape.as_list(), self._element_shape.as_list()))
 
@@ -860,7 +871,9 @@ class _EagerTensorArray(object):
   def gather(self, indices, name=None):
     """See TensorArray."""
     del name  # not meaningful when executing eagerly.
-    return array_ops.stack([self._maybe_zero(i) for i in indices.numpy()])
+    if isinstance(indices, ops.EagerTensor):
+      indices = indices.numpy()
+    return array_ops.stack([self._maybe_zero(i) for i in indices])
 
   def concat(self, name=None):
     """See TensorArray."""
@@ -893,7 +906,9 @@ class _EagerTensorArray(object):
   def scatter(self, indices, value, name=None):
     """See TensorArray."""
     del name  # not meaningful when executing eagerly.
-    for index, val in zip(indices.numpy(), array_ops.unstack(value)):
+    if isinstance(indices, ops.EagerTensor):
+      indices = indices.numpy()
+    for index, val in zip(indices, array_ops.unstack(value)):
       self._write(index, val)  # pylint: disable=protected-access
     return self.parent()
 
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 7788f28bc30885db5c5e2745a5151a1783a4dc06..219ba7fbb2ef8de3f0ebf020b95b1b3c945cc12d 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
@@ -204,7 +204,7 @@ class VariableMetaclass(type):
 
 @tf_export("Variable", v1=[])
 class Variable(six.with_metaclass(VariableMetaclass,
-                                  checkpointable.Checkpointable)):
+                                  trackable.Trackable)):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
 
   A variable maintains state in the graph across calls to `run()`. You add a
@@ -1018,8 +1018,8 @@ class Variable(six.with_metaclass(VariableMetaclass,
     return self.shape
 
   def _gather_saveables_for_checkpoint(self):
-    """For implementing `Checkpointable`. This object is saveable on its own."""
-    return {checkpointable.VARIABLE_VALUE_KEY: self}
+    """For implementing `Trackable`. This object is saveable on its own."""
+    return {trackable.VARIABLE_VALUE_KEY: self}
 
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
@@ -1506,8 +1506,8 @@ class RefVariable(VariableV1):
     # Store the graph key so optimizers know how to only retrieve variables from
     # this graph.
     self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 0654104a3436366bb5fe88e2c3415cc957cbfde8..fcab57c12c95cd18fd5e32279a1a42b296a4d130 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -61,6 +61,7 @@ cuda_py_test(
         "no_pip",
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -76,6 +77,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_library(
@@ -130,6 +132,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_library(
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 994206cd63a915de93bc109e7b217ad997c787a7..0a6ba12094b5e2d4374acbe0d23e8355c3b309c2 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -69,4 +69,5 @@ cuda_py_test(
     tags = [
         "no_pip",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index f96d721f46e162ee6753377569aacb439cd591d5..9e92a8f5f3670cb30d910456789b6d186f26a66b 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -50,7 +50,7 @@ def _extract_node(run_meta, node_name):
       dev = dev[dev.find('cpu:'):]
     elif dev.find('gpu:') > 0:
       dev = dev[dev.find('gpu:'):]
-    else:
+    elif '/host:cpu' not in dev:
       assert False, 'Unrecognized device name: %s' % dev
 
     for node_stat in dev_stat.node_stats:
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index bae20aca506f28910ab1d62238bddb25509dc053..fe7a41afb421015a0b57335989a5aad441fbf91d 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -25,6 +25,8 @@ limitations under the License.
 %rename("%s") TFE_ContextHasFunction;
 %rename("%s") TFE_ContextEnableRunMetadata;
 %rename("%s") TFE_ContextDisableRunMetadata;
+%rename("%s") TFE_ContextEnableGraphCollection;
+%rename("%s") TFE_ContextDisableGraphCollection;
 %rename("%s") TFE_ContextExportRunMetadata;
 %rename("%s") TFE_ContextClearCaches;
 %rename("%s") TFE_ContextGetDevicePlacementPolicy;
@@ -34,9 +36,14 @@ limitations under the License.
 %rename("%s") TFE_ContextAsyncWait;
 %rename("%s") TFE_ContextAsyncClearError;
 %rename("%s") TFE_NewProfiler;
+%rename("%s") TFE_ProfilerIsOk;
 %rename("%s") TFE_DeleteProfiler;
 %rename("%s") TFE_ProfilerSerializeToString;
+%rename("%s") TFE_NewProfilerContext;
+%rename("%s") TFE_ProfilerContextSetEagerContext;
+%rename("%s") TFE_DeleteProfilerContext;
 %rename("%s") TFE_StartProfilerServer;
+%rename("%s") TFE_ProfilerClientStartTracing;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_SetEagerTensorProfiler;
@@ -69,6 +76,7 @@ limitations under the License.
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
 %rename("%s") TFE_Py_TensorShapeOnDevice;
+%rename("%s") TFE_Py_EnableInteractivePythonLogging;
 %rename("%s") TFE_ContextStartStep;
 %rename("%s") TFE_ContextEndStep;
 %rename("%s") TFE_Py_RegisterVSpace;
@@ -149,6 +157,27 @@ limitations under the License.
   $1 = const_cast<char*>(TFE_GetPythonString($input));
 }
 
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* service_addr {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* logdir {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* worker_list {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
 %typemap(in) (TFE_Context*) {
   $1 = (TFE_Context*)PyCapsule_GetPointer($input, nullptr);
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 5d08a40d67d075652a95424d03feaacb893ca41b..01d4818879f3e1d8d64b8393b15103214e2b581e 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -261,6 +261,24 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "signature_serialization",
+    srcs = [
+        "signature_serialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":revived_types",
+        ":signature_constants",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/tracking:base",
+    ],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -271,10 +289,11 @@ py_library(
         ":builder",
         ":constants",
         ":function_serialization",
+        ":nested_structure_coder",
         ":revived_types",
-        ":saved_object_graph_py",
         ":signature_constants",
         ":signature_def_utils",
+        ":signature_serialization",
         ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
@@ -286,14 +305,16 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:tracking",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/saving:functional_saver",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:object_identity",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -320,14 +341,40 @@ py_library(
     deps = [
         ":constants",
         ":function_deserialization",
+        ":load_v1_in_v2",
         ":loader",
+        ":nested_structure_coder",
         ":revived_types",
-        ":saved_object_graph_py",
         ":utils",
-        "//tensorflow/python:function",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
+py_library(
+    name = "load_v1_in_v2",
+    srcs = [
+        "load_v1_in_v2.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":signature_serialization",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:saver",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/training/tracking",
     ],
 )
 
@@ -344,7 +391,33 @@ tf_py_test(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python/training/tracking:tracking",
+    ],
+)
+
+tf_py_test(
+    name = "load_v1_in_v2_test",
+    srcs = ["load_v1_in_v2_test.py"],
+    additional_deps = [
+        ":builder",
+        ":load",
+        ":save",
+        ":signature_def_utils",
+        ":simple_save",
+        ":utils",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/tracking:tracking",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -355,7 +428,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":saved_object_graph_py",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -364,7 +437,7 @@ tf_py_test(
     srcs = ["revived_types_test.py"],
     additional_deps = [
         ":revived_types",
-        ":saved_object_graph_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -377,7 +450,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":nested_structure_coder",
-        ":saved_object_graph_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
     ],
@@ -395,27 +468,11 @@ py_library(
     ],
 )
 
-tf_proto_library(
-    name = "struct",
-    srcs = ["struct.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
-)
-
-tf_proto_library(
-    name = "saved_object_graph",
-    srcs = ["saved_object_graph.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos() + [":struct"],
-    visibility = ["//tensorflow:internal"],
-)
-
 py_library(
     name = "nested_structure_coder",
     srcs = ["nested_structure_coder.py"],
     deps = [
-        ":struct_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "@six_archive//:six",
     ],
@@ -426,7 +483,7 @@ tf_py_test(
     srcs = ["nested_structure_coder_test.py"],
     additional_deps = [
         ":nested_structure_coder",
-        ":struct_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python/eager:test",
     ],
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index f37d283a2a2cbb50faf62f1ae24cd69bd0f29d74..37af428dcb97d77f85e0555edcc1ca959a479943 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -95,11 +95,13 @@ class _SavedModelBuilder(object):
 
     self._export_dir = export_dir
     if file_io.file_exists(export_dir):
-      raise AssertionError(
-          "Export directory already exists. Please specify a different export "
-          "directory: %s" % export_dir)
-
-    file_io.recursive_create_dir(self._export_dir)
+      if file_io.list_directory(export_dir):
+        raise AssertionError(
+            "Export directory already exists, and isn't empty. Please choose "
+            "a different export directory, or delete all the contents of the "
+            "specified directory: %s" % export_dir)
+    else:
+      file_io.recursive_create_dir(self._export_dir)
 
     # Boolean to track whether variables and assets corresponding to the
     # SavedModel have been saved. Specifically, the first meta graph to be added
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index e82e64260c0e1916777866370b9779c84dafabf0..d1ad37180e334b6d1d8719ba4b4c0819bfa8ba02 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -24,6 +24,7 @@ import re
 from tensorflow.core.framework import function_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
+from tensorflow.python.framework import func_graph as func_graph_lib
 from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
@@ -32,39 +33,89 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 def _is_tensor(t):
   return isinstance(t, (ops.Tensor, resource_variable_ops.ResourceVariable))
 
 
-def _inputs_compatible(args, stored_inputs):
-  """Checks whether function arguments are compatible with parameters."""
-  if len(args) != len(stored_inputs):
-    return False
+def _call_concrete_function(function, inputs):
+  """Calls a restored Function with structured inputs.
 
-  for arg, stored_input in zip(args, stored_inputs):
-    if not function_lib.is_same_structure(arg, stored_input):
-      return False
+  This differs from `function.__call__` in that inputs and outputs are
+  structured and that it casts inputs to tensors if needed.
 
-    flattened_arg = nest.flatten(arg)
-    flattened_stored_input = nest.flatten(stored_input)
+  Note: this does not checks that non-tensor inputs match. That should be
+  done before via `_concrete_function_callable_with`.
 
-    for a, b in zip(flattened_arg, flattened_stored_input):
-      if _is_tensor(a):
-        if not isinstance(b, tensor_spec.TensorSpec):
-          return False
-        if a.dtype != b.dtype or not b.shape.is_compatible_with(a.shape):
-          return False
-      else:
-        if a != b:
-          return False
+  Args:
+    function: ConcreteFunction to call.
+    inputs: Structured inputs compatible with
+        `function.graph.structured_input_signature`.
+
+  Returns:
+    The structured function output.
+  """
+  expected_structure = function.graph.structured_input_signature
+  flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  tensor_inputs = []
+  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+    if isinstance(expected, tensor_spec.TensorSpec):
+      tensor_inputs.append(
+          ops.convert_to_tensor(arg, dtype_hint=expected.dtype))
+  result = function._call_flat(tensor_inputs)  # pylint: disable=protected-access
+  if isinstance(result, ops.Operation):
+    return None
+  return result
+
+
+def _try_convert_to_tensor_spec(arg, dtype_hint):
+  """Returns None or TensorSpec obtained if `arg` is converted to tensor."""
+  try:
+    # Note: try conversion in a FuncGraph to avoid poluting current context.
+    with func_graph_lib.FuncGraph(name="guess_conversion").as_default():
+      result = ops.convert_to_tensor(arg, dtype_hint=dtype_hint)
+      return tensor_spec.TensorSpec(shape=result.shape, dtype=result.dtype)
+  except (TypeError, ValueError):
+    return None
+
+
+def _concrete_function_callable_with(function, inputs, allow_conversion):
+  """Returns whether concrete `function` can be called with `inputs`."""
+  expected_structure = function.graph.structured_input_signature
+  try:
+    flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  except (TypeError, ValueError):
+    return False
+  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+    if isinstance(expected, tensor_spec.TensorSpec):
+      if allow_conversion:
+        arg = _try_convert_to_tensor_spec(arg, dtype_hint=expected.dtype)
+      if not _is_tensor(arg) and not isinstance(arg, tensor_spec.TensorSpec):
+        return False
+      if arg.dtype != expected.dtype:
+        return False
+      if not expected.shape.is_compatible_with(arg.shape):
+        return False
+    else:
+      if arg != expected:
+        return False
   return True
 
 
 def _deserialize_function_spec(function_spec_proto, coder):
   """Deserialize a FunctionSpec object from its proto representation."""
-  fullargspec = coder.decode_proto(function_spec_proto.fullargspec)
+  typeless_fullargspec = coder.decode_proto(function_spec_proto.fullargspec)
+  fullargspec = tf_inspect.FullArgSpec(
+      args=typeless_fullargspec.args,
+      varargs=typeless_fullargspec.varargs,
+      varkw=typeless_fullargspec.varkw,
+      defaults=typeless_fullargspec.defaults,
+      kwonlyargs=typeless_fullargspec.kwonlyargs,
+      kwonlydefaults=typeless_fullargspec.kwonlydefaults,
+      annotations=typeless_fullargspec.annotations)
   is_method = function_spec_proto.is_method
   args_to_prepend = coder.decode_proto(function_spec_proto.args_to_prepend)
   kwargs_to_include = coder.decode_proto(function_spec_proto.kwargs_to_include)
@@ -105,18 +156,15 @@ class RestoredFunction(def_function.Function):
     super(RestoredFunction, self).__init__(
         python_function, name, autograph=False)
     self._concrete_functions = concrete_functions
-    # TODO(vbardiovsky): This does not propagate to stateful and stateless
-    # functions of the RestoredFunction, which will have seen only defunned
-    # restored_function_body(*args, **kwargs). Therefore get_concrete_function()
-    # called on RestoredFunction will not work properly.
+    # This does not propagate to stateful and stateless functions of the
+    # RestoredFunction, which will have seen only defunned
+    # restored_function_body(*args, **kwargs). That's why we have to
+    # canonicalize inputs inside restored_function_body.
     self._function_spec = function_spec
 
   def _list_all_concrete_functions_for_serialization(self):
     return self._concrete_functions
 
-  def get_concrete_function(self, *args, **kwargs):
-    raise NotImplementedError()
-
 
 def recreate_function(saved_function, concrete_functions):
   """Creates a `Function` from a `SavedFunction`.
@@ -149,36 +197,40 @@ def recreate_function(saved_function, concrete_functions):
           "Cannot canonicalize input args %r and kwargs %r. Error: %r." %
           (args, kwargs, e))
 
-    debug_considered_signatures = []
-    for concrete_function_name in saved_function.concrete_functions:
-      function_obj = concrete_functions[concrete_function_name]
-      canonicalized_original_inputs = (
-          function_obj.graph.structured_input_signature)
-      debug_considered_signatures.append(canonicalized_original_inputs)
-
-      if _inputs_compatible(canonicalized_inputs,
-                            canonicalized_original_inputs):
-        flattened_inputs = nest.flatten(canonicalized_inputs)
-        filtered_inputs = [t for t in flattened_inputs if _is_tensor(t)]
-
-        result = function_obj._call_flat(filtered_inputs)  # pylint: disable=protected-access
-        if isinstance(result, ops.Operation):
-          return None
-        return result
-
-    raise AssertionError(
+    # First try to find a concrete function that can be called without input
+    # conversions. This allows one to pick a more specific trace in case there
+    # was also a more expensive one that supported tensors.
+    for allow_conversion in [False, True]:
+      for function_name in saved_function.concrete_functions:
+        function = concrete_functions[function_name]
+        if _concrete_function_callable_with(function,
+                                            canonicalized_inputs,
+                                            allow_conversion):
+          return _call_concrete_function(function, canonicalized_inputs)
+
+    available_signatures = [
+        concrete_functions[function_name].graph.structured_input_signature
+        for function_name in saved_function.concrete_functions
+    ]
+    raise ValueError(
         "Could not find matching function to call for canonicalized inputs %r. "
         "Only existing signatures are %r."
-        % (canonicalized_inputs, debug_considered_signatures))
+        % (canonicalized_inputs, available_signatures))
 
   concrete_function_objects = []
   for concrete_function_name in saved_function.concrete_functions:
     concrete_function_objects.append(concrete_functions[concrete_function_name])
 
-  return RestoredFunction(restored_function_body,
-                          restored_function_body.__name__,
-                          function_spec,
-                          concrete_function_objects)
+  restored_function = RestoredFunction(
+      restored_function_body,
+      restored_function_body.__name__,
+      function_spec,
+      concrete_function_objects)
+
+  return tf_decorator.make_decorator(
+      restored_function_body,
+      restored_function,
+      decorator_argspec=function_spec.fullargspec)
 
 
 def load_function_def_library(library):
@@ -199,8 +251,9 @@ def load_function_def_library(library):
   """
   functions = {}
 
+  load_shared_name_suffix = "_load_{}".format(ops.uid())
   for fdef in _sort_function_defs(library):
-    copy = _fix_fdef(fdef, functions)
+    copy = _fix_fdef(fdef, functions, load_shared_name_suffix)
 
     func_graph = function_def_lib.function_def_to_graph(copy)
     for dep in _list_function_deps(fdef):
@@ -250,7 +303,7 @@ def _sort_function_defs(library):
   return [reverse[x] for x in output]
 
 
-def _fix_fdef(orig_fdef, functions):
+def _fix_fdef(orig_fdef, functions, shared_name_suffix):
   """Fixes a FunctionDef proto to be loaded in current context.
 
   In particular, when loading a function library into an eager context, one
@@ -259,6 +312,10 @@ def _fix_fdef(orig_fdef, functions):
   Args:
     orig_fdef: FunctionDef proto to fix. It is not modified.
     functions: map from function name to a ConcreteFunction instance.
+    shared_name_suffix: A unique string for this load which helps to avoid
+      `shared_name` collisions across loads. Two functions from the same load
+      using the same `shared_name` still need to share, but functions from
+      different loads with the same `shared_name` should not.
 
   Returns:
     A fixed copy of the original FunctionDef.
@@ -282,6 +339,12 @@ def _fix_fdef(orig_fdef, functions):
       if attr_value.func.name:
         attr_value.func.name = functions[attr_value.func.name].name
 
+    # TODO(b/124205571): Avoid accidental sharing and destruction of restored
+    # resources. For now uniquify "shared_name" when loading functions to avoid
+    # sharing.
+    if "shared_name" in node_def.attr:
+      node_def.attr["shared_name"].s += compat.as_bytes(shared_name_suffix)
+
   fdef.signature.name = _clean_function_name(fdef.signature.name)
   return fdef
 
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index 95dfec7b692aac0bd10d29ad65e0bcf98099f177..e876eef8b349ac17a42cb284a861784b4d941998 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.saved_model import nested_structure_coder
-from tensorflow.python.saved_model import saved_object_graph_pb2
 
 
 def _serialize_function_spec(function_spec, coder):
@@ -49,7 +49,7 @@ def serialize_concrete_function(concrete_function, node_ids, coder):
         "captures tensor %s which is unsupported or not reachable from root. "
         "One reason could be that a stateful object or a variable that the "
         "function depends on is not assigned to an attribute of the serialized "
-        "checkpointable object "
+        "trackable object "
         "(see SaveTest.test_captures_unreachable_variable)."
         % (concrete_function.name, capture))
   concrete_function_proto = saved_object_graph_pb2.SavedConcreteFunction()
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 6a1937d88fb3fe0e68f4b6f446ac833633657df4..f37678b093d379c71e4f6313046189515b706415 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Import a checkpointable object from a SavedModel."""
+"""Import a trackable object from a SavedModel."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,21 +24,21 @@ import os
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_deserialization
+from tensorflow.python.saved_model import load_v1_in_v2
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
-from tensorflow.python.util import compat
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Loader(object):
@@ -55,59 +55,74 @@ class _Loader(object):
         function_deserialization.load_function_def_library(
             meta_graph.graph_def.library))
     self._load_all()
-    self._setup_functions()
+    # TODO(b/124045874): There are limitations with functions whose captures
+    # trigger other functions to be executed. For now it is only guaranteed to
+    # work if the captures of a function only trigger functions without
+    # captures.
+    self._setup_functions_structures()
+    self._setup_functions_captures()
     self._restore_checkpoint()
 
-  def _setup_concrete_function(self, proto, concrete_function, coder):
-    """Setup captured tensors and outputs for a single concrete function."""
-    bound_inputs = [
-        self._get_tensor_from_node(node_id)
-        for node_id in proto.bound_inputs]
-    bound_variables = [
-        self._nodes[node_id]
-        for node_id in proto.bound_inputs
-        if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
-    ]
-    # TODO(andresp): This is only injecting the captured inputs into the
-    # concrete function, note that we did not modify the FuncGraph
-    # itself.
-    concrete_function._captured_inputs = bound_inputs  # pylint: disable=protected-access
-    concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
-    # By setting the structured_outputs directly, we can rely on this
-    # function_lib.ConcreteFunction object to perform the output repacking
-    # logic. The only limitation of that logic is that it only works
-    # with output that is convertible to Tensors and the conversion
-    # always happens. For example tf.TensorShape([2, 3]) will be
-    # converted to Tensor representing [2, 3].
-    original_outputs = coder.decode_proto(proto.output_signature)
-    # The original_outputs here had Tensors converted to TensorSpecs, so
-    # the restored function's structured_outputs field will not be
-    # exactly the same. Fortunately the repacking logic cares only about
-    # the structure.
-    # TODO(vbardiovsky): Should we just replicate the structures, with
-    # Nones instead of real objects?
-    concrete_function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
-    concrete_function._func_graph.structured_input_signature = (  # pylint: disable=protected-access
-        coder.decode_proto(proto.canonicalized_input_signature))
-
-  def _setup_functions(self):
-    """Setup captures and output structure in restored functions."""
+    for node in self._nodes:
+      if isinstance(node, tracking.TrackableResource):
+        init_op = node.initialize()
+        ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+
+  def _setup_functions_structures(self):
+    """Setup structure for inputs and outputs of restored functions."""
     coder = nested_structure_coder.StructureCoder()
-    for name, concrete_function_proto in self._proto.concrete_functions.items():
-      self._setup_concrete_function(
-          concrete_function_proto,
-          self._concrete_functions[name],
-          coder)
+    for name, proto in sorted(self._proto.concrete_functions.items()):
+      concrete_function = self._concrete_functions[name]
+      # By setting the structured_outputs directly, we can rely on this
+      # function_lib.ConcreteFunction object to perform the output repacking
+      # logic. The only limitation of that logic is that it only works
+      # with output that is convertible to Tensors and the conversion
+      # always happens. For example tf.TensorShape([2, 3]) will be
+      # converted to Tensor representing [2, 3].
+      original_outputs = coder.decode_proto(proto.output_signature)
+      # The original_outputs here had Tensors converted to TensorSpecs, so
+      # the restored function's structured_outputs field will not be
+      # exactly the same. Fortunately the repacking logic cares only about
+      # the structure.
+      # TODO(vbardiovsky): Should we just replicate the structures, with
+      # Nones instead of real objects?
+      concrete_function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
+      concrete_function._func_graph.structured_input_signature = (  # pylint: disable=protected-access
+          coder.decode_proto(proto.canonicalized_input_signature))
+
+  def _setup_functions_captures(self):
+    """Setup captures and variables in restored functions."""
+    concrete_functions = sorted(self._proto.concrete_functions.items())
+    for name, proto in concrete_functions:
+      concrete_function = self._concrete_functions[name]
+      bound_inputs = [
+          self._get_tensor_from_node(node_id)
+          for node_id in proto.bound_inputs]
+      bound_variables = [
+          self._nodes[node_id]
+          for node_id in proto.bound_inputs
+          if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
+      ]
+      # TODO(andresp): This is only injecting the captured inputs into the
+      # concrete function, note that we did not modify the FuncGraph
+      # itself.
+      concrete_function._captured_inputs = bound_inputs  # pylint: disable=protected-access
+      concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
 
   def _get_tensor_from_node(self, node_id):
-    obj = self._nodes[node_id]
-    if resource_variable_ops.is_resource_variable(obj):
-      return obj.handle
-    elif isinstance(obj, tracking.TrackableAsset):
-      return obj.asset_path.handle
-    elif tensor_util.is_tensor(obj):
-      return obj
-    raise ValueError("Can't convert node %s to tensor" % (type(obj)))
+    """Resolves a node id into a tensor to be captured for a function."""
+    with ops.init_scope():
+      obj = self._nodes[node_id]
+      if resource_variable_ops.is_resource_variable(obj):
+        return obj.handle
+      elif isinstance(obj, tracking.TrackableAsset):
+        return obj.asset_path
+      elif tensor_util.is_tensor(obj):
+        return obj
+      elif isinstance(obj, tracking.TrackableResource):
+        # Note: this executes restored functions in the TrackableResource.
+        return obj.resource_handle
+      raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
   def _load_all(self):
     """Load all saved objects and wire their properties."""
@@ -131,24 +146,24 @@ class _Loader(object):
   def _restore_checkpoint(self):
     """Load state from checkpoint into the deserialized objects."""
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
-    # TODO(andresp): Clean use of private methods of CheckpointableSaver.
+    # TODO(andresp): Clean use of private methods of TrackableSaver.
     # pylint: disable=protected-access
-    saver = util.CheckpointableSaver(self.get(0))
+    saver = util.TrackableSaver(graph_view.ObjectGraphView(self.get(0)))
     saver._file_prefix_placeholder = constant_op.constant(variables_path)
     load_status = saver.restore(variables_path)
     load_status.assert_existing_objects_matched()
     checkpoint = load_status._checkpoint
 
     # When running in eager mode, the `restore` call above has already run and
-    # restored the state of checkpointables, call `position.restore_ops()` will
+    # restored the state of trackables, call `position.restore_ops()` will
     # return an empty list as there is nothing left to do. In graph mode, that
     # will return the list of ops that must run to restore the object on that
     # position. We have to wire them in the initializers of the objects so that
     # they get initialized properly when using common practices (e.g. the ones
     # used by ManagedSession) without further user action.
     for object_id, obj in dict(checkpoint.object_by_proto_id).items():
-      position = base._CheckpointPosition(checkpoint=checkpoint,
-                                          proto_id=object_id)
+      position = base.CheckpointPosition(checkpoint=checkpoint,
+                                         proto_id=object_id)
       restore_ops = position.restore_ops()
       if restore_ops:
         if resource_variable_ops.is_resource_variable(obj):
@@ -172,6 +187,7 @@ class _Loader(object):
             proto.bare_concrete_function),
         "variable": lambda: self._recreate_variable(proto.variable),
         "constant": lambda: self._recreate_constant(proto.constant),
+        "resource": lambda: self._recreate_resource(proto.resource),
     }
     kind = proto.WhichOneof("kind")
     if kind not in factory:
@@ -186,7 +202,7 @@ class _Loader(object):
       # individually callable by adding a `__call__` method to the classes of
       # the objects instances that have a `__call__` property.
 
-      class _UserObject(tracking.AutoCheckpointable):
+      class _UserObject(tracking.AutoTrackable):
         pass
 
       return _UserObject(), setattr
@@ -217,33 +233,97 @@ class _Loader(object):
         tensor_util.MakeNdarray(tensor_proto))
     return imported_constant, setattr
 
+  def _recreate_resource(self, proto):
+    del proto
+    return _RestoredResource(), setattr
+
+
+# TODO(b/124205571,b/124092991): Solve destruction of resources.
+class _RestoredResource(tracking.TrackableResource):
+  """Restored SavedResource."""
+
+  def create_resource(self):
+    raise RuntimeError()
+
+  def initialize(self):
+    raise RuntimeError()
+
+  def _list_functions_for_serialization(self):
+    # Overwrite this method to avoid the implementation of
+    # base class to re-wrap the polymorphic functions into
+    # another layer of `tf.function`.
+    return {
+        "create_resource": self.create_resource,
+        "initialize": self.initialize,
+    }
+
 
 def _call_attribute(instance, *args, **kwargs):
   return instance.__call__(*args, **kwargs)
 
 
-def _load_saved_object_graph_proto(filename):
-  with file_io.FileIO(filename, "rb") as f:
-    contents = f.read()
-    return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
+@tf_export("saved_model.load", v1=["saved_model.load_v2"])
+def load(export_dir, tags=None):
+  """Load a SavedModel from `export_dir`.
+
+  Signatures associated with the SavedModel are available as functions:
+
+  ```python
+  imported = tf.saved_model.load(path)
+  f = imported.signatures["serving_default"]
+  print(f(x=tf.constant([[1.]])))
+  ```
 
+  Objects exported with `tf.saved_model.save` additionally have trackable
+  objects and functions assigned to attributes:
 
-def load(export_dir):
-  """Load a SavedModel from `export_dir`."""
+  ```python
+  exported = tf.train.Checkpoint(v=tf.Variable(3.))
+  exported.f = tf.function(
+      lambda x: exported.v * x,
+      input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  tf.saved_model.save(exported, path)
+  imported = tf.saved_model.load(path)
+  assert 3. == imported.v.numpy()
+  assert 6. == imported.f(x=tf.constant(2.)).numpy()
+  ```
+
+  Args:
+    export_dir: The SavedModel directory to load from.
+    tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
+      if the SavedModel contains a single MetaGraph, as for those exported from
+      `tf.saved_model.load`.
+
+  Returns:
+    A trackable object with a `signatures` attribute mapping from signature
+    keys to functions. If the SavedModel was exported by `tf.saved_model.load`,
+    it also points to trackable objects and functions which were attached
+    to the exported object.
+
+  Raises:
+    ValueError: If `tags` don't match a MetaGraph in the SavedModel.
+  """
+  if tags is not None:
+    # Supports e.g. tags=SERVING and tags=[SERVING]
+    tags = nest.flatten(tags)
   saved_model_proto = loader_impl.parse_saved_model(export_dir)
-  object_graph_filename = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
-      compat.as_bytes("object_graph.pb"))
-  if file_io.file_exists(object_graph_filename):
-    object_graph_proto = _load_saved_object_graph_proto(object_graph_filename)
+  if (len(saved_model_proto.meta_graphs) == 1
+      and saved_model_proto.meta_graphs[0].HasField("object_graph_def")):
+    meta_graph_def = saved_model_proto.meta_graphs[0]
+    if (tags is not None
+        and set(tags) != set(meta_graph_def.meta_info_def.tags)):
+      raise ValueError(
+          ("The SavedModel at {} has one MetaGraph with tags {}, but got an "
+           "incompatible argument tags={} to tf.saved_model.load. You may omit "
+           "it, pass 'None', or pass matching tags.")
+          .format(export_dir, meta_graph_def.meta_info_def.tags, tags))
+    object_graph_proto = meta_graph_def.object_graph_def
     with ops.init_scope():
       loader = _Loader(object_graph_proto,
                        saved_model_proto,
                        export_dir)
       root = loader.get(0)
   else:
-    raise NotImplementedError(
-        "Currently only SavedModels exported with `tf.saved_model.save` may be "
-        "imported. Other SavedModels may eventually be supported via load().")
+    with ops.init_scope():
+      root = load_v1_in_v2.load(export_dir, tags)
   return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 3edecd5be2a6a2bdea98ee0c0005dbdd87421dfe..845fc40e5c57e60dc209b5b0b3cb4310b05c80c9 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel loading."""
+"""Tests for trackable object SavedModel loading."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -38,9 +38,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import monitored_session
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import tf_inspect
 
 
 @parameterized.named_parameters(
@@ -61,17 +63,17 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     return loaded
 
   def test_structure_import(self, cycles):
-    root = tracking.AutoCheckpointable()
-    root.dep_one = tracking.AutoCheckpointable()
-    root.dep_two = tracking.AutoCheckpointable()
-    root.dep_two.dep = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    root.dep_one = tracking.AutoTrackable()
+    root.dep_two = tracking.AutoTrackable()
+    root.dep_two.dep = tracking.AutoTrackable()
     root.dep_three = root.dep_two.dep
     imported = self.cycle(root, cycles)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
 
   def test_variables(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(1., trainable=True)
     root.v2 = variables.Variable(2., trainable=False)
     imported = self.cycle(root, cycles)
@@ -81,7 +83,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertFalse(imported.v2.trainable)
 
   def test_capture_variables(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.weights = variables.Variable(2.)
     root.f = def_function.function(
         lambda x: root.weights * x,
@@ -101,7 +103,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     file1 = self._make_asset("contents 1")
     file2 = self._make_asset("contents 2")
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.asset1 = tracking.TrackableAsset(file1)
     root.asset2 = tracking.TrackableAsset(file2)
 
@@ -120,21 +122,44 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       self.assertEqual("contents 2", f.read())
 
   def test_capture_assets(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
     root.f = def_function.function(
         lambda: root.vocab.asset_path,
         input_signature=[])
     imported = self.cycle(root, cycles)
-    origin_output = root.f().numpy()
+    original_output = root.f().numpy()
     imported_output = imported.f().numpy()
-    self.assertNotEqual(origin_output, imported_output)
+    self.assertNotEqual(original_output, imported_output)
     with open(imported_output, "r") as f:
       self.assertEqual("contents", f.read())
 
+  def test_capture_assets_in_graph(self, cycles):
+    root = tracking.AutoTrackable()
+    root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
+    root.f = def_function.function(
+        lambda: root.vocab.asset_path,
+        input_signature=[])
+
+    original_output = root.f().numpy()
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      imported_tensor = imported.f()
+      with monitored_session.MonitoredSession() as sess:
+        imported_output = sess.run(imported_tensor)
+        self.assertNotEqual(original_output, imported_output)
+        with open(imported_output, "r") as f:
+          self.assertEqual("contents", f.read())
+
   def test_dedup_assets(self, cycles):
     vocab = self._make_asset("contents")
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.asset1 = tracking.TrackableAsset(vocab)
     root.asset2 = tracking.TrackableAsset(vocab)
     imported = self.cycle(root, cycles)
@@ -146,7 +171,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     # Add two traces.
@@ -164,7 +189,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     imported = self.cycle(root, cycles)
@@ -175,7 +200,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     imported = self.cycle(
@@ -194,7 +219,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         lambda x: f(x) + 1.0,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.g = g
     imported = self.cycle(root, cycles)
     imported.g(constant_op.constant([1.0]))
@@ -207,7 +232,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
@@ -227,7 +252,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       else:
         return array_ops.zeros(shape=x.shape, dtype=dtypes.float32)
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     self.assertAllEqual([0.0, 0.0, 0.0],
@@ -261,17 +286,17 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_function_no_return(self, cycles):
 
-    class CheckpointableWithOneVariable(tracking.AutoCheckpointable):
+    class TrackableWithOneVariable(tracking.AutoTrackable):
 
       def __init__(self, initial_value=0.0):
-        super(CheckpointableWithOneVariable, self).__init__()
+        super(TrackableWithOneVariable, self).__init__()
         self.variable = variables.Variable(initial_value)
 
       @def_function.function
       def increase(self, by=1.0):
         self.variable.assign_add(by)
 
-    obj = CheckpointableWithOneVariable(5.0)
+    obj = TrackableWithOneVariable(5.0)
 
     obj.increase(constant_op.constant(10.0))
     self.assertEqual(15.0, obj.variable.numpy())
@@ -295,7 +320,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     x = constant_op.constant(10)
@@ -311,8 +336,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = self.cycle(root, cycles)
 
-    with self.assertRaisesRegexp(AssertionError,
-                                 "Could not find matching function to call.*"):
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call"):
       imported.f(input2)
 
     self.assertEqual(31, imported.f(input1).numpy())
@@ -327,7 +352,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       named_tuple = named_tuple_type(a=input1 + input2, b=input1 * input2)
       return [named_tuple, input2, {"x": 0.5}]
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     result = root.f(constant_op.constant(2), constant_op.constant(3))
@@ -357,7 +382,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
@@ -379,7 +404,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     x = constant_op.constant(10)
@@ -387,17 +412,17 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = self.cycle(root, cycles)
 
-    with self.assertRaisesRegexp(AssertionError,
+    with self.assertRaisesRegexp(ValueError,
                                  "Could not find matching function to call.*"):
       imported.f(x, learning_rate=0.5, epochs=4)
 
     self.assertEqual(7, imported.f(x, learning_rate=0.5, epochs=3).numpy())
 
   def test_member_function(self, cycles):
-    class CheckpointableWithMember(tracking.AutoCheckpointable):
+    class TrackableWithMember(tracking.AutoTrackable):
 
       def __init__(self):
-        super(CheckpointableWithMember, self).__init__()
+        super(TrackableWithMember, self).__init__()
         self._some_value = 20
 
       @def_function.function
@@ -407,7 +432,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         else:
           return 7 + self._some_value
 
-    root = CheckpointableWithMember()
+    root = TrackableWithMember()
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
     self.assertEqual(27, root.f(constant_op.constant(1)).numpy())
@@ -419,7 +444,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(27, imported.f(constant_op.constant(2)).numpy())
 
   def test_side_effect_listing(self, cycles):
-    class M(tracking.AutoCheckpointable):
+    class M(tracking.AutoTrackable):
 
       def __init__(self):
         super(M, self).__init__()
@@ -443,7 +468,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         lambda x: x*weight + bias,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.weight = weight
     root.bias = bias
     root.g = g
@@ -483,7 +508,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def h(x):
       return g(x) + bias,
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.weight = weight
     root.bias = bias
     root.g = h
@@ -496,16 +521,16 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(grad, [3.5, 2.0])
 
   def test_callable(self, cycles):
-    class M1(tracking.AutoCheckpointable):
+    class M1(tracking.AutoTrackable):
 
       @def_function.function(
           input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
       def __call__(self, x):
         return x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.m1 = M1()
-    root.m2 = tracking.AutoCheckpointable()
+    root.m2 = tracking.AutoTrackable()
     root.m2.__call__ = def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
             lambda x: x*3.0)
@@ -528,9 +553,9 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     func = def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
             lambda x: x*3.0)
-    root = tracking.AutoCheckpointable()
-    root.__call__ = tracking.AutoCheckpointable()
-    root.__call__.__call__ = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    root.__call__ = tracking.AutoTrackable()
+    root.__call__.__call__ = tracking.AutoTrackable()
     root.__call__.__call__.__call__ = func
 
     imported = self.cycle(root, cycles)
@@ -539,7 +564,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(imported(x).numpy(), 3.0)
 
   def test_load_in_graph_mode(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(1.)
     root.v2 = variables.Variable(2.)
     root.f = def_function.function(
@@ -560,7 +585,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(4.0, sess.run(output))
 
   def test_load_in_func_graph(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(1.)
     root.v2 = variables.Variable(2.)
     root.f = def_function.function(
@@ -572,7 +597,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
-    closure = tracking.AutoCheckpointable()
+    closure = tracking.AutoTrackable()
     @def_function.function
     def func(x):
       if not hasattr(closure, "model"):
@@ -589,7 +614,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
@@ -612,6 +637,36 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 4, 6],
                         imported.f(constant_op.constant([1, 2, 3])).numpy())
 
+  def test_get_concrete_function(self, cycles):
+
+    @def_function.function
+    def func(x, training=False):
+      if training:
+        return 2 * x
+      else:
+        return 3 * x
+
+    func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32), True)
+    func.get_concrete_function(tensor_spec.TensorSpec([None], dtypes.float32))
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    imported = self.cycle(root, cycles)
+
+    concrete = imported.f.get_concrete_function(
+        training=True, x=tensor_spec.TensorSpec([None], dtypes.int32))
+
+    self.assertAllEqual([2, 4, 6, 8],
+                        concrete(x=constant_op.constant([1, 2, 3, 4])).numpy())
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call"):
+      imported.f.get_concrete_function(
+          tensor_spec.TensorSpec([None], dtypes.int32))
+    imported.f.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32), True)
+
   def test_concrete_function(self, cycles):
 
     @def_function.function(
@@ -619,7 +674,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function()
 
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
@@ -640,7 +695,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function()
 
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
@@ -656,7 +711,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function(constant_op.constant([1]))
     self.assertAllEqual([4], root.f(constant_op.constant([2])).numpy())
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
@@ -669,7 +724,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         input_signature=[tensor_spec.TensorSpec([None], dtypes.float32)])
     def func(x):
       return x ** 2.
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function()
 
     def _compute_gradient(function):
@@ -689,7 +744,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     @def_function.function
     def func(x, y):
       return x * (y + 1.)
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function(
         tensor_spec.TensorSpec([], dtypes.float32),
         tensor_spec.TensorSpec([], dtypes.float32))
@@ -706,7 +761,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(*args):
       x, y = args
       return x * (y + 1.)
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function(
         tensor_spec.TensorSpec([], dtypes.float32, name="x"),
         tensor_spec.TensorSpec([], dtypes.float32, name="y"))
@@ -727,7 +782,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       capture.assign_sub(1)
 
     vsave = variables.Variable(1)
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function(vsave)
     root.capture = capture
     self.assertEqual(1, vsave.numpy())
@@ -750,7 +805,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     def func(v):
       return v + 1
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.func = func
     root.concrete_func = func.get_concrete_function(
         tensor_spec.TensorSpec(None, dtypes.int32))
@@ -762,7 +817,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(2, imported.concrete_func(one).numpy())
 
   def test_dict(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.variables = dict(a=variables.Variable(1.))
     root.variables["b"] = variables.Variable(2.)
     root.variables["c"] = 1
@@ -777,7 +832,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(100., imported.funcs["conc"]().numpy())
 
   def test_list(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.variables = [variables.Variable(1.)]
     root.variables.append(1)
     root.variables.append(variables.Variable(3.))
@@ -788,7 +843,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(3, len(imported.variables))
 
   def test_functions_list(self, cycles):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     v1 = variables.Variable(1.)
     root.losses = [def_function.function(lambda: math_ops.reduce_sum(v1 ** 2))]
     root.variables = [v1]
@@ -810,7 +865,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_captured_constant(self, cycles):
     const = array_ops.zeros([100])
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda: const + 1.)
     root.g = def_function.function(lambda: const + 2.)
     self.assertAllClose(array_ops.ones([100]), root.f())
@@ -828,25 +883,231 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertIs(f_concrete.captured_inputs[0],
                   g_concrete.captured_inputs[0])
 
+  def test_functions_accessed_once(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      def __init__(self):
+        self._counter = 0
+
+      @property
+      def make_func(self):
+        @def_function.function
+        def f():
+          return constant_op.constant(self._counter)
+        f.get_concrete_function()  # force a trace
+        self._counter += 1
+        return f
+
+    exported = Exported()
+    imported = self.cycle(exported, cycles)
+    self.assertEqual(0, imported.make_func().numpy())
+    self.assertEqual(1, exported.make_func().numpy())
+
+  def test_overwritten_signatures_error(self, cycles):
+    exported = tracking.AutoTrackable()
+    exported.f = def_function.function(lambda: constant_op.constant(1.))
+    imported = self.cycle(
+        exported, cycles,
+        signatures={"key": exported.f.get_concrete_function()})
+    self.assertEqual(1., imported.signatures["key"]()["output_0"].numpy())
+    imported.signatures = {"key1": imported.signatures["key"]}
+    with self.assertRaisesRegexp(ValueError, "signatures"):
+      save.save(imported, tempfile.mkdtemp(prefix=self.get_temp_dir()))
+
+  def test_signature_loading(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = variables.Variable(3.)
+
+      @def_function.function
+      def do(self, x):
+        return self.v * x
+
+    exported = Exported()
+    imported = self.cycle(
+        exported,
+        signatures=exported.do.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    for _ in range(cycles - 1):
+      imported = self.cycle(imported, signatures=imported.signatures)
+    self.assertEqual(["serving_default"], list(imported.signatures.keys()))
+    imported_function = imported.signatures["serving_default"]
+    two = constant_op.constant(2.)
+    self.assertEqual(6., imported_function(x=two)["output_0"].numpy())
+    imported.v.assign(4.)
+    self.assertEqual(8., imported_function(x=two)["output_0"].numpy())
+    self.assertEqual(8., imported_function(two)["output_0"].numpy())
+    with self.assertRaises(TypeError):
+      # The signatures mapping is immutable
+      imported.signatures["random_key"] = 3
+
+  def test_multiple_argument_signatures_no_positional(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      @def_function.function
+      def do(self, x, y):
+        return x + y
+
+    exported = Exported()
+    imported = self.cycle(
+        exported, signatures=exported.do.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32),
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    for _ in range(cycles - 1):
+      imported = self.cycle(imported, signatures=imported.signatures)
+    with self.assertRaises(TypeError):
+      imported.signatures["serving_default"](
+          constant_op.constant(1.),
+          y=constant_op.constant(2.))
+    self.assertEqual(
+        {"output_0": 3.},
+        self.evaluate(imported.signatures["serving_default"](
+            x=constant_op.constant(1.),
+            y=constant_op.constant(2.))))
+
+  def _make_model_with_tables(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table1_initializer = lookup_ops.KeyValueTensorInitializer(keys, values)
+    table1 = lookup_ops.HashTable(table1_initializer, default_val)
+
+    table2_file = self._make_asset("test\nfoo\nbrain\n")
+    table2_initializer = lookup_ops.TextFileIdTableInitializer(table2_file)
+    table2 = lookup_ops.HashTable(table2_initializer, default_val)
+
+    def _make_lookup_function(table):
+      signature = [tensor_spec.TensorSpec(None, dtypes.string)]
+      return def_function.function(input_signature=signature)(
+          lambda x: table.lookup(x))  # pylint: disable=unnecessary-lambda
+
+    root = tracking.AutoTrackable()
+    root.table1 = table1
+    root.lookup1 = _make_lookup_function(table1)
+    root.table2 = table2
+    root.lookup2 = _make_lookup_function(table2)
+    return root
+
   def test_table(self, cycles):
-    # TODO(b/123408779): Handle generic TrackableResources and enable this test
-    self.skipTest("Need to handle generic TrackableResources")
-    vocab_path = self._make_asset("alpha\nbeta\ngamma\n")
-    initializer = lookup_ops.TextFileInitializer(
-        vocab_path,
-        key_dtype=dtypes.string,
-        key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
-        value_dtype=dtypes.int64,
-        value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-    root = util.Checkpoint(table=lookup_ops.HashTable(
-        initializer, default_value=-1))
-    root.table_user = def_function.function(
-        root.table.lookup,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
-    self.assertEqual(2, root.table_user(constant_op.constant("gamma")).numpy())
+    root = self._make_model_with_tables()
+    imported = self.cycle(root, cycles, signatures={})
+    keys = constant_op.constant(["brain", "test", "foo", "surgery"])
+    self.assertAllEqual([0, -1, -1, 2], imported.lookup1(keys).numpy())
+    self.assertAllEqual([2, 0, 1, -1], imported.lookup2(keys).numpy())
+
+  def test_table_in_graph(self, cycles):
+    root = self._make_model_with_tables()
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+    imported = self.cycle(root, 1)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      keys = constant_op.constant(["brain", "test", "foo", "surgery"])
+      output1 = imported.lookup1(keys)
+      output2 = imported.lookup2(keys)
+      with monitored_session.MonitoredSession() as sess:
+        self.assertAllEqual([0, -1, -1, 2], sess.run(output1))
+        self.assertAllEqual([2, 0, 1, -1], sess.run(output2))
+
+  def test_perserve_argspec(self, cycles):
+    def f(a, b, c):  # pylint: disable=unused-argument
+      return None
+
+    original_fullargspec = tf_inspect.getfullargspec(f)
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(f)
     imported = self.cycle(root, cycles)
-    self.assertEqual(
-        2, imported.table_user(constant_op.constant("gamma")).numpy())
+
+    restored_fullargspec = tf_inspect.getfullargspec(imported.f)
+    self.assertEqual(original_fullargspec, restored_fullargspec)
+
+  def test_canonicalize_inputs(self, cycles):
+    @def_function.function(autograph=False)
+    def func(a=1, b=2, c=3, training=True):
+      if training:
+        return [a, b, c, training]
+      else:
+        return [c, b, a, training]
+
+    # TODO(b/123501567): Work-around to trigger generic traces of a function
+    # with extra non tensor args.
+    signature = 3*[tensor_spec.TensorSpec(None, dtypes.float32)]
+    @def_function.function(input_signature=signature)
+    def trigger(a, b, c):
+      func(a, b, c, True)
+      func(a, b, c, False)
+
+    trigger.get_concrete_function()
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(root.f(), [1.0, 2.0, 3.0, True])
+    self.assertAllEqual(root.f(-1.0, training=False), [3.0, 2.0, -1.0, False])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function"):
+      root.f(["hello", 1.0])
+
+  def test_prefer_specific_trace(self, cycles):
+    @def_function.function(autograph=False)
+    def func(a):
+      if isinstance(a, int):
+        return a
+      else:
+        return a + 1
+
+    self.assertAllEqual(2, func(2).numpy())
+    self.assertAllEqual(3, func(constant_op.constant(2)).numpy())
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(2, root.f(2).numpy())
+    self.assertAllEqual(4, root.f(3).numpy())
+    self.assertAllEqual(3, root.f(constant_op.constant(2)).numpy())
+    self.assertAllEqual(4, root.f(constant_op.constant(3)).numpy())
+
+
+class SingleCycleTests(test.TestCase, parameterized.TestCase):
+
+  def test_load_with_tags(self):
+    root = tracking.AutoTrackable()
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+    with self.assertRaises(ValueError):
+      load.load(path, tags=[tag_constants.EVAL])
+    load.load(path, tags=[tag_constants.SERVING])
+    load.load(path, tags=tag_constants.SERVING)
+
+  def test_docstring_examples(self):
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    exported = util.Checkpoint(v=variables.Variable(3.))
+    exported.f = def_function.function(
+        lambda x: exported.v * x,
+        input_signature=[
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)])
+    save.save(exported, path)
+    imported = load.load(path)
+    self.assertEqual(3., imported.v.numpy())
+    self.assertEqual(6., imported.f(x=constant_op.constant(2.)).numpy())
+
+    save.save(exported, path, exported.f.get_concrete_function())
+    imported = load.load(path)
+    f = imported.signatures["serving_default"]
+    self.assertAllEqual(
+        [[-3.]],
+        f(x=constant_op.constant([[-1.]]))["output_0"].numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb8ed7cfcce8d01ed5d5726f66a387a6239792f4
--- /dev/null
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -0,0 +1,159 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a TF v1-style SavedModel when executing eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import signature_serialization
+from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.training.tracking import tracking
+
+
+class _Initializer(tracking.TrackableResource):
+  """Represents an initialization operation restored from a SavedModel.
+
+  Without this object re-export of imported 1.x SavedModels would omit the
+  original SavedModel's initialization procedure.
+
+  Created when `tf.saved_model.load` loads a TF 1.x-style SavedModel with an
+  initialization op. This object holds a function which runs the
+  initialization. It does not require any manual user intervention;
+  `tf.saved_model.save` will see this object and automatically add it to the
+  exported SavedModel, and `tf.saved_model.load` runs the initialization
+  function automatically.
+  """
+
+  def __init__(self, init_fn, asset_paths):
+    super(_Initializer, self).__init__()
+    self._asset_paths = asset_paths
+    self._init_fn = init_fn
+
+  def create_resource(self):
+    return array_ops.placeholder(
+        dtype=dtypes.resource, shape=[], name="unused_resource")
+
+  def initialize(self):
+    self._init_fn(*[path.asset_path for path in self._asset_paths])
+
+
+class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
+  """Loads a SavedModel without using Sessions."""
+
+  def get_meta_graph_def_from_tags(self, tags):
+    """Override to support implicit one-MetaGraph loading with tags=None."""
+    if tags is None:
+      if len(self._saved_model.meta_graphs) != 1:
+        tag_sets = [mg.meta_info_def.tags
+                    for mg in self._saved_model.meta_graphs]
+        raise ValueError(
+            ("Importing a SavedModel with tf.saved_model.load requires a "
+             "'tags=' argument if there is more than one MetaGraph. Got "
+             "'tags=None', but there are {} MetaGraphs in the SavedModel with "
+             "tag sets {}. Pass a 'tags=' argument to load this SavedModel.")
+            .format(len(self._saved_model.meta_graphs), tag_sets))
+      return self._saved_model.meta_graphs[0]
+    return super(_EagerSavedModelLoader, self).get_meta_graph_def_from_tags(
+        tags)
+
+  def load_graph(self, returns, meta_graph_def):
+    """Called from wrap_function to import `meta_graph_def`."""
+    # pylint: disable=protected-access
+    saver, _ = tf_saver._import_meta_graph_with_return_elements(
+        meta_graph_def)
+    # pylint: enable=protected-access
+    returns[0] = saver
+
+  def restore_variables(self, wrapped, saver):
+    """Restores variables from the checkpoint."""
+    if saver is not None:
+      saver_def = saver.saver_def
+      restore_fn = wrapped.prune(
+          feeds=[wrapped.graph.as_graph_element(
+              saver_def.filename_tensor_name)],
+          fetches=[wrapped.graph.as_graph_element(saver_def.restore_op_name)])
+      restore_fn(constant_op.constant(self._variables_path))
+
+  def _extract_signatures(self, wrapped, meta_graph_def):
+    """Creates ConcreteFunctions for signatures in `meta_graph_def`."""
+    signature_functions = {}
+    for signature_key, signature_def in meta_graph_def.signature_def.items():
+      input_names, input_specs = zip(*signature_def.inputs.items())
+      # TODO(allenl): Support optional arguments
+      signature_fn = wrapped.prune(
+          feeds=[wrapped.graph.as_graph_element(inp.name)
+                 for inp in input_specs],
+          fetches={name: wrapped.graph.as_graph_element(out.name)
+                   for name, out in signature_def.outputs.items()})
+      # pylint: disable=protected-access
+      signature_fn._arg_keywords = input_names
+      if len(input_names) == 1:
+        # Allowing positional arguments does not create any ambiguity if there's
+        # only one.
+        signature_fn._num_positional_args = 1
+      else:
+        signature_fn._num_positional_args = 0
+      # pylint: enable=protected-access
+      signature_functions[signature_key] = signature_fn
+    return signature_functions
+
+  def load(self, tags):
+    """Creates an object from the MetaGraph identified by `tags`."""
+    meta_graph_def = self.get_meta_graph_def_from_tags(tags)
+    load_graph_returns = [None]
+    wrapped = wrap_function.wrap_function(
+        functools.partial(self.load_graph, load_graph_returns, meta_graph_def),
+        signature=[])
+    saver, = load_graph_returns
+    self.restore_variables(wrapped, saver)
+    with wrapped.graph.as_default():
+      init_op = loader_impl.get_init_op(meta_graph_def)
+    root = tracking.AutoTrackable()
+    if init_op is not None:
+      asset_feed_tensors = []
+      asset_paths = []
+      for tensor_name, value in loader_impl.get_asset_tensors(
+          self._export_dir, meta_graph_def).items():
+        asset_feed_tensors.append(wrapped.graph.as_graph_element(tensor_name))
+        asset_paths.append(tracking.TrackableAsset(value))
+      init_fn = wrapped.prune(
+          feeds=asset_feed_tensors,
+          fetches=[wrapped.graph.as_graph_element(init_op)])
+      initializer = _Initializer(init_fn, asset_paths)
+      initializer.initialize()
+      root.initializer = initializer
+      root.asset_paths = asset_paths
+    else:
+      root.asset_paths = []
+    signature_functions = self._extract_signatures(wrapped, meta_graph_def)
+
+    root.signatures = signature_serialization.create_signature_map(
+        signature_functions)
+    root.variables = list(wrapped.graph.variables)
+    return root
+
+
+def load(export_dir, tags):
+  """Load a v1-style SavedModel as an object."""
+  loader = _EagerSavedModelLoader(export_dir)
+  return loader.load(tags=tags)
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce2c0cd960e5da06cf7cd28b5db556471c1b891e
--- /dev/null
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -0,0 +1,208 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for importing a TF v1-style SavedModel when executing eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import builder_impl
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import simple_save
+from tensorflow.python.saved_model import utils_impl
+
+
+class LoadTest(test.TestCase):
+
+  def _v1_single_metagraph_saved_model(self, use_resource):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      if use_resource:
+        distractor = variables.RefVariable(-1., name="distractor")
+        v = resource_variable_ops.ResourceVariable(3., name="v")
+      else:
+        # "distractor" gets saved in the checkpoint and so used in the restore
+        # function, but not in the pruned function for the signature. This tests
+        # node naming: it needs to be consistent (and ideally always the same as
+        # the node in the original GraphDef) for the resource manager to find
+        # the right variable.
+        distractor = variables.RefVariable(-1., name="distractor")
+        v = variables.RefVariable(3., name="v")
+      local_variable = variables.VariableV1(
+          1.,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          trainable=False,
+          use_resource=True)
+      output = array_ops.identity(start * v * local_variable, name="output")
+      with session_lib.Session() as session:
+        session.run([v.initializer, distractor.initializer,
+                     local_variable.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=local_variable.initializer)
+    return path
+
+  def test_resource_variable_import(self):
+    imported = load.load(self._v1_single_metagraph_saved_model(
+        use_resource=True))
+    fn = imported.signatures["serving_default"]
+    self.assertEqual({"output": 6.},
+                     self.evaluate(fn(constant_op.constant(2.))))
+    self.assertAllEqual([3., 1.], self.evaluate(imported.variables))
+    imported.variables[0].assign(4.)
+    self.assertEqual({"output": 8.},
+                     self.evaluate(fn(start=constant_op.constant(2.))))
+    imported.variables[1].assign(2.)
+    self.assertEqual({"output": 24.},
+                     self.evaluate(fn(start=constant_op.constant(3.))))
+    self.assertTrue(imported.variables[0].trainable)
+    self.assertFalse(imported.variables[1].trainable)
+    with backprop.GradientTape() as tape:
+      output = fn(start=constant_op.constant(4.))
+    self.assertEqual(imported.variables[:1], list(tape.watched_variables()))
+    self.assertEqual(8., tape.gradient(output, imported.variables[0]).numpy())
+
+  def test_ref_variable_import(self):
+    saved = self._v1_single_metagraph_saved_model(use_resource=False)
+    imported = load.load(saved)
+    fn = imported.signatures["serving_default"]
+    self.assertEqual(6., fn(start=constant_op.constant(2.))["output"].numpy())
+
+  def _v1_multi_metagraph_saved_model(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      v = resource_variable_ops.ResourceVariable(21.)
+      first_output = array_ops.identity(start * v, name="first_output")
+      second_output = array_ops.identity(v, name="second_output")
+      with session_lib.Session() as session:
+        session.run(v.initializer)
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        builder = builder_impl.SavedModelBuilder(path)
+        builder.add_meta_graph_and_variables(
+            session, tags=["first"],
+            signature_def_map={
+                "first_key": signature_def_utils.build_signature_def(
+                    {"first_start": utils_impl.build_tensor_info(start)},
+                    {"first_output": utils_impl.build_tensor_info(
+                        first_output)})})
+        builder.add_meta_graph(
+            tags=["second"],
+            signature_def_map={
+                "second_key": signature_def_utils.build_signature_def(
+                    {"second_start": utils_impl.build_tensor_info(start)},
+                    {"second_output": utils_impl.build_tensor_info(
+                        second_output)})})
+        builder.save()
+    return path
+
+  def test_multi_meta_graph_loading(self):
+    with self.assertRaisesRegexp(ValueError, "2 MetaGraphs"):
+      load.load(self._v1_multi_metagraph_saved_model())
+    first_imported = load.load(self._v1_multi_metagraph_saved_model(),
+                               tags=["first"])
+    self.assertEqual({"first_output": 42.},
+                     self.evaluate(first_imported.signatures["first_key"](
+                         first_start=constant_op.constant(2.))))
+    second_imported = load.load(self._v1_multi_metagraph_saved_model(),
+                                tags=["second"])
+    with self.assertRaisesRegexp(TypeError, "second_start"):
+      second_imported.signatures["second_key"](x=constant_op.constant(2.))
+    with self.assertRaisesRegexp(TypeError, "second_start"):
+      second_imported.signatures["second_key"](
+          second_start=constant_op.constant(2.),
+          x=constant_op.constant(2.))
+    self.assertEqual({"second_output": 21.},
+                     self.evaluate(second_imported.signatures["second_key"](
+                         second_start=constant_op.constant(2.))))
+
+  def _v1_asset_saved_model(self):
+    export_graph = ops.Graph()
+    vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+    with export_graph.as_default():
+      initializer = lookup_ops.TextFileInitializer(
+          vocab_path,
+          key_dtype=dtypes.string,
+          key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+          value_dtype=dtypes.int64,
+          value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+      table = lookup_ops.HashTable(
+          initializer, default_value=-1)
+      start = array_ops.placeholder(
+          shape=None, dtype=dtypes.string, name="in")
+      output = table.lookup(start, name="out")
+      with session_lib.Session() as session:
+        session.run([table.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=table.initializer)
+    file_io.delete_file(vocab_path)
+    return path
+
+  def test_asset_loading(self):
+    first_path = self._v1_asset_saved_model()
+    imported = load.load(first_path)
+    fn = imported.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+    second_path = os.path.join(self.get_temp_dir(), "saved_model",
+                               str(ops.uid()))
+    save.save(imported, second_path, signatures=imported.signatures)
+    shutil.rmtree(first_path)
+    second_import = load.load(second_path)
+    fn = second_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
+    third_path = os.path.join(self.get_temp_dir(), "saved_model",
+                              str(ops.uid()))
+    save.save(second_import, third_path, signatures=second_import.signatures)
+    shutil.rmtree(second_path)
+    third_import = load.load(third_path)
+    fn = third_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5be03aae4905f4465ac87590da610a7d46e2ae4..bfabef9174de2b7ae7a330785d735c7193569683 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -88,7 +88,7 @@ def parse_saved_model(export_dir):
 _parse_saved_model = parse_saved_model
 
 
-def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
+def get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
   Args:
@@ -393,7 +393,7 @@ class SavedModelLoader(object):
     meta_graph_def = self.get_meta_graph_def_from_tags(tags)
     with sess.graph.as_default():
       # Get asset tensors, if any.
-      asset_tensors_dictionary = _get_asset_tensors(
+      asset_tensors_dictionary = get_asset_tensors(
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       init_op = get_init_op(meta_graph_def, import_scope)
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 192a610fd244c0d8950764cdfbf53fb62bd32698..493574a225d16fdada0ce08b569e06bf0aa06e16 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -30,6 +30,7 @@ py_library(
     deps = [
         ":export_output",
         ":export_utils",
+        ":mode_keys",
     ],
 )
 
@@ -70,7 +71,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":export_output",
-        "//tensorflow/python:mode_keys",
+        ":mode_keys",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "//tensorflow/python/saved_model:signature_constants",
@@ -98,3 +99,19 @@ py_test(
         "//tensorflow/python/saved_model:signature_def_utils",
     ],
 )
+
+py_library(
+    name = "mode_keys",
+    srcs = ["mode_keys.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "mode_keys_test",
+    srcs = ["mode_keys_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":mode_keys",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/saved_model/model_utils/__init__.py b/tensorflow/python/saved_model/model_utils/__init__.py
index 84540badb4b100ab649b4653d9d84b5ebe922cf1..3f54c96def1bc10d334b62b9c4b0f201b2850a07 100644
--- a/tensorflow/python/saved_model/model_utils/__init__.py
+++ b/tensorflow/python/saved_model/model_utils/__init__.py
@@ -25,4 +25,5 @@ from tensorflow.python.saved_model.model_utils.export_utils import EXPORT_TAG_MA
 from tensorflow.python.saved_model.model_utils.export_utils import get_export_outputs
 from tensorflow.python.saved_model.model_utils.export_utils import get_temp_export_dir
 from tensorflow.python.saved_model.model_utils.export_utils import get_timestamped_export_dir
+from tensorflow.python.saved_model.model_utils.export_utils import SIGNATURE_KEY_MAP
 # pylint: enable=wildcard-import
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
index ef512150a259514fcc4c801eaa06a99441f1f7a2..c87d2ee6ae703d50c916dbedc7fcc03936880f71 100644
--- a/tensorflow/python/saved_model/model_utils/export_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -24,7 +24,6 @@ import time
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -32,21 +31,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model.model_utils import export_output
 from tensorflow.python.saved_model.model_utils import export_utils
-
-
-class LabeledTensorMock(object):
-  """Mock class emulating LabeledTensor."""
-
-  def __init__(self):
-    self.tensor = constant_op.constant([1])
-
-
-def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
-  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
-
-
-ops.register_tensor_conversion_function(LabeledTensorMock,
-                                        _convert_labeled_tensor_mock_to_tensor)
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys
 
 
 class ExportTest(test_util.TensorFlowTestCase):
@@ -251,6 +236,53 @@ class ExportTest(test_util.TensorFlowTestCase):
 
     self.assertDictEqual(expected_signature_defs, signature_defs)
 
+  @test_util.deprecated_graph_mode_only
+  def test_export_outputs_for_mode(self):
+    predictions = {"predictions": constant_op.constant([1.])}
+    loss = {"loss": constant_op.constant([2.])}
+    metrics = {
+        "metrics": (constant_op.constant([3.]), constant_op.constant([4.]))}
+    expected_metrics = {
+        "metrics/value": metrics["metrics"][0],
+        "metrics/update_op": metrics["metrics"][1]
+    }
+
+    def _build_export_output(mode):
+      return export_utils.export_outputs_for_mode(
+          mode, None, predictions, loss, metrics)
+
+    ret = _build_export_output(KerasModeKeys.TRAIN)
+    self.assertIn(signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.TrainOutput)
+    self.assertEqual(export_out.predictions, predictions)
+    self.assertEqual(export_out.loss, loss)
+    self.assertEqual(export_out.metrics, expected_metrics)
+
+    ret = _build_export_output(KerasModeKeys.TEST)
+    self.assertIn(signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.EvalOutput)
+    self.assertEqual(export_out.predictions, predictions)
+    self.assertEqual(export_out.loss, loss)
+    self.assertEqual(export_out.metrics, expected_metrics)
+
+    ret = _build_export_output(KerasModeKeys.PREDICT)
+    self.assertIn(signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.PredictOutput)
+    self.assertEqual(export_out.outputs, predictions)
+
+    classes = constant_op.constant(["class5"])
+    ret = export_utils.export_outputs_for_mode(
+        KerasModeKeys.PREDICT,
+        {"classify": export_output.ClassificationOutput(
+            classes=classes)})
+    self.assertIn("classify", ret)
+    export_out = ret["classify"]
+    self.assertIsInstance(export_out, export_output.ClassificationOutput)
+    self.assertEqual(export_out.classes, classes)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_utils.py b/tensorflow/python/saved_model/model_utils/export_utils.py
index 4f8933758d92199ca1cbdd4a6f046a90e5a97f21..adb6bf26677e0ff0e465291cdfc08e92a27ee85d 100644
--- a/tensorflow/python/saved_model/model_utils/export_utils.py
+++ b/tensorflow/python/saved_model/model_utils/export_utils.py
@@ -30,21 +30,32 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
-from tensorflow.python.training import mode_keys
+from tensorflow.python.saved_model.model_utils import mode_keys
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
 from tensorflow.python.util import compat
 
 
 # Mapping of the modes to appropriate MetaGraph tags in the SavedModel.
-EXPORT_TAG_MAP = {
-    mode_keys.ModeKeys.PREDICT: [tag_constants.SERVING],
-    mode_keys.ModeKeys.TRAIN: [tag_constants.TRAINING],
-    mode_keys.ModeKeys.TEST: [tag_constants.EVAL],
-}
-
-
-_SINGLE_FEATURE_DEFAULT_NAME = 'feature'
-_SINGLE_RECEIVER_DEFAULT_NAME = 'input'
-_SINGLE_LABEL_DEFAULT_NAME = 'label'
+EXPORT_TAG_MAP = mode_keys.ModeKeyMap(**{
+    ModeKeys.PREDICT: [tag_constants.SERVING],
+    ModeKeys.TRAIN: [tag_constants.TRAINING],
+    ModeKeys.TEST: [tag_constants.EVAL]})
+
+# For every exported mode, a SignatureDef map should be created using the
+# functions `export_outputs_for_mode` and `build_all_signature_defs`. By
+# default, this map will contain a single Signature that defines the input
+# tensors and output predictions, losses, and/or metrics (depending on the mode)
+# The default keys used in the SignatureDef map are defined below.
+SIGNATURE_KEY_MAP = mode_keys.ModeKeyMap(**{
+    ModeKeys.PREDICT: signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    ModeKeys.TRAIN: signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY,
+    ModeKeys.TEST: signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY})
+
+# Default names used in the SignatureDef input map, which maps strings to
+# TensorInfo protos.
+SINGLE_FEATURE_DEFAULT_NAME = 'feature'
+SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+SINGLE_LABEL_DEFAULT_NAME = 'label'
 
 ### Below utilities are specific to SavedModel exports.
 
@@ -80,7 +91,7 @@ def build_all_signature_defs(receiver_tensors,
     ValueError: if export_outputs is not a dict
   """
   if not isinstance(receiver_tensors, dict):
-    receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+    receiver_tensors = {SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
   if export_outputs is None or not isinstance(export_outputs, dict):
     raise ValueError('export_outputs must be a dict and not'
                      '{}'.format(type(export_outputs)))
@@ -100,7 +111,7 @@ def build_all_signature_defs(receiver_tensors,
         six.iteritems(receiver_tensors_alternatives)):
       if not isinstance(receiver_tensors_alt, dict):
         receiver_tensors_alt = {
-            _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+            SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
         }
       for output_key, export_output in export_outputs.items():
         signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
@@ -262,18 +273,21 @@ def export_outputs_for_mode(
   Raises:
     ValueError: if an appropriate ExportOutput cannot be found for the mode.
   """
-  # TODO(b/113185250): move all model export helper functions into an util file.
-  if mode == mode_keys.ModeKeys.PREDICT:
+  if mode not in SIGNATURE_KEY_MAP:
+    raise ValueError(
+        'Export output type not found for mode: {}. Expected one of: {}.\n'
+        'One likely error is that V1 Estimator Modekeys were somehow passed to '
+        'this function. Please ensure that you are using the new ModeKeys.'
+        .format(mode, SIGNATURE_KEY_MAP.keys()))
+  signature_key = SIGNATURE_KEY_MAP[mode]
+  if mode_keys.is_predict(mode):
     return get_export_outputs(serving_export_outputs, predictions)
-  elif mode == mode_keys.ModeKeys.TRAIN:
-    return {mode: export_output_lib.TrainOutput(
-        loss=loss, predictions=predictions, metrics=metrics)}
-  elif mode == mode_keys.ModeKeys.TEST:
-    return {mode: export_output_lib.EvalOutput(
+  elif mode_keys.is_train(mode):
+    return {signature_key: export_output_lib.TrainOutput(
         loss=loss, predictions=predictions, metrics=metrics)}
   else:
-    raise ValueError(
-        'Export output type not found for mode: {}'.format(mode))
+    return {signature_key: export_output_lib.EvalOutput(
+        loss=loss, predictions=predictions, metrics=metrics)}
 
 
 def get_export_outputs(export_outputs, predictions):
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys.py b/tensorflow/python/saved_model/model_utils/mode_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..2912de7210f8b8900f7383b537d13bc664f15158
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/mode_keys.py
@@ -0,0 +1,109 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for managing different mode strings used by Keras and Estimator models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+class KerasModeKeys(object):
+  """Standard names for model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `TEST`: testing/evaluation mode.
+  * `PREDICT`: prediction/inference mode.
+  """
+
+  TRAIN = 'train'
+  TEST = 'test'
+  PREDICT = 'predict'
+
+
+# TODO(kathywu): Remove copy in Estimator after nightlies
+class EstimatorModeKeys(object):
+  """Standard names for Estimator model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `EVAL`: testing/evaluation mode.
+  * `PREDICT`: predication/inference mode.
+  """
+
+  TRAIN = 'train'
+  EVAL = 'eval'
+  PREDICT = 'infer'
+
+
+def is_predict(mode):
+  return mode in [KerasModeKeys.PREDICT, EstimatorModeKeys.PREDICT]
+
+
+def is_eval(mode):
+  return mode in [KerasModeKeys.TEST, EstimatorModeKeys.EVAL]
+
+
+def is_train(mode):
+  return mode in [KerasModeKeys.TRAIN, EstimatorModeKeys.TRAIN]
+
+
+class ModeKeyMap(collections.Mapping):
+  """Map using ModeKeys as keys.
+
+  This class creates an immutable mapping from modes to values. For example,
+  SavedModel export of Keras and Estimator models use this to map modes to their
+  corresponding MetaGraph tags/SignatureDef keys.
+
+  Since this class uses modes, rather than strings, as keys, both "predict"
+  (Keras's PREDICT ModeKey) and "infer" (Estimator's PREDICT ModeKey) map to the
+  same value.
+  """
+
+  def __init__(self, **kwargs):
+    self._internal_dict = {}
+    self._keys = []
+    for key in kwargs:
+      self._keys.append(key)
+      dict_key = self._get_internal_key(key)
+      if dict_key in self._internal_dict:
+        raise ValueError(
+            'Error creating ModeKeyMap. Multiple keys/values found for {} mode.'
+            .format(dict_key))
+      self._internal_dict[dict_key] = kwargs[key]
+
+  def _get_internal_key(self, key):
+    """Return keys used for the internal dictionary."""
+    if is_train(key):
+      return KerasModeKeys.TRAIN
+    if is_eval(key):
+      return KerasModeKeys.TEST
+    if is_predict(key):
+      return KerasModeKeys.PREDICT
+    raise ValueError('Invalid mode key: {}.'.format(key))
+
+  def __getitem__(self, key):
+    return self._internal_dict[self._get_internal_key(key)]
+
+  def __iter__(self):
+    return iter(self._keys)
+
+  def __len__(self):
+    return len(self._keys)
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys_test.py b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26795ef8b16a08e6426fa8399a38135dc8a4ac7c
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModeKey Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.model_utils import mode_keys
+
+
+class ModeKeyMapTest(test.TestCase):
+
+  def test_map(self):
+    mode_map = mode_keys.ModeKeyMap(**{
+        mode_keys.KerasModeKeys.PREDICT: 3,
+        mode_keys.KerasModeKeys.TEST: 1
+    })
+
+    # Test dictionary __getitem__
+    self.assertEqual(3, mode_map[mode_keys.KerasModeKeys.PREDICT])
+    self.assertEqual(3, mode_map[mode_keys.EstimatorModeKeys.PREDICT])
+    self.assertEqual(1, mode_map[mode_keys.KerasModeKeys.TEST])
+    self.assertEqual(1, mode_map[mode_keys.EstimatorModeKeys.EVAL])
+    with self.assertRaises(KeyError):
+      _ = mode_map[mode_keys.KerasModeKeys.TRAIN]
+    with self.assertRaises(KeyError):
+      _ = mode_map[mode_keys.EstimatorModeKeys.TRAIN]
+    with self.assertRaisesRegexp(ValueError, 'Invalid mode'):
+      _ = mode_map['serve']
+
+    # Test common dictionary methods
+    self.assertLen(mode_map, 2)
+    self.assertEqual({1, 3}, set(mode_map.values()))
+    self.assertEqual(
+        {mode_keys.KerasModeKeys.TEST, mode_keys.KerasModeKeys.PREDICT},
+        set(mode_map.keys()))
+
+    # Map is immutable
+    with self.assertRaises(TypeError):
+      mode_map[mode_keys.KerasModeKeys.TEST] = 1
+
+  def test_invalid_init(self):
+    with self.assertRaisesRegexp(ValueError, 'Multiple keys/values found'):
+      _ = mode_keys.ModeKeyMap(**{
+          mode_keys.KerasModeKeys.PREDICT: 3,
+          mode_keys.EstimatorModeKeys.PREDICT: 1
+      })
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 5cf9a5b155bc27d236b67496159a4ca540bb0c1b..59a2687edafdf7f8b98a6a00670ad0d975bbf1d7 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -34,10 +34,10 @@ import collections
 import functools
 import six
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.saved_model import struct_pb2
 from tensorflow.python.util import compat
 
 
@@ -361,10 +361,7 @@ class _TensorShapeCodec(object):
   """Codec for `TensorShape`."""
 
   def can_encode(self, pyobj):
-    return isinstance(pyobj, (tensor_shape.TensorShape,
-                              # TODO(b/121255889): Should not need these.
-                              tensor_shape.TensorShapeV1,
-                              tensor_shape.TensorShapeV2))
+    return isinstance(pyobj, tensor_shape.TensorShape)
 
   def do_encode(self, tensor_shape_value, encode_fn):
     del encode_fn
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index 863630171965f57806d87c02d0e9cdb1ead7e8a2..1538fbf1271dadaa9bb7d82359f8ea38fcb95a01 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import nested_structure_coder
-from tensorflow.python.saved_model import struct_pb2
 
 
 class NestedStructureTest(test.TestCase):
diff --git a/tensorflow/python/saved_model/revived_types.py b/tensorflow/python/saved_model/revived_types.py
index 8f82039cfd60b8c5d414f7dc3ca2e0b4e85022b1..3140d3d617d458dccb1bb6107b679b927d0d151e 100644
--- a/tensorflow/python/saved_model/revived_types.py
+++ b/tensorflow/python/saved_model/revived_types.py
@@ -19,20 +19,19 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import versions_pb2
-from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 
 
 class VersionedTypeRegistration(object):
   """Holds information about one version of a revived type."""
 
   def __init__(self, object_factory, version, min_producer_version,
-               min_consumer_version, bad_consumers=None, setter=setattr,
-               getter=getattr, attribute_extractor=dir):
+               min_consumer_version, bad_consumers=None, setter=setattr):
     """Identify a revived type version.
 
     Args:
       object_factory: A callable which takes a SavedUserObject proto and returns
-        a checkpointable object. Dependencies are added later via `setter`.
+        a trackable object. Dependencies are added later via `setter`.
       version: An integer, the producer version of this wrapper type. When
         making incompatible changes to a wrapper, add a new
         `VersionedTypeRegistration` with an incremented `version`. The most
@@ -46,11 +45,11 @@ class VersionedTypeRegistration(object):
         with this object. `min_consumer_version` should be set to the lowest
         version number which can successfully load protos saved by this
         object. If no matching registration is available on load, the object
-        will be revived with a generic checkpointable type.
+        will be revived with a generic trackable type.
 
         `min_consumer_version` and `bad_consumers` are a blunt tool, and using
         them will generally break forward compatibility: previous versions of
-        TensorFlow will revive newly saved objects as opaque checkpointable
+        TensorFlow will revive newly saved objects as opaque trackable
         objects rather than wrapped objects. When updating wrappers, prefer
         saving new information but preserving compatibility with previous
         wrapper versions. They are, however, useful for ensuring that
@@ -60,16 +59,8 @@ class VersionedTypeRegistration(object):
         addition to any version less than `min_consumer_version`).
       setter: A callable with the same signature as `setattr` to use when adding
         dependencies to generated objects.
-      getter: A callable with the same signature as `getattr` to use when
-        retrieving items from objects of this type. Used along with
-        `attribute_extractor` to find functions, which are not Checkpointable
-        objects and so not regular dependencies.
-      attribute_extractor: A callable equivalent of the builtin `dir`, used for
-        listing items in this container (if any).
     """
     self.setter = setter
-    self.getter = getter
-    self.attribute_extractor = attribute_extractor
     self.identifier = None  # Set after registration
     self._object_factory = object_factory
     self.version = version
@@ -92,7 +83,7 @@ class VersionedTypeRegistration(object):
             bad_consumers=self._bad_consumers))
 
   def from_proto(self, proto):
-    """Recreate a checkpointable object from a SavedUserObject proto."""
+    """Recreate a trackable object from a SavedUserObject proto."""
     return self._object_factory(proto)
 
   def should_load(self, proto):
@@ -120,7 +111,7 @@ def register_revived_type(identifier, predicate, versions):
   Args:
     identifier: A unique string identifying this class of objects.
     predicate: A Boolean predicate for this registration. Takes a
-      checkpointable object as an argument. If True, `type_registration` may be
+      trackable object as an argument. If True, `type_registration` may be
       used to save and restore the object.
     versions: A list of `VersionedTypeRegistration` objects.
   """
@@ -146,17 +137,8 @@ def register_revived_type(identifier, predicate, versions):
   _TYPE_IDENTIFIERS.append(identifier)
 
 
-def get_attribute_extractors(obj):
-  """Get a `dir` and `getattr` equivalent for use with `obj`."""
-  for identifier in _TYPE_IDENTIFIERS:
-    predicate, versions = _REVIVED_TYPE_REGISTRY[identifier]
-    if predicate(obj):
-      return versions[0].attribute_extractor, versions[0].getter
-  return dir, getattr
-
-
 def serialize(obj):
-  """Create a SavedUserObject from a checkpointable object."""
+  """Create a SavedUserObject from a trackable object."""
   for identifier in _TYPE_IDENTIFIERS:
     predicate, versions = _REVIVED_TYPE_REGISTRY[identifier]
     if predicate(obj):
@@ -166,15 +148,15 @@ def serialize(obj):
 
 
 def deserialize(proto):
-  """Create a checkpointable object from a SavedUserObject proto.
+  """Create a trackable object from a SavedUserObject proto.
 
   Args:
     proto: A SavedUserObject to deserialize.
 
   Returns:
-    A tuple of (checkpointable, assignment_fn) where assignment_fn has the same
+    A tuple of (trackable, assignment_fn) where assignment_fn has the same
     signature as setattr and should be used to add dependencies to
-    `checkpointable` when they are available.
+    `trackable` when they are available.
   """
   _, type_registrations = _REVIVED_TYPE_REGISTRY.get(
       proto.identifier, (None, None))
diff --git a/tensorflow/python/saved_model/revived_types_test.py b/tensorflow/python/saved_model/revived_types_test.py
index ede5922b801e1d6606d3d86059a03eee60433ad8..8bd806f315aa7899d7e816739191a6c1af111912 100644
--- a/tensorflow/python/saved_model/revived_types_test.py
+++ b/tensorflow/python/saved_model/revived_types_test.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.saved_model import saved_object_graph_pb2
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
-class CustomTestClass(tracking.AutoCheckpointable):
+class CustomTestClass(tracking.AutoTrackable):
 
   def __init__(self, version):
     self.version = version
@@ -56,7 +56,7 @@ revived_types.register_revived_type(
 class RegistrationMatchingTest(test.TestCase):
 
   def test_save_typecheck(self):
-    self.assertIs(revived_types.serialize(tracking.AutoCheckpointable()), None)
+    self.assertIs(revived_types.serialize(tracking.AutoTrackable()), None)
 
   def test_load_identifier_not_found(self):
     nothing_matches = revived_types.deserialize(
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 7eeb14608f8cbff04500294231dff4800a0dec33..6cdbee4187cd3250c11736606d7003a26cd226b2 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Exports a SavedModel from a Checkpointable Python object."""
+"""Exports a SavedModel from a Trackable Python object."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import collections
-import functools
 import os
 
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
@@ -41,19 +41,20 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_serialization
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import signature_serialization
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 from tensorflow.python.util import compat
-from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
 _UNCOPIABLE_DTYPES = frozenset((dtypes.resource, dtypes.variant))
 
 
@@ -63,38 +64,88 @@ _CapturedConstant = collections.namedtuple(
     "_CapturedConstant", ["eager_tensor", "graph_tensor"])
 
 
+class _AugmentedGraphView(graph_view.ObjectGraphView):
+  """An extendable graph which also tracks functions attached to objects.
+
+  Extensions through `add_object` appear in the object graph and any checkpoints
+  generated from it, even if they are not dependencies of the node they were
+  attached to in the saving program. For example a `.signatures` attribute is
+  added to exported SavedModel root objects without modifying the root object
+  itself.
+
+  Also tracks functions attached to objects in the graph, through the caching
+  `list_functions` method. Enumerating functions only through this method
+  ensures that we get a consistent view of functions, even if object attributes
+  create new functions every time they are accessed.
+  """
+
+  def __init__(self, root):
+    super(_AugmentedGraphView, self).__init__(root)
+    # Object -> (name -> dep)
+    self._extra_dependencies = object_identity.ObjectIdentityDictionary()
+    self._functions = object_identity.ObjectIdentityDictionary()
+
+  def add_object(self, parent_node, name_in_parent, subgraph_root):
+    """Attach an object to `parent_node`, overriding any existing dependency."""
+    self._extra_dependencies.setdefault(
+        parent_node, {})[name_in_parent] = subgraph_root
+
+  def list_dependencies(self, obj):
+    """Overrides a parent method to include `add_object` objects."""
+    extra_dependencies = self._extra_dependencies.get(obj, {})
+    used_names = set()
+    for name, dep in super(_AugmentedGraphView, self).list_dependencies(obj):
+      used_names.add(name)
+      if name in extra_dependencies:
+        yield base.TrackableReference(name, extra_dependencies[name])
+      else:
+        yield base.TrackableReference(name, dep)
+    for name, dep in extra_dependencies.items():
+      if name in used_names:
+        continue
+      yield base.TrackableReference(name, dep)
+
+  def list_functions(self, obj):
+    obj_functions = self._functions.get(obj, None)
+    if obj_functions is None:
+      obj_functions = obj._list_functions_for_serialization()  # pylint: disable=protected-access
+      self._functions[obj] = obj_functions
+    return obj_functions
+
+
 class _SaveableView(object):
-  """Provides a stable view over a checkpointable root.
+  """Provides a frozen view over a trackable root.
 
   This class helps creating a single stable view over an object to save. The
   saving code should access properties and functions via this class and not via
   the original object as there are cases where an object construct their
-  checkpointable attributes and functions dynamically per call and will yield
+  trackable attributes and functions dynamically per call and will yield
   different objects if invoked more than once.
+
+  Changes to the graph, for example adding objects, must happen in
+  `checkpoint_view` (an `_AugmentedGraphView`) before the `_SaveableView` is
+  constructed. Changes after the `_SaveableView` has been constructed will be
+  ignored.
   """
 
-  def __init__(self, root):
-    checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
-    self.nodes = checkpointable_objects
+  def __init__(self, checkpoint_view):
+    self.checkpoint_view = checkpoint_view
+    trackable_objects, node_ids, slot_variables = (
+        self.checkpoint_view.objects_ids_and_slot_variables())
+    self.nodes = trackable_objects
     self.node_ids = node_ids
-    self.captured_tensor_node_ids = util.ObjectIdentityDictionary()
+    self.captured_tensor_node_ids = object_identity.ObjectIdentityDictionary()
     self.slot_variables = slot_variables
-    self.functions = util.ObjectIdentityDictionary()
     self.concrete_functions = []
 
     # Also add `Function`s as nodes.
     nodes_without_functions = list(self.nodes)
     seen_function_names = set()
-    for obj in nodes_without_functions:
-      self.functions[obj] = self._list_functions(obj)
-      for function in self.functions[obj].values():
+    for node in nodes_without_functions:
+      for function in checkpoint_view.list_functions(node).values():
         if function not in self.node_ids:
           self.node_ids[function] = len(self.nodes)
           self.nodes.append(function)
-          # Avoids recursing into functions to see if other functions are
-          # assigned to attributes. This is sometimes true for concrete
-          # functions but not helpful.
-          self.functions[function] = {}
         if isinstance(function, def_function.Function):
           # Force listing the concrete functions for the side effects:
           #  - populate the cache for functions that have an input_signature
@@ -123,33 +174,16 @@ class _SaveableView(object):
       if isinstance(node, (def_function.Function, defun.ConcreteFunction,
                            _CapturedConstant)):
         continue
-      for child in node._checkpoint_dependencies:  # pylint: disable=protected-access
+      for child in self.checkpoint_view.list_dependencies(node):
         child_proto = object_proto.children.add()
         child_proto.node_id = self.node_ids[child.ref]
         child_proto.local_name = child.name
-      for local_name, ref_function in self.functions[node].items():
+      for local_name, ref_function in (
+          self.checkpoint_view.list_functions(node).items()):
         child_proto = object_proto.children.add()
         child_proto.node_id = self.node_ids[ref_function]
         child_proto.local_name = local_name
 
-  def _list_functions(self, checkpointable_object):
-    """Return a dict of `Function`s of a checkpointable."""
-    functions = dict()
-    attribute_extractor, attribute_getter = (
-        revived_types.get_attribute_extractors(checkpointable_object))
-    for attribute_name in attribute_extractor(checkpointable_object):
-      try:
-        attribute_value = attribute_getter(
-            checkpointable_object, attribute_name, None)
-      except Exception:  # pylint: disable=broad-except
-        # We really don't want to throw an exception just because some object's
-        # attribute accessor is broken.
-        attribute_value = None
-      if isinstance(attribute_value, (def_function.Function,
-                                      defun.ConcreteFunction)):
-        functions[attribute_name] = attribute_value
-    return functions
-
   def map_resources(self):
     """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -171,7 +205,7 @@ class _SaveableView(object):
     assert not context.executing_eagerly()
     # TODO(allenl): Handle MirroredVariables and other types of variables which
     # may need special casing.
-    object_map = util.ObjectIdentityDictionary()
+    object_map = object_identity.ObjectIdentityDictionary()
     resource_map = {}
     asset_info = _AssetInfo(
         asset_defs=[],
@@ -190,7 +224,7 @@ class _SaveableView(object):
         self.captured_tensor_node_ids[obj.handle] = node_id
       elif isinstance(obj, tracking.TrackableAsset):
         _process_asset(obj, asset_info, resource_map)
-        self.captured_tensor_node_ids[obj.asset_path.handle] = node_id
+        self.captured_tensor_node_ids[obj.asset_path] = node_id
 
     for concrete_function in self.concrete_functions:
       for capture in concrete_function.captured_inputs:
@@ -210,108 +244,6 @@ class _SaveableView(object):
     return object_map, resource_map, asset_info
 
 
-def _get_signature(function):
-  if (isinstance(function, (defun.Function, def_function.Function)) and
-      function._input_signature is not None):  # pylint: disable=protected-access
-    function = function.get_concrete_function()
-  if not isinstance(function, defun.ConcreteFunction):
-    return None
-  return function
-
-
-def _valid_signature(concrete_function):
-  """Returns whether concrete function can be converted to a signature."""
-  if not concrete_function.outputs:
-    # Functions without outputs don't make sense as signatures. We just don't
-    # have any way to run an Operation with no outputs as a SignatureDef in the
-    # 1.x style.
-    return False
-  try:
-    _normalize_outputs(concrete_function.structured_outputs, "unused", "unused")
-  except ValueError:
-    return False
-  return True
-
-
-def _find_function_to_export(saveable_view):
-  """Function to export, None if no suitable function was found."""
-  # If the user did not specify signatures, check the root object for a function
-  # that can be made into a signature.
-  functions = saveable_view.functions[saveable_view.root]
-  signature = functions.get(DEFAULT_SIGNATURE_ATTR, None)
-  if signature is not None:
-    return signature
-
-  # TODO(andresp): Discuss removing this behaviour. It can lead to WTFs when a
-  # user decides to annotate more functions with tf.function and suddenly
-  # serving that model way later in the process stops working.
-  if len(functions) == 1:
-    single_function = list(functions.values())[0]
-    signature = _get_signature(single_function)
-    if signature and  _valid_signature(signature):
-      return signature
-  return None
-
-
-def _canonicalize_signatures(signatures):
-  """Converts `signatures` into a dictionary of concrete functions."""
-  if signatures is None:
-    return {}
-  if not isinstance(signatures, collections.Mapping):
-    signatures = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
-  concrete_signatures = {}
-  for signature_key, function in signatures.items():
-    signature_function = _get_signature(function)
-    if signature_function is None:
-      raise ValueError(
-          ("Expected a TensorFlow function to generate a signature for, but "
-           "got {}. Only `tf.functions` with an input signature or "
-           "concrete functions can be used as a signature.").format(function))
-    concrete_signatures[signature_key] = signature_function
-  return concrete_signatures
-
-
-def _is_flat(sequence):
-  sequence_flat = nest.flatten(sequence)
-  try:
-    nest.assert_same_structure(sequence_flat, sequence)
-    return True
-  except ValueError:
-    return False
-  except TypeError:
-    return False
-
-
-def _normalize_outputs(outputs, function_name, signature_key):
-  """Construct an output dictionary from unnormalized function outputs."""
-  if isinstance(outputs, collections.Mapping):
-    for key, value in outputs.items():
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            ("Got a dictionary containing non-Tensor value {} for key {} "
-             "in the output of the function {} used to generate a SavedModel "
-             "signature. Dictionaries outputs for functions used as signatures "
-             "should have one Tensor output per string key.")
-            .format(value, key, compat.as_str_any(function_name)))
-    return outputs
-  else:
-    original_outputs = outputs
-    if not isinstance(outputs, collections.Sequence):
-      outputs = [outputs]
-    if not _is_flat(outputs):
-      raise ValueError(
-          ("Got non-flat outputs '{}' from '{}' for SavedModel "
-           "signature '{}'. Signatures have one Tensor per output, so "
-           "to have predictable names Python functions used to generate "
-           "these signatures should avoid outputting Tensors in nested "
-           "structures.")
-          .format(original_outputs, function_name, signature_key))
-    return {("output_{}".format(output_index)): output
-            for output_index, output
-            in enumerate(outputs)}
-
-
 def _tensor_dict_to_tensorinfo(tensor_dict):
   return {key: utils_impl.build_tensor_info_internal(value)
           for key, value in tensor_dict.items()}
@@ -340,14 +272,12 @@ def _map_captures_to_created_tensors(
   for exterior, interior in original_captures.items():
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
-      if exterior.dtype == dtypes.resource:
-        raise AssertionError(
-            ("Tried to export a function which references untracked stateful "
-             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
-             "be tracked by the main object. Objects may be tracked by "
-             "assigning them to an attribute of another tracked object, or to "
-             "an attribute of the main object directly.")
-            .format(interior))
+      raise AssertionError(
+          ("Tried to export a function which references untracked object {}."
+           "TensorFlow objects (e.g. tf.Variable) captured by functions must "
+           "be tracked by assigning them to an attribute of a tracked object "
+           "or assigned to an attribute of the main object directly.")
+          .format(interior))
     export_captures.append(mapped_resource)
   return export_captures
 
@@ -441,8 +371,8 @@ def _generate_signatures(signature_functions, resource_map):
 
   Args:
     signature_functions: A dictionary mapping string keys to concrete TensorFlow
-      functions (e.g. from `_canonicalize_signatures`) which will be used to
-      generate SignatureDefs.
+      functions (e.g. from `signature_serialization.canonicalize_signatures`)
+      which will be used to generate SignatureDefs.
     resource_map: A dictionary mapping from resource tensors in the eager
       context to resource tensors in the Graph being exported. This dictionary
       is used to re-bind resources captured by functions to tensors which will
@@ -473,10 +403,8 @@ def _generate_signatures(signature_functions, resource_map):
     mapped_inputs, exterior_argument_placeholders = (
         _map_function_arguments_to_created_inputs(
             argument_inputs, signature_key, function.name))
-    outputs = _normalize_outputs(
-        _call_function_with_mapped_captures(
-            function, mapped_inputs, resource_map),
-        function.name, signature_key)
+    outputs = _call_function_with_mapped_captures(
+        function, mapped_inputs, resource_map)
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
         _tensor_dict_to_tensorinfo(outputs),
@@ -492,10 +420,13 @@ def _trace_resource_initializers(accessible_objects):
     obj.initialize()
     return constant_op.constant(1.)  # Dummy control output
 
+  def _wrap_obj_initializer(obj):
+    return lambda: _wrap_initializer(obj)
+
   for obj in accessible_objects:
     if isinstance(obj, tracking.TrackableResource):
       resource_initializers.append(def_function.function(
-          functools.partial(_wrap_initializer, obj),
+          _wrap_obj_initializer(obj),
           # All inputs are captures.
           input_signature=[]).get_concrete_function())
   return resource_initializers
@@ -535,14 +466,13 @@ def _process_asset(trackable_asset, asset_info, resource_map):
   asset_def.filename = path
   asset_def.tensor_info.name = asset_path_initializer.name
   asset_info.asset_defs.append(asset_def)
-  asset_info.asset_initializers_by_resource[original_variable.handle] = (
+  asset_info.asset_initializers_by_resource[original_variable] = (
       asset_variable.initializer)
   asset_info.asset_index[trackable_asset] = len(asset_info.asset_defs) - 1
-  resource_map[original_variable.handle] = asset_variable.handle
+  resource_map[original_variable] = asset_variable
 
 
-def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
-                         object_saver):
+def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions):
   """Generates a MetaGraph which calls `signature_functions`.
 
   Args:
@@ -550,7 +480,6 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
     saveable_view: The _SaveableView being exported.
     signature_functions: A dictionary mapping signature keys to concrete
       functions containing signatures to add to the MetaGraph.
-    object_saver: A CheckpointableSaver to add to the MetaGraph.
 
   Returns:
     An _AssetInfo, which contains information to help creating the SavedModel.
@@ -575,6 +504,8 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
         resource_initializer_ops.append(
             _call_function_with_mapped_captures(
                 resource_initializer_function, [], resource_map))
+    resource_initializer_ops.extend(
+        asset_info.asset_initializers_by_resource.values())
     with ops.control_dependencies(resource_initializer_ops):
       init_op = control_flow_ops.no_op()
     # Add the same op to the main_op collection and to the init_op
@@ -590,7 +521,9 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   # gathering from the eager context so Optimizers save the right set of
   # variables, but want any operations associated with the save/restore to be in
   # the exported graph (thus the `to_graph` argument).
-  saver = object_saver.freeze(object_map=object_map, to_graph=exported_graph)
+  saver = functional_saver.Saver(
+      saveable_view.checkpoint_view.frozen_saveable_objects(
+          object_map=object_map, to_graph=exported_graph))
 
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
@@ -609,9 +542,9 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   return asset_info, exported_graph
 
 
-def _write_object_graph(saveable_view, export_dir, asset_file_def_index):
+def _serialize_object_graph(saveable_view, asset_file_def_index):
   """Save a SavedObjectGraph proto for `root`."""
-  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # SavedObjectGraph is similar to the TrackableObjectGraph proto in the
   # checkpoint. It will eventually go into the SavedModel.
   proto = saved_object_graph_pb2.SavedObjectGraph()
   saveable_view.fill_object_graph_proto(proto)
@@ -626,14 +559,7 @@ def _write_object_graph(saveable_view, export_dir, asset_file_def_index):
 
   for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
     _write_object_proto(obj, obj_proto, asset_file_def_index)
-
-  extra_asset_dir = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
-  file_io.recursive_create_dir(extra_asset_dir)
-  object_graph_filename = os.path.join(
-      extra_asset_dir, compat.as_bytes("object_graph.pb"))
-  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
+  return proto
 
 
 def _write_object_proto(obj, proto, asset_file_def_index):
@@ -654,6 +580,8 @@ def _write_object_proto(obj, proto, asset_file_def_index):
         function_serialization.serialize_bare_concrete_function(obj))
   elif isinstance(obj, _CapturedConstant):
     proto.constant.operation = obj.graph_tensor.op.name
+  elif isinstance(obj, tracking.TrackableResource):
+    proto.resource.SetInParent()
   else:
     registered_type_proto = revived_types.serialize(obj)
     if registered_type_proto is None:
@@ -665,10 +593,11 @@ def _write_object_proto(obj, proto, asset_file_def_index):
     proto.user_object.CopyFrom(registered_type_proto)
 
 
-@tf_export("saved_model.save", v1=["saved_model.experimental.save"])
+@tf_export("saved_model.save",
+           v1=["saved_model.save", "saved_model.experimental.save"])
 def save(obj, export_dir, signatures=None):
   # pylint: disable=line-too-long
-  """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+  """Exports the Trackable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
 
   Example usage:
 
@@ -712,7 +641,11 @@ def save(obj, export_dir, signatures=None):
   which case outputs will be numbered, or a dictionary mapping string keys to
   `Tensor`, in which case the keys will be used to name outputs.
 
-  Since `tf.keras.Model` objects are also Checkpointable, this function can be
+  Signatures are available in objects returned by `tf.saved_model.load` as a
+  `.signatures` attribute. This is a reserved attribute: `tf.saved_model.save`
+  on an object with a custom `.signatures` attribute will raise an exception.
+
+  Since `tf.keras.Model` objects are also Trackable, this function can be
   used to export Keras models. For example, exporting with a signature
   specified:
 
@@ -798,7 +731,7 @@ def save(obj, export_dir, signatures=None):
   prior to the TensorFlow 2.0 release.
 
   Args:
-    obj: A checkpointable object to export.
+    obj: A trackable object to export.
     export_dir: A directory in which to write the SavedModel.
     signatures: Optional, either a `tf.function` with an input signature
       specified or the result of `f.get_concrete_function` on a
@@ -811,7 +744,7 @@ def save(obj, export_dir, signatures=None):
       `tf.saved_model.signature_constants` module.
 
   Raises:
-    ValueError: If `obj` is not checkpointable.
+    ValueError: If `obj` is not trackable.
 
   @compatibility(eager)
   Not supported when graph building. From TensorFlow 1.x,
@@ -832,30 +765,37 @@ def save(obj, export_dir, signatures=None):
             "tf.enable_eager_execution() must run first when calling it from "
             "TensorFlow 1.x.")
   # pylint: enable=line-too-long
-  if not isinstance(obj, base.Checkpointable):
+  if not isinstance(obj, base.Trackable):
     raise ValueError(
-        "Expected a Checkpointable object for export, got {}.".format(obj))
+        "Expected a Trackable object for export, got {}.".format(obj))
 
-  # Use _SaveableView to provide a stable listing of properties and functions.
+  checkpoint_graph_view = _AugmentedGraphView(obj)
+  if signatures is None:
+    signatures = signature_serialization.find_function_to_export(
+        checkpoint_graph_view)
+
+  signatures = signature_serialization.canonicalize_signatures(signatures)
+  signature_serialization.validate_saveable_view(checkpoint_graph_view)
+  signature_map = signature_serialization.create_signature_map(signatures)
+  checkpoint_graph_view.add_object(
+      parent_node=checkpoint_graph_view.root,
+      name_in_parent=signature_serialization.SIGNATURE_ATTRIBUTE_NAME,
+      subgraph_root=signature_map)
+
+  # Use _SaveableView to provide a frozen listing of properties and functions.
   # Note we run this twice since, while constructing the view the first time
   # there can be side effects of creating variables.
-  _ = _SaveableView(obj)
-  saveable_view = _SaveableView(obj)
-
-  if signatures is None:
-    signatures = _find_function_to_export(saveable_view)
-
-  signatures = _canonicalize_signatures(signatures)
+  _ = _SaveableView(checkpoint_graph_view)
+  saveable_view = _SaveableView(checkpoint_graph_view)
 
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
   saved_model = saved_model_pb2.SavedModel()
   meta_graph_def = saved_model.meta_graphs.add()
-  # TODO(andresp): Should this be using saveable_view?
-  object_saver = util.CheckpointableSaver(obj)
+  object_saver = util.TrackableSaver(checkpoint_graph_view)
   asset_info, exported_graph = _fill_meta_graph_def(
-      meta_graph_def, saveable_view, signatures, object_saver)
+      meta_graph_def, saveable_view, signatures)
   saved_model.saved_model_schema_version = (
       constants.SAVED_MODEL_SCHEMA_VERSION)
   # So far we've just been generating protocol buffers with no I/O. Now we write
@@ -868,8 +808,10 @@ def save(obj, export_dir, signatures=None):
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+  object_graph_proto = _serialize_object_graph(
+      saveable_view, asset_info.asset_index)
+  meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
   file_io.write_string_to_file(path, saved_model.SerializeToString())
-  _write_object_graph(saveable_view, export_dir, asset_info.asset_index)
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector. Before this point we need to keep references to captured
   # constants in the saved graph.
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index cbca51aea7915762f4d87df8a700cceaa322d513..ca1d5738ed7a7b0d0bf8ee2488fbedba258e2c3c 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel save."""
+"""Tests for trackable object SavedModel save."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -41,8 +41,9 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import compat
 
 
 class _ModelWithOptimizer(util.Checkpoint):
@@ -86,7 +87,7 @@ def _import_and_infer(
 class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -98,7 +99,7 @@ class SaveTest(test.TestCase):
         _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda z: {"out": 2. * z})
     root.f(constant_op.constant(1.))
@@ -114,7 +115,7 @@ class SaveTest(test.TestCase):
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -123,7 +124,7 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, root.f)
 
   def test_captures_unreachable_variable(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     unreachable_variable = variables.Variable([5.0, 2.0])
     root.reachable_variable = variables.Variable([1.0, 3.0])
 
@@ -142,7 +143,7 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir)
 
   def test_nested_inputs(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x[0],
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
@@ -155,7 +156,7 @@ class SaveTest(test.TestCase):
       root.f.get_concrete_function()
 
   def test_nested_outputs(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
@@ -176,7 +177,7 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, to_save)
 
   def test_variable(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(3.)
     root.v2 = variables.Variable(2.)
     root.f = def_function.function(
@@ -213,7 +214,7 @@ class SaveTest(test.TestCase):
                                     {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
-    model = tracking.AutoCheckpointable()
+    model = tracking.AutoTrackable()
     model.f = def_function.function(lambda: 3., input_signature=())
     model.f()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -222,7 +223,7 @@ class SaveTest(test.TestCase):
                         _import_and_infer(save_dir, {}))
 
   def test_single_function_no_signature(self):
-    model = tracking.AutoCheckpointable()
+    model = tracking.AutoTrackable()
     model.f = def_function.function(lambda: 3.)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
@@ -303,6 +304,14 @@ class SaveTest(test.TestCase):
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
+  def test_signature_attribute_reserved(self):
+    root = util.Checkpoint(signatures=variables.Variable(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(ValueError, "del obj.signatures"):
+      save.save(root, save_dir)
+    del root.signatures
+    save.save(root, save_dir)
+
 
 class AssetTests(test.TestCase):
 
@@ -312,6 +321,18 @@ class AssetTests(test.TestCase):
     with open(self._vocab_path, "w") as f:
       f.write("alpha\nbeta\ngamma\n")
 
+  def test_asset_path_returned(self):
+    root = tracking.AutoTrackable()
+    root.path = tracking.TrackableAsset(self._vocab_path)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    root.get_asset = def_function.function(lambda: root.path.asset_path)
+    save.save(root, save_dir, signatures=root.get_asset.get_concrete_function())
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    file_io.rename(save_dir, second_dir)
+    imported_path = _import_and_infer(second_dir, {})["output_0"]
+    self.assertIn(compat.as_str_any(second_dir),
+                  compat.as_str_any(imported_path))
+
   def test_table(self):
     initializer = lookup_ops.TextFileInitializer(
         self._vocab_path,
@@ -341,7 +362,7 @@ class AssetTests(test.TestCase):
         _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
 
   def test_unused_asset(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index fcde6b47e4ff10dbd84801e08597591a10818d51..9c926d789f4199666e2ffb68bdc9134751ba17e8 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -29,6 +29,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
 # pylint: disable=wildcard-import
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 0efe1763430eade223801b63f958405212eebe34..525d18d18e186c3a9bc551150a7fe5fcd60f0356 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -136,6 +136,9 @@ tf_export(
 ################################################################################
 # Train/Eval API constants.
 # Not exported while export_all_saved_models is experimental.
+DEFAULT_TRAIN_SIGNATURE_DEF_KEY = "train"
+
+DEFAULT_EVAL_SIGNATURE_DEF_KEY = "eval"
 
 SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
 
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index f6e6e1d13ecdea684f14dcaaa39f1c66f72ac352..2e0a0afeec630eb97467d6967d989dd9bf5ce898 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -30,7 +30,6 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
-    'saved_model.build_signature_def',
     v1=[
         'saved_model.build_signature_def',
         'saved_model.signature_def_utils.build_signature_def'
@@ -63,7 +62,6 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
 
 
 @tf_export(
-    'saved_model.regression_signature_def',
     v1=[
         'saved_model.regression_signature_def',
         'saved_model.signature_def_utils.regression_signature_def'
@@ -112,7 +110,6 @@ def regression_signature_def(examples, predictions):
 
 
 @tf_export(
-    'saved_model.classification_signature_def',
     v1=[
         'saved_model.classification_signature_def',
         'saved_model.signature_def_utils.classification_signature_def'
@@ -172,7 +169,6 @@ def classification_signature_def(examples, classes, scores):
 
 
 @tf_export(
-    'saved_model.predict_signature_def',
     v1=[
         'saved_model.predict_signature_def',
         'saved_model.signature_def_utils.predict_signature_def'
@@ -270,7 +266,6 @@ def _supervised_signature_def(
 
 
 @tf_export(
-    'saved_model.is_valid_signature',
     v1=[
         'saved_model.is_valid_signature',
         'saved_model.signature_def_utils.is_valid_signature'
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cfd5b14eff25644b55dbed0c78f5d4ff24b6891
--- /dev/null
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -0,0 +1,263 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers for working with signatures in tf.saved_model.save."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training.tracking import base
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+
+
+DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
+SIGNATURE_ATTRIBUTE_NAME = "signatures"
+
+
+def _get_signature(function):
+  if (isinstance(function, (defun.Function, def_function.Function)) and
+      function._input_signature is not None):  # pylint: disable=protected-access
+    function = function.get_concrete_function()
+  if not isinstance(function, defun.ConcreteFunction):
+    return None
+  return function
+
+
+def _valid_signature(concrete_function):
+  """Returns whether concrete function can be converted to a signature."""
+  if not concrete_function.outputs:
+    # Functions without outputs don't make sense as signatures. We just don't
+    # have any way to run an Operation with no outputs as a SignatureDef in the
+    # 1.x style.
+    return False
+  try:
+    _normalize_outputs(concrete_function.structured_outputs, "unused", "unused")
+  except ValueError:
+    return False
+  return True
+
+
+def find_function_to_export(saveable_view):
+  """Function to export, None if no suitable function was found."""
+  # If the user did not specify signatures, check the root object for a function
+  # that can be made into a signature.
+  functions = saveable_view.list_functions(saveable_view.root)
+  signature = functions.get(DEFAULT_SIGNATURE_ATTR, None)
+  if signature is not None:
+    return signature
+
+  # TODO(andresp): Discuss removing this behaviour. It can lead to WTFs when a
+  # user decides to annotate more functions with tf.function and suddenly
+  # serving that model way later in the process stops working.
+  possible_signatures = []
+  for function in functions.values():
+    concrete = _get_signature(function)
+    if concrete is not None and _valid_signature(concrete):
+      possible_signatures.append(concrete)
+  if len(possible_signatures) == 1:
+    single_function = possible_signatures[0]
+    signature = _get_signature(single_function)
+    if signature and  _valid_signature(signature):
+      return signature
+  return None
+
+
+def canonicalize_signatures(signatures):
+  """Converts `signatures` into a dictionary of concrete functions."""
+  if signatures is None:
+    return {}
+  if not isinstance(signatures, collections.Mapping):
+    signatures = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
+  concrete_signatures = {}
+  for signature_key, function in signatures.items():
+    signature_function = _get_signature(function)
+    if signature_function is None:
+      raise ValueError(
+          ("Expected a TensorFlow function to generate a signature for, but "
+           "got {}. Only `tf.functions` with an input signature or "
+           "concrete functions can be used as a signature.").format(function))
+
+    # Re-wrap the function so that it returns a dictionary of Tensors. This
+    # matches the format of 1.x-style signatures.
+    # pylint: disable=cell-var-from-loop
+    @def_function.function
+    def signature_wrapper(**kwargs):
+      structured_outputs = signature_function(**kwargs)
+      return _normalize_outputs(
+          structured_outputs, signature_function.name, signature_key)
+    # TODO(b/123902469): Use ConcreteFunction.structured_inputs once their names
+    # always match keyword arguments.
+    tensor_spec_signature = {}
+    for keyword, tensor in zip(
+        signature_function._arg_keywords,  # pylint: disable=protected-access
+        signature_function.inputs):
+      keyword = compat.as_str(keyword)
+      tensor_spec_signature[keyword] = tensor_spec.TensorSpec.from_tensor(
+          tensor, name=keyword)
+    final_concrete = signature_wrapper.get_concrete_function(
+        **tensor_spec_signature)
+    # pylint: disable=protected-access
+    if len(final_concrete._arg_keywords) == 1:
+      # If there is only one input to the signature, a very common case, then
+      # ordering is unambiguous and we can let people pass a positional
+      # argument. Since SignatureDefs are unordered (protobuf "map") multiple
+      # arguments means we need to be keyword-only.
+      final_concrete._num_positional_args = 1
+    else:
+      final_concrete._num_positional_args = 0
+    # pylint: enable=protected-access
+    concrete_signatures[signature_key] = final_concrete
+    # pylint: enable=cell-var-from-loop
+  return concrete_signatures
+
+
+def _is_flat(sequence):
+  sequence_flat = nest.flatten(sequence)
+  try:
+    nest.assert_same_structure(sequence_flat, sequence)
+    return True
+  except ValueError:
+    return False
+  except TypeError:
+    return False
+
+
+def _normalize_outputs(outputs, function_name, signature_key):
+  """Construct an output dictionary from unnormalized function outputs."""
+  if isinstance(outputs, collections.Mapping):
+    for key, value in outputs.items():
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            ("Got a dictionary containing non-Tensor value {} for key {} "
+             "in the output of the function {} used to generate a SavedModel "
+             "signature. Dictionaries outputs for functions used as signatures "
+             "should have one Tensor output per string key.")
+            .format(value, key, compat.as_str_any(function_name)))
+    return outputs
+  else:
+    original_outputs = outputs
+    if not isinstance(outputs, collections.Sequence):
+      outputs = [outputs]
+    if not _is_flat(outputs):
+      raise ValueError(
+          ("Got non-flat outputs '{}' from '{}' for SavedModel "
+           "signature '{}'. Signatures have one Tensor per output, so "
+           "to have predictable names Python functions used to generate "
+           "these signatures should avoid outputting Tensors in nested "
+           "structures.")
+          .format(original_outputs, function_name, signature_key))
+    return {("output_{}".format(output_index)): output
+            for output_index, output
+            in enumerate(outputs)}
+
+
+# _SignatureMap is immutable to ensure that users do not expect changes to be
+# reflected in the SavedModel. Using public APIs, tf.saved_model.load() is the
+# only way to create a _SignatureMap and there is no way to modify it. So we can
+# safely ignore/overwrite ".signatures" attributes attached to objects being
+# saved if they contain a _SignatureMap. A ".signatures" attribute containing
+# any other type (e.g. a regular dict) will raise an exception asking the user
+# to first "del obj.signatures" if they want it overwritten.
+class _SignatureMap(collections.Mapping, base.Trackable):
+  """A collection of SavedModel signatures."""
+
+  def __init__(self):
+    self._signatures = {}
+
+  def _add_signature(self, name, concrete_function):
+    """Adds a signature to the _SignatureMap."""
+    # Ideally this object would be immutable, but restore is streaming so we do
+    # need a private API for adding new signatures to an existing object.
+    self._signatures[name] = concrete_function
+
+  def __getitem__(self, key):
+    return self._signatures[key]
+
+  def __iter__(self):
+    return iter(self._signatures)
+
+  def __len__(self):
+    return len(self._signatures)
+
+  def __repr__(self):
+    return "_SignatureMap({})".format(self._signatures)
+
+  def _list_functions_for_serialization(self):
+    return {
+        key: value for key, value in self.items()
+        if isinstance(value, (def_function.Function, defun.ConcreteFunction))
+    }
+
+
+revived_types.register_revived_type(
+    "signature_map",
+    lambda obj: isinstance(obj, _SignatureMap),
+    versions=[revived_types.VersionedTypeRegistration(
+        # Standard dependencies are enough to reconstruct the trackable
+        # items in dictionaries, so we don't need to save any extra information.
+        object_factory=lambda proto: _SignatureMap(),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=_SignatureMap._add_signature  # pylint: disable=protected-access
+    )])
+
+
+def create_signature_map(signatures):
+  """Creates an object containing `signatures`."""
+  signature_map = _SignatureMap()
+  for name, func in signatures.items():
+    # This true of any signature that came from canonicalize_signatures. Here as
+    # a sanity check on saving; crashing on load (e.g. in _add_signature) would
+    # be more problematic in case future export changes violated these
+    # assertions.
+    assert isinstance(func, defun.ConcreteFunction)
+    assert isinstance(func.structured_outputs, collections.Mapping)
+    # pylint: disable=protected-access
+    if len(func._arg_keywords) == 1:
+      assert 1 == func._num_positional_args
+    else:
+      assert 0 == func._num_positional_args
+    signature_map._add_signature(name, func)
+    # pylint: enable=protected-access
+  return signature_map
+
+
+def validate_saveable_view(saveable_view):
+  """Performs signature-related sanity checks on `saveable_view`."""
+  for name, dep in saveable_view.list_dependencies(
+      saveable_view.root):
+    if name == SIGNATURE_ATTRIBUTE_NAME:
+      if not isinstance(dep, _SignatureMap):
+        raise ValueError(
+            ("Exporting an object {} which has an attribute named "
+             "'{signatures}'. This is a reserved attribute used to store "
+             "SavedModel signatures in objects which come from "
+             "`tf.saved_model.load`. Delete this attribute "
+             "(e.g. 'del obj.{signatures}') before saving if this shadowing is "
+             "acceptable.").format(
+                 saveable_view.root,
+                 signatures=SIGNATURE_ATTRIBUTE_NAME))
+      break
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index a82007fd545ca9e088411bcd5234477b8801e995..2e7b2080574e875233181a1476eb328a07e718c5 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -50,7 +50,7 @@ def build_tensor_info(tensor):
   Args:
     tensor: Tensor or SparseTensor whose name, dtype and shape are used to
         build the TensorInfo. For SparseTensors, the names of the three
-        constitutent Tensors are used.
+        constituent Tensors are used.
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index b8ad31e88bd2998be1f4920845fdd3005cbb7cca..e483155dcfbc9e93c8b8aa28e83b6122ec99822e 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -13,21 +13,26 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 # Transitive dependencies of this target will be included in the pip package.
 py_library(
     name = "tools_pip",
+    data = [
+        ":freeze_graph",
+        ":import_pb_to_tensorboard",
+        ":inspect_checkpoint",
+        ":optimize_for_inference",
+        ":print_selective_registration_header",
+        ":saved_model_cli",
+        ":strip_unused",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
+    ],
     deps = [
-        ":freeze_graph_lib",
-        ":import_pb_to_tensorboard_lib",
-        ":inspect_checkpoint_lib",
-        ":optimize_for_inference_lib",
-        ":print_selective_registration_header_lib",
-        ":saved_model_cli_lib",
         ":saved_model_utils",
-        ":strip_unused_lib",
         # The following py_library are needed because
         # py_binary may not depend on them when --define=no_tensorflow_py_deps=true
         # is specified. See https://github.com/tensorflow/tensorflow/issues/22390
+        ":freeze_graph_lib",
+        ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
-        # Include the TF upgrade script to users can run it directly after install TF
-        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
+        ":strip_unused_lib",
     ],
 )
 
@@ -74,6 +79,13 @@ py_binary(
     name = "freeze_graph",
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
+    deps = [":freeze_graph_main_lib"],
+)
+
+py_library(
+    name = "freeze_graph_main_lib",
+    srcs = ["freeze_graph.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":freeze_graph_lib",
     ],
@@ -204,6 +216,13 @@ py_binary(
     name = "optimize_for_inference",
     srcs = ["optimize_for_inference.py"],
     srcs_version = "PY2AND3",
+    deps = [":optimize_for_inference_main_lib"],
+)
+
+py_library(
+    name = "optimize_for_inference_main_lib",
+    srcs = ["optimize_for_inference.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 5095f9fdbe531992b742a13b3d237f00c9b34b20..a1dd37f42f2d29cfe964e9155e31d4d335028019 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -15,6 +15,7 @@ TENSORFLOW_API_INIT_FILES = [
     "debugging/__init__.py",
     "distribute/__init__.py",
     "distribute/cluster_resolver/__init__.py",
+    "distribute/experimental/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
@@ -27,7 +28,8 @@ TENSORFLOW_API_INIT_FILES = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
-    "losses/__init__.py",
+    "lite/experimental/__init__.py",
+    "lite/experimental/nn/__init__.py",
     "math/__init__.py",
     "nest/__init__.py",
     "nn/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 01bf12ebe70dcfe90f8040af899898ee147e0ae0..43ef09b9fa8e12e2e5bd16e11e117a1909a5bf11 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "debugging/__init__.py",
     "distribute/__init__.py",
     "distribute/cluster_resolver/__init__.py",
+    "distribute/experimental/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
@@ -33,6 +34,8 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
+    "lite/experimental/__init__.py",
+    "lite/experimental/nn/__init__.py",
     "logging/__init__.py",
     "losses/__init__.py",
     "manip/__init__.py",
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 06a6e7dc086f27344bc24184eab13aa20208fbbb..ab82ee9fd410e646c0c1f9b302d47bb3021bb514 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -240,7 +240,7 @@ def freeze_graph_with_def_protos(input_graph_def,
 
 
 def _parse_input_graph_proto(input_graph, input_binary):
-  """Parser input tensorflow graph into GraphDef proto."""
+  """Parses input tensorflow graph into GraphDef proto."""
   if not gfile.Exists(input_graph):
     print("Input graph file '" + input_graph + "' does not exist!")
     return -1
@@ -255,7 +255,7 @@ def _parse_input_graph_proto(input_graph, input_binary):
 
 
 def _parse_input_meta_graph_proto(input_graph, input_binary):
-  """Parser input tensorflow graph into MetaGraphDef proto."""
+  """Parses input tensorflow graph into MetaGraphDef proto."""
   if not gfile.Exists(input_graph):
     print("Input meta graph file '" + input_graph + "' does not exist!")
     return -1
@@ -271,7 +271,7 @@ def _parse_input_meta_graph_proto(input_graph, input_binary):
 
 
 def _parse_input_saver_proto(input_saver, input_binary):
-  """Parser input tensorflow Saver into SaverDef proto."""
+  """Parses input tensorflow Saver into SaverDef proto."""
   if not gfile.Exists(input_saver):
     print("Input saver file '" + input_saver + "' does not exist!")
     return -1
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a76c6205648d75146782684c171cc345e7fff394
--- /dev/null
+++ b/tensorflow/python/tpu/BUILD
@@ -0,0 +1,334 @@
+# Description: Operations defined for Cloud TPUs
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_py_test",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//knowledge/cerebra/sense/im2query:__subpackages__",
+        "//learning/brain:__subpackages__",
+        "//learning/deepmind:__subpackages__",
+        "//medical/pathology:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//vr/perception:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "tpu_py",
+    srcs = ["ops/tpu_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:tpu_ops_gen",
+    ],
+)
+
+py_library(
+    name = "async_checkpoint",
+    srcs = ["async_checkpoint.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_library(
+    name = "tpu_estimator",
+    srcs = [
+        "_tpu_estimator_embedding.py",
+        "error_handling.py",
+        "tpu_config.py",
+        "tpu_context.py",
+        "tpu_estimator.py",
+        "util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":async_checkpoint",
+        ":feature_column",
+        ":functional",
+        ":tpu_embedding",
+        ":tpu_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:function",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "functional",
+    srcs = ["functional.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/python:tpu_ops_gen",
+    ],
+)
+
+py_library(
+    name = "tpu",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column",
+        ":tpu_embedding",
+        ":tpu_estimator",
+        ":tpu_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_lib",
+    srcs = [
+        "__init__.py",
+        "bfloat16.py",
+        "device_assignment.py",
+        "session_support.py",
+        "tensor_tracer.py",
+        "topology.py",
+        "tpu.py",
+        "tpu_feed.py",
+        "tpu_function.py",
+        "tpu_optimizer.py",
+        "tpu_sharding.py",
+        "tpu_system_metadata.py",
+        "training_loop.py",
+        "xla.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":datasets",
+        ":functional",
+        ":tpu_py",
+        "//tensorflow/compiler/xla/experimental/xla_sharding",
+        "//tensorflow/compiler/xla/python_api:xla_shape",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_py",
+        "//tensorflow/core/protobuf/tpu:topology_proto_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu/profiler",
+    ],
+)
+
+py_library(
+    name = "datasets",
+    srcs = [
+        "datasets.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "datasets_test",
+    size = "medium",
+    srcs = ["datasets_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        ":datasets",
+    ],
+    grpc_enabled = True,
+    shard_count = 4,
+    tags = ["no_oss"],
+)
+
+tf_py_test(
+    name = "tpu_test",
+    size = "small",
+    srcs = ["tpu_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+    ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+)
+
+tf_py_test(
+    name = "tpu_sharding_test",
+    size = "small",
+    srcs = ["tpu_sharding_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_infeed_test",
+    size = "small",
+    srcs = ["tpu_infeed_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_config_test",
+    size = "small",
+    srcs = ["tpu_config_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_estimator_signals_test",
+    size = "small",
+    srcs = ["tpu_estimator_signals_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    # TODO(jhseu): Remove. Fails in OSS on Python 3.
+    tags = ["no_oss"],
+)
+
+tf_py_test(
+    name = "topology_test",
+    size = "medium",
+    srcs = ["topology_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_embedding",
+    srcs = [
+        "tpu_embedding.py",
+        "tpu_embedding_gradient.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "feature_column",
+    srcs = ["feature_column.py"],
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+)
+
+tf_py_test(
+    name = "feature_column_test",
+    srcs = [
+        "feature_column_test.py",
+    ],
+    additional_deps = [
+        ":feature_column",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+    main = "feature_column_test.py",
+)
diff --git a/tensorflow/python/tpu/__init__.py b/tensorflow/python/tpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dffd7064b19f353aed6afa3ad383564643a4a90
--- /dev/null
+++ b/tensorflow/python/tpu/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Ops related to Tensor Processing Units."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/tpu/_tpu_estimator_embedding.py b/tensorflow/python/tpu/_tpu_estimator_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e0e968a648ebdd51de1719cb8c4784c8fdf286
--- /dev/null
+++ b/tensorflow/python/tpu/_tpu_estimator_embedding.py
@@ -0,0 +1,334 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Tooling for support TPU embedding in TPUEstimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.feature_column import feature_column as core_fc
+from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
+from tensorflow.python.tpu import feature_column as tpu_fc
+from tensorflow.python.tpu import tpu_embedding
+
+# pylint: disable=protected-access
+_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
+                                 tpu_fc._TPUSharedEmbeddingColumn)
+_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
+                             core_fc_lib.EmbeddingColumn,
+                             core_fc._SharedEmbeddingColumn)
+_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
+
+# pylint: enable=protected-access
+
+_TABLE_NAME_PREFIX = 'tbl_'
+_LEN_TABLE_NAME_PREFIX = len(_TABLE_NAME_PREFIX)
+
+
+def _get_table_name_from_embedding_var_name(embedding_var_name):
+  return '{}{}'.format(_TABLE_NAME_PREFIX, embedding_var_name)
+
+
+def _get_embedding_var_name_from_table_name(table_name):
+  return table_name[_LEN_TABLE_NAME_PREFIX:]
+
+
+def _get_embedding_variable_name(scope_name, var_name):
+  return '{}/{}'.format(scope_name, var_name)
+
+
+def _get_slot_variable_names(scope_name, var_name, optimization_parameters):
+  """Return embedding variable names which are consistent with CPU runs."""
+  if isinstance(optimization_parameters, tpu_embedding.AdagradParameters):
+    return tpu_embedding.AdagradSlotVariableName(
+        '{}/{}/Adagrad'.format(scope_name, var_name)
+    )
+  elif isinstance(optimization_parameters, tpu_embedding.AdamParameters):
+    return tpu_embedding.AdamSlotVariableNames(
+        '{}/{}/Adam/m'.format(scope_name, var_name),
+        '{}/{}/Adam/v'.format(scope_name, var_name)
+    )
+  elif isinstance(optimization_parameters,
+                  tpu_embedding.StochasticGradientDescentParameters):
+    return None
+  else:
+    raise ValueError('Support to infer full variable name '
+                     'for optimization_parameter {} has not been added.'
+                     .format(optimization_parameters))
+
+
+def get_full_variable_names(
+    graph, table_to_config_dict, optimization_parameters):
+  """Return embedding variable names and slot variables which are consistent with CPU runs."""
+  collection = graph.get_collection_ref(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
+  if not collection:
+    raise RuntimeError(
+        'Embedding feature column did not capture any thing. Make sure the '
+        'feature columns passed to TPUEstimator constructor is properly '
+        'used in model_fn.')
+
+  embedding_variable_name_by_table = {}
+  slot_variable_names_by_table = {}
+  for table_name in table_to_config_dict:
+    embedding_var_name = _get_embedding_var_name_from_table_name(table_name)
+    (scope_name, var_name) = collection[0][embedding_var_name]
+    embedding_variable_name_by_table[table_name] = (
+        _get_embedding_variable_name(scope_name, var_name))
+    slot_variable_names_by_table[table_name] = _get_slot_variable_names(
+        scope_name, var_name, optimization_parameters)
+
+  graph.clear_collection(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
+  return embedding_variable_name_by_table, slot_variable_names_by_table
+
+
+def get_tpu_embedding_config_from_feature_columns(feature_columns):
+  """Create configs for TPUEmbedding from a list of feature columns.
+
+  This function will place one embedding tensor per table and the return is
+  intended to be used as input to TPUEmbedding.
+
+  Args:
+    feature_columns: a list of supported feature columns.
+
+  Returns:
+    A pair of dicts, the first maps tables to their config, the second maps
+    features to tables.
+  """
+
+  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
+
+  for column in feature_columns:
+    if not isinstance(column, allowed):
+      raise TypeError(
+          'Unsupported feature column {}. Supported types are {}.'.format(
+              type(column), allowed))
+
+  table_to_config = {}
+  feature_to_table = {}
+  for column in feature_columns:
+    feature_name = column.get_feature_key_name()
+    table_name = _get_table_name_from_embedding_var_name(
+        column.get_embedding_var_name())
+    if feature_name in feature_to_table:
+      raise ValueError(
+          'Feature column {} is used with multiple embeddings and this is '
+          'not supported.'.format(feature_name))
+    feature_to_table[feature_name] = table_name
+    vocabulary_size, dimension = column.get_embedding_table_size()
+    table_to_config[table_name] = tpu_embedding.TableConfig(
+        vocabulary_size=vocabulary_size,
+        dimension=dimension,
+        initializer=column.get_initializer(),
+        combiner=column.get_combiner())
+
+  return table_to_config, feature_to_table
+
+
+def _get_tpu_embedding_optimization_parameters(embedding_config_spec):
+  """Get tpu_embedding._OptimizationParameters from EmbeddingConfigSpec."""
+  if embedding_config_spec.optimizer_type == 'adagrad':
+    return tpu_embedding.AdagradParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.adagrad_initial_accumulator,
+        embedding_config_spec.use_gradient_accumulation)
+  elif embedding_config_spec.optimizer_type == 'sgd':
+    return tpu_embedding.StochasticGradientDescentParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.use_gradient_accumulation)
+  elif embedding_config_spec.optimizer_type == 'adam':
+    return tpu_embedding.AdamParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.adam_parameters.beta1,
+        embedding_config_spec.adam_parameters.beta2,
+        embedding_config_spec.adam_parameters.epsilon,
+        use_gradient_accumulation=embedding_config_spec
+        .use_gradient_accumulation)
+  else:
+    raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
+
+
+AdamParameters = collections.namedtuple('AdamParameters',
+                                        ['beta1', 'beta2', 'epsilon'])
+
+
+# TODO(shizhiw): Improve the API to support more optimizer parameters in API.
+class EmbeddingConfigSpec(
+    collections.namedtuple('EmbeddingConfigSpec', [
+        'feature_columns', 'learning_rate', 'optimizer_type',
+        'adagrad_initial_accumulator', 'clipping_limit',
+        'use_gradient_accumulation', 'adam_parameters'
+    ])):
+  """Class to keep track of embedding config specification."""
+
+  def __new__(cls,
+              feature_columns,
+              learning_rate,
+              optimizer_type='adagrad',
+              adagrad_initial_accumulator=None,
+              clipping_limit=None,
+              use_gradient_accumulation=False,
+              adam_parameters=None):
+    """Creates an EmbeddingConfigSpec instance.
+
+    Args:
+      feature_columns: All `FeatureColumn`s used by model.
+      learning_rate: embedding optimizer learning rate.
+      optimizer_type: (String) Name of the optimizer for embedding gradients
+        updates. Must be either 'adagrad' ( `tf.train.AdagradOptimizer`, default
+        value), 'sgd' (`tf.train.GradientDescentOptimizer`), or 'adam'
+        (`tf.contrib.opt.LazyAdamOptimizer`) for lazy Adam. This optimizer will
+        be applied to all embedding variables specified by `feature_columns`.
+      adagrad_initial_accumulator: Initial accumulator for Adagrad. Used when
+        optimizer_type is 'adagrad'. Default is `0.1`.
+      clipping_limit: (Optional) Clipping limit (absolute value).
+      use_gradient_accumulation: (Experimental) Whether to accumulate the
+        gradients across TPU embedding mini-batches. Gradient accumulation does
+        not affect SGD and therefore this is applicable only for Adagrad.
+      adam_parameters: AdamParameters. Used when optimizer_type is 'adam'.
+        Default is 0.9 for beta1, 0.999 for beta2 and 1e-8 for epsilon.
+
+    Returns:
+      An EmbeddingConfigSpec instance.
+
+    Raises:
+      ValueError: If the feature_columns are not specified.
+      TypeError: If the feature columns are not of ths correct type (one of
+        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
+        _EMBEDDING_COLUMN_CLASSES).
+      ValueError: If use_gradient_accumulation is True for SGD.
+      ValueError: If `optimizer_type` is not one of "adagrad" or "sgd" or
+        "adam".
+    """
+    if not feature_columns:
+      raise ValueError('`feature_columns` cannot be `None` or empty.')
+
+    # It is unknown at this moment, whether the TPUEstimator is running in CPU
+    # or TPU mode. So allow non-TPU embedding columns also.
+    supported_classes = tuple(
+        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
+        list(_EMBEDDING_COLUMN_CLASSES))
+
+    for column in feature_columns:
+      if not isinstance(column, supported_classes):
+        raise TypeError(
+            'All feature columns must be supported types in {}. Got {}'.format(
+                supported_classes, type(column)))
+
+    if optimizer_type == 'adagrad':
+      if adagrad_initial_accumulator is None:
+        adagrad_initial_accumulator = 0.1
+      if adagrad_initial_accumulator <= 0:
+        raise ValueError('Adagrad initial_accumulator must be positive')
+    elif optimizer_type == 'sgd':
+      if use_gradient_accumulation:
+        raise ValueError('Gradient accumulation makes sense for Adagrad only.')
+    elif optimizer_type == 'adam':
+      if adam_parameters is None:
+        adam_parameters = AdamParameters(0.9, 0.999, 1e-8)
+      if adam_parameters.beta1 < 0. or adam_parameters.beta1 >= 1.:
+        raise ValueError('beta1 must be between 0. and 1; got {}.'.format(
+            adam_parameters.beta1))
+      if adam_parameters.beta2 < 0. or adam_parameters.beta2 >= 1.:
+        raise ValueError('beta2 must be between 0. and 1; got {}.'.format(
+            adam_parameters.beta2))
+      if adam_parameters.epsilon <= 0.:
+        raise ValueError('epsilon must be positive; got {}.'.format(
+            adam_parameters.epsilon))
+    else:
+      raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
+
+    return super(EmbeddingConfigSpec, cls).__new__(
+        cls,
+        feature_columns=feature_columns,
+        learning_rate=learning_rate,
+        optimizer_type=optimizer_type,
+        adagrad_initial_accumulator=adagrad_initial_accumulator,
+        clipping_limit=clipping_limit,
+        use_gradient_accumulation=use_gradient_accumulation,
+        adam_parameters=adam_parameters)
+
+
+class EmbeddingConfig(object):
+  """This is the internal immutable object for embedding config.
+
+  `_EmbeddingConfig` is responsible to _translate_ user provided
+  `EmbeddingConfigSpec` to internal data structures, mostly constructor
+  arguments of `TPUEmbedding`.
+  """
+
+  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
+               num_hosts, num_cores, master):
+    self._embedding_config_spec = embedding_config_spec
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._num_hosts = num_hosts
+    self._num_cores = num_cores
+    self._master = master
+
+    self._table_to_config_dict, self._feature_to_table_dict = (
+        get_tpu_embedding_config_from_feature_columns(
+            embedding_config_spec.feature_columns))
+    self._optimization_parameters = _get_tpu_embedding_optimization_parameters(
+        self._embedding_config_spec)
+    self._mode_to_tpu_embedding_dict = {}
+    self.dummy_table_variables = None
+
+  def has_embedding_tables(self):
+    return bool(self._table_to_config_dict)
+
+  def _create_tpu_embedding(self, mode):
+    """Create tpu_embedding.TPUEmbedding based on mode."""
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      batch_size = self._train_batch_size
+    else:
+      batch_size = self._eval_batch_size
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      tpu_embedding_mode = tpu_embedding.TRAINING
+    elif (mode == model_fn_lib.ModeKeys.EVAL or
+          mode == model_fn_lib.ModeKeys.PREDICT):
+      tpu_embedding_mode = tpu_embedding.INFERENCE
+    else:
+      raise ValueError('Mode {} is not supported.'.format(mode))
+
+    tpu_embedding_ = tpu_embedding.TPUEmbedding(
+        self._table_to_config_dict,
+        self._feature_to_table_dict,
+        batch_size,
+        tpu_embedding_mode,
+        self._master,
+        self._optimization_parameters,
+    )
+    return tpu_embedding_
+
+  def get_tpu_embedding(self, mode):
+    if mode not in self._mode_to_tpu_embedding_dict:
+      self._mode_to_tpu_embedding_dict[mode] = (
+          self._create_tpu_embedding(mode))
+    return self._mode_to_tpu_embedding_dict[mode]
+
+
+def split_inputs(ctx, features, labels):
+  """Splits the dense and sparse tensors inside the features and labels."""
+  sparse_features = collections.OrderedDict()
+  if ctx.embedding_config:
+    tpu_embedding_ = ctx.embedding_config.tpu_embedding
+    for feature_key in tpu_embedding_.feature_to_table_dict:
+      sparse_features[feature_key] = features.pop(feature_key)
+
+  return features, labels, sparse_features
diff --git a/tensorflow/python/tpu/async_checkpoint.py b/tensorflow/python/tpu/async_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf
--- /dev/null
+++ b/tensorflow/python/tpu/async_checkpoint.py
@@ -0,0 +1,212 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Hook for asynchronous checkpointing.
+
+This hook dispatches checkpoint writing operations in a separate thread to
+allow execution to continue on the main thread.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+import time
+
+from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import training_util
+from tensorflow.python.training.session_run_hook import SessionRunArgs
+from tensorflow.python.training.summary_io import SummaryWriterCache
+
+
+class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
+  """Saves checkpoints every N steps or seconds."""
+
+  def __init__(self,
+               checkpoint_dir,
+               save_secs=None,
+               save_steps=None,
+               saver=None,
+               checkpoint_basename="model.ckpt",
+               scaffold=None,
+               listeners=None):
+    """Initializes a `CheckpointSaverHook`.
+
+    Args:
+      checkpoint_dir: `str`, base directory for the checkpoint files.
+      save_secs: `int`, save every N secs.
+      save_steps: `int`, save every N steps.
+      saver: `Saver` object, used for saving.
+      checkpoint_basename: `str`, base name for the checkpoint files.
+      scaffold: `Scaffold`, use to get saver object.
+      listeners: List of `CheckpointSaverListener` subclass instances. Used for
+        callbacks that run immediately before or after this hook saves the
+        checkpoint.
+
+    Raises:
+      ValueError: One of `save_steps` or `save_secs` should be set.
+      ValueError: At most one of `saver` or `scaffold` should be set.
+    """
+    logging.info("Create AsyncCheckpointSaverHook.")
+    if saver is not None and scaffold is not None:
+      raise ValueError("You cannot provide both saver and scaffold.")
+    self._saver = saver
+    self._save_thread = None
+    self._write_graph_thread = None
+    self._checkpoint_dir = checkpoint_dir
+    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
+    self._scaffold = scaffold
+    self._timer = basic_session_run_hooks.SecondOrStepTimer(
+        every_secs=save_secs, every_steps=save_steps)
+    self._listeners = listeners or []
+    self._steps_per_run = 1
+    self._summary_writer = None
+    self._global_step_tensor = None
+
+    self._last_checkpoint_step = None
+
+  def _set_steps_per_run(self, steps_per_run):
+    self._steps_per_run = steps_per_run
+
+  def begin(self):
+    self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
+    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use CheckpointSaverHook.")
+    for l in self._listeners:
+      l.begin()
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+
+    # We do write graph and saver_def at the first call of before_run.
+    # We cannot do this in begin, since we let other hooks to change graph and
+    # add variables in begin. Graph is finalized after all begin calls.
+    def _write_graph_fn(self):
+      training_util.write_graph(
+          ops.get_default_graph().as_graph_def(add_shapes=True),
+          self._checkpoint_dir, "graph.pbtxt")
+    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
+                                                args=[self])
+    self._write_graph_thread.start()
+
+    saver_def = self._get_saver().saver_def if self._get_saver() else None
+    graph = ops.get_default_graph()
+    meta_graph_def = meta_graph.create_meta_graph_def(
+        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
+    self._summary_writer.add_graph(graph)
+    self._summary_writer.add_meta_graph(meta_graph_def)
+    # The checkpoint saved here is the state at step "global_step".
+    self._save(session, global_step)
+    self._timer.update_last_triggered_step(global_step)
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    global_step = run_context.session.run(self._global_step_tensor)
+    if self._timer.should_trigger_for_step(global_step):
+      self._timer.update_last_triggered_step(global_step)
+      logging.info("Triggering checkpoint. %s", global_step)
+      if self._save(run_context.session, global_step):
+        run_context.request_stop()
+
+  def end(self, session):
+    if self._save_thread:
+      logging.info("Waiting for any pending checkpoints to finish.")
+      self._save_thread.join()
+    if self._write_graph_thread:
+      logging.info("Waiting for any pending write_graph to finish.")
+      self._write_graph_thread.join()
+
+    last_step = session.run(self._global_step_tensor)
+
+    if self._last_checkpoint_step != last_step:
+      self._save(session, last_step, asynchronous=False)
+
+    for l in self._listeners:
+      l.end(session, last_step)
+
+  def _save(self, session, step, asynchronous=True):
+    """Saves the latest checkpoint, returns should_stop."""
+
+    # Skip saving on step 0
+    if step == 0:
+      return
+
+    def _save_fn():
+      """Run the saver process."""
+      logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
+
+      start_time = time.time()
+      for l in self._listeners:
+        l.before_save(session, step)
+
+      self._get_saver().save(session, self._save_path, global_step=step)
+      self._summary_writer.add_session_log(
+          SessionLog(
+              status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
+          step)
+
+      for l in self._listeners:
+        l.after_save(session, step)
+
+      end_time = time.time()
+      logging.info("Checkpoint actual writing time: (%.3f sec)",
+                   end_time - start_time)
+      logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
+
+    if not asynchronous:
+      self._last_checkpoint_step = step
+      _save_fn()
+      return
+
+    if self._save_thread is not None:
+      self._save_thread.join(timeout=0.1)
+      if self._save_thread.is_alive():
+        logging.info("Saver thread still in progress, skipping checkpoint.")
+        return
+
+    self._last_checkpoint_step = step
+    self._save_thread = threading.Thread(target=_save_fn)
+    self._save_thread.start()
+
+  def _get_saver(self):
+    if self._saver is not None:
+      return self._saver
+    elif self._scaffold is not None:
+      return self._scaffold.saver
+
+    # Get saver from the SAVERS collection if present.
+    collection_key = ops.GraphKeys.SAVERS
+    savers = ops.get_collection(collection_key)
+    if not savers:
+      raise RuntimeError(
+          "No items in collection {}. Please add a saver to the collection "
+          "or provide a saver or scaffold.".format(collection_key))
+    elif len(savers) > 1:
+      raise RuntimeError(
+          "More than one item in collection {}. "
+          "Please indicate which one to use by passing it to the constructor."
+          .format(collection_key))
+
+    self._saver = savers[0]
+    return savers[0]
diff --git a/tensorflow/python/tpu/bfloat16.py b/tensorflow/python/tpu/bfloat16.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa74f651aa63c72d14eb78c8af479263810e9b7d
--- /dev/null
+++ b/tensorflow/python/tpu/bfloat16.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      '', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/python/tpu/bfloat16_test.py
similarity index 96%
rename from tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
rename to tensorflow/python/tpu/bfloat16_test.py
index 26fd3768278cacd076e5fee8bdad75d0486678d0..3308e01700770c6ab4b0616a83b1207e49c82777 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
+++ b/tensorflow/python/tpu/bfloat16_test.py
@@ -19,11 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import bfloat16
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import bfloat16
 
 
 class BFloat16ScopeTest(test.TestCase):
diff --git a/tensorflow/python/tpu/datasets.py b/tensorflow/python/tpu/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0cd41d210ac6f8de1b20ebf744ee1e1dd04137
--- /dev/null
+++ b/tensorflow/python/tpu/datasets.py
@@ -0,0 +1,191 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of Cloud TPU helper functions for data loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import functional_ops
+
+
+def _TextLineDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+def _TFRecordDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+_FILETYPE_MAP = {
+    'tfrecord': _TFRecordDataset,
+    'textline': _TextLineDataset,
+    'text': _TextLineDataset,
+}
+
+
+def StreamingFilesDataset(files,
+                          filetype=None,
+                          file_reader_job=None,
+                          worker_job=None,
+                          num_epochs=None,
+                          filename_shuffle_buffer_size=None,
+                          num_parallel_reads=None,
+                          batch_transfer_size=None,
+                          sloppy=None):
+  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
+
+  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
+  files local to your GCE VM. In order to train using files stored on your local
+  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
+  helper to generate a dataset to feed your Cloud TPU with files from your GCE
+  VM.
+
+  The resulting dataset may return an OutOfRangeError if there are no files
+  found as a result of the fileglob expansion.
+
+  Note: StreamingFilesDataset assumes that the session is using a
+  TPUClusterResolver and has therefore a worker and a coordinator job. File
+  loading will be done on the coordinator job.
+
+  Args:
+    files: A string glob to match files, or a `tf.data.Dataset` generating file
+      names.
+    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
+      TensorFlow function that when given a filename returns a dataset.
+    file_reader_job: An optional string that corresponds to the job that should
+      perform the file reads.
+    worker_job: An optional string that corresponds to the job that should
+      process the tensors (i.e. your GPU or TPU worker).
+    num_epochs: The number of epochs through the training set that should be
+      generated. By default, it will repeat infinitely.
+    filename_shuffle_buffer_size: An optional integer whose value controls the
+      shuffling of the file names. If you would like to read from the files in
+      the same order, set to 0 or False.
+    num_parallel_reads: An optional integer controlling the number of files to
+      read from concurrently. (Set to 1 for no parallelism.)
+    batch_transfer_size: An optional integer controlling the batching used to
+      amortize the remote function invocation overhead. Set to a very large
+      number to increase throughput. Set to a very small number to reduce memory
+      consumption. Set to False to skip batching.
+    sloppy: (Optional.) If `False`, read input data while maintaining a
+      deterministic order. (This may have significant performance impacts.)
+      sloppy defaults to: True.
+  Returns:
+    A `tf.data.Dataset` with an infinite stream of elements generated by a
+    parallel interleaving of the set of files matched (or generated) by `files`
+    with a type is the output of the dataset specified by `filetype`.
+
+  Raises:
+    ValueError: if any argument is not of the expected type.
+  """
+  if filetype is None:
+    filetype = 'tfrecord'
+
+  if isinstance(filetype, str):
+    if filetype not in _FILETYPE_MAP:
+      raise ValueError('Unexpected filetype: %s' % filetype)
+    reader_fn = _FILETYPE_MAP[filetype]
+  elif callable(filetype):
+    reader_fn = filetype
+  else:
+    raise ValueError('filetype should be a string or a callable')
+
+  file_reader_job = file_reader_job or 'coordinator'
+
+  worker_job = worker_job or 'worker'
+
+  if filename_shuffle_buffer_size is None:
+    filename_shuffle_buffer_size = 4096
+
+  num_parallel_reads = num_parallel_reads or 8
+
+  if batch_transfer_size is None:
+    batch_transfer_size = 256
+
+  if sloppy is None:
+    sloppy = True
+
+  with ops.device('/job:%s' % file_reader_job):
+    if isinstance(files, str):
+      source_dataset = dataset_ops.Dataset.list_files(files)
+    elif isinstance(files, dataset_ops.DatasetV2):
+      source_dataset = files
+    else:
+      raise ValueError('files was not a string or a dataset: %s' % files)
+
+    if filename_shuffle_buffer_size:
+      source_dataset = source_dataset.shuffle(
+          buffer_size=filename_shuffle_buffer_size)
+
+    source_dataset = source_dataset.apply(
+        interleave_ops.parallel_interleave(
+            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+    source_dataset = source_dataset.repeat(num_epochs)
+
+    if batch_transfer_size:
+      source_dataset = source_dataset.batch(batch_transfer_size)
+
+    source_dataset = source_dataset.prefetch(1)
+
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
+    source_handle = source_iterator.string_handle()
+
+  @function.Defun(dtypes.string)
+  def LoadingFunc(h):
+    remote_iterator = iterator_ops.Iterator.from_string_handle(
+        h, source_dataset.output_types, source_dataset.output_shapes)
+    return remote_iterator.get_next()
+
+  def MapFn(unused_input):
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
+        args=[source_handle],
+        Tout=output_types,
+        f=LoadingFunc,
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
+
+  with ops.device('/job:%s' % worker_job):
+    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
+        MapFn, num_parallel_calls=4 if sloppy else None)
+    output_dataset = output_dataset.prefetch(1)
+
+    if batch_transfer_size:
+      # Undo the batching used during the transfer.
+      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
+
+  return output_dataset
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/python/tpu/datasets_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/datasets_test.py
rename to tensorflow/python/tpu/datasets_test.py
index 8a94f527bb6dffa48e71e6500ae5e9e9589fbf5c..416dd9496cc18af8354d4d961b54a50f3db99a24 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/python/tpu/datasets_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.tpu.python.tpu import datasets
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -31,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import datasets
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8a7a348cf18fdeb91b769570029fdef68c1bf3
--- /dev/null
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -0,0 +1,313 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.tpu.topology import Topology
+
+
+SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
+
+
+def _compute_task_and_cores_to_replicas(core_assignment, topology):
+  """Computes a nested dict which maps task and logical core to replicas."""
+  task_and_cores_to_replicas = {}
+  for replica in xrange(core_assignment.shape[0]):
+    for logical_core in xrange(core_assignment.shape[1]):
+      coordinates = core_assignment[replica, logical_core, :]
+      task_id = topology.task_ordinal_at_coordinates(coordinates)
+      if task_id not in task_and_cores_to_replicas:
+        task_and_cores_to_replicas[task_id] = {}
+      if logical_core not in task_and_cores_to_replicas[task_id]:
+        task_and_cores_to_replicas[task_id][logical_core] = set()
+
+      task_and_cores_to_replicas[task_id][logical_core].add(replica)
+
+  task_to_sorted_replica_id = {}
+
+  for task, core_to_replicas in task_and_cores_to_replicas.items():
+    core_to_sorted_replicas = {}
+    for core, replicas in core_to_replicas.items():
+      core_to_sorted_replicas[core] = sorted(replicas)
+
+    task_to_sorted_replica_id[task] = core_to_sorted_replicas
+  return task_to_sorted_replica_id
+
+
+class DeviceAssignment(object):
+  """Mapping from logical cores in a computation to the physical TPU topology.
+
+  Prefer to use the `device_assignment()` helper to construct a
+  `DeviceAssignment`; it is easier if less flexible than constructing a
+  `DeviceAssignment` directly.
+  """
+
+  def __init__(self, topology, core_assignment):
+    """Constructs a `DeviceAssignment` object.
+
+    Args:
+      topology: A `Topology` object that describes the physical TPU topology.
+      core_assignment: A logical to physical core mapping, represented as a
+        rank 3 numpy array. See the description of the `core_assignment`
+        property for more details.
+
+    Raises:
+      ValueError: If `topology` is not `Topology` object.
+      ValueError: If `core_assignment` is not a rank 3 numpy array.
+    """
+    if not isinstance(topology, Topology):
+      raise ValueError("topology must be a Topology object, got {}".format(
+          type(topology)))
+    core_assignment = np.asarray(core_assignment, dtype=np.int32)
+
+    self._topology = topology
+
+    if core_assignment.ndim != 3:
+      raise ValueError("core_assignment must be a rank 3 numpy array, "
+                       "got shape {}".format(core_assignment.shape))
+
+    self._num_replicas = core_assignment.shape[0]
+    self._num_cores_per_replica = core_assignment.shape[1]
+
+    if core_assignment.shape[-1] != topology.mesh_rank:
+      raise ValueError(
+          "minor dimension of core_assignment must have size equal to topology "
+          "rank ({}), got shape {}".format(topology.mesh_rank,
+                                           core_assignment.shape))
+
+    self._core_assignment = core_assignment
+    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
+        self._core_assignment, topology)
+
+  @property
+  def topology(self):
+    """A `Topology` that describes the TPU topology."""
+    return self._topology
+
+  @property
+  def num_cores_per_replica(self):
+    """The number of cores per replica."""
+    return self._num_cores_per_replica
+
+  @property
+  def num_replicas(self):
+    """The number of replicas of the computation."""
+    return self._num_replicas
+
+  @property
+  def core_assignment(self):
+    """The logical to physical core mapping.
+
+    Returns:
+      An integer numpy array of rank 3, with shape
+      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
+      (replica, logical core) pairs to physical topology coordinates.
+    """
+    return self._core_assignment
+
+  def coordinates(self, replica, logical_core):
+    """Returns the physical topology coordinates of a logical core."""
+    return tuple(self.core_assignment[replica, logical_core, :])
+
+  def lookup_replicas(self, task_id, logical_core):
+    """Lookup replica ids by task number and logical core.
+
+    Args:
+      task_id: TensorFlow task number.
+      logical_core: An integer, identifying a logical core.
+    Returns:
+      A sorted list of the replicas that are attached to that task and
+      logical_core.
+    Raises:
+      ValueError: If no replica exists in the task which contains the logical
+      core.
+    """
+    try:
+      return self._task_and_cores_to_replicas[task_id][logical_core]
+    except KeyError:
+      raise ValueError(
+          "Can not find any replica in task: {} contains logical_core: {} ".
+          format(task_id, logical_core))
+
+  def tpu_ordinal(self, replica=0, logical_core=0):
+    """Returns the ordinal of the TPU device assigned to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
+
+  def host_device(self, replica=0, logical_core=0, job=None):
+    """Returns the CPU device attached to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
+
+  def tpu_device(self, replica=0, logical_core=0, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
+
+
+def device_assignment(topology,
+                      computation_shape=None,
+                      computation_stride=None,
+                      num_replicas=1):
+  """Computes a device_assignment of a computation across a TPU topology.
+
+  Attempts to choose a compact grid of cores for locality.
+
+  Returns a `DeviceAssignment` that describes the cores in the topology assigned
+  to each core of each replica.
+
+  `computation_shape` and `computation_stride` values should be powers of 2 for
+  optimal packing.
+
+  Args:
+    topology: A `Topology` object that describes the TPU cluster topology.
+      To obtain a TPU topology, evaluate the `Tensor` returned by
+      `initialize_system` using `Session.run`. Either a serialized
+      `TopologyProto` or a `Topology` object may be passed. Note: you must
+      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
+    computation_shape: A rank 1 int32 numpy array with size equal to the
+      topology rank, describing the shape of the computation's block of cores.
+      If None, the `computation_shape` is `[1] * topology_rank`.
+    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
+      describing the inter-core spacing of the `computation_shape` cores in the
+      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
+    num_replicas: The number of computation replicas to run. The replicas will
+      be packed into the free spaces of the topology.
+
+  Returns:
+    A DeviceAssignment object, which describes the mapping between the logical
+    cores in each computation replica and the physical cores in the TPU
+    topology.
+
+  Raises:
+    ValueError: If `topology` is not a valid `Topology` object.
+    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
+      numpy arrays with shape [3] where all values are positive.
+    ValueError: If computation's replicas cannot fit into the TPU topology.
+  """
+  # Deserialize the Topology proto, if it is a string.
+  if isinstance(topology, bytes):
+    topology = Topology(serialized=topology)
+
+  if not isinstance(topology, Topology):
+    raise ValueError("`topology` is not a Topology object; got {}".format(
+        type(topology)))
+
+  topology_rank = len(topology.mesh_shape)
+  mesh_shape = topology.mesh_shape
+  if computation_shape is None:
+    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
+  else:
+    computation_shape = np.asarray(computation_shape, dtype=np.int32)
+
+  if computation_stride is None:
+    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
+  else:
+    computation_stride = np.asarray(computation_stride, dtype=np.int32)
+
+  if computation_shape.shape != (topology_rank,):
+    raise ValueError("computation_shape must have shape [{}]; got {}".format(
+        topology_rank, computation_shape.shape))
+  if computation_stride.shape != (topology_rank,):
+    raise ValueError("computation_stride must have shape [{}]; got {}".format(
+        topology_rank, computation_stride.shape))
+
+  if any(computation_shape < 1):
+    raise ValueError(
+        "computation_shape must be positive; got computation_shape={}".format(
+            computation_shape))
+  if any(computation_stride < 1):
+    raise ValueError(
+        "computation_stride must be positive; got computation_stride={}".format(
+            computation_stride))
+
+  # Computes the physical size of one computation instance.
+  computation_footprint = computation_shape * computation_stride
+  if any(computation_footprint > mesh_shape):
+    raise ValueError(
+        "computation footprint {} does not fit in TPU topology shape {}".format(
+            computation_footprint, mesh_shape))
+
+  # Computes how many copies of the computation footprint fit in the mesh.
+  block_counts = mesh_shape // computation_footprint
+
+  replica_counts = block_counts * computation_stride
+  max_replicas = np.prod(replica_counts)
+  if num_replicas > max_replicas:
+    raise ValueError(
+        "requested {} replicas but only {} replicas with shape {} and "
+        "computation_stride {} fit in a TPU mesh of shape {}".format(
+            num_replicas, max_replicas, computation_shape, computation_stride,
+            mesh_shape))
+
+  def ceil_of_ratio(n, m):
+    return (n + m - 1) // m
+
+  replica_shape = [0] * topology_rank
+  if num_replicas > 0:
+    remaining_replicas = num_replicas
+    remaining_dims = topology_rank
+
+    # Choose dimensions as close to an equal cube as possible, in order of
+    # increasing dimension size. By visiting dimensions in increasing size, we
+    # assign the most constrained dimension first, so we won't make infeasible
+    # choices.
+    #
+    # As a secondary sort order, visit the dimensions in reverse order. This
+    # means we try to use both cores on the same chip in preference to two cores
+    # on different chips.
+    for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
+      i = -ni
+      target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
+      replica_shape[i] = min(target_size, x)
+      remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
+      remaining_dims -= 1
+
+    assert remaining_replicas == 1 and remaining_dims == 0
+
+  # Assigns an offset to each replica such that no two replicas overlap.
+  replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
+  for replica in xrange(num_replicas):
+    # Chooses a replica number in each axis.
+    t = replica
+    pos = []
+    for dim in replica_shape[::-1]:
+      pos.append(t % dim)
+      t //= dim
+    replica_pos = np.array(pos[::-1], dtype=np.int32)
+
+    # Determines where that replica starts in each axis.
+    outer = replica_pos // computation_stride
+    inner = replica_pos % computation_stride
+    replica_offsets[replica, :] = outer * computation_footprint + inner
+
+  # Computes a complete logical core -> physical core mapping for each replica.
+  indices = [
+      np.arange(0, computation_shape[i] * computation_stride[i],
+                computation_stride[i]) for i in xrange(topology_rank)
+  ]
+  indices = np.concatenate(
+      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
+      axis=-1)
+  indices = indices.reshape((-1, topology_rank))
+  assignment = indices + replica_offsets[:, np.newaxis, :]
+  return DeviceAssignment(topology, core_assignment=assignment)
diff --git a/tensorflow/python/tpu/error_handling.py b/tensorflow/python/tpu/error_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e1ea42370d653d1de7c12eee4b456ec7ce921c
--- /dev/null
+++ b/tensorflow/python/tpu/error_handling.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""ErrorRendezvous handler for collecting errors from multiple threads."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import sys
+import threading
+import time
+
+import six
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+
+_UNINTERESTING_ERRORS = (errors.CancelledError,)
+
+
+class ErrorRendezvous(object):
+  """Resolve errors from multiple threads during TPU execution.
+
+  TPU errors can occur on the infeed or outfeed threads as well as the main
+  training thread.
+
+  Depending on which thread "wins" and receives the session error first, we may
+  end up showing users a confusing and non-actionable error message (session
+  cancelled) instead of a root cause (e.g. a bad filename).
+
+  The rendezvous object provides a location to capture these errors until all
+  threads terminate.  At that point we can choose the most informative error
+  to report.
+  """
+
+  def __init__(self, num_sources):
+    # string -> (message, traceback)
+    self._errors = {}
+    self._num_sources = num_sources
+    self._session_cancel_timer = None
+
+  def record_error(self, source, exc_info, session=None):
+    """Report an exception from the given source.
+
+    If a session is passed, a timer will be registered to close it after a few
+    seconds.  This is necessary to ensure the main training loop does not hang
+    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
+    interesting error from another thread to propagate.
+
+    Args:
+      source: string, source of the error
+      exc_info: Output from `sys.exc_info` (type, value, traceback)
+      session: Session to close after delay.
+    """
+    _, value, _ = exc_info
+    self._errors[source] = exc_info
+    logging.info('Error recorded from %s: %s', source, value)
+
+    if session is not None and self._session_cancel_timer is None:
+
+      def _cancel_session():
+        time.sleep(5)
+        try:
+          session.close()
+        except:  # pylint: disable=bare-except
+          pass
+
+      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
+      self._session_cancel_timer.daemon = True
+      self._session_cancel_timer.start()
+
+  def record_done(self, source):
+    """Mark execution source `source` as done.
+
+    If an error was originally reported from `source` it is left intact.
+
+    Args:
+      source: `str`, source being recorded
+    """
+    logging.info('%s marked as finished', source)
+    if source not in self._errors:
+      self._errors[source] = None
+
+  @contextlib.contextmanager
+  def catch_errors(self, source, session=None):
+    """Context manager to report any errors within a block."""
+    try:
+      yield
+    except Exception:  # pylint: disable=broad-except
+      self.record_error(source, sys.exc_info(), session)
+
+  def raise_errors(self, timeout_sec=0):
+    """Wait for up to `timeout` seconds for all error sources to finish.
+
+    Preferentially raise "interesting" errors (errors not in the
+    _UNINTERESTING_ERRORS) set.
+
+    Args:
+      timeout_sec: Seconds to wait for other error sources.
+    """
+    for _ in range(timeout_sec):
+      if len(self._errors) == self._num_sources:
+        break
+      time.sleep(1)
+
+    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
+
+    # First check for any interesting errors, then fall back on the session
+    # cancelled errors etc.
+    for k, (typ, value, traceback) in kept_errors:
+      if isinstance(value, _UNINTERESTING_ERRORS):
+        continue
+      else:
+        logging.warn('Reraising captured error')
+        six.reraise(typ, value, traceback)
+
+    for k, (typ, value, traceback) in kept_errors:
+      logging.warn('Reraising captured error')
+      six.reraise(typ, value, traceback)
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7e93910c834831b5aa79440de3660f974a2310
--- /dev/null
+++ b/tensorflow/python/tpu/feature_column.py
@@ -0,0 +1,435 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU Feature Column Library."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_function
+# pylint: disable=protected-access
+
+
+_TPU_FC_TO_SCOPE = '_tpu_feature_column_scope'
+_SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
+                                  fc._VocabularyFileCategoricalColumn,
+                                  fc._VocabularyListCategoricalColumn,
+                                  fc._WeightedCategoricalColumn,
+                                  fc_lib.IdentityCategoricalColumn,
+                                  fc_lib.VocabularyFileCategoricalColumn,
+                                  fc_lib.VocabularyListCategoricalColumn,
+                                  fc_lib.WeightedCategoricalColumn)
+
+
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None):
+  """TPU embedding_column for `tf.feature_column.embedding_column`.
+
+  Note that the interface for TPU embedding_column is different from the non-TPU
+  version. The following args available for the non-TPU version are NOT
+  supported: ckpt_to_load_from, tensor_name_in_ckp, max_norm and trainable.
+
+  Args:
+    categorical_column: A categorical_column returned from
+        categorical_column_with_identity,  weighted_categorical_column,
+        categorical_column_with_vocabulary_list or
+        categorical_column_with_vocabulary_file.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. For more information, see
+      `tf.feature_column.embedding_column`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+
+  Returns:
+    A  _TPUEmbeddingColumn.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if `initializer` is specified but not callable.
+  """
+  if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
+    raise TypeError(
+        'categorical_column for tpu '
+        ' embedding_column must be type %s, got %s.' % (' or '.join([
+            cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
+        ]), type(categorical_column)))
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
+
+  def _creator(weight_collections, scope):
+    embedding_column_layer = fc._EmbeddingColumnLayer(
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=True,
+        name='embedding_column_layer')
+    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
+
+  column = _TPUEmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      layer_creator=_creator,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      max_norm=None,
+      trainable=True)
+  # For Embedding column, the initializer is hidden inside the creator Fn, which
+  # is not accessiable later. So, we attach it to a speicial field. Also note
+  # that non-TPU Embedding column and non-TPU shared Embedding column handle the
+  # initializer differently. See shared_embedding_columns for details.
+  column._tpu_initializer = initializer
+  return column
+
+
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None):
+  """List of dense columns that convert from sparse, categorical input."""
+  for categorical_column in categorical_columns:
+    if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
+      raise TypeError(
+          'categorical_column for tpu '
+          ' shared_embedding_columns must be type %s, got %s.' % (' or '.join([
+              cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
+          ]), type(categorical_column)))
+  columns = fc_lib.shared_embedding_columns(
+      categorical_columns,
+      dimension,
+      combiner=combiner,
+      initializer=initializer,
+      shared_embedding_collection_name=shared_embedding_collection_name,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      max_norm=None,
+      trainable=True)
+
+  # Use the initializer and shared_embedding_collection_name to create TPU
+  # version
+  initializer = columns[0].initializer
+  shared_embedding_collection_name = columns[0].shared_embedding_collection_name
+  tpu_columns = []
+
+  # Create the state (_SharedEmbeddingColumnLayer) here.
+  for categorical_column in categorical_columns:
+    column = _TPUSharedEmbeddingColumn(
+        categorical_column=categorical_column,
+        dimension=dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=None,
+        tensor_name_in_ckpt=None,
+        max_norm=None,
+        trainable=True)
+    tpu_columns.append(column)
+
+  return tpu_columns
+
+
+class _TPUBaseEmbeddingColumn(object):
+  """Base class for TPU Embedding Column."""
+
+  def __init__(self, categorical_column):
+    self._tpu_categorical_column = categorical_column
+
+  def get_combiner(self):
+    """Returns the embedding combiner."""
+    raise NotImplementedError('not implemented')
+
+  def get_embedding_table_size(self):
+    """Returns the embedding table size, tuple of vocab size and dimension."""
+    raise NotImplementedError('not implemented')
+
+  def get_feature_key_name(self):
+    """Returns the feature key name in the features dict."""
+    raise NotImplementedError('not impl')
+
+  def get_weight_key_name(self):
+    """Return the key name for weights."""
+    raise NotImplementedError('not impl')
+
+  def get_embedding_var_name(self):
+    """Returns the embedding variable name.
+
+    Feature key name and embedding variable name are usually one-to-one mapping.
+    But for shared embedding columns, it is many-to-one mapping.
+    """
+    raise NotImplementedError('not impl')
+
+  def get_initializer(self):
+    """Returns the initializer."""
+    raise NotImplementedError('not impl')
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    raise NotImplementedError('not impl')
+
+
+class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
+  """Core Embedding Column."""
+
+  def __new__(cls,
+              categorical_column,
+              dimension,
+              combiner='mean',
+              layer_creator=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None,
+              max_norm=None,
+              trainable=True):
+    # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
+    # are not supported on TPU. They are solely for matching the signature of
+    # __new__ of parent class fc._EmbeddingColumn.
+    return fc._EmbeddingColumn.__new__(
+        cls,
+        categorical_column,
+        dimension,
+        combiner=combiner,
+        layer_creator=layer_creator,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable)
+
+  def __init__(self,
+               categorical_column,
+               dimension,
+               combiner='mean',
+               layer_creator=None,
+               ckpt_to_load_from=None,
+               tensor_name_in_ckpt=None,
+               max_norm=None,
+               trainable=True):
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    self._key = None
+
+  def get_combiner(self):
+    return self.combiner
+
+  def get_embedding_table_size(self):
+    """Returns num_ids and width."""
+    return (self.categorical_column._num_buckets, self.dimension)
+
+  def get_feature_key_name(self):
+    """get_feature_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.categorical_column.name
+    return self.categorical_column.name
+
+  def get_weight_key_name(self):
+    """get_weight_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.weight_feature_key
+    return None
+
+  def get_embedding_var_name(self):
+    """get_embedding_var_name."""
+    return self.categorical_column.name
+
+  def get_initializer(self):
+    return self._tpu_initializer
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    if isinstance(
+        self.categorical_column,
+        (
+            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
+            fc_lib.WeightedCategoricalColumn)):
+      return True
+    return False
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      def host_computation():
+        return fc._EmbeddingColumn._get_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      return fc._EmbeddingColumn._get_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    # TPU mode
+    # Get the embeddings from the LazyBuilder.
+    tensor = inputs.get(self.get_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(self.get_embedding_var_name(),
+                                    'embedding_weights')
+
+    return tensor
+
+
+class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
+                                fc._SharedEmbeddingColumn):
+  """Core Shared Embedding Column."""
+
+  def __new__(cls,
+              categorical_column,
+              dimension,
+              combiner='mean',
+              initializer=None,
+              shared_embedding_collection_name=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None,
+              max_norm=None,
+              trainable=True):
+    return fc._SharedEmbeddingColumn.__new__(
+        cls,
+        categorical_column,
+        dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable)
+
+  def __init__(self,
+               categorical_column,
+               dimension,
+               combiner='mean',
+               initializer=None,
+               shared_embedding_collection_name=None,
+               ckpt_to_load_from=None,
+               tensor_name_in_ckpt=None,
+               max_norm=None,
+               trainable=True):
+
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    self._key = None
+
+  def get_combiner(self):
+    return self.combiner
+
+  def get_embedding_table_size(self):
+    """Returns num_ids and width."""
+    return (self.categorical_column._num_buckets, self.dimension)
+
+  def get_feature_key_name(self):
+    """get_feature_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.categorical_column.name
+    return self.categorical_column.name
+
+  def get_weight_key_name(self):
+    """get_weight_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.weight_feature_key
+    return None
+
+  def get_embedding_var_name(self):
+    """get_embedding_var_name."""
+    return self.shared_embedding_collection_name
+
+  def get_initializer(self):
+    return self.initializer
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    if isinstance(
+        self.categorical_column,
+        (
+            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
+            fc_lib.WeightedCategoricalColumn)):
+      return True
+    return False
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      def host_computation():
+        return fc._SharedEmbeddingColumn._get_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      return fc._SharedEmbeddingColumn._get_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    # TPU mode
+    # Get the embeddings from the LazyBuilder.
+    tensor = inputs.get(self.get_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        is_shared_embedding=True)
+    return tensor
+
+
+def _record_variable_scope_and_name(embedding_var_name,
+                                    embedding_var_name_in_fc,
+                                    is_shared_embedding=False):
+  """Add embedding variable name and scope to collection."""
+  g = ops.get_default_graph()
+  collection = g.get_collection_ref(_TPU_FC_TO_SCOPE)
+  if not collection:
+    collection.append({})
+
+  var_def_dict = collection[0]
+
+  captured_scope = variable_scope.get_variable_scope()
+  captured_scope_name = captured_scope.name
+
+  if embedding_var_name in var_def_dict:
+    if (var_def_dict[embedding_var_name][0] != captured_scope_name
+        and not is_shared_embedding):
+      raise ValueError(
+          'For embedding var name {}, the variable scope name is different, '
+          'got {}; expected {}'.format(embedding_var_name,
+                                       captured_scope_name,
+                                       var_def_dict[embedding_var_name][0]))
+    if var_def_dict[embedding_var_name][1] != embedding_var_name_in_fc:
+      raise ValueError(
+          'For embedding var name {}, the embedding name is different, '
+          'got {}; expected {}'.format(embedding_var_name,
+                                       embedding_var_name_in_fc,
+                                       var_def_dict[embedding_var_name][1]))
+  else:
+    var_def_dict[embedding_var_name] = (captured_scope_name,
+                                        embedding_var_name_in_fc)
+
+
+def _is_running_on_cpu():
+  """Returns True if the current context is CPU model."""
+  return tpu_function.get_tpu_context().number_of_shards is None
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column_test.py b/tensorflow/python/tpu/feature_column_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/feature_column_test.py
rename to tensorflow/python/tpu/feature_column_test.py
index 75164cce4c261cc541dd6b01ee22699d286d9621..6feeb0198de76efc373e8be11765508220935605 100644
--- a/tensorflow/contrib/tpu/python/tpu/feature_column_test.py
+++ b/tensorflow/python/tpu/feature_column_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-"""Tests for contrib.tpu.python.tpu.feature_column."""
+"""Tests for python.tpu.feature_column."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tpu.python.tpu import feature_column as tpu_fc
 from tensorflow.python.client import session
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.feature_column import feature_column_lib as fc_lib
@@ -31,6 +30,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import feature_column as tpu_fc
 
 
 def _initialized_session():
diff --git a/tensorflow/python/tpu/functional.py b/tensorflow/python/tpu/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..045ec523bbbd1a16f616fbf2c3b11b66053968f5
--- /dev/null
+++ b/tensorflow/python/tpu/functional.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.tpu.ops import tpu_ops
+
+TPUPartitionedCall = tpu_ops.tpu_partitioned_call  # pylint: disable=invalid-name
diff --git a/tensorflow/python/tpu/ops/tpu_ops.py b/tensorflow/python/tpu/ops/tpu_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e411cbc92488f65f80527b410d39fbad3a37e8
--- /dev/null
+++ b/tensorflow/python/tpu/ops/tpu_ops.py
@@ -0,0 +1,418 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Operations for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+
+if platform.system() != "Windows":
+  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
+  from tensorflow.python.ops import gen_tpu_ops
+  from tensorflow.python.ops.gen_tpu_ops import *
+  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
+
+  def _create_default_group_assignment():
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    if num_shards is None:
+      logging.warning(
+          "cross_replica_sum should be used within a tpu_shard_context, but "
+          "got unset number_of_shards. Assuming 1.")
+      num_shards = 1
+    group_assignment = [list(range(num_shards))]
+    return group_assignment
+
+  def all_to_all(x,
+                 concat_dimension,
+                 split_dimension,
+                 split_count,
+                 group_assignment=None,
+                 name=None):
+    """Exchange data across TPU replicas.
+
+    Args:
+      x: The local tensor.
+      concat_dimension: The dimension number to concatenate.
+      split_dimension: The dimension number to split.
+      split_count: The number of splits, this number must equal to the sub-group
+        size(group_assignment.get_shape()[1])
+      group_assignment: Optional 2d int32 lists with shape [num_groups,
+        num_replicas_per_group]. `group_assignment[i]` represents the replica
+        ids in the ith subgroup.
+      name: Optional op name.
+
+    Returns:
+      A `Tensor` which is concatenated by data from different replicas.
+    """
+    if group_assignment is None:
+      group_assignment = _create_default_group_assignment()
+    return gen_tpu_ops.all_to_all(
+        x,
+        group_assignment,
+        concat_dimension=concat_dimension,
+        split_dimension=split_dimension,
+        split_count=split_count,
+        name=name)
+
+  @ops.RegisterGradient("AllToAll")
+  def _all_to_all_grad(op, grad):
+    # The gradient of a all-to-all is also a all-to-all but the
+    # split_dimension and concat_dimension is swapped.
+    # The graident with respect to group_assignment is None.
+    return [
+        gen_tpu_ops.all_to_all(
+            grad,
+            op.inputs[1],
+            concat_dimension=op.get_attr("split_dimension"),
+            split_dimension=op.get_attr("concat_dimension"),
+            split_count=op.get_attr("split_count")), None
+    ]
+
+  def cross_replica_sum(x, group_assignment=None, name=None):
+    """Sum the input tensor across replicas according to group_assignment.
+
+    Args:
+      x: The local tensor to the sum.
+      group_assignment: Optional 2d int32 lists with shape [num_groups,
+        num_replicas_per_group]. `group_assignment[i]` represents the replica
+        ids in the ith subgroup.
+      name: Optional op name.
+
+    Returns:
+      A `Tensor` which is summed across replicas.
+    """
+    if group_assignment is None:
+      group_assignment = _create_default_group_assignment()
+
+    return gen_tpu_ops.cross_replica_sum(x, group_assignment, name=name)
+
+  def collective_permute(x, source_target_pairs, name=None):
+    """Permute the input tensor across replicas given source_target_pairs.
+
+    For each source_target_pair <a, b>, we send replica a's input to replica b.
+    Each replica id must only appear once in the source column. Also it must
+    only appear once in the target column.
+    For the replica id not in the target column, this op returns a zero tensor
+    with the same shape and dtype of the input x.
+
+    For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+    source_target_pairs=`[[0,1],[1,2],[2,3]]` gets the outputs:
+    `[0, A, B, C]`.
+
+    Args:
+      x: The local tensor to be permuted.
+      source_target_pairs: 2d int lists with shape [num_pairs, 2].
+        source_target_pairs[i][0] represents the source replica id and
+        source_target_pairs[i][1] represents the target replica id.
+      name: Optional op name.
+
+    Returns:
+      A `Tensor` which is permuted.
+    """
+    return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
+
+  @ops.RegisterGradient("CollectivePermute")
+  def _collective_permute_grad(op, grad):
+    # The gradient of a collective permute operation is also a collective
+    # permute, but with source/target pairs reversed. The gradient with respect
+    # to input argument `source_target_pairs` is `None`.
+    source_target_pairs = op.inputs[1][:, ::-1]
+    return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
+
+  @ops.RegisterGradient("CrossReplicaSum")
+  def _cross_replica_sum_grad(op, grad):
+    # The gradient of a cross replica sum is also a cross-replica sum.
+    # The gradient with respect to group_assignment is None.
+    return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
+
+  # This extra type checking exists to give a more helpful error message in
+  # the common case that uint8 and int64 values are infed. Remove when both
+  # types are supported.
+
+  _SUPPORTED_INFEED_DTYPES = set([
+      dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
+      dtypes.complex64, dtypes.uint32
+  ])
+
+  @ops.RegisterGradient("TPUEmbeddingActivations")
+  def _embedding_activations_grad(activations_op, grad_wrt_activations):
+    """Saves the gradient of embedding activations ops in a graph collection."""
+    g = ops.get_default_graph()
+    table_id = activations_op.get_attr("table_id")
+    lookup_id = activations_op.get_attr("lookup_id")
+    table_gradients = g.get_collection_ref(
+        "tpu_embedding_gradients_table_%d" % table_id)
+
+    if not table_gradients:
+      raise RuntimeError(
+          "Gradients for TPUEmbedding have been generated in non-training mode."
+          "This is not expected. Consider putting your Optimizer.minimize code "
+          "behind the training mode condition check. For Estimator, you can "
+          "do \n\n"
+          "    if mode == tf.estimator.ModeKeys.TRAIN:\n"
+          "        train_op = opt.minimize(loss)\n"
+          "\n")
+
+    table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
+    return [
+        # RegisterGradient requires that value be returned for all inputs. Since
+        # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
+        # we will return zeros(shape=[1]). The actual gradient w.r.t. the
+        # embedding activations (grad_wrt_activations) has the same shape as the
+        # activations returned by  embedding_activations.
+        array_ops.zeros(arg.shape, dtype=dtypes.float32)
+        for arg in activations_op.inputs
+    ]
+
+  def infeed_dequeue(dtype, shape, name=None):
+    """A placeholder op for a value that will be fed into the computation.
+
+    Args:
+      dtype: A `tf.DType`. The type of elements in the tensor.
+      shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor` of type `dtype`.
+      A tensor that will be provided using the infeed mechanism.
+
+    Raises:
+      TypeError: If 'dtype` is not a supported infeed type.
+    """
+    if dtype not in _SUPPORTED_INFEED_DTYPES:
+      raise TypeError(
+          "{} is not a supported TPU infeed type. Supported types are: "
+          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+
+    return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
+
+  # pylint: disable=redefined-outer-name
+  def infeed_dequeue_tuple(dtypes, shapes, name=None):
+    """A placeholder op for values fed into the TPU simultaneously as a tuple.
+
+    Args:
+      dtypes: A list of `tf.DType`s that has length `>= 1`.
+        The element types of each element in `outputs`.
+      shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
+        The shapes of each tensor in `outputs`.
+      name: A name for the operation (optional).
+
+    Returns:
+      A list of `Tensor` objects of type `dtypes`.
+      A list of tensors that will be provided using the infeed mechanism.
+
+    Raises:
+      TypeError: If a type in 'dtypes` is not a supported infeed type.
+    """
+    for dtype in dtypes:
+      if dtype not in _SUPPORTED_INFEED_DTYPES:
+        raise TypeError(
+            "{} is not a supported TPU infeed type. Supported types are: "
+            "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+    return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
+  # pylint: enable=redefined-outer-name
+
+  # pylint: disable=protected-access
+  def send_tpu_embedding_gradients(inputs,
+                                   config,
+                                   learning_rates=None,
+                                   name=None):
+    """A placeholder op for feeding per-sample gradients to the embedding layer.
+
+    Args:
+      inputs: A TensorList of gradients with which to update embedding tables.
+          This argument has the same length and shapes as the return value of
+          RecvTPUEmbeddingActivations, but contains gradients of the model's
+          loss with respect to the embedding activations. The embedding tables
+          are updated from these gradients via the optimizers specified in the
+          TPU embedding configuration given to tpu.initialize_system.
+      config: Serialized TPUEmbeddingConfiguration proto.
+      learning_rates: A TensorList of float32 scalars, one for each dynamic
+          learning rate tag: see the comments in
+          //third_party/tensorflow/core/protobuf/tpu/
+                                               optimization_parameters.proto.
+          Multiple tables can share the same dynamic learning rate tag as
+          specified in the configuration. If the learning rates for all tables
+          are constant, this list should be empty.
+      name: A name for the operation (optional).
+
+    Returns:
+      A SendTPUEmbeddingGradients operation.
+    """
+    if learning_rates is None:
+      learning_rates = []
+    return gen_tpu_ops.send_tpu_embedding_gradients(
+        inputs=inputs, learning_rates=learning_rates, config=config, name=name)
+
+  send_tpu_embedding_gradients.__doc__ = (
+      gen_tpu_ops.send_tpu_embedding_gradients.__doc__)
+
+  # pylint: disable=protected-access
+  def enqueue_tpu_embedding_integer_batch(batch,
+                                          device_ordinal,
+                                          mode_override=None,
+                                          name=None):
+    """A placeholder op for enqueueing embedding IDs to the TPU.
+
+    Args:
+      batch: A list of 1D tensors, one for each embedding table, containing the
+        indices into the tables.
+      device_ordinal: The TPU device to use. Should be >= 0 and less than the
+        number of TPU cores in the task on which the node is placed.
+      mode_override: A string input that overrides the mode specified in the
+        TPUEmbeddingConfiguration. Supported values are {'unspecified',
+        'inference', 'training', 'backward_pass_only'}. When set to
+        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+        otherwise mode_override is used (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      An EnqueueTPUEmbeddingIntegerBatch operation.
+    """
+    if mode_override is None:
+      mode_override = "unspecified"
+    return gen_tpu_ops.enqueue_tpu_embedding_integer_batch(
+        batch=batch,
+        device_ordinal=device_ordinal,
+        mode_override=mode_override,
+        name=name)
+
+  enqueue_tpu_embedding_integer_batch.__doc__ = (
+      gen_tpu_ops.enqueue_tpu_embedding_integer_batch.__doc__)
+
+  # pylint: disable=protected-access
+  def enqueue_tpu_embedding_sparse_batch(sample_indices,
+                                         embedding_indices,
+                                         aggregation_weights,
+                                         device_ordinal,
+                                         combiners=None,
+                                         mode_override=None,
+                                         name=None):
+    """A placeholder op for enqueueing embedding IDs to the TPU.
+
+    Args:
+      sample_indices: A list of rank 1 Tensors specifying the training example
+        and feature to which the corresponding embedding_indices and
+        aggregation_weights values belong. sample_indices[i] must equal b * nf +
+        f, where nf is the number of features from the corresponding table, f is
+        in [0, nf), and b is in [0, batch size).
+      embedding_indices: A list of rank 1 Tensors, indices into the embedding
+        tables.
+      aggregation_weights: A list of rank 1 Tensors containing per sample --
+        i.e. per (training example, feature) -- aggregation weights.
+      device_ordinal: The TPU device to use. Should be >= 0 and less than the
+        number of TPU cores in the task on which the node is placed.
+      combiners: A list of string scalars, one for each embedding table that
+        specify how to normalize the embedding activations after weighted
+        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+        is to use 'sum' for all tables (optional).
+      mode_override: A string input that overrides the mode specified in the
+        TPUEmbeddingConfiguration. Supported values are {'unspecified',
+        'inference', 'training', 'backward_pass_only'}. When set to
+        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+        otherwise mode_override is used (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      An EnqueueTPUEmbeddingSparseBatch operation.
+    """
+    if mode_override is None:
+      mode_override = "unspecified"
+    return gen_tpu_ops.enqueue_tpu_embedding_sparse_batch(
+        sample_indices=sample_indices,
+        embedding_indices=embedding_indices,
+        aggregation_weights=aggregation_weights,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        mode_override=mode_override,
+        name=name)
+
+  enqueue_tpu_embedding_sparse_batch.__doc__ = (
+      gen_tpu_ops.enqueue_tpu_embedding_sparse_batch.__doc__)
+
+  # pylint: disable=protected-access
+  def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
+                                                embedding_indices,
+                                                aggregation_weights,
+                                                table_ids,
+                                                device_ordinal,
+                                                combiners=None,
+                                                mode_override=None,
+                                                name=None):
+    """A placeholder op for enqueueing embedding IDs to the TPU.
+
+    Args:
+      sample_indices: A list of rank 1 Tensors specifying the training example
+        to which the corresponding embedding_indices and aggregation_weights
+        values belong. It corresponds to sp_ids.indices[:,0] in
+        embedding_lookup_sparse().
+      embedding_indices: A list of rank 1 Tensors, indices into the embedding
+        tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
+      aggregation_weights: A list of rank 1 Tensors containing per training
+        example aggregation weights. It corresponds to sp_weights.values in
+        embedding_lookup_sparse().
+      table_ids: A list of integers specifying the identifier of the embedding
+        table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
+        lookup the corresponding input. The ith input is looked up using
+        table_ids[i]. The size of the table_ids list must be equal to that of
+        sample_indices, embedding_indices and aggregation_weights.
+      device_ordinal: The TPU device to use. Should be >= 0 and less than the
+        number of TPU cores in the task on which the node is placed.
+      combiners: A list of string scalars, one for each embedding table that
+        specify how to normalize the embedding activations after weighted
+        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+        is to use 'sum' for all tables (optional).
+      mode_override: A string input that overrides the mode specified in the
+        TPUEmbeddingConfiguration. Supported values are {'unspecified',
+        'inference', 'training', 'backward_pass_only'}. When set to
+        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+        otherwise mode_override is used (optional).
+      name: A name for the operation (optional).
+
+    Returns:
+      An EnqueueTPUEmbeddingSparseTensorBatch operation.
+    """
+    if mode_override is None:
+      mode_override = "unspecified"
+    return gen_tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+        sample_indices=sample_indices,
+        embedding_indices=embedding_indices,
+        aggregation_weights=aggregation_weights,
+        table_ids=table_ids,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        mode_override=mode_override,
+        name=name)
+
+  enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
+      gen_tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
+
+else:
+  # We have already built the appropriate libraries into the binary via CMake
+  # if we have built contrib, so we don't need this
+  pass
diff --git a/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py b/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f2dce26cd5dd1183d51491715186f57fbe95fab
--- /dev/null
+++ b/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Operations to select TPU core to run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bfe79454cc16db7dedadcddb1aad20c3d7792528
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -0,0 +1,27 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "profiler",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_profiler_analysis_pb2_grpc",
+        "//tensorflow/core/profiler:profiler_analysis_proto_py",
+        "//tensorflow/core/profiler:protos_all_py",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "tpu_profiler_analysis_pb2_grpc",
+    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core/profiler:profiler_analysis_proto_py"],
+)
diff --git a/tensorflow/python/tpu/profiler/__init__.py b/tensorflow/python/tpu/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c183aaf53543f7bb38475525d1777048925ff62
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Classes for TPU trace events."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.core.profiler.trace_events_pb2 import *
+from tensorflow.core.profiler.profiler_analysis_pb2 import *
+# pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
similarity index 100%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
rename to tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df7185be0828df13f7088689e9891f985450182
--- /dev/null
+++ b/tensorflow/python/tpu/session_support.py
@@ -0,0 +1,438 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Operations for handling session logging and shutdown notifications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import time
+from google.protobuf import text_format
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+_WATCHDOG = None
+
+
+class CoordinatorShutdownException(Exception):
+  """Raised when the coordinator needs to shutdown."""
+  pass
+
+
+def _clone_session(session, graph=None):
+  return session_lib.Session(
+      target=session.sess_str,
+      config=session._config,  # pylint: disable=protected-access
+      graph=graph if graph else session.graph)
+
+
+def _make_heartbeat_op(session, device, request_ph):
+  """Return a heartbeat op or None if heartbeats are not supported by device."""
+  try:
+    # Test if we can connect in a isolated graph + session
+    with ops.Graph().as_default():
+      with _clone_session(session) as temp_session:
+        with ops.device(device):
+          heartbeat_op = tpu_ops.worker_heartbeat('')
+          options = config_pb2.RunOptions(timeout_in_ms=5000)
+          temp_session.run(heartbeat_op, options=options)
+  except errors.InvalidArgumentError as _:
+    logging.warning('Error running heartbeat on %s', device)
+    return None
+  except errors.DeadlineExceededError as _:
+    logging.warning('Timeout connecting to %s when testing heartbeat', device)
+    return None
+
+  # If we successfully connected and pinged the worker, go ahead and construct
+  # the operation.
+  with ops.device(device):
+    return tpu_ops.worker_heartbeat(request_ph)
+
+
+class WorkerHeartbeatManager(object):
+  """Manages the status/heartbeat monitor for a set of workers."""
+
+  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
+    """Construct a new WorkerHeartbeatManager.
+
+    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
+
+    Args:
+      session: `tf.Session`, session to use for heartbeat operations.
+      devices: `list[string]` Set of devices to connect to.
+      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
+      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
+        the WorkerHeartbeatRequest protocol buffer.
+    """
+    self._session = session
+    self._devices = devices
+    self._ops = heartbeat_ops
+    self._request_placeholder = request_placeholder
+
+  @staticmethod
+  def from_devices(session, devices):
+    """Construct a heartbeat manager for the given devices."""
+    if not devices:
+      logging.error('Trying to create heartbeat manager with no devices?')
+
+    logging.info('Creating heartbeat manager for %s', devices)
+    request_placeholder = array_ops.placeholder(
+        name='worker_heartbeat_request', dtype=dtypes.string)
+
+    heartbeat_ops = []
+    kept_devices = []
+    for device in devices:
+      heartbeat_op = _make_heartbeat_op(session, device, request_placeholder)
+      if heartbeat_op is not None:
+        kept_devices.append(device)
+        heartbeat_ops.append(heartbeat_op)
+      else:
+        logging.warning('Heartbeat support not available for %s', device)
+
+    return WorkerHeartbeatManager(session, kept_devices, heartbeat_ops,
+                                  request_placeholder)
+
+  def num_workers(self):
+    return len(self._devices)
+
+  def configure(self, message):
+    """Configure heartbeat manager for all devices.
+
+    Args:
+      message: `event_pb2.WorkerHeartbeatRequest`
+    Returns: `None`
+    """
+    logging.info('Configuring worker heartbeat: %s',
+                 text_format.MessageToString(message))
+    self._session.run(self._ops,
+                      {self._request_placeholder: message.SerializeToString()})
+
+  def ping(self, request=None, timeout_in_ms=5000):
+    """Ping all workers, returning the parsed status results."""
+    if request is None:
+      request = event_pb2.WorkerHeartbeatRequest()
+
+    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
+    results = self._session.run(
+        self._ops,
+        feed_dict={self._request_placeholder: request.SerializeToString()},
+        options=options)
+    parsed_results = [
+        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
+        for res_pb in results
+    ]
+    logging.debug('Ping results: %s', parsed_results)
+    return parsed_results
+
+  def lame_workers(self):
+    """Ping all workers, returning manager containing lame workers (or None)."""
+    ping_results = self.ping()
+    lame_workers = []
+
+    for ping_response, device, op in zip(ping_results, self._devices,
+                                         self._ops):
+      if ping_response.health_status != event_pb2.OK:
+        lame_workers.append((device, op))
+
+    if not lame_workers:
+      return None
+
+    bad_devices, bad_ops = zip(*lame_workers)
+    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
+                                  self._request_placeholder)
+
+  def __repr__(self):
+    return 'HeartbeatManager(%s)' % ','.join(self._devices)
+
+  def shutdown(self, timeout_ms=10000):
+    """Shutdown all workers after `shutdown_timeout_secs`."""
+    logging.info('Shutting down %s.', self)
+    req = event_pb2.WorkerHeartbeatRequest(
+        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
+        shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
+    self.configure(req)
+
+    # Wait for workers to shutdown.  This isn't strictly required
+    # but it avoids triggering multiple checkpoints with the same lame worker.
+    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
+    time.sleep(timeout_ms / 1000)
+
+
+def all_worker_devices(session):
+  """Return a list of devices for each worker in the system."""
+  devices = session.list_devices()
+  return [
+      device.name
+      for device in devices
+      if ':CPU:' in device.name and 'coordinator' not in device.name
+  ]
+
+
+class WatchdogManager(threading.Thread):
+  """Configures worker watchdog timer and handles periodic pings.
+
+  Usage:
+    # Ping workers every minute, shutting down workers if they haven't received
+    # a ping after 1 hour.
+    watchdog_manager = WatchdogManager(
+      ping_interval=60, shutdown_timeout=3600
+    )
+
+    # Use as a context manager, resetting watchdog on context exit:
+    with watchdog_manager:
+      session.run(...)
+
+    # Or setup globally; watchdog will remain active until program exit.
+    watchdog_manager.configure_and_run()
+  """
+
+  def __init__(self,
+               session,
+               devices=None,
+               ping_interval=60,
+               shutdown_timeout=3600):
+    """Initialize a watchdog manager.
+
+    Args:
+      session: Session connected to worker devices.  A cloned session and graph
+        will be created for managing worker pings.
+      devices: Set of devices to monitor.  If none, all workers will be
+        monitored.
+      ping_interval: Time, in seconds, between watchdog pings.
+      shutdown_timeout: Time, in seconds, before watchdog timeout.
+    """
+    threading.Thread.__init__(self)
+    self.ping_interval = ping_interval
+    self.shutdown_timeout = shutdown_timeout
+    self.daemon = True
+    self._config = session._config  # pylint: disable=protected-access
+    self._target = session.sess_str
+    self._running = False
+    self._devices = devices
+
+    self._graph = None
+    self._session = None
+    self._worker_manager = None
+
+  def _reset_manager(self):
+    """Reset the graph, session and worker manager."""
+    self._graph = ops.Graph()
+    self._session = session_lib.Session(
+        target=self._target,
+        graph=self._graph,
+        config=self._config,
+    )
+
+    if self._devices is None:
+      self._devices = all_worker_devices(self._session)
+
+    with self._graph.as_default():
+      self._worker_manager = WorkerHeartbeatManager.from_devices(
+          self._session, self._devices)
+
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(
+                timeout_ms=self.shutdown_timeout * 1000,),
+            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+
+  def configure_and_run(self):
+    logging.info(
+        'Enabling watchdog timer with %d second timeout '
+        'and %d second ping interval.', self.shutdown_timeout,
+        self.ping_interval)
+    self._reset_manager()
+    self._running = True
+    self.start()
+
+  def stop(self):
+    logging.info('Stopping worker watchdog.')
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
+            shutdown_mode=event_pb2.NOT_CONFIGURED))
+    self._running = False
+    self.join()
+
+  def __enter__(self):
+    self.configure_and_run()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self.stop()
+
+  def run(self):
+    # Don't fetch logs or adjust timing: just ping the watchdog.
+    #
+    # If we hit an exception, reset our session as it is likely broken.
+    while self._running:
+      try:
+        self._worker_manager.ping(request=None)
+        time.sleep(self.ping_interval)
+      except errors.OpError as e:
+        # Catch any TF errors that occur so we don't stop sending heartbeats
+        logging.debug('Caught error while sending heartbeat: %s', e)
+        self._reset_manager()
+
+
+def start_worker_watchdog(session,
+                          devices=None,
+                          ping_interval=60,
+                          shutdown_timeout=3600):
+  """Start global worker watchdog to shutdown workers on coordinator exit."""
+  global _WATCHDOG
+  if _WATCHDOG is None:
+    # Ensure we can send a few pings before we timeout!
+    ping_interval = min(shutdown_timeout / 10., ping_interval)
+    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
+                                shutdown_timeout)
+    _WATCHDOG.configure_and_run()
+
+
+class GracefulShutdownHook(session_run_hook.SessionRunHook):
+  """Session hook that watches for shutdown events.
+
+  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
+  SystemShutdown exception is raised to terminate the main session.  If `saver`
+  is None the `SAVERS` collection will be read to find a saver.
+
+  `on_shutdown_hooks` is an optional list of functions that should be called
+  after checkpointing.  The function is called with (`run_context`,
+  `all_workers`, `lame_workers`).
+
+  If `heartbeat_group` is not specified, it will default to all CPU workers
+  in the system.
+  """
+
+  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
+    self._saver = saver
+    self._checkpoint_prefix = checkpoint_prefix
+    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
+
+    # Worker heartbeats are managed independently of the main training graph.
+    self._graph = ops.Graph()
+    self._workers = None
+    self._session = None
+    self._heartbeat_supported = False
+
+  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
+    # N.B. We have to pull the global step here to avoid it being unavailable
+    # at checkpoint time; the graph has been frozen at that point.
+    if training_util.get_global_step() is None and self.saver() is not None:
+      raise ValueError(
+          'Saver defined but no global step.  Run `get_or_create_global_step()`'
+          ' in your model definition to allow checkpointing.')
+
+    with self._graph.as_default():
+      logging.info('Installing graceful shutdown hook.')
+      self._session = _clone_session(training_session, self._graph)
+      self._workers = WorkerHeartbeatManager.from_devices(
+          self._session, all_worker_devices(self._session))
+      self._heartbeat_supported = self._workers.num_workers() > 0
+      if self._heartbeat_supported:
+        self._workers.configure(
+            event_pb2.WorkerHeartbeatRequest(
+                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+      else:
+        logging.warn(
+            'No workers support hearbeats. Failure handling will be disabled.')
+
+  def saver(self):
+    if self._saver:
+      return self._saver
+
+    savers = ops.get_collection(ops.GraphKeys.SAVERS)
+    if not savers:
+      return None
+
+    if not isinstance(savers, list):
+      return savers
+
+    if len(savers) > 1:
+      logging.error(
+          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
+          'will be disabled. Pass an explicit `saver` to the constructor to '
+          'override this behavior.')
+      return None
+
+    return savers[0]
+
+  def after_run(self, run_context, run_values):
+    del run_values
+
+    if not self._heartbeat_supported:
+      return
+
+    lame_workers = self._workers.lame_workers()
+    if lame_workers:
+      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
+
+      if self.saver():
+        logging.info('ShutdownHook: saving checkpoint to %s',
+                     self._checkpoint_prefix)
+        self.saver().save(
+            run_context.session,
+            self._checkpoint_prefix,
+            global_step=training_util.get_global_step(),
+            write_state=True,
+        )
+      else:
+        logging.info('ShutdownHook: no Saver defined.')
+
+      for fn in self._on_shutdown_hooks:
+        fn(run_context, self._workers, lame_workers)
+
+
+class RestartComputation(object):
+  """Restart the entire computation.
+
+  This hook shuts down all workers and returns control to the top-level by
+  throwing a CoordinatorShutdownException.
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    del run_context, lame_workers
+    all_workers.shutdown(timeout_ms=self.timeout_ms)
+
+    logging.info('Terminating coordinator.')
+    raise CoordinatorShutdownException()
+
+
+class ShutdownLameWorkers(object):
+  """Shutdown lamed workers.
+
+  Processing will continue normally (typically by waiting for the down
+  workers to be restarted).
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_in_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d41d8768a3acbfd3521e04967b59e356c0f2943
--- /dev/null
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -0,0 +1,1638 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""A utility to trace tensor values on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path
+import re
+import sys
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_io
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu.ops import tpu_ops
+
+_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
+_DEVICE_TYPE_TPU = 'tpu'
+_DEVICE_TYPE_CPU = 'cpu'
+_TRACE_MODE_NAN_INF = 'nan-inf'
+_TRACE_MODE_PART_TENSOR = 'part-tensor'
+_TRACE_MODE_PART_TENSOR_SIZE = 3
+_TRACE_MODE_FULL_TENSOR = 'full-tensor'
+_TRACE_MODE_NORM = 'norm'
+_TRACE_MODE_MAX_ABS = 'max-abs'
+_SUBMODE_BRIEF = 'brief'
+_SUBMODE_DETAILED = 'detailed'
+_REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_REASON_UNSAFE_OP = 'not-traced-unsafe-op'
+_REASON_WHILELOOP_OP = 'not-traced-special-whileloop-op'
+_REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
+_REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
+_REASON_DEVICE_MISMATCH = 'not-traced-device-mismatch'
+_REASON_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_REASON_SCALAR_GET_TRACED = 'traced-scalar'
+_REASON_TENSOR_GET_TRACED = 'traced-tensor'
+_REASON_USER_INCLUDED = 'traced-user-included'
+_REASON_USER_EXCLUDED = 'not-traced-user-excluded'
+_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
+_REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
+_REASON_FEEDS_WHILELOOP_OP = 'not-traced-feeds-special-whileloop-op'
+_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
+_MARKER_SECTION_END = '!!!!!!! section-end:'
+_SECTION_NAME_CONFIG = 'configuration'
+_SECTION_NAME_REASON = 'reason'
+_SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_TENSOR_LIST = 'tensor-list'
+_SECTION_NAME_CACHE_INDEX_MAP = 'cache-index-map'
+_SECTION_NAME_GRAPH = 'graph'
+_FIELD_NAME_VERSION = 'version:'
+_FIELD_NAME_DEVICE = 'device:'
+_FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_SUBMODE = 'submode:'
+_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
+_FIELD_NAME_NUM_REPLICAS_PER_HOST = 'num-replicas-per-host:'
+_FIELD_NAME_NUM_HOSTS = 'num-hosts:'
+_FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
+_FIELD_NAME_NUM_CACHE_INDICES = 'number-of-indices:'
+_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
+_FLAG_NAME_SUBMODE = 'submode'
+_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
+_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
+_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
+_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
+_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
+_FLAG_NAME_TRACE_DIR = 'trace_dir'
+_FLAG_NAME_REPORT_FILE = 'report_file'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+# Folder to dump the pre (before tensor tracer updates) and post graphs (after
+# tensor tracer updates).
+_FLAG_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_OUTPUT_STREAM_ESCAPE = 'file://'
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+_TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
+_TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
+_TRACE_FILE_NAME = 'trace.all'
+_COMPACT_TRACE_FILE_PREFIX = 'compact_trace.'
+_COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
+_TENSOR_TRACER_STORAGE = 'tensor_tracer_storage'
+_TENSOR_VALUES_CACHE = 'tensor_values_cache'
+_REPLICA_ID_TAG = '#replica-id: '
+
+
+def tensor_tracepoint(tensor, checkpoint_name):
+  """Adds a checkpoint with the given checkpoint name for the given tensor.
+
+  The tensor will be added to the list of tensors that will be traced by the
+  tensor tracer.
+
+  Args:
+     tensor: the tensor object for which the tracing is requested.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+  Returns:
+    The provided tensor.
+  """
+
+  tensor.graph.get_collection(_TENSOR_TRACER_COLLECTION)
+  tensor.graph.add_to_collection(_TENSOR_TRACER_COLLECTION,
+                                 (tensor, checkpoint_name))
+  return tensor
+
+
+def keras_layer_tracepoint(layer, checkpoint_name):
+  """An interface for adding the tensor outputs of a keras layer.
+
+  Encapsulates tensor_tracepoint.
+
+  Args:
+     layer: A keras layer.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+
+  Returns:
+    The provided layer.
+  """
+  try:
+    outputs = layer.output
+    if tensor_util.is_tensor(outputs):
+      tensor_tracepoint(outputs, '%s' % (checkpoint_name))
+    else:
+      idx = 0
+      for output_tensor in outputs:
+        if tensor_util.is_tensor(outputs):
+          tensor_tracepoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
+        idx += 1
+  except AttributeError:
+    pass
+  except RuntimeError:
+    pass
+  return layer
+
+
+def _trace_files_need_precreated(output_dir):
+  """Return True if trace files must be pre-created by users."""
+
+  if not output_dir.startswith('/'):
+    return False
+  if len(output_dir) < 5:
+    return False
+  if output_dir[2] != 'n':
+    return False
+  if output_dir[3] != 's':
+    return False
+  if output_dir[1] != 'c':
+    return False
+  if output_dir[4] != '/':
+    return False
+  return True
+
+
+def _get_tensor_values_cache(graph=None):
+  """Returns the variable that implements tensor-value caching."""
+
+  graph = graph or ops.get_default_graph()
+  collection = graph.get_collection(_TENSOR_TRACER_STORAGE)
+  if len(collection) == 1:
+    return collection[0]
+  elif not collection:
+    raise RuntimeError('%s has not been created'%_TENSOR_VALUES_CACHE)
+  else:
+    raise RuntimeError('Multiple %s created'%_TENSOR_VALUES_CACHE)
+  return None
+
+
+def _create_tensor_values_cache(graph, num_tensors):
+  """Creates a variable as the cache to store intermediate tensor values."""
+  graph = graph or ops.get_default_graph()
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        _TENSOR_VALUES_CACHE,
+        shape=[num_tensors],
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(
+            _COMPACT_TRACE_ENTRY_INIT_VALUE),
+        trainable=False,
+        use_resource=True,
+        collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.GLOBAL_VARIABLES])
+
+
+class TensorTracer(object):
+  """A software construct for tracing tensor values in a TF graph on TPU.
+
+  This utility is disabled by default. It can be enabled by setting
+  the TENSOR_TRACER_FLAGS env variable as:
+    export TENSOR_TRACER_FLAGS="--enable=1"
+  If it is enabled, it will trace the output tensor values of
+  selected Ops in the graph. It has two outputs: (1) the traces and (2)
+  a report. The traces are dumped to a specified local file on the TPU
+  host. The report is printed to the log.info of the TPU job.
+  By passing options via the env variable, users can change:
+     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
+         full tensor values)
+     (2) which Ops to be traced (via op.name or op.type)
+     (3) output trace file path.
+  """
+  # The set of graphs that are rewritten by tensor tracer.
+  _traced_graphs = set()
+  @staticmethod
+  def _match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag.
+
+    Args:
+       flags: a string that contains the flags.
+       pos: where in flags to start the search.
+
+    Returns:
+       A pair where the first element is the regular-expression
+       match found and the second element indicates if the match
+       has a value.
+    """
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
+    if match:
+      # The flag is found but is not given a value.
+      return match, False
+    # The flag is not found.
+    return None, False
+
+  @staticmethod
+  def validate_flag_names():
+    """Validates if the TensorTrace flags passed are valid."""
+    valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
+                        _FLAG_NAME_USE_COMPACT_TRACE,
+                        _FLAG_NAME_SUBMODE,
+                        _FLAG_NAME_EXCLUDED_OPNAMES,
+                        _FLAG_NAME_EXCLUDED_OPTYPES,
+                        _FLAG_NAME_INCLUDED_OPNAMES,
+                        _FLAG_NAME_INCLUDED_OPTYPES,
+                        _FLAG_NAME_TRACE_DIR,
+                        _FLAG_NAME_REPORT_FILE,
+                        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
+                        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS,
+                        _FLAG_NAME_OP_RANGE,
+                        _FLAG_DUMP_BEFORE_AFTER_GRAPHS]
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return
+    pos = 0
+    while True:
+      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if flag_name not in valid_flag_names:
+        raise ValueError(
+            'The flag name "%s" passed via the environment variable "%s" '
+            'is invalid. Valid flag names are:'
+            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
+      pos = match.end()
+
+  @staticmethod
+  def print_flag_values():
+    """Prints all TensorTracer flags passed via environment variables."""
+
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
+    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
+                                                   tensor_tracer_flags)
+    result += 'Individual flag value:\n'
+    pos = 0
+    while True:
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      result += '  %s: %s\n'%(flag_name, flag_value)
+      pos = match.end()
+    result += '\n'
+    return result
+
+  @staticmethod
+  def get_flag_value(wanted_flag_name):
+    """Returns the value of a TensorTracer flags.
+
+    Args:
+      wanted_flag_name: the name the the flag we are looking for.
+
+    Returns:
+      A pair where the first element indicates if the flag is
+      found and the second element is the value of the flag.
+
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
+
+    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return False, None
+    pos = 0
+    while True:
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        return False, None
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      if flag_name == wanted_flag_name:
+        return True, flag_value
+      pos = match.end()
+    raise RuntimeError('Should not reach here.')
+
+  @staticmethod
+  def flag_value_to_re_list(flag_name):
+    """Converts list of strings to compiled RE."""
+
+    re_list = []
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found or not flag_value:
+      return re_list
+    list_of_values = flag_value.split()
+    for v in list_of_values:
+      r = re.compile(v)
+      re_list.append(r)
+    return re_list
+
+  @staticmethod
+  def _is_flag_on(flag_name):
+    """Returns True if the given flag is on."""
+
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found:
+      return False
+    if flag_value is None:
+      return True
+    # Depends on the flag value.
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    return TensorTracer._is_flag_on(_FLAG_NAME_ENABLE)
+
+  @staticmethod
+  def use_test_undeclared_outputs_dir():
+    """Decides the output directory of the report and trace files.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output files should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    return TensorTracer._is_flag_on(
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
+
+  @staticmethod
+  def use_compact_trace():
+    return TensorTracer._is_flag_on(
+        _FLAG_NAME_USE_COMPACT_TRACE)
+
+  @staticmethod
+  def check_device_type(device_type):
+    """Checks if the given device type is valid."""
+
+    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+      raise ValueError('Invalid device_type "%s"'%device_type)
+
+  @staticmethod
+  def check_trace_mode(trace_mode):
+    """Checks if the given trace mode is valid."""
+
+    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
+                         _TRACE_MODE_FULL_TENSOR, _TRACE_MODE_NORM,
+                         _TRACE_MODE_MAX_ABS]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+
+  @staticmethod
+  def check_submode(submode):
+    """Checks if the given submode is valid."""
+
+    if not submode:
+      return
+    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
+    if submode not in valid_submodes:
+      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
+                       'Valid submodes are: %s'%(submode,
+                                                 valid_submodes))
+
+  @staticmethod
+  def loop_cond_op(op):
+    return op.type in ('LoopCond', 'RefLoopCond')
+
+  @staticmethod
+  def while_loop_op(op):
+    """Returns true if op is one of the special ops of in a while loop.
+
+    Args:
+       op: A tf.Operation.
+
+    Returns:
+       True if the given op is one of [Switch, Merge, Enter, Exit,
+       NextIteration, LoopCond], which are all building blocks for TF while
+       loops.
+    """
+    return  (control_flow_util.IsLoopSwitch(op) or
+             control_flow_util.IsLoopMerge(op) or
+             control_flow_util.IsLoopEnter(op) or
+             control_flow_util.IsLoopExit(op) or
+             TensorTracer.loop_cond_op(op) or
+             op.type in ('RefNextIteration', 'NextIteration'))
+
+  @staticmethod
+  def unsafe_op(op):
+    """Returns True if this op is not safe to be traced."""
+
+    if control_flow_util.IsInCond(op):
+      return True
+    # Reasons for not including following op types:
+    #    Assign: cause incorrect result with CPU tracing.
+    if op.type in ['Assign']:
+      return True
+    return False
+
+  @staticmethod
+  def device_mismatch(device_type, op):
+    if device_type == _DEVICE_TYPE_TPU:
+      # pylint: disable=protected-access
+      return tpu._TPU_REPLICATE_ATTR not in op.node_def.attr
+      # pylint: enable=protected-access
+    return False
+
+  @staticmethod
+  def unsafe_scalar_trace(op):
+    """Return true if scalar output tensor from Op is not safe to be traced."""
+
+    # Tracing the following causes cycle in the graph on TPU.
+    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
+                   'Switch', 'Less', 'ReadVariableOp']:
+      return True
+    # Tracing the following will cause casting-issue
+    # with the norm tracing mode or other compilation issues on CPU.
+    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
+                   'IteratorGetNext', 'OneShotIterator',
+                   'IteratorV2', 'MakeIterator',
+                   'BatchDatasetV2', 'MapDataset',
+                   'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
+                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
+      return True
+    return False
+
+  @staticmethod
+  def less_interesting_op(op):
+    """Returns True if the given Op is not an interesting one to be traced."""
+
+    found, _ = TensorTracer.get_flag_value(
+        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
+    if found:
+      # users force to include all ops.
+      return False
+    # Following ops are highly unlikey to cause bugs.
+    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
+
+  @staticmethod
+  def reason(op_idx, details):
+    """Returns reason why the Op at op_idx is traced or not."""
+
+    return '%d %s'%(op_idx, details)
+
+  @staticmethod
+  def topological_sort(g):
+    """Performs topological sort on the given graph.
+
+    Args:
+       g: the graph.
+
+    Returns:
+       A pair where the first element indicates if the topological
+       sort succeeded (True if there is no cycle found; False if a
+       cycle is found) and the second element is either the sorted
+       list of nodes or the cycle of nodes found.
+    """
+
+    def visit(op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops):
+      """Recursively visits all Ops in a graph.
+
+      Args:
+         op: the current Op being visited.
+         cycle: a cycle of Ops found.
+         permanently_marked_ops: the set of Ops that were already visited.
+         temporarily_marked_ops: the set of Ops that we have visited during
+                                 the current descent.
+         sorted_ops: the list of Ops sorted in topological order.
+      """
+
+      if cycle:
+        return
+      if op in permanently_marked_ops:
+        return
+      if op in temporarily_marked_ops:
+        cycle = temporarily_marked_ops
+        return
+      temporarily_marked_ops.add(op)
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        for consumer_op in out_tensor.consumers():
+          visit(consumer_op, cycle, permanently_marked_ops,
+                temporarily_marked_ops, sorted_ops)
+      # pylint: disable=protected-access
+      for ctrl_output_op in op._control_outputs:
+        # pylint: enable=protected-access
+        visit(ctrl_output_op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops)
+      temporarily_marked_ops.remove(op)
+      permanently_marked_ops.add(op)
+      sorted_ops.insert(0, op)
+
+    graph_cycle = set([])
+    sorted_ops = []
+    permanently_marked_ops = set([])
+    temporarily_marked_ops = set([])
+    unsorted_ops = g.get_operations()
+    for op in unsorted_ops:
+      visit(op, graph_cycle, permanently_marked_ops,
+            temporarily_marked_ops, sorted_ops)
+    if graph_cycle:
+      return (False, graph_cycle)
+    else:
+      assert len(unsorted_ops) == len(sorted_ops)
+      return (True, sorted_ops)
+
+  @staticmethod
+  def _make_op_and_tensor_maps(op_list):
+    """Creates various maps and lists from op_list.
+
+    Args:
+       op_list: a list of Ops
+
+    Returns:
+       opname_idx_map: a map from Op's name to its index in op_list.
+       tensor_list: a list of output tensors of the Ops in op_list.
+       tensorname_idx_map: a map from output tensor name to its index
+                           in tensor_list.
+    """
+
+    opname_idx_map = {}
+    tensor_list = []
+    tensorname_idx_map = {}
+    for op_id, op in enumerate(op_list):
+      if op.name in opname_idx_map:
+        raise ValueError('Duplicated Op name: %s'%op.name)
+      opname_idx_map[op.name] = op_id
+      for output_tensor in op.outputs:
+        if output_tensor.name not in tensorname_idx_map:
+          tensor_list.append(output_tensor)
+          tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
+    return (opname_idx_map, tensor_list, tensorname_idx_map)
+
+  def __init__(self):
+    """Initializes a TensorTracer.
+
+    Sets the various member fields from the flags (if given) or the defaults.
+    """
+    self._version = 'use-outside-compilation'
+    self._device_type = None
+    TensorTracer.validate_flag_names()
+    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not found or not self._trace_mode:
+      self._trace_mode = _TRACE_MODE_NAN_INF
+    TensorTracer.check_trace_mode(self._trace_mode)
+    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
+    if not found or not self._submode:
+      self._submode = _SUBMODE_DETAILED
+    TensorTracer.check_submode(self._submode)
+    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
+    self._instrument_records = {}
+    self._set_trace_dir()
+    self._set_report_file()
+    self._set_op_range()
+    self._set_excluded_opnames()
+    self._set_excluded_optypes()
+    self._set_included_opnames()
+    self._set_included_optypes()
+    self._num_replicas = None
+    self._num_replicas_per_host = None
+    self._num_hosts = None
+    self._replica_id = None
+    _, self._graph_dump_path = TensorTracer.get_flag_value(
+        _FLAG_DUMP_BEFORE_AFTER_GRAPHS)
+
+  def _add_replica_id_to_graph(self):
+    """Adds nodes for computing the replica ID to the graph."""
+
+    if self._num_replicas:
+      with ops.control_dependencies(None):
+        # Uses None as dependency to run outside of TPU graph rewrites.
+        self._replica_id = tpu_ops.tpu_replicated_input(
+            list(range(self._num_replicas)),
+            name='tt_replica_id')
+    else:
+      self._replica_id = 'unknown'
+
+  def _set_trace_dir(self):
+    found, self._trace_dir = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_DIR)
+    if found and self._trace_dir \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      raise ValueError('Cannot not use --%s and --%s at the same time'
+                       %(_FLAG_NAME_TRACE_DIR,
+                         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR))
+    if TensorTracer.use_test_undeclared_outputs_dir():
+      self._trace_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+
+  def _set_report_file(self):
+    """Sets the path of the output report file."""
+
+    found, self._report_file_path = TensorTracer.get_flag_value(
+        _FLAG_NAME_REPORT_FILE)
+    if found and self._report_file_path \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._report_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'report_file_path cannot be an absolute path (%s)'
+                         %self._report_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._report_file_path = os.path.join(outputs_dir,
+                                            self._report_file_path)
+    if not self._report_file_path:
+      self._report_file = None
+      return
+    try:
+      self._report_file = gfile.Open(self._report_file_path, 'w')
+    except IOError as e:
+      raise e
+
+  def _close_report_file(self):
+    if self._report_file:
+      self._report_file.close()
+
+  def _set_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+
+    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not found or not op_range:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    self._op_range = (int(match.group(1)), int(match.group(2)))
+
+  def _inside_op_range(self, idx):
+    """Return True if the given index is inside the selected range."""
+
+    if idx < self._op_range[0]:
+      return False
+    return self._op_range[1] < 0 or idx <= self._op_range[1]
+
+  def _set_excluded_opnames(self):
+    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPNAMES)
+
+  def _set_excluded_optypes(self):
+    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPTYPES)
+
+  def _set_included_opnames(self):
+    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPNAMES)
+
+  def _set_included_optypes(self):
+    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPTYPES)
+
+  def _is_user_included_op(self, op):
+    for opname_re in self._included_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._included_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
+
+  def _is_user_excluded_op(self, op):
+    for opname_re in self._excluded_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._excluded_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
+
+  def _use_tensor_values_cache(self):
+    """Returns True if immediate tensors should be first saved to a cache."""
+
+    if self._trace_mode not in set([_TRACE_MODE_NAN_INF,
+                                    _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS]):
+      return False
+    if self._trace_dir and _trace_files_need_precreated(self._trace_dir):
+      return True
+    if TensorTracer.use_compact_trace():
+      return True
+    return False
+
+  def _save_tensor_value_to_cache_op(self, graph, cache_idx, updates):
+    """Returns an Op that will save the given updates to an entry in the cache."""
+
+    cache = _get_tensor_values_cache(graph)
+    indices = constant_op.constant([cache_idx])
+    return state_ops.scatter_update(cache, indices, updates).op
+
+  def _write_report(self, content):
+    """Writes the given content to the report."""
+
+    line = '%s %s'%(_TRACER_LOG_PREFIX, content)
+    if self._report_file:
+      self._report_file.write(line)
+    else:
+      logging.info(line)
+
+  def _write_config_section(self):
+    """Writes the config section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
+    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
+    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
+                                  self._num_replicas_per_host))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, self._num_hosts))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
+
+  def _write_reason_section(self):
+    """Writes the reason section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
+    for key in sorted(self._instrument_records):
+      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
+
+  def _write_op_list_section(self, op_list):
+    """Writes the Op-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
+    for i in range(0, len(op_list)):
+      op = op_list[i]
+      line = '%d "%s" %s'%(i, op.name, op.type)
+      for out_tensor in op.outputs:
+        if out_tensor.name not in self._tensorname_idx_map:
+          raise ValueError(
+              'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
+        line += ' %d'%self._tensorname_idx_map[out_tensor.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
+
+  def _write_tensor_list_section(self, tensor_list, opname_idx_map):
+    """Writes the tensor-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_TENSOR_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_TENSORS, len(tensor_list)))
+    for i in range(0, len(tensor_list)):
+      tensor = tensor_list[i]
+      line = '%d "%s"'%(i, tensor.name)
+      for consumer_op in tensor.consumers():
+        if consumer_op.name not in opname_idx_map:
+          raise ValueError(
+              'consumer_op %s is not in opname_idx_map'%consumer_op.name)
+        line += ' %d'%opname_idx_map[consumer_op.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_TENSOR_LIST))
+
+  def _write_cache_index_map_section(self):
+    """Writes the mapping from cache index to tensor index to the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_CACHE_INDEX_MAP))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_CACHE_INDICES,
+                                  len(self._cache_idx_to_tensor_idx)))
+    for cache_idx in range(0, len(self._cache_idx_to_tensor_idx)):
+      tensor_idx = self._cache_idx_to_tensor_idx[cache_idx]
+      line = '%d %d\n'%(cache_idx, tensor_idx)
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_CACHE_INDEX_MAP))
+
+  def _write_graph_section(self, succeed, sorted_or_cycle):
+    """Writes the graph section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
+                                  succeed))
+    l = list(sorted_or_cycle)
+    for i in range(0, len(l)):
+      self._write_report('%d "%s"\n'%(i, l[i].name))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
+
+  def _preprocess_traced_tensor(self, tensor):
+    """Computes NAN/Norm/Max on TPUs before sending to CPU.
+
+    Args:
+      tensor: The tensor to be traced.
+    Returns:
+      A tensor that should be input to the trace_function.
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        mask = math_ops.reduce_any(
+            gen_math_ops.logical_or(
+                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
+        output_tensor = control_flow_ops.cond(mask,
+                                              lambda: constant_op.constant(1.0),
+                                              lambda: constant_op.constant(0.0))
+      else:
+        output_tensor = constant_op.constant(0.0)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_norm(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = linalg_ops.norm(tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_max_abs(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
+      zero = constant_op.constant(0, dtypes.float32)
+      output_tensor = gen_math_ops.maximum(zero, output_tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf(tensor)
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_NORM:
+      return _show_norm(tensor)
+    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+      return _show_max_abs(tensor)
+    raise RuntimeError(
+        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
+
+  def _make_tensor_trace_fun(self, tensor_name):
+    """Makes the tensor tracing function called by outside compilation.
+
+    Args:
+      tensor_name: name of the tensor being traced.
+
+    Returns:
+      A function to be passed as the first argument to outside compilation.
+
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _print_tensor(tensor_name, num_elements, tensor, output_tensor):
+      """Prints a tensor value to a file.
+
+      Args:
+        tensor_name: name of the tensor being traced.
+        num_elements: number of elements to print (-1 means print all).
+        tensor: the tensor needs to be returned.
+        output_tensor: the tensor needs to be printed.
+
+      Returns:
+        The same tensor passed via the "tensor" argument.
+
+      Raises:
+        ValueError: If tensor_name is not already in
+                    self._tensorname_idx_map.
+      """
+
+      if self._submode == _SUBMODE_BRIEF:
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        msg = '%d'%self._tensorname_idx_map[tensor_name]
+      else:
+        msg = '"%s"'%tensor_name
+
+      if self._trace_dir:
+        output_path = os.path.join(self._trace_dir, _TRACE_FILE_NAME)
+        output_stream = _OUTPUT_STREAM_ESCAPE + output_path
+      else:
+        output_stream = sys.stderr
+      return logging_ops.print_v2(msg, array_ops.shape(output_tensor),
+                                  '@', self._replica_id,
+                                  '\n', output_tensor, '\n',
+                                  summarize=num_elements,
+                                  output_stream=output_stream)
+
+    def _show_part_tensor(tensor):
+      """Trace function for printing part of the tensor."""
+
+      return _print_tensor(tensor_name, self._part_tensor_size,
+                           tensor, tensor)
+
+    def _show_full_tensor(tensor):
+      """Trace function for printing the entire tensor."""
+
+      return _print_tensor(tensor_name, -1, tensor, tensor)
+
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return _show_part_tensor
+    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
+    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
+    # performed within TPUs and only their results are transferred to CPU.
+    # Simply, print the full tensor for these trace modes.
+    if self._trace_mode in [
+        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
+        _TRACE_MODE_MAX_ABS
+    ]:
+      return _show_full_tensor
+
+    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
+                       %self._trace_mode)
+
+  def _skip_op(self, op_id, op, user_included, user_excluded,
+               in_exec_path=True):
+    """Returns True if we should not trace Op."""
+
+    if TensorTracer.while_loop_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_WHILELOOP_OP)
+      return True
+    if TensorTracer.unsafe_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_UNSAFE_OP)
+      return True
+    if TensorTracer.device_mismatch(self._device_type, op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_DEVICE_MISMATCH)
+      return True
+    if not in_exec_path:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_NOT_EXECUTED)
+      return True
+
+    if not self._inside_op_range(op_id):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_OUTSIDE_OP_RANGE)
+      return True
+    if TensorTracer.less_interesting_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_LESS_INTERESTING_OP)
+      return True
+    if user_included:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    return False
+
+  def _skip_tensor(self, op_id, out_tensor, user_included,
+                   user_excluded):
+    """Returns True if we should not trace out_tensor."""
+
+    # Skips a tensor if the tensor has a non-numeric type.
+    #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
+    #         because it also excludes tensors with dtypes, bool, and
+    #         float32_ref, which we actually want to trace.
+    non_numeric_tensor_types = set([dtypes.variant, dtypes.resource,
+                                    dtypes.string])
+    if out_tensor.dtype in non_numeric_tensor_types:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_NON_NUMERIC_TENSOR)
+      return True
+    # Skip a tensor if it feeds a special while loop op.
+    if [consumer for consumer in out_tensor.consumers() if
+        TensorTracer.while_loop_op(consumer)]:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_FEEDS_WHILELOOP_OP)
+      return True
+    if user_included:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    if not out_tensor.get_shape().is_fully_defined():
+      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
+      # to a scalar before the outside compilation call.
+      if self._trace_mode in [
+          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
+      ]:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_TENSOR_GET_TRACED)
+        return False
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_DYNAMIC_SHAPE)
+        return True
+    rank = len(out_tensor.shape)
+    if rank < 1:
+      # scalar
+      if TensorTracer.unsafe_scalar_trace(out_tensor.op):
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_UNSAFE_SCALAR)
+        return True
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_SCALAR_GET_TRACED)
+        return False
+    else:
+      # tensor
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_TENSOR_GET_TRACED)
+      return False
+
+  def _filter_execution_path_operations(self, operations, fetches):
+    """Returns the set of ops in the execution path to compute given fetches."""
+
+    # If no fetch provided, then return all operations.
+    if fetches is None:
+      return set(operations)
+    # Convert to list, if a single element is provided.
+    if not isinstance(fetches, (list, tuple)):
+      fetches = [fetches]
+    # If a tensor is given as fetch, convert it to op.
+    op_fetches = []
+    for fetch in fetches:
+      if isinstance(fetch, ops.Operation):
+        op_fetches.append(fetch)
+      elif isinstance(fetch, ops.Tensor):
+        op_fetches.append(fetch.op)
+      else:
+        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
+                           %fetch)
+
+    execution_path_operations = set(op_fetches)
+    traverse_stack = list(op_fetches)
+    while True:
+      if not traverse_stack:
+        break
+      head_op = traverse_stack.pop()
+      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
+      input_ops.extend(head_op.control_inputs)
+
+      for input_op in input_ops:
+        if input_op not in execution_path_operations:
+          # Filter out loop condition operations, tracing them causes a cycle.
+          # Trace only the loop-body.
+          if TensorTracer.loop_cond_op(input_op):
+            continue
+          execution_path_operations.add(input_op)
+          traverse_stack.append(input_op)
+    return execution_path_operations
+
+  def _determine_traced_tensors(self, graph, ops_in_exec_path):
+    """Determines the tensors that will be traced."""
+
+    self._traced_tensorname_to_cache_idx_map = {}
+    self._cache_idx_to_tensor_idx = []
+    operations = graph.get_operations()
+    checkpoint_operations = self._get_checkpoints(graph)
+    for op_id, op in enumerate(operations):
+      if checkpoint_operations and op.name not in checkpoint_operations:
+        continue
+      user_included = self._is_user_included_op(op)
+      user_excluded = self._is_user_excluded_op(op)
+      in_exec_path = op in ops_in_exec_path
+      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if self._skip_tensor(op_id, out_tensor, user_included,
+                             user_excluded):
+          continue
+        tensor_name = out_tensor.name
+        if tensor_name in self._traced_tensorname_to_cache_idx_map:
+          raise ValueError(
+              'Tensor name %s should not be already in '
+              'traced_tensorname_to_cache_idx_map'%tensor_name)
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        tensor_idx = self._tensorname_idx_map[tensor_name]
+        cache_idx = len(self._traced_tensorname_to_cache_idx_map)
+        self._traced_tensorname_to_cache_idx_map[tensor_name] = cache_idx
+        self._cache_idx_to_tensor_idx.append(tensor_idx)
+        if len(self._traced_tensorname_to_cache_idx_map) != len(
+            self._cache_idx_to_tensor_idx):
+          raise RuntimeError('len(self._traced_tensorname_to_cache_idx_map) != '
+                             'len(self._cache_idx_to_tensor_idx')
+
+  def _check_trace_files(self):
+    """Checks if any requirements for trace files are satisfied."""
+
+    if not self._trace_dir:
+      # traces will be written to stderr. No need to check trace files.
+      return
+    if _trace_files_need_precreated(self._trace_dir):
+      for replica_id in range(0, self._num_replicas):
+        trace_file_path = os.path.join(
+            self._trace_dir,
+            _COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
+        if not gfile.Exists(trace_file_path):
+          raise RuntimeError(
+              '%s must be pre-created with the '
+              'appropriate properties.'%trace_file_path)
+    else:
+      if not gfile.Exists(self._trace_dir):
+        gfile.MkDir(self._trace_dir)
+        if not gfile.Exists(self._trace_dir):
+          raise RuntimeError('Failed to create %s'%self._trace_dir)
+
+  def _pre_tracing(self, graph, fetches):
+    """Work needs to be done prior to TPU or CPU tracing."""
+
+    self._check_trace_files()
+    operations = graph.get_operations()
+    (opname_idx_map, tensor_list, self._tensorname_idx_map) = (
+        TensorTracer._make_op_and_tensor_maps(operations))
+    self._write_config_section()
+    self._write_op_list_section(operations)
+    self._write_tensor_list_section(tensor_list, opname_idx_map)
+    # Filter out the operations that won't be executed.
+    # if fetches=None, then ops_in_exec_path = set(operations)
+    ops_in_exec_path = self._filter_execution_path_operations(operations,
+                                                              fetches)
+    self._determine_traced_tensors(graph, ops_in_exec_path)
+    self._write_cache_index_map_section()
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    if self._use_tensor_values_cache():
+      _create_tensor_values_cache(graph,
+                                  len(self._cache_idx_to_tensor_idx))
+    return (ops_in_exec_path, succeed, sorted_or_cycle)
+
+  def _post_tracing(self, succeed, sorted_or_cycle):
+    """Work needs to be done after TPU or CPU tracing."""
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+    self._close_report_file()
+
+  def _get_checkpoints(self, graph):
+    """Returns the list of Ops that produce the tensors traced with API.
+
+    Args:
+      graph: the graph of Ops.
+
+    Returns:
+      A set of operation names which should be traced.
+    """
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    checkpoint_operations = set()
+    tensor_tracer_variables = graph.get_collection(_TENSOR_TRACER_COLLECTION)
+    for (tensor, checkpoint_name) in tensor_tracer_variables:
+      self._write_report('%s %s\n'%(tensor.name, checkpoint_name))
+      checkpoint_operations.add(tensor.op.name)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    return checkpoint_operations
+
+  def _generate_flush_cache_op(self, graph, start_replica, on_tpu):
+    """Generates an Op that will flush the cache to file.
+
+    Args:
+      graph: the graph of Ops
+      start_replica: the ID of the first replica being flushed by this Op.
+      on_tpu: if the graph is executed on TPU.
+
+    Returns:
+      The Op to flush the cache to file.
+    """
+    def _make_flush_fun(replica_id):
+      """Makes a function for flushing the cache for the given replica."""
+
+      def _fun():
+        """A function that flushes the cache to a file."""
+
+        def _flush_fun(cache):
+          """Flushes the cache to a file."""
+
+          if isinstance(replica_id, str):
+            replica_id_str = replica_id
+          else:
+            replica_id_str = '%d'%replica_id
+          if self._trace_dir:
+            output_path = os.path.join(self._trace_dir,
+                                       _COMPACT_TRACE_FILE_PREFIX) \
+                                       + replica_id_str
+            output_stream = _OUTPUT_STREAM_ESCAPE + output_path
+          else:
+            output_stream = sys.stderr
+          new_step_line = _REPLICA_ID_TAG + replica_id_str
+          print_op = logging_ops.print_v2(
+              new_step_line, '\n',
+              cache, '\n',
+              summarize=-1,
+              output_stream=output_stream)
+          with ops.control_dependencies([print_op]):
+            return constant_op.constant(0).op
+
+        cache = _get_tensor_values_cache(graph)
+        if on_tpu:
+          flush_op = tpu.outside_compilation(_flush_fun, cache.value())
+        else:
+          flush_op = _flush_fun(cache.value())
+        with ops.control_dependencies([flush_op]):
+          reset_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
+                                             dtype=cache.dtype,
+                                             shape=cache.shape)
+          assign_op = state_ops.assign(cache, reset_value).op
+          with ops.control_dependencies([assign_op]):
+            return flush_op.outputs[0]
+
+      return _fun
+
+    def _f(replica_id):
+      return _make_flush_fun(replica_id)
+    def _eq(x):
+      return math_ops.equal(x, self._replica_id)
+    def _do_nothing():
+      return constant_op.constant(0)
+
+    return control_flow_ops.case({\
+                                  _eq(start_replica): _f(start_replica), \
+                                  _eq(start_replica+1): _f(start_replica+1), \
+                                  _eq(start_replica+2): _f(start_replica+2), \
+                                  _eq(start_replica+3): _f(start_replica+3), \
+                                  _eq(start_replica+4): _f(start_replica+4), \
+                                  _eq(start_replica+5): _f(start_replica+5), \
+                                  _eq(start_replica+6): _f(start_replica+6), \
+                                  _eq(start_replica+7): _f(start_replica+7), \
+    },
+                                 default=_do_nothing,
+                                 exclusive=True).op
+
+  def _flush_tensor_values_cache(self, graph, tensor_fetches, op_fetches,
+                                 on_tpu):
+    """Flushes the intermediate tensor values in the graph to the cache.
+
+    Args:
+      graph: the graph of Ops
+      tensor_fetches: list of tensor results returned by the model_fn.
+      op_fetches: list of ops that are returned by the model_fn, e.g., train_op.
+      on_tpu: if the graph is executed on TPU.
+
+    Returns:
+      An identical copy of tensor_fetches.
+    """
+    # Add a dependency to op and tensor fetches to make sure that all tracing
+    # ops are executed before flushing trace results.
+    with ops.control_dependencies(op_fetches +
+                                  [tensor.op for tensor in tensor_fetches]):
+      flush_cache_op_list = []
+      for host in range(self._num_hosts):
+        start_replica = host * 8
+        flush_op = self._generate_flush_cache_op(graph, start_replica, on_tpu)
+        flush_cache_op_list.append(flush_op)
+      return control_flow_ops.tuple(tensor_fetches,
+                                    control_inputs=flush_cache_op_list)
+
+  def _process_tensor_fetches(self, tensor_fetches):
+    """Check that tensor_fetches is not empty and have valid tensors."""
+    # If none or empty list.
+    if tensor_fetches is None:
+      raise RuntimeError('tensor_fetches provided to tensor_tracer cannot be '
+                         'None.')
+    if not isinstance(tensor_fetches, (list, tuple)):
+      tensor_fetches = [tensor_fetches]
+    elif not tensor_fetches:
+      raise RuntimeError('tensor_fetches provided to tensor_tracer cannot be '
+                         'empty list.')
+    fetches = []
+    for fetch in tensor_fetches:
+      if isinstance(fetch, ops.Tensor):
+        fetches.append(fetch)
+      else:
+        raise RuntimeError('Given tensor_fetch:%s is not a tensor.' % fetch)
+    return fetches
+
+  def _process_op_fetches(self, op_fetches):
+    """Check that op_fetches have valid ops."""
+    if op_fetches is None:
+      return []
+
+    if not isinstance(op_fetches, (list, tuple)):
+      op_fetches = [op_fetches]
+
+    fetches = []
+    for fetch in op_fetches:
+      if isinstance(fetch, ops.Operation):
+        fetches.append(fetch)
+      else:
+        logging.warning('Ignoring the given op_fetch:%s, which is not an op.' %
+                        fetch)
+    return fetches
+
+  def _convert_fetches_to_input_format(self, input_fetches, current_fetches):
+    """Changes current_fetches' format, so that it matches input_fetches."""
+    if isinstance(input_fetches, ops.Tensor):
+      if len(current_fetches) != 1:
+        raise RuntimeError('Tensor tracer input/output fetches do not match.')
+      return current_fetches[0]
+    else:
+      if len(current_fetches) != len(current_fetches):
+        raise RuntimeError('Tensor tracer input/output fetches do not match.')
+      elif isinstance(input_fetches, tuple):
+        return tuple(current_fetches)
+      else:
+        return current_fetches
+
+  def _get_op_control_flow_context(self, op):
+    """Returns the control flow of the given op.
+
+    Args:
+      op: tf.Operation for which the control flow context is requested.
+    Returns:
+      op_control_flow_context: which the is control flow context of the given
+      op. If the operation type is LoopExit, returns the outer control flow
+      context.
+    """
+    # pylint: disable=protected-access
+    op_control_flow_context = op._control_flow_context
+    # pylint: enable=protected-access
+    if control_flow_util.IsLoopExit(op):
+      op_control_flow_context = op_control_flow_context.outer_context
+    return op_control_flow_context
+
+  def _trace_execution(self, graph,
+                       tensor_fetches,
+                       op_fetches=None,
+                       on_tpu=True):
+    """Commong tracing function for both CPU and TPUs.
+
+    The caller function should set _device_type, _num_replicas,
+    _num_replicas_per_host, _num_hosts and _replica_id before calling
+    _trace_execution.
+
+
+    Args:
+      graph: the graph of Ops executed on the TPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+      on_tpu: True if executing on TPU.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+    def _cast_unsupported_dtypes(tensor):
+      """Casts tensor to a supported type."""
+
+      if tensor.dtype.__eq__(dtypes.int64):
+        # outside-compilation doesn't support int64 input yet.
+        return math_ops.cast(tensor, dtypes.int32)
+      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
+          dtypes.float16):
+        # Since host can't handle bf16, convert tensor to f32.
+        return math_ops.cast(tensor, dtypes.float32)
+      return tensor
+
+    TensorTracer.check_device_type(self._device_type)
+    # Check in_tensor_fetches, and op_fetches and convert them to lists.
+    processed_t_fetches = self._process_tensor_fetches(tensor_fetches)
+    op_fetches = self._process_op_fetches(op_fetches)
+    all_fetches = op_fetches + [tensor.op for tensor in processed_t_fetches]
+
+    # Filter the set of ops that will be executed, and topological sort.
+    (exec_op_set, succeed, sorted_or_cycle) = self._pre_tracing(graph,
+                                                                all_fetches)
+
+    tensor_fetch_set = set(processed_t_fetches)
+    tracing_ops = []
+
+    # pylint: disable=protected-access
+    current_control_flow_context = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+
+    # Trace ops only if they are in the execution path.
+    for op in exec_op_set:
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        tensor_name = out_tensor.name
+        if tensor_name not in self._traced_tensorname_to_cache_idx_map:
+          continue
+        # Create the list of consumers before calling _preprocess_traced_tensor.
+        # Otherwise, adding control input below, will introduce a cycle in the
+        # graph.
+        consumers = out_tensor.consumers()
+        # Not all consumers may be in the exec path. Filter out the consumers
+        # to keep the graph simpler.
+        consumers = [cop for cop in consumers if cop in exec_op_set]
+
+        # If there is no consumer of the tensor, there is no need to trace it;
+        # unless the tensor itself is one of the fetches.
+        is_a_fetched_tensor = out_tensor in tensor_fetch_set
+        if (not consumers) and (not is_a_fetched_tensor):
+          continue
+
+        op_control_flow_context = self._get_op_control_flow_context(op)
+        # pylint: disable=protected-access
+        graph._set_control_flow_context(op_control_flow_context)
+        # pylint: enable=protected-access
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+
+        if on_tpu:
+          processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
+
+        if self._use_tensor_values_cache():
+          cache_idx = self._traced_tensorname_to_cache_idx_map[tensor_name]
+          trace_op = self._save_tensor_value_to_cache_op(graph,
+                                                         cache_idx,
+                                                         processed_out_tensor)
+        elif on_tpu:
+          trace_op = tpu.outside_compilation(
+              self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
+        else:
+          trace_fun = self._make_tensor_trace_fun(tensor_name)
+          trace_op = trace_fun(processed_out_tensor)
+
+        if is_a_fetched_tensor:
+          tracing_ops.append(trace_op)
+          continue
+        # Add it to all consumers, as some consumers may not be executed if they
+        # are in a control flow.
+        for consumer_op in consumers:
+          # pylint: disable=protected-access
+          consumer_op._add_control_input(trace_op)
+          # pylint: enable=protected-access
+
+    # pylint: disable=protected-access
+    graph._set_control_flow_context(current_control_flow_context)
+    # pylint: enable=protected-access
+    if tracing_ops:
+      # If we are tracing a fetched tensor, their dependency is stored in
+      # tracing_ops.
+      processed_t_fetches = control_flow_ops.tuple(processed_t_fetches,
+                                                   control_inputs=tracing_ops)
+    if self._use_tensor_values_cache():
+      processed_t_fetches = self._flush_tensor_values_cache(graph,
+                                                            processed_t_fetches,
+                                                            op_fetches,
+                                                            on_tpu=on_tpu)
+    self._post_tracing(succeed, sorted_or_cycle)
+    # processed_t_fetches is a list at this point. Convert it to the same
+    # format as given in tensor_fetches.
+    return self._convert_fetches_to_input_format(tensor_fetches,
+                                                 processed_t_fetches)
+
+  def trace_tpu(self, graph,
+                tensor_fetches,
+                op_fetches=None,
+                num_replicas=None,
+                num_replicas_per_host=None,
+                num_hosts=None):
+    """Traces the tensors generated by TPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops executed on the TPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+      num_replicas: number of replicas used on the TPU.
+      num_replicas_per_host: number of replicas per TPU host.
+      num_hosts: total number of TPU hosts.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If num_replicas_per_host > 8.
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+
+    if graph in TensorTracer._traced_graphs:
+      logging.warning('Graph is already rewritten with tensor tracer, ignoring '
+                      'multiple calls.')
+      return tensor_fetches
+    else:
+      TensorTracer._traced_graphs.add(graph)
+    self._device_type = _DEVICE_TYPE_TPU
+    self._num_replicas = num_replicas
+    self._num_replicas_per_host = num_replicas_per_host
+    self._num_hosts = num_hosts
+    if self._num_replicas is not None:
+      if self._num_replicas_per_host is None:
+        self._num_replicas_per_host = 8
+      if self._num_hosts is None:
+        self._num_hosts = num_replicas // self._num_replicas_per_host + \
+            (num_replicas % self._num_replicas_per_host > 0)
+
+    if self._num_replicas_per_host > 8:
+      # Checks for the assumption in _generate_flush_cache_op().
+      raise RuntimeError('num_replicas_per_host (%d) is '
+                         'greater than 8'%self._num_replicas_per_host)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_before_tt.pbtxt')
+    with graph.as_default():
+      self._add_replica_id_to_graph()
+      tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
+                                             on_tpu=True)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_after_tt.pbtxt')
+    return tensor_fetches
+
+  def trace_cpu(self, graph, tensor_fetches, op_fetches=None):
+    """Traces the tensors generated by CPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops executed on the CPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+
+    if graph in TensorTracer._traced_graphs:
+      logging.warning('Graph is already rewritten with tensor tracer, ignoring '
+                      'multiple calls.')
+      return tensor_fetches
+    else:
+      TensorTracer._traced_graphs.add(graph)
+
+    self._device_type = _DEVICE_TYPE_CPU
+    self._num_replicas = 1
+    self._num_replicas_per_host = 1
+    self._num_hosts = 1
+    self._replica_id = 0
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_before_tt.pbtxt')
+    with graph.as_default():
+      tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
+                                             on_tpu=False)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_after_tt.pbtxt')
+    return tensor_fetches
+
+
diff --git a/tensorflow/python/tpu/topology.py b/tensorflow/python/tpu/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ee21e694d15d2e795b0b35289e1e116b9e76cf
--- /dev/null
+++ b/tensorflow/python/tpu/topology.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Defines the `Topology` class, that describes a TPU fabric topology."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf.tpu import topology_pb2
+
+
+def _tpu_device_name(job, task, device):
+  """Returns the device name for the TPU `device` on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:TPU:%d" % (task, device)
+  else:
+    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+
+
+def _tpu_host_device_name(job, task):
+  """Returns the device name for the CPU device on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:CPU:0" % task
+  else:
+    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+
+
+class Topology(object):
+  """Describes a set of TPU devices.
+
+  Represents both the shape of the physical mesh, and the mapping between
+  TensorFlow TPU devices to physical mesh coordinates.
+  """
+
+  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
+    """Builds a Topology object.
+
+    If `serialized` is not `None`, the topology is parsed from `serialized` and
+    the other arguments are ignored. Otherwise, the topology is computed from
+    `mesh_shape` and `device_coordinates`.
+
+    Args:
+      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
+        serialized proto is parsed to discover the topology.
+      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
+        the shape of the TPU topology, in number of cores. Ignored if
+        `serialized` is not `None`.
+      device_coordinates: A rank 3 numpy array that describes the mapping from
+        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
+        if `serialized is not `None`.
+
+    Raises:
+      ValueError: If `serialized` does not describe a well-formed topology.
+      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
+        of 3 positive integers.
+      ValueError: If `serialized` is `None` and `device_coordinates` is not a
+        rank 3 numpy int32 array that describes a valid coordinate mapping.
+    """
+
+    self._serialized = serialized
+
+    if serialized:
+      self._parse_topology(serialized)
+    else:
+      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
+      self._device_coordinates = np.asarray(device_coordinates, np.int32)
+      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
+                         "entries; got {}".format(self._mesh_shape))
+
+      if (len(self._device_coordinates.shape) != 3 or
+          self._device_coordinates.shape[2] != len(self._mesh_shape)):
+        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
+                         "with minor dimension equal to the mesh shape rank")
+
+    self._topology_tasks, self._topology_devices = self._invert_topology()
+
+  def _parse_topology(self, serialized):
+    """Parses a serialized `TopologyProto` into `self`."""
+    proto = topology_pb2.TopologyProto()
+    proto.ParseFromString(serialized)
+
+    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
+    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
+                       "entries; got {}".format(self._mesh_shape))
+
+    if proto.num_tasks < 0:
+      raise ValueError("`num_tasks` must be >= 0; got {}".format(
+          proto.num_tasks))
+    if proto.num_tpu_devices_per_task < 0:
+      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
+          proto.num_tpu_devices_per_task))
+
+    expected_coordinates_size = (
+        proto.num_tasks * proto.num_tpu_devices_per_task * len(
+            proto.mesh_shape))
+    if len(proto.device_coordinates) != expected_coordinates_size:
+      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
+                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
+                       "got shape {}".format(proto.num_tasks,
+                                             proto.num_tpu_devices_per_task,
+                                             proto.mesh_shape,
+                                             len(proto.device_coordinates)))
+
+    coords = np.array(proto.device_coordinates, dtype=np.int32)
+    if any(coords < 0):
+      raise ValueError("`device_coordinates` must be >= 0")
+    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
+                             len(proto.mesh_shape)))
+    self._device_coordinates = coords
+
+  def _invert_topology(self):
+    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
+    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    for task in xrange(self.device_coordinates.shape[0]):
+      for device in xrange(self.device_coordinates.shape[1]):
+        x, y, z = self.device_coordinates[task, device, :]
+        tasks[x, y, z] = task
+        devices[x, y, z] = device
+    return tasks, devices
+
+  @property
+  def mesh_shape(self):
+    """A rank 1 int32 array describing the shape of the TPU topology."""
+    return self._mesh_shape
+
+  @property
+  def mesh_rank(self):
+    """Returns the number of dimensions in the mesh."""
+    return len(self._mesh_shape)
+
+  @property
+  def device_coordinates(self):
+    """Describes the mapping from TPU devices to topology coordinates.
+
+    Returns:
+      A rank 3 int32 array with shape `[tasks, devices, axis]`.
+      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+      of TPU devices per task, and `axis` is the number of axes in the TPU
+      cluster topology. Each entry gives the `axis`-th coordinate in the
+      topology of a task/device pair. TPU topologies are 3-dimensional, with
+      dimensions `(x, y, core number)`.
+    """
+    return self._device_coordinates
+
+  def task_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow task number attached to `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow task number that contains the TPU device with those
+      physical coordinates.
+    """
+    return self._topology_tasks[tuple(device_coordinates)]
+
+  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow device number at `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow device number within the task corresponding to
+      attached to the device with those physical coordinates.
+    """
+    return self._topology_devices[tuple(device_coordinates)]
+
+  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the CPU device attached to a logical core."""
+    return _tpu_host_device_name(
+        job, self._topology_tasks[tuple(device_coordinates)])
+
+  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    return _tpu_device_name(job,
+                            self._topology_tasks[tuple(device_coordinates)],
+                            self._topology_devices[tuple(device_coordinates)])
+
+  @property
+  def num_tasks(self):
+    """Returns the number of TensorFlow tasks in the TPU slice."""
+    return self._device_coordinates.shape[0]
+
+  @property
+  def num_tpus_per_task(self):
+    """Returns the number of TPU devices per task in the TPU slice."""
+    return self._device_coordinates.shape[1]
+
+  def serialized(self):
+    """Returns the serialized form of the topology."""
+    if self._serialized is None:
+      proto = topology_pb2.TopologyProto()
+      proto.mesh_shape[:] = list(self._mesh_shape)
+      proto.num_tasks = self._device_coordinates.shape[0]
+      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
+      proto.device_coordinates.extend(list(self._device_coordinates.flatten()))
+      self._serialized = proto.SerializeToString()
+
+    return self._serialized
diff --git a/tensorflow/contrib/tpu/python/tpu/topology_test.py b/tensorflow/python/tpu/topology_test.py
similarity index 96%
rename from tensorflow/contrib/tpu/python/tpu/topology_test.py
rename to tensorflow/python/tpu/topology_test.py
index fafe3254d84551d3d7ed8a9d3346849411714f97..9e1b7de859703f6859017d57cd73b4fbda7237b4 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology_test.py
+++ b/tensorflow/python/tpu/topology_test.py
@@ -19,9 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import topology
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import topology
 
 
 class TopologyTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..02489a9b10e81d818f6d02d72e9e8c13988805c3
--- /dev/null
+++ b/tensorflow/python/tpu/tpu.py
@@ -0,0 +1,1576 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf.tpu import dynamic_padding_pb2 as dynamic_padding
+from tensorflow.python.compat import compat as api_compat
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import xla
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+
+
+# Operations that indicate some error in the users graph, e.g. a placeholder
+# that's introduced outside of the infeed.
+_BLACKLISTED_OPS = set([
+    "Placeholder",
+])
+
+# XLA doesn't currently support reading of intermediate tensors, thus some ops
+# are not supported.
+_UNSUPPORTED_OPS = set([
+    "AudioSummary",
+    "AudioSummaryV2",
+    "HistogramSummary",
+    "ImageSummary",
+    "MergeSummary",
+    "Print",
+    "ScalarSummary",
+    "TensorSummary",
+    "TensorSummaryV2",
+    ])
+
+_MAX_WARNING_LINES = 5
+
+_TPU_REPLICATE_ATTR = "_tpu_replicate"
+_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
+
+
+def _tpu_system_device_name(job):
+  """Returns the device name for the TPU_SYSTEM device of `job`."""
+  if job is None:
+    return "/device:TPU_SYSTEM:0"
+  else:
+    return "/job:%s/device:TPU_SYSTEM:0" % job
+
+
+def initialize_system(embedding_config=None, job=None):
+  """Initializes a distributed TPU system for use with TensorFlow.
+
+  Args:
+    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
+      describing the desired configuration of the hardware embedding lookup
+      tables. If embedding_config is None, no hardware embeddings can be used.
+    job: The job (the XXX in TensorFlow device specification /job:XXX) that
+      contains the TPU devices that will be initialized. If job=None it is
+      assumed there is only one job in the TensorFlow flock, and an error will
+      be returned if this assumption does not hold.
+  Returns:
+    A serialized `TopologyProto` that describes the TPU system. Note:
+      the topology must be evaluated using `Session.run` before it can be used.
+  """
+  config_string = ("" if embedding_config is None else
+                   embedding_config.SerializeToString())
+  with ops.device(_tpu_system_device_name(job)):
+    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
+
+
+def shutdown_system(job=None):
+  """Shuts down a running a distributed TPU system."""
+  with ops.device(_tpu_system_device_name(job)):
+    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
+  return shutdown_distributed_tpu
+
+
+def core(num):
+  """Returns the device name for a core in a replicated TPU computation.
+
+  Args:
+    num: the virtual core number within each replica to which operators should
+    be assigned.
+  Returns:
+    A device name, suitable for passing to `tf.device()`.
+  """
+  return "device:TPU_REPLICATED_CORE:{}".format(num)
+
+
+class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU computation.
+
+  The primary role of `TPUReplicateContext` is to mark operators inside a
+  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
+  is a unique name.
+
+  We use a `ControlFlowContext` to perform the annotation since it integrates
+  with Tensorflow constructs like ResourceVariables. For example, if a
+  `ResourceVariable` is constructed inside a tpu.replicate() block, the
+  `ResourceVariable` implementation can use
+  `with ops.control_dependencies(None)` to build the variable's definition
+  outside the replicated computation.
+  """
+
+  def __init__(self, name, num_replicas, pivot):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
+    super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
+    self._name = name
+    self._name_as_bytes = compat.as_bytes(name)
+    self._unsupported_ops = []
+    self._pivot = pivot
+    self._replicated_vars = {}
+
+  def get_replicated_var_handle(self, name, vars_):
+    """Returns a variable handle for replicated TPU variable 'var'.
+
+    This is a method used by an experimental replicated variable implementation
+    and is not intended as a public API.
+
+    Args:
+      name: The common name of the variable.
+      vars_: The replicated TPU variables.
+
+    Returns:
+      The handle of the TPU replicated input node.
+    """
+    handle = self._replicated_vars.get(name)
+    if handle is not None:
+      return handle
+
+    # Builds a TPUReplicatedInput node for the variable, if one does not already
+    # exist. The TPUReplicatedInput node must belong to the enclosing
+    # control-flow scope of the TPUReplicateContext.
+    # TODO(phawkins): consider changing the contract of the TPU encapsulation
+    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
+    # instead.
+
+    # pylint: disable=protected-access
+    graph = ops.get_default_graph()
+    saved_context = graph._get_control_flow_context()
+    graph._set_control_flow_context(self.outer_context)
+    handle = tpu_ops.tpu_replicated_input(
+        [v.handle for v in vars_], name=name + "/handle")
+    graph._set_control_flow_context(saved_context)
+    # pylint: enable=protected-access
+    self._replicated_vars[name] = handle
+    return handle
+
+  def report_unsupported_operations(self):
+    if self._unsupported_ops:
+      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
+                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
+      logging.warning("%d unsupported operations found: \n%s",
+                      len(self._unsupported_ops), op_str)
+      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
+        logging.warning("... and %d more" %
+                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
+
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an outside_compilation
+          # cluster C in a forward computation we would like to put the ops
+          # corresponding to the gradient of X into a new outside_compilation
+          # cluster C'. However, if we take the gradient of X twice, the second
+          # one should get yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is the
+          # cluster that X was in before we took gradients, and a 'gradient_uid'
+          # which is different for every invocation of gradients, and put the
+          # gradient of X in cluster 'root_cluster.gradient_uid'.
+          #
+          # When taking a gradient of a gradient, some ops will be colocated
+          # with Op in the forward pass (e.g., cluster root_cluster) and some in
+          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
+          # We need all of the grad-of-grad ops to be in the same cluster to
+          # avoid cyclic dependencies between clusters. We adopt a heuristic
+          # that puts any op clustered with root_cluster.<xxx> in
+          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          cluster = parts[0] + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the type and device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def type(self):
+        return "FakeOp"
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        if isinstance(device, pydev.DeviceSpec):
+          self._device = device.to_string()
+        else:
+          self._device = device
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      # pylint: disable=protected-access
+      self._outer_device_function_stack = graph._device_function_stack.copy()
+      # pylint: enable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
+  def _RemoveExternalControlEdges(self, op):
+    """Remove any external control dependency on this op."""
+    internal_control_inputs = []
+    external_control_inputs = []
+    for x in op.control_inputs:
+      # pylint: disable=protected-access
+      is_internal_op = False
+      ctxt = x._get_control_flow_context()
+      while ctxt is not None:
+        if ctxt == self:
+          is_internal_op = True
+          break
+        ctxt = ctxt._outer_context
+      if is_internal_op:
+        internal_control_inputs.append(x)
+      else:
+        external_control_inputs.append(x)
+      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    op._remove_all_control_inputs()
+    op._add_control_inputs(internal_control_inputs)
+    # pylint: enable=protected-access
+    return internal_control_inputs, external_control_inputs
+
+  def AddOp(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_OPS:
+      logging.error("Operation of type %s (%s) is not supported on the TPU. "
+                    "Execution will fail if this op is used in the graph. " %
+                    (op.type, op.name))
+
+    if op.type in _UNSUPPORTED_OPS:
+      self._unsupported_ops.append(op)
+
+    if any(x.dtype._is_ref_dtype for x in op.inputs):
+      raise NotImplementedError(
+          "Non-resource Variables are not supported inside TPU computations "
+          "(operator name: %s)" % op.name)
+    if _TPU_REPLICATE_ATTR in op.node_def.attr:
+      raise ValueError("TPU computations cannot be nested")
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=self._name_as_bytes))
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
+
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    (internal_control_inputs,
+     external_control_inputs) = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not internal_control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in xrange(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x != x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_control_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_control_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_control_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_control_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    """Add `val` to the current context and its outer context recursively."""
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
+    result = val
+    self._values.add(val.name)
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    self._external_values[val.name] = result
+
+    return result
+
+  def AddInnerOp(self, op):
+    self.AddOp(op)
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  @property
+  def grad_state(self):
+    # Define the gradient loop state associated with the TPUReplicateContext to
+    # be None as the TPUReplicateContext does not get nested nor does the
+    # grad_state outside the TPUReplicateContext affect the graph inside so the
+    # grad_state should be as if this is the top-level gradient state.
+    return None
+
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self):
+    return self._pivot
+
+
+def outside_compilation(computation, *args, **kwargs):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
+  Returns:
+    The Tensors returned by computation.
+  """
+  args = [] if args is None else args
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args, **kwargs)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
+def replicate(computation,
+              inputs=None,
+              infeed_queue=None,
+              device_assignment=None,
+              name=None,
+              maximum_shapes=None):
+  """Builds a graph operator that runs a replicated TPU computation.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+  Returns:
+    A list of outputs, indexed by `[replica_num]` each output can be a nested
+    structure same as what computation() returns with a few exceptions.
+
+    Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
+  """
+  return split_compile_and_replicate(
+      computation,
+      inputs,
+      infeed_queue,
+      device_assignment,
+      name,
+      maximum_shapes=maximum_shapes)[1]
+
+
+def _pad_all_input(inputs, padded_shapes):
+  """Pad all input tensors given padded_shapes.
+
+  The real shape tensors will be concatenated with the padded original inputs.
+
+  Args:
+    inputs: The original inputs.
+    padded_shapes: A list of padded shapes for each input.
+
+  Returns:
+    The padded inputs and a PaddingMap list which maps the padded input
+    dimension to the real shape argument index.
+  """
+  input_shape_tensors = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    for idx, input_tensor in enumerate(inputs_per_core):
+      if core_idx == 0:
+        input_shape_tensors.append([])
+      input_shape_tensors[idx].append(array_ops.shape(input_tensor))
+
+  maximum_shapes = []
+  for shapes_per_input in input_shape_tensors:
+    maximum_shapes.append(
+        math_ops.reduce_max(array_ops.stack(shapes_per_input), axis=0))
+
+  padded_inputs = []
+  real_shapes = []
+  padding_maps = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    padded_inputs.append([])
+    real_shapes.append([])
+    real_shape_idx = len(inputs_per_core) - 1
+    for idx, input_tensor in enumerate(inputs_per_core):
+      input_shape_tensor = input_shape_tensors[idx][core_idx]
+      input_shape = input_tensor.get_shape()
+      padded_shape = padded_shapes[idx]
+
+      # The static shape of inputs should be compatible with the given padded
+      # shapes.
+      input_shape.assert_is_compatible_with(padded_shape)
+
+      if input_shape.is_fully_defined():
+        # Do nothing if the shape of the whole tensor is already static.
+        padded_inputs[core_idx].append(input_tensor)
+      else:
+        # Only pad the non static shape dimension.
+        for i, s in enumerate(input_shape):
+          if s.value is None:
+            if core_idx == 0:
+              real_shape_idx += 1
+              padding_map = dynamic_padding.PaddingMap()
+              padding_map.arg_index = idx
+              padding_map.shape_index = i
+              padding_map.padding_arg_index = real_shape_idx
+              padding_maps.append(padding_map)
+            real_shapes[core_idx].append(
+                math_ops.cast(input_shape_tensor[i], dtypes.uint32))
+
+        paddings = []
+        for i, s in enumerate(padded_shape):
+          if input_shape[i].value:
+            # Don't pad if input shape is already static.
+            padding = [0, 0]
+          else:
+            if s.value:
+              # Pad to the given maximum value.
+              padding = [0, s.value - input_shape_tensor[i]]
+            else:
+              # If maximum value is not given, then pad to the maximum dimension
+              # among all the cores.
+              padding = [0, maximum_shapes[idx][i] - input_shape_tensor[i]]
+          paddings.append(padding)
+
+        padded_input = array_ops.pad(input_tensor, paddings)
+        padded_inputs[core_idx].append(padded_input)
+
+  num_replicas = len(padded_inputs)
+  for i in range(num_replicas):
+    padded_inputs[i].extend(real_shapes[i])
+
+  return padded_inputs, padding_maps
+
+
+def split_compile_and_replicate(computation,
+                                inputs=None,
+                                infeed_queue=None,
+                                device_assignment=None,
+                                name=None,
+                                use_tpu=True,
+                                maximum_shapes=None):
+  """Builds graph operators that runs compilation and replicated computation.
+
+  This is a lower level interface than replicate that returns a separate compile
+  and execute output tensor. In the generated graph the compile op feeds into
+  the execute op and no additional compilation is incurred when running the
+  compile op before the execute op. The compile op returns additional
+  information about the compilation but does not return the compiled program.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
+      backends. Currently, only supports a default placement (computation is
+      placed on GPU if one is available, and on CPU if not).
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+
+  Returns:
+    A list of lists with the first list corresponding to the compile op and the
+    second a list of output tensors, indexed by `[replica_num][output_num]`.
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
+  """
+  del name
+  inputs = [[]] if inputs is None else inputs
+
+  metadata_kwargs = {}
+  if device_assignment is not None:
+    # Turn the Numpy array into a flattened list so we can pass it as an
+    # operator attribute.
+    metadata_kwargs = {
+        "topology":
+            device_assignment.topology.serialized(),
+        "device_assignment":
+            device_assignment.core_assignment.flatten().tolist()
+    }
+    # TODO(phawkins): remove this case after the forward compatibility window
+    # expires on 2018-10-5.
+    if api_compat.forward_compatible(2018, 10, 5):
+      metadata_kwargs["num_cores_per_replica"] = (
+          device_assignment.num_cores_per_replica)
+    else:
+      metadata_kwargs["computation_shape"] = [
+          device_assignment.num_cores_per_replica
+      ]
+
+  if ((not isinstance(inputs, list)) or
+      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
+    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
+
+  num_replicas = len(inputs)
+
+  # No replicas? Nothing to do.
+  if num_replicas == 0:
+    return []
+
+  # Checks all replicas have the same structure.
+  for i in xrange(1, num_replicas):
+    nest.assert_same_structure(inputs[0], inputs[i])
+
+  # Flatten inputs.
+  flat_inputs = [
+      nest.flatten(per_replica_input) for per_replica_input in inputs
+  ]
+  # Converts inputs to Tensors.
+  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
+
+  # Verifies that all replicas have matching numbers and types of inputs
+  flat_input_types = [x.dtype for x in flat_inputs[0]]
+  input_arity = len(inputs[0])
+  flat_input_arity = len(flat_input_types)
+  for i in range(num_replicas):
+    if len(inputs[i]) != input_arity:
+      raise ValueError("Replicas must have the same number of inputs. "
+                       "Replica 0 had {} inputs, replica {} had {} "
+                       "inputs.".format(input_arity, i, len(inputs[i])))
+
+    types = [x.dtype for x in flat_inputs[i]]
+    if types != flat_input_types:
+      raise ValueError("Replicas must have matching input types. Replica 0 had "
+                       "input types {}, replica {} had input types {}".format(
+                           flat_input_types, i, types))
+
+  arg_error = xla.check_function_argument_count(
+      computation, input_arity, infeed_queue)
+  if arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s, but the computation needs %s" % (
+              input_arity, str([i.name for i in inputs[0]]), arg_error))
+    else:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s and %d additional inputs from infeed,"
+          " but the computation needs %s" % (input_arity, str(
+              [i.name
+               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
+                                             arg_error))
+
+  if maximum_shapes:
+    if infeed_queue:
+      raise ValueError(
+          "Dynamic input shapes are not supported with infeed queues")
+
+    # Make sure maximum_shapes has the same structure as inputs.
+    nest.assert_same_structure(inputs[0], maximum_shapes, check_types=False)
+
+    # Flatten padded shapes.
+    flat_maximum_shapes = nest.flatten(maximum_shapes)
+    flat_maximum_shapes = [
+        tensor_shape.TensorShape(s) for s in flat_maximum_shapes
+    ]
+
+    flat_inputs, padding_maps = _pad_all_input(flat_inputs, flat_maximum_shapes)
+
+    serialized_padding_maps = []
+    for padding_map in padding_maps:
+      serialized_padding_maps.append(padding_map.SerializeToString())
+    metadata_kwargs["padding_map"] = serialized_padding_maps
+
+  metadata_kwargs["step_marker_location"] = getattr(
+      computation, "step_marker_location", "STEP_MARK_AT_ENTRY")
+
+  graph = ops.get_default_graph()
+
+  # Fan-in: Builds a TPUReplicatedInput node for each input.
+  flat_replicated_inputs = []
+  for i in range(0, len(flat_inputs[0])):
+    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
+    flat_replicated_inputs.append(
+        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+
+  cluster_name = graph.unique_name("cluster")
+  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
+  context = TPUReplicateContext(
+      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
+  try:
+    context.Enter()
+
+    metadata = tpu_ops.tpu_replicate_metadata(
+        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
+
+    with tpu_function.tpu_shard_context(
+        num_replicas), ops.control_dependencies([metadata]):
+
+      # Add identity ops so even unused inputs are "consumed" by the
+      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
+      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
+      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
+      flat_replicated_inputs = [
+          array_ops.identity(x, name="replicated_input_{}".format(i))
+          for i, x in enumerate(flat_replicated_inputs)
+      ]
+      for i in flat_replicated_inputs:
+        # pylint: disable=protected-access
+        # Add an attribute to the identity node so that they could be removed in
+        # encapsulate TPU computation pass if unused. However we don't remove
+        # inputs when dynamic padding is enabled.
+        # TODO(rxsang): Use other ways except argument index in padding_map so
+        # outside compilation can work with dynamic padding correctly.
+        if maximum_shapes is None:
+          i.op._set_attr("_tpu_input_identity",
+                         attr_value_pb2.AttrValue(b=True))
+        # pylint: enable=protected-access
+
+      # Unflatten the computation inputs to match original input structure.
+      computation_inputs = nest.pack_sequence_as(
+          structure=inputs[0],
+          flat_sequence=flat_replicated_inputs[:flat_input_arity])
+
+      # If there is an infeed queue, adds the dequeued values to the
+      # computation's inputs.
+      if infeed_queue is not None:
+        infeed_queue.set_number_of_shards(num_replicas)
+        for t in infeed_queue.generate_dequeue_op():
+          computation_inputs.append(t)
+
+      # Only resource variables work inside a TPU computation, so turn on
+      # resource variables for the computation.
+      # TODO(phawkins): consider removing this code. It will
+      # be less confusing to clients if they knowingly choose to use resource
+      # variables.
+      # Partitioned variables is not supported (b/112311320).
+      vscope = variable_scope.get_variable_scope()
+      saved_use_resource = vscope.use_resource
+      saved_custom_getter = vscope.custom_getter
+
+      def custom_getter(getter, name, *args, **kwargs):
+        """Variables on TPU have a few restrictions."""
+        partitioner = kwargs["partitioner"]
+        if partitioner is not None:
+          kwargs["partitioner"] = None
+          logging.warning(
+              "Partitioned variables are not supported on TPU. Got "
+              "`partitioner` that is {} for variable {}. "
+              "Setting `partitioner` to `None`."
+              .format(partitioner, name))
+        if saved_custom_getter is None:
+          return getter(name, *args, **kwargs)
+        else:
+          return saved_custom_getter(getter, name, *args, **kwargs)
+
+      vscope.set_use_resource(True)
+      vscope.set_custom_getter(custom_getter)
+
+      outputs = computation(*computation_inputs)
+
+      vscope.set_use_resource(saved_use_resource)
+      vscope.set_custom_getter(saved_custom_getter)
+
+    outputs_is_flat = xla.is_flat(outputs)
+    if outputs_is_flat:
+      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+    else:
+      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
+
+    # tensor_tracer imports tpu.py. Local import to tensor_tracer to avoid
+    # import-cycle
+    # pylint: disable=g-import-not-at-top
+    from tensorflow.python.tpu import tensor_tracer
+    # pylint: enable=g-import-not-at-top
+    if tensor_tracer.TensorTracer.is_enabled():
+      tt = tensor_tracer.TensorTracer()
+      output_tensors = tt.trace_tpu(ops.get_default_graph(),
+                                    output_tensors, control_deps,
+                                    num_replicas)
+
+    context.ExitResult(output_tensors)
+  finally:
+    context.report_unsupported_operations()
+    context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
+
+  with ops.control_dependencies([metadata]):
+    if use_tpu:
+      compile_status = tpu_ops.tpu_compilation_result()
+      op = compile_status.op
+      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
+      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
+    else:
+      compile_status = control_flow_ops.no_op(name="compilation_status")
+
+  if not output_tensors:
+    # Returns a list of NoOps dependent on the replication Op, indexed by
+    # [replica_num].
+    return [
+        compile_status,
+        [
+            control_flow_ops.group(control_deps, name="shard_%d" % i)
+            for i in range(num_replicas)
+        ]
+    ]
+
+  # Fan-out: Builds a TPUReplicatedOutput node for each output.
+  replicated_outputs = [[] for i in xrange(num_replicas)]
+  for i, t in enumerate(output_tensors):
+    # Fan-out: Builds a TPUReplicatedOutput node for each output.
+    ys = tpu_ops.tpu_replicated_output(
+        t, num_replicas, name="output{}".format(i))
+
+    # Wraps the outputs in identity operators so the names of any possible
+    # `fetch` nodes are preserved by the replication rewrite.
+    with ops.control_dependencies(control_deps):
+      for replica in xrange(num_replicas):
+        replicated_outputs[replica].append(
+            array_ops.identity(
+                ys[replica], name="output_%d_shard_%d" % (i, replica)))
+
+  if not outputs_is_flat:
+    replicated_outputs = [
+        nest.pack_sequence_as(outputs, replica_outs)
+        for replica_outs in replicated_outputs
+    ]
+
+  return [compile_status, replicated_outputs]
+
+
+def _postprocess_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors and Operations extracted from outputs.
+  """
+  # Following code segment is to preserve legacy behavior. Previously we only
+  # supported flat outputs and thus for consistency it was nice to convert even
+  # single element into a tuple. But now that we support arbitrary output
+  # structure, this is no longer necessary.
+  # TODO(b/121383831): Migrate all legacy use cases and delete this special
+  # case.
+  # If the computation returns `None`, make it an empty tuple.
+  if outputs is None:
+    outputs = tuple()
+  # If the computation only returned one value, makes it a tuple.
+  if not isinstance(outputs, collections.Sequence):
+    outputs = (outputs,)
+
+  # Append `no_op` here so that fetching any return value of this function
+  # will trigger TPUExecute node.
+  outputs += (control_flow_ops.no_op(),)
+  try:
+    with ops.device(core(0)):
+      outputs = [
+          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+          for o in outputs
+      ]
+  except Exception as e:
+    raise ValueError(
+        "TPU function return values must all either be Operations or "
+        "convertible to Tensors. Got '%s'" % str(e))
+
+  # Separates the returned Operations and Tensors.
+  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
+
+  if outputs != output_tensors + output_operations:
+    raise ValueError(
+        "TPU functions must return zero-or more Tensor values followed by "
+        "zero or more Operations.")
+
+  # Wraps outputs in Identity ops. Otherwise a replicated input copied
+  # straight to an output would bypass the replicate(). This would be bad
+  # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+  # be rewritten away, leading to a runtime error.
+  # TODO(phawkins): extend the rewrite to elide these nodes instead.
+  new_output_tensors = []
+  for t in output_tensors:
+    with ops.device(t.device if t.device else core(0)):
+      o = array_ops.identity(t)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      new_output_tensors.append(o)
+  return new_output_tensors, output_operations
+
+
+def _postprocess_non_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors extracted from outputs and an empty list because Operations are not
+    allowed in non-flat outputs..
+  """
+
+  # Flatten output items.
+  flat_outputs = nest.flatten(outputs)
+
+  # Convert all non-Operation outputs to Tensors.
+  for i, o in enumerate(flat_outputs):
+    if isinstance(o, ops.Operation):
+      raise ValueError(
+          "tpu.rewrite does not support Operation as return value in non-flat "
+          "output structure. You can set returned Operations as control "
+          "dependencies of returned Tensors so Operations are triggered when "
+          'Tensors are evaluated. Operation found: "%s"' % o.name)
+
+    try:
+      o = ops.convert_to_tensor(o)
+    except Exception as e:
+      raise ValueError(
+          "TPU function return values must all either be Operations or "
+          'convertible to Tensors. Got error: "%s"' % str(e))
+
+    # Wraps outputs in Identity ops. Otherwise a replicated input copied
+    # straight to an output would bypass the replicate(). This would be bad
+    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+    # be rewritten away, leading to a runtime error.
+    # TODO(phawkins): extend the rewrite to elide these nodes instead.
+    with ops.device(core(0)):
+      o = array_ops.identity(o)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      flat_outputs[i] = array_ops.identity(o)
+
+  # All flat_outputs are Tensors, and no Operations.
+  return flat_outputs, []
+
+
+def split_compile_and_shard(computation,
+                            inputs=None,
+                            num_shards=1,
+                            input_shard_axes=None,
+                            outputs_from_all_shards=True,
+                            output_shard_axes=None,
+                            infeed_queue=None,
+                            device_assignment=None,
+                            name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A tuple of (compile op, [output tensors]).
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  # TODO(phawkins): consider adding support for broadcasting Tensors passed as
+  # inputs.
+
+  if num_shards <= 0:
+    raise ValueError("num_shards must be a positive integer.")
+
+  inputs = [] if inputs is None else inputs
+  if not isinstance(inputs, list):
+    raise TypeError("tpu.shard()'s inputs must be a list of Tensors or None.")
+
+  # Converts inputs to Tensors.
+  inputs = [ops.convert_to_tensor(x) for x in inputs]
+
+  if input_shard_axes is None:
+    input_shard_axes = [0] * len(inputs)
+  if len(inputs) != len(input_shard_axes):
+    raise ValueError("Length of input_shard_axes must be equal to the number "
+                     "of inputs.")
+
+  if inputs:
+    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
+    # lists with layout [input][shard]
+    split_inputs = [
+        array_ops.split(x, num_shards, axis=axis)
+        for (axis, x) in zip(input_shard_axes, inputs)]
+
+    # Transposes the input lists to have layout [shard][input]
+    transposed_inputs = [list(i) for i in zip(*split_inputs)]
+  else:
+    transposed_inputs = [[]] * num_shards
+
+  compile_op, outputs = split_compile_and_replicate(
+      computation,
+      transposed_inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+
+  # There must be at least one shard since num_shards > 0.
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  if isinstance(outputs[0], ops.Operation):
+    # pylint: enable=indexing-exception
+    # There were no outputs from the computation and replicate returned a list
+    # of NoOps with control dependencies on the computation. Return the first
+    # one so it can be used as a control dependency or fetch node.
+    # TODO(b/36647078) remove disable when pylint bug is fixed.
+    # pylint: disable=indexing-exception
+    return compile_op, [outputs[0]]
+    # pylint: enable=indexing-exception
+
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  num_outputs = len(outputs[0])
+  # pylint: enable=indexing-exception
+
+  if output_shard_axes is None:
+    output_shard_axes = [0] * num_outputs
+  if num_outputs != len(output_shard_axes):
+    raise ValueError("Length of output_shard_axes must be equal to the number "
+                     "of outputs.")
+
+  if isinstance(outputs_from_all_shards, bool):
+    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
+
+  if num_outputs != len(outputs_from_all_shards):
+    raise ValueError("Length of outputs_from_all_shards must be equal to the "
+                     "number of outputs.")
+
+  results = []
+  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
+                                   zip(*outputs)):
+    if all_shards:
+      # Concatenate all of the outputs together (use stack for scalars).
+      shape = x[0].shape
+      is_scalar = shape is not None and (shape.ndims == 0)
+      results.append((array_ops.stack(list(x)) if is_scalar
+                      else array_ops.concat(list(x), axis=axis)))
+    else:
+      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
+      results.append(x[0])
+
+  return compile_op, results
+
+
+def shard(computation,
+          inputs=None,
+          num_shards=1,
+          input_shard_axes=None,
+          outputs_from_all_shards=True,
+          output_shard_axes=None,
+          infeed_queue=None,
+          device_assignment=None,
+          name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  TODO(phawkins): consider adding support for broadcasting Tensors passed
+  as inputs.
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  return split_compile_and_shard(
+      computation,
+      inputs=inputs,
+      num_shards=num_shards,
+      input_shard_axes=input_shard_axes,
+      outputs_from_all_shards=outputs_from_all_shards,
+      output_shard_axes=output_shard_axes,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[1]
+
+
+def batch_parallel(computation,
+                   inputs=None,
+                   num_shards=1,
+                   infeed_queue=None,
+                   device_assignment=None,
+                   name=None):
+  """Shards `computation` along the batch dimension for parallel execution.
+
+  Convenience wrapper around shard().
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list).
+  Each input is split into `num_shards` pieces along the 0-th dimension, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  The outputs from all shards are concatenated back together along their 0-th
+  dimension.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). The
+      0-th dimension of each Tensor must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If `num_shards <= 0`
+  """
+  return shard(
+      computation,
+      inputs,
+      num_shards=num_shards,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+
+
+def rewrite(computation,
+            inputs=None,
+            infeed_queue=None,
+            device_assignment=None,
+            name=None):
+  """Rewrites `computation` for execution on a TPU system.
+
+  Args:
+    computation: A Python function that builds a computation to apply to the
+      input. If the function takes n inputs, 'inputs' should be a list of n
+      tensors.
+
+      `computation` may return a list of operations and tensors. Tensors must
+      come before operations in the returned list.  The return value of
+      `rewrite` is a list of tensors corresponding to the tensors from the
+      output of `computation`.
+
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+      Each input can be a nested structure containing values that are
+      convertible to tensors. Note that passing an N-dimension list of
+      compatible values will result in a N-dimention list of scalar tensors
+      rather than a single Rank-N tensors. If you need different behavior,
+      convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: (Deprecated) Does nothing.
+  Returns:
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+  """
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  return replicate(
+      computation,
+      None if inputs is None else [inputs],
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[0]
+  # pylint: enable=indexing-exception
+
+  # Operations that indicate some error in the user's inference graph.
+_BLACKLISTED_INFERENCE_OPS = set([
+    "ReadVariableOp",
+    "AssignVariableOp",
+    "AssignAddVariableOp",
+    "AssignSubVariableOp",
+    "VarHandleOp",
+    "Variable",
+    "VariableV2",
+])
+
+
+def under_tpu_inference_context():
+  """Check if it is currently under `tpu.rewrite_for_inference()`."""
+  graph = ops.get_default_graph()
+
+  context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  while context:
+    if isinstance(context, _TPUInferenceContext):
+      return True
+    context = context.outer_context
+
+  return False
+
+
+class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU inference computation.
+
+  The primary role of `TPUReplicateContext` is to sanity check operators inside
+  a tpu.rewrite_for_inference() computation.
+  """
+
+  def __init__(self, name):
+    super(_TPUInferenceContext, self).__init__()
+    self._name = name
+
+  def AddOp(self, op):
+    self._AddOpInternal(op)
+
+  def _AddOpInternal(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_INFERENCE_OPS:
+      raise NotImplementedError(
+          "Operation of type %s (%s) is not supported on the TPU for inference."
+          " Execution will fail if this op is used in the graph. Make sure your"
+          " variables are using variable_scope." % (op.type, op.name))
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    result = val
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+    return result
+
+  def AddInnerOp(self, op):
+    self._AddOpInternal(op)
+
+  @property
+  def grad_state(self):
+    return None
+
+
+def validate_inference_rewrite_for_variables(graph):
+  """Validates whether rewrite_for_inference() 'worked' for variables.
+
+     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
+     after ReadVariableOps, but this mechanism works only if you are using
+     tf.get_variable() to create and access variables in your tpu computation.
+     This validation method can be called immediately after calling
+     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
+     to the graph.
+
+     Typical usages:
+       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
+
+       tpu.validate_inference_rewrite_for_variables(sess.graph)
+
+  Args:
+    graph: The graph which needs to be validated.
+  Raises:
+    RuntimeError: if validation failed.
+  """
+  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
+    raise RuntimeError(
+        "No GuaranteeConst ops found in the graph after running "
+        "tpu.rewrite_for_inference(...). Please check that you are using "
+        "tf.get_variable() to create and access variables in your tpu "
+        "computation.")
+
+
+def rewrite_for_inference(computation,
+                          inputs=None,
+                          infeed_queue=None,
+                          device_assignment=None,
+                          name=None):
+  """Rewrites `computation` for inference on a TPU system.
+
+     Other than 'rewriting' the computation to run on a TPU, if using variables
+     in your computation, it moves the ReadVariableOps outside the TPU
+     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
+     This mechanism works only if you are using tf.get_variable() to create and
+     access variables in your tpu computation. You can validate whether this
+     worked, by calling validate_inference_rewrite_for_variables() method
+     immediately after this method to check whether GuaranteeConstOps where
+     added to the graph.
+
+  Args:
+    computation: A Python function that builds a computation to apply to the
+      input. If the function takes n inputs, 'inputs' should be a list of n
+      tensors. If the function returns m outputs, rewrite will return a list of
+      m tensors.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: The name of the operator.
+  Returns:
+    A list of output tensors.
+  """
+
+  def guarantee_const_getter(getter, name, *args, **kwargs):
+    with ops.control_dependencies(None):
+      return array_ops.guarantee_const(
+          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
+
+  def wrapped_computation(*args, **kwargs):
+    """Execute computation under `_TPUInferenceContext`."""
+    context = _TPUInferenceContext(
+        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
+    try:
+      context.Enter()
+
+      vscope = variable_scope.get_variable_scope()
+      prev_custom_getter = vscope.custom_getter
+      prev_caching_device = vscope.caching_device
+      vscope.set_custom_getter(guarantee_const_getter)
+      vscope.set_caching_device(lambda op: op.device)
+
+      result = computation(*args, **kwargs)
+
+      vscope.set_custom_getter(prev_custom_getter)
+      vscope.set_caching_device(prev_caching_device)
+    finally:
+      context.Exit()
+    return result
+
+  # pylint: disable=undefined-variable
+  return rewrite(
+      wrapped_computation,
+      inputs=inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+  # pylint: enable=undefined-variable
diff --git a/tensorflow/python/tpu/tpu_config.py b/tensorflow/python/tpu/tpu_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9038a95221249b84ba0b64f66ec7fb24e1877a1
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_config.py
@@ -0,0 +1,276 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""A RunConfig subclass with TPU support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import util as util_lib
+
+# pylint: disable=protected-access
+_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
+_SERVICE_KEY = run_config_lib._SERVICE_KEY
+_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
+# pylint: enable=protected-access
+
+
+class InputPipelineConfig(object):
+  r"""Please see the definition of these values in TPUConfig."""
+  PER_SHARD_V1 = 1
+  PER_HOST_V1 = 2
+  PER_HOST_V2 = 3
+  BROADCAST = 4
+
+
+class TPUConfig(
+    collections.namedtuple('TPUConfig', [
+        'iterations_per_loop',
+        'num_shards',
+        'num_cores_per_replica',
+        'per_host_input_for_training',
+        'tpu_job_name',
+        'initial_infeed_sleep_secs',
+        'input_partition_dims',
+    ])):
+  r"""TPU related configuration required by `TPUEstimator`.
+
+  Args:
+    iterations_per_loop: This is the number of train steps running in TPU
+      system before returning to CPU host for each `Session.run`. This means
+      global step is increased `iterations_per_loop` times in one `Session.run`.
+      It is recommended to be set as number of global steps for next checkpoint.
+    num_shards: (Deprecated, ignored by TPUEstimator).
+      The number of model replicas in the system. For non-model-parallelism
+      case, this number equals the total number of TPU cores. For
+      model-parallelism, the total number of TPU cores equals
+      num_cores_per_replica * num_shards.
+    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
+      An integer which describes the number of TPU cores per model replica. This
+      is required by model-parallelism which enables partitioning
+      the model to multiple cores. Currently num_cores_per_replica must be
+      1, 2, 4, or 8.
+    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
+      `input_fn` is invoked once on each host. With the per-core input pipeline
+      configuration, it is invoked once for each core.
+      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
+      the batch size for each shard is `train_batch_size` // #hosts in the
+      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
+      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
+      invoked once on host 0 and the tensors are broadcasted to all other
+      replicas. The batch size equals to train_batch_size`. With the per-core
+      input pipeline configuration, the shard batch size is also
+      `train_batch_size` // #cores.
+      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
+    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
+      within TPUEstimator, however when using ClusterSpec propagation in more
+      esoteric cluster configurations, you may need to specify the job name as a
+      string.
+    initial_infeed_sleep_secs: The number of seconds the infeed thread should
+      wait before enqueueing the first batch. This helps avoid timeouts for
+      models that require a long compilation time.
+    input_partition_dims: A nested list to describe the partition dims
+      for all the tensors from input_fn(). The structure of
+      input_partition_dims must match the structure of `features` and
+      `labels` from input_fn(). The total number of partitions must match
+      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
+      images with shape [N, H, W, C] and labels [N].
+      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
+      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
+      to all the TPU cores since the partition dims is `None`.
+      Current limitations: This feature is only supported with the PER_HOST_V2
+      input mode.
+
+    Raises:
+      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
+  """
+
+  def __new__(cls,
+              iterations_per_loop=2,
+              num_shards=None,
+              num_cores_per_replica=None,
+              per_host_input_for_training=True,
+              tpu_job_name=None,
+              initial_infeed_sleep_secs=None,
+              input_partition_dims=None):
+
+    # Check iterations_per_loop.
+    util_lib.check_positive_integer(iterations_per_loop,
+                                    'TPUConfig iterations_per_loop')
+
+    # Check num_shards.
+    if num_shards is not None:
+      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+
+    if input_partition_dims is not None:
+      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
+        raise ValueError(
+            'input_partition_dims must be a list/tuple with one or two'
+            ' elements.')
+
+      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
+        raise ValueError(
+            'input_partition_dims is only supported in PER_HOST_V2 mode.')
+
+      if num_cores_per_replica is None:
+        raise ValueError(
+            'input_partition_dims requires setting num_cores_per_replica.')
+
+    # Check num_cores_per_replica
+    if num_cores_per_replica is not None:
+      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
+        raise ValueError(
+            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
+                str(num_cores_per_replica)))
+
+    # per_host_input_for_training may be True, False, or integer in [1..3].
+    # Map legacy values (True, False) to numeric values.
+    if per_host_input_for_training is False:
+      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
+    elif per_host_input_for_training is True:
+      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
+
+    # Check initial_infeed_sleep_secs.
+    if initial_infeed_sleep_secs:
+      util_lib.check_positive_integer(initial_infeed_sleep_secs,
+                                      'TPUConfig initial_infeed_sleep_secs')
+
+    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
+
+    return super(TPUConfig, cls).__new__(
+        cls,
+        iterations_per_loop=iterations_per_loop,
+        num_shards=num_shards,
+        num_cores_per_replica=num_cores_per_replica,
+        per_host_input_for_training=per_host_input_for_training,
+        tpu_job_name=tpu_job_name,
+        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
+        input_partition_dims=input_partition_dims)
+
+
+class RunConfig(run_config_lib.RunConfig):
+  """RunConfig with TPU support."""
+
+  def __init__(self,
+               tpu_config=None,
+               evaluation_master=None,
+               master=None,
+               cluster=None,
+               **kwargs):
+    """Constructs a RunConfig.
+
+    Args:
+      tpu_config: the TPUConfig that specifies TPU-specific configuration.
+      evaluation_master: a string. The address of the master to use for eval.
+        Defaults to master if not set.
+      master: a string. The address of the master to use for training.
+      cluster: a ClusterResolver
+      **kwargs: keyword config parameters.
+
+    Raises:
+      ValueError: if cluster is not None and the provided session_config has a
+        cluster_def already.
+    """
+    super(RunConfig, self).__init__(**kwargs)
+    self._tpu_config = tpu_config or TPUConfig()
+    self._cluster = cluster
+
+    # If user sets master and/or evaluation_master explicitly, including empty
+    # string '', take it. Otherwise, take the values set by parent class.
+    if master is not None:
+      if cluster is not None:
+        raise ValueError('Both master and cluster are set.')
+      self._master = master
+    else:
+      if cluster:
+        self._master = cluster.master()
+
+    if evaluation_master is not None:
+      self._evaluation_master = evaluation_master
+    elif (not self._evaluation_master and
+          self.task_type != run_config_lib.TaskType.EVALUATOR):
+      # If the task type is EVALUATOR, it means some cluster manager sets the
+      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
+      #
+      # Otherwise, it means user executes the code without external cluster
+      # manager. For that, we optimize the user experience by setting
+      # evaluation_master to master, unless user overwrites it.
+      self._evaluation_master = self._master
+
+    # Set the ClusterSpec to use
+    if cluster:
+      self._cluster_spec = cluster.cluster_spec()
+
+      # Merge the cluster_def into the ConfigProto.
+      if self._session_config is None:  # pylint: disable=access-member-before-definition
+        self._session_config = config_pb2.ConfigProto(
+            allow_soft_placement=True, isolate_session_state=True)
+      if self._session_config.HasField('cluster_def'):
+        raise ValueError(
+            'You cannot provide a ClusterResolver and '
+            'session_config.cluster_def.')
+      if self._cluster_spec:
+        self._session_config.cluster_def.CopyFrom(
+            self._cluster_spec.as_cluster_def())
+
+  def _maybe_overwrite_session_config_for_distributed_training(self):
+    # Overrides the parent class session_config overwrite for between-graph. TPU
+    # runs with in-graph, which should not have device filter. Doing nothing
+    # ("pass") basically disables it.
+    pass
+
+  @property
+  def evaluation_master(self):
+    return self._evaluation_master
+
+  @property
+  def master(self):
+    return self._master
+
+  @property
+  def tpu_config(self):
+    return self._tpu_config
+
+  @property
+  def cluster(self):
+    return self._cluster
+
+  def replace(self, **kwargs):
+    if 'tpu_config' not in kwargs:
+      return super(RunConfig, self).replace(**kwargs)
+
+    tpu_config = kwargs.pop('tpu_config')
+    new_instance = super(RunConfig, self).replace(**kwargs)
+    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
+    return new_instance
+
+
+def _get_tpu_job_name_from_tf_config():
+  """Extracts the TPU job name from TF_CONFIG env variable."""
+  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
+  # spec propagation.
+  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
+  if tpu_job_name:
+    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
+  return tpu_job_name
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py b/tensorflow/python/tpu/tpu_config_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
rename to tensorflow/python/tpu/tpu_config_test.py
index b2fe0a688861503ae0bc55208f5dfc4d664419fd..22fb3032169851e5ee58d6b40bef52ece8593ba1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
+++ b/tensorflow/python/tpu/tpu_config_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import json
 
-from tensorflow.contrib.tpu.python.tpu import tpu_config as tpu_config_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_config as tpu_config_lib
 
 
 def _set_tf_config_env_variable(tf_config):
diff --git a/tensorflow/python/tpu/tpu_context.py b/tensorflow/python/tpu/tpu_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..2511e427a26fbfd58b3b3c7a2179f2cd74626ce2
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_context.py
@@ -0,0 +1,763 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from contextlib import contextmanager
+import copy
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import _tpu_estimator_embedding
+from tensorflow.python.tpu import device_assignment as tpu_device_assignment
+from tensorflow.python.tpu import tpu_config
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+
+
+_DEFAULT_JOB_NAME = 'tpu_worker'
+_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
+_LOCAL_MASTERS = ('', 'local')
+_NUM_CORES_TO_COMPUTATION_SHAPE = {
+    1: [1, 1, 1],
+    2: [1, 1, 2],
+    4: [1, 2, 2],
+    8: [2, 2, 2],
+    16: [4, 2, 2],
+}
+
+
+class TPUContext(object):
+  """A context that holds the current configuration of the TPU computation."""
+
+  def __init__(self,
+               internal_ctx,
+               input_device=None,
+               invocation_index=None,
+               call_from_input_fn=True):
+    self._internal_ctx = internal_ctx
+    self._input_device = input_device
+    self._invocation_index = invocation_index
+    self._call_from_input_fn = call_from_input_fn
+
+  def current_input_fn_deployment(self):
+    """The configuration of the current input_fn invocation.
+
+    The configuration depends on `TPUConfig.per_host_input_for_training`. See
+    `TPUConfig` for details.
+
+    Only set in params dict of input_fn
+
+    Returns:
+      A tuple of
+        1. Device spec string: String, is the current CPU host where the
+           input_fn is invoked.
+        2. Current invocation index: Int, 0-based index of the input_fn
+           invocation. See next item for details.
+        3. Total invocation count: Int, the total number of times to invoke the
+           input_fn on all CPU hosts. Each invocation will be passed with a new
+           `TPUContext` instance with current invocation index set properly.
+        4. Total number of replicas consumed by current_invocation: Int, the
+           number of replicas fed by the data returned by current input_fn. For
+           example, for per_core input pipeline deployment
+           and non-model-parallelism, total invocation count is equal to
+           the number of cores in the system and num replicas consumed by
+           current invocation is 1. For per-host v2 input pipeline deployment,
+           total invocation count is equal to the number of hosts in the system
+           and num replicas consumed by current invocation is equal to number of
+           cores per host.
+
+    Raises:
+      RuntimeError: If this method must not be called from input_fn.
+    """
+    if not self._call_from_input_fn:
+      raise RuntimeError('This TPUContext instance must not be called from'
+                         ' model_fn.')
+
+    if self._internal_ctx.is_input_sharded_per_core():
+      total_invocation_count = (self._internal_ctx.num_hosts
+                                * self._internal_ctx.num_of_replicas_per_host)
+      replicas_consumed = 1
+    elif self._internal_ctx.is_input_broadcast_with_iterators():
+      total_invocation_count = 1
+      replicas_consumed = self._internal_ctx.num_replicas
+    else:
+      total_invocation_count = self._internal_ctx.num_hosts
+      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
+    return (self._input_device, self._invocation_index,
+            total_invocation_count, replicas_consumed)
+
+  @property
+  def num_replicas(self):
+    """The total number of replicas.
+
+    For non-model-parallelism, num_replicas should be the total num of TPU
+    cores in the system.
+
+    Returns:
+      The number of replicas.
+    """
+    return self._internal_ctx.num_replicas
+
+  @property
+  def num_hosts(self):
+    """The number of hosts for the TPU system."""
+    return self._internal_ctx.num_hosts
+
+  @property
+  def current_host(self):
+    """The current host index for the TPU system."""
+    return self._invocation_index
+
+  @property
+  def num_of_replicas_per_host(self):
+    """The number of replicas for each host."""
+    if self._internal_ctx.model_parallelism_enabled:
+      raise ValueError(
+          'num_of_replicas_per_host is not supported for model_parallelism')
+    return self._internal_ctx.num_of_replicas_per_host
+
+  @property
+  def device_assignment(self):
+    """Returns device_assignment object."""
+    if self._call_from_input_fn:
+      raise RuntimeError('This TPUContext instance must not be called from'
+                         ' input_fn.')
+    return self._internal_ctx.device_assignment
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    # Note that: For the non-model parallelism, the mapping could be
+    # a random permutation. The order should not matter in most cases
+    # as far as model is replicated to all cores in the system.
+    return self._internal_ctx.device_for_replica(replica_id)
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function.
+
+    The place function takes host_id as the input and returns the TF device
+    for the correspoding host.
+    """
+
+    def _placement_function(host_id):
+      """Return the host device given host_id."""
+      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
+
+    return _placement_function
+
+
+class _InternalTPUContext(object):
+  """A context holds immutable states of TPU computation.
+
+  This immutable object holds TPUEstimator config, train/eval batch size, and
+  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
+  provides utility functions, based on the current state, to determine other
+  information commonly required by TPU computation, such as TPU device names,
+  TPU hosts, shard batch size, etc.
+
+  if eval_on_tpu is False, then execution of eval on TPU is disabled.
+  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
+  and TPU execution is disabled for all modes.
+
+  N.B. As `mode` is not immutable state in Estimator, but essential to
+  distinguish between TPU training and evaluation, a common usage for
+  _InternalTPUContext with `mode` is as follows:
+  ```
+  with _ctx.with_mode(mode) as ctx:
+    if ctx.is_running_on_cpu():
+       ...
+  ```
+  """
+
+  def __init__(self,
+               config,
+               train_batch_size,
+               eval_batch_size,
+               predict_batch_size,
+               use_tpu,
+               eval_on_tpu=True,
+               embedding_config_spec=None):
+    self._config = config
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
+    self._use_tpu = use_tpu
+    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
+    if not use_tpu and eval_on_tpu:
+      logging.warning('eval_on_tpu ignored because use_tpu is False.')
+
+    self._eval_on_tpu = eval_on_tpu
+    self._model_parallelism_enabled = (
+        use_tpu and config.tpu_config.num_cores_per_replica)
+    self._mode = None
+    num_cores_per_replica = config.tpu_config.num_cores_per_replica
+    if self._model_parallelism_enabled:
+      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
+          num_cores_per_replica]
+    else:
+      self._computation_shape = None
+    self._lazy_tpu_system_metadata_dict = {}  # key by master address
+    self._lazy_device_assignment_dict = {}  # key by master address
+    self._lazy_validation_dict = {}  # key by ModeKeys
+    self._embedding_config_spec = embedding_config_spec
+    self._lazy_embedding_config_dict = {}  # key by master address
+
+  def _assert_mode(self):
+    if self._mode is None:
+      raise RuntimeError(
+          '`mode` needs to be set via contextmanager `with_mode`.')
+    return self._mode
+
+  @contextmanager
+  def with_mode(self, mode):
+    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
+    # such as _lazy_tpu_system_metadata_dict between new copy and the original
+    # one. Note that all lazy states stored in properties _lazy_foo are sort of
+    # immutable as they should be same for the process lifetime.
+    new_ctx = copy.copy(self)
+    new_ctx._mode = mode  # pylint: disable=protected-access
+    yield new_ctx
+
+  @property
+  def mode(self):
+    return self._assert_mode()
+
+  def _get_master_address(self):
+    mode = self._assert_mode()
+    config = self._config
+    master = (
+        config.master
+        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
+    return master
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    cluster_def = None
+    if (self._config.session_config and
+        self._config.session_config.cluster_def.job):
+      cluster_def = self._config.session_config.cluster_def
+
+    # pylint: disable=protected-access
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(
+            master,
+            cluster_def=cluster_def,
+            query_topology=self.model_parallelism_enabled))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+  def _get_device_assignment(self):
+    """Gets the (maybe cached) TPU device assignment."""
+    master = self._get_master_address()
+    device_assignment = self._lazy_device_assignment_dict.get(master)
+    if device_assignment is not None:
+      return device_assignment
+
+    tpu_system_metadata = self._get_tpu_system_metadata()
+
+    device_assignment = tpu_device_assignment.device_assignment(
+        tpu_system_metadata.topology,
+        computation_shape=self._computation_shape,
+        num_replicas=self.num_replicas)
+
+    logging.info('num_cores_per_replica: %s',
+                 str(self._config.tpu_config.num_cores_per_replica))
+    logging.info('computation_shape: %s', str(self._computation_shape))
+    logging.info('num_replicas: %d', self.num_replicas)
+    logging.info('device_assignment.topology.device_coordinates: %s',
+                 str(device_assignment.topology.device_coordinates))
+    logging.info('device_assignment.core_assignment: %s',
+                 str(device_assignment.core_assignment))
+
+    self._lazy_device_assignment_dict[master] = device_assignment
+    return device_assignment
+
+  @property
+  def embedding_config(self):
+    """Returns the embedding config based on current mode."""
+    master = self._get_master_address()
+    if master in self._lazy_embedding_config_dict:
+      embedding_config = self._lazy_embedding_config_dict[master]
+    else:
+      embedding_config = None
+      if self._use_tpu and self._embedding_config_spec:
+        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
+            self._embedding_config_spec, self._train_batch_size,
+            self._eval_batch_size, self.num_hosts, self.num_cores, master)
+        if not embedding_config.has_embedding_tables():
+          embedding_config = None
+      self._lazy_embedding_config_dict[master] = embedding_config
+
+    if embedding_config is not None:
+      mode = self._assert_mode()
+      # Dynamically attach tpu_embedding based on mode. With
+      # this, we could keep embedding_config immutable but call site always
+      # accesses the unified API '.tpu_embedding'.
+      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
+    return embedding_config
+
+  @property
+  def model_parallelism_enabled(self):
+    return self._model_parallelism_enabled
+
+  @property
+  def input_partition_dims(self):
+    return self._config.tpu_config.input_partition_dims
+
+  @property
+  def device_assignment(self):
+    return (self._get_device_assignment()
+            if self._model_parallelism_enabled else None)
+
+  @property
+  def num_of_cores_per_host(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_of_cores_per_host
+
+  @property
+  def num_cores(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_cores
+
+  @property
+  def num_of_replicas_per_host(self):
+    """Return the number of replicas per host."""
+    if self.model_parallelism_enabled:
+      return self.num_replicas // self.num_hosts
+    else:
+      return self.num_of_cores_per_host
+
+  @property
+  def num_replicas(self):
+    num_cores_in_system = self.num_cores
+
+    if self.model_parallelism_enabled:
+      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
+      if num_cores_per_replica > num_cores_in_system:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica, is larger than the total num of '
+            'TPU cores in the system. num_cores_per_replica: {}, num cores '
+            'in the system: {}'.format(num_cores_per_replica,
+                                       num_cores_in_system))
+
+      if num_cores_in_system % num_cores_per_replica != 0:
+        raise RuntimeError(
+            'The num of cores in the system ({}) is not divisible by the num '
+            'of cores ({}) required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
+                num_cores_in_system, num_cores_per_replica))
+
+      return num_cores_in_system // num_cores_per_replica
+    else:
+      return num_cores_in_system
+
+  @property
+  def num_hosts(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_hosts
+
+  @property
+  def config(self):
+    return self._config
+
+  def is_input_sharded_per_core(self):
+    """Return true if input_fn is invoked per-core (other than per-host)."""
+    mode = self._assert_mode()
+    return (mode == model_fn_lib.ModeKeys.TRAIN and
+            (self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.PER_SHARD_V1))
+
+  def is_input_per_host_with_iterators(self):
+    """Return true if input_fn should be run in the per-host v2 config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.PER_HOST_V2)
+
+  def is_input_broadcast_with_iterators(self):
+    """Return true if input_fn should be run in the full_replicae config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.BROADCAST)
+
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    This API also validates user provided configuration, such as batch size,
+    according the lazy initialized TPU system metadata.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distinguish these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
+
+    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
+    if not is_running_on_cpu:
+      self._validate_tpu_configuration()
+    return is_running_on_cpu
+
+  def _is_running_on_cpu(self, is_export_mode):
+    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+    mode = self._assert_mode()
+
+    if not self._use_tpu:
+      return True
+
+    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
+      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
+      return True
+
+    if is_export_mode:
+      return True
+
+    return False
+
+  @property
+  def global_batch_size(self):
+    mode = self._assert_mode()
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
+
+  @property
+  def batch_size_for_input_fn(self):
+    """Returns the shard batch size for `input_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
+      return global_batch_size
+
+    # On TPU
+    if self.is_input_sharded_per_core() or (
+        self.is_input_per_host_with_iterators()):
+      return global_batch_size // self.num_replicas
+    else:
+      return global_batch_size // self.num_hosts
+
+  @property
+  def batch_size_for_model_fn(self):
+    """Returns the shard batch size for `model_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
+      return global_batch_size
+
+    # On TPU. always sharded per shard.
+    return global_batch_size // self.num_replicas
+
+  @property
+  def master_job(self):
+    """Returns the job name to use to place TPU computations on.
+
+    Returns:
+      A string containing the job name, or None if no job should be specified.
+
+    Raises:
+      ValueError: If the user needs to specify a tpu_job_name, because we are
+        unable to infer the job name automatically, or if the user-specified job
+        names are inappropriate.
+    """
+    run_config = self._config
+    # If the user specifies the tpu_job_name, use that.
+    if run_config.tpu_config.tpu_job_name:
+      return run_config.tpu_config.tpu_job_name
+
+    # The tpu job is determined by the run_config. Right now, this method is
+    # required as tpu_config is not part of the RunConfig.
+    mode = self._assert_mode()
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
+    if master in _LOCAL_MASTERS:
+      return None
+
+    if (not run_config.session_config or
+        not run_config.session_config.cluster_def.job):
+      return _DEFAULT_JOB_NAME
+    cluster_def = run_config.session_config.cluster_def
+    job_names = set([job.name for job in cluster_def.job])
+    if _DEFAULT_JOB_NAME in job_names:
+      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
+      raise ValueError('Currently, tpu_worker is not an allowed job name.')
+    if len(job_names) == 1:
+      return cluster_def.job[0].name
+    if len(job_names) == 2:
+      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
+        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
+        return job_names.pop()
+      # TODO(b/67716447): Include more sophisticated heuristics.
+    raise ValueError(
+        'Could not infer TPU job name. Please specify a tpu_job_name as part '
+        'of your TPUConfig.')
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function."""
+
+    master = self.master_job
+
+    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
+      """Return the host device given replica_id or host_id."""
+      assert _sentinal is None
+      if replica_id is not None and host_id is not None:
+        raise RuntimeError(
+            'replica_id and host_id can have only one non-None value.')
+
+      if master is None:
+        return '/replica:0/task:0/device:CPU:0'
+      else:
+        if replica_id is not None:
+          if self.model_parallelism_enabled:
+            return self.device_assignment.host_device(
+                replica=replica_id, job=master)
+          else:
+            host_id = replica_id / self.num_of_cores_per_host
+
+        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
+    return _placement_function
+
+  @property
+  def tpu_device_placement_function(self):
+    """Returns a TPU device placement Fn."""
+    master = self.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    def _placement_function(i):
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_device(replica=i, job=master)
+      else:
+        num_of_cores_per_host = self.num_of_cores_per_host
+        host_id = i / num_of_cores_per_host
+        ordinal_id = i % num_of_cores_per_host
+        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
+
+    return _placement_function
+
+  def tpu_ordinal_function(self, host_id):
+    """Returns the TPU ordinal fn."""
+
+    def _tpu_ordinal_function(shard_index_in_host):
+      """Return the TPU ordinal associated with a shard.
+
+      Required because the enqueue ops are placed on CPU.
+
+      Args:
+        shard_index_in_host: the shard index
+
+      Returns:
+        The ordinal of the TPU device the shard's infeed should be placed on.
+      """
+      if self.model_parallelism_enabled:
+        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
+        replica = self.device_assignment.lookup_replicas(host_id,
+                                                         0)[shard_index_in_host]
+        return self.device_assignment.tpu_ordinal(replica=replica)
+      else:
+        return shard_index_in_host % self.num_of_cores_per_host
+
+    return _tpu_ordinal_function
+
+  def _validate_tpu_configuration(self):
+    """Validates the configuration based on the TPU system metadata."""
+    mode = self._assert_mode()
+    if self._lazy_validation_dict.get(mode):
+      return
+
+    # All following information is obtained from TPU system metadata.
+    num_cores = self.num_cores
+    num_replicas = self.num_replicas
+    num_hosts = self.num_hosts
+
+    if not num_cores:
+      tpu_system_metadata = self._get_tpu_system_metadata()
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system. Please double check '
+          'Tensorflow master address and TPU worker(s). Available devices '
+          'are {}.'.format(tpu_system_metadata.devices))
+
+    if self._config.tpu_config.num_shards:
+      user_provided_num_replicas = self._config.tpu_config.num_shards
+      if user_provided_num_replicas != num_replicas:
+        message = (
+            'TPUConfig.num_shards is not set correctly. According to TPU '
+            'system metadata for Tensorflow master ({}): num_replicas should '
+            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
+            'be the total num of TPU cores in the system. For '
+            'model-parallelism, the total number of TPU cores should be '
+            'num_cores_per_replica * num_replicas. Please set it '
+            'accordingly or leave it as `None`'.format(
+                self._get_master_address(), num_replicas,
+                user_provided_num_replicas))
+
+        raise ValueError(message)
+
+    if self._config.tpu_config.num_cores_per_replica:
+      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
+      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
+      if num_cores_per_replica > num_cores_per_host:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica, is larger than the '
+            'num_cores_per_host. num_cores_per_replica: {}, '
+            'num_cores_per_host: {}'.format(num_cores_per_replica,
+                                            num_cores_per_host))
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      if (self._train_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'train batch size {} must be divisible by number of replicas {}'
+            .format(self._train_batch_size, num_replicas))
+
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      if self._eval_batch_size is None:
+        raise ValueError(
+            'eval_batch_size in TPUEstimator constructor cannot be `None`'
+            'if .evaluate is running on TPU.')
+      if (self._eval_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'eval batch size {} must be divisible by number of replicas {}'
+            .format(self._eval_batch_size, num_replicas))
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
+        raise ValueError(
+            'TPUEstimator.evaluate should be running on single TPU'
+            ' instead of a Pod.')
+    else:
+      assert mode == model_fn_lib.ModeKeys.PREDICT
+      if self._predict_batch_size is None:
+        raise ValueError(
+            'predict_batch_size in TPUEstimator constructor should not be '
+            '`None` if .predict is running on TPU.')
+      if (self._predict_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'predict batch size {} must be divisible by number of replicas {}'
+            .format(self._predict_batch_size, num_replicas))
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
+        raise ValueError(
+            'TPUEstimator.predict should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+
+    # Record the state "validated" into lazy dictionary.
+    self._lazy_validation_dict[mode] = True
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    master = self.master_job
+
+    if self.model_parallelism_enabled:
+      return (self.device_assignment.host_device(
+          replica=replica_id, job=master),
+              self.device_assignment.tpu_ordinal(replica=replica_id))
+
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
+
+class _OneCoreTPUContext(_InternalTPUContext):
+  """Special _InternalTPUContext for one core usage."""
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+
+    super(_OneCoreTPUContext, self).__init__(
+        config, train_batch_size, eval_batch_size,
+        predict_batch_size, use_tpu)
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
+            num_cores=1,
+            num_hosts=1,
+            num_of_cores_per_host=1,
+            topology=None,
+            devices=[]))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+
+def _get_tpu_context(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu, eval_on_tpu,
+                     embedding_config_spec):
+  """Returns an instance of `_InternalTPUContext`."""
+
+  if (config.tpu_config.num_shards == 1 and
+      config.tpu_config.num_cores_per_replica is None):
+    if embedding_config_spec is not None:
+      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
+                       'when embedding_config_spec is not None.')
+    logging.warning(
+        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
+        'Please fix as soon as possible (leaving num_shards as None.)')
+    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
+                              predict_batch_size, use_tpu)
+
+  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
+                             predict_batch_size, use_tpu, eval_on_tpu,
+                             embedding_config_spec)
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f17395f11899450968ec08c8440b10b5386e45
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -0,0 +1,1102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU embedding APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import math
+import re
+import six
+
+from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2 as elc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.tpu.ops import tpu_ops
+
+TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
+INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
+
+
+class TableConfig(
+    collections.namedtuple(
+        'TableConfig',
+        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
+  """Embedding table configuration."""
+
+  def __new__(cls,
+              vocabulary_size,
+              dimension,
+              initializer=None,
+              combiner='mean'):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Number of vocabulary (/rows) in the table.
+      dimension: The embedding dimension.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+        with bag-of-words columns. For more information, see
+        `tf.nn.embedding_lookup_sparse`.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not positive integer.
+      ValueError: if `dimension` is not positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
+
+    if not isinstance(dimension, int) or dimension < 1:
+      raise ValueError('Invalid dimension {}.'.format(dimension))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError('initializer must be callable if specified.')
+    if initializer is None:
+      initializer = init_ops.truncated_normal_initializer(
+          mean=0.0, stddev=1 / math.sqrt(dimension))
+
+    if combiner not in ('mean', 'sum', 'sqrtn'):
+      raise ValueError('Invalid combiner {}'.format(combiner))
+
+    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
+                                           initializer, combiner)
+
+
+AdamSlotVariableNames = collections.namedtuple(
+    'AdamSlotVariableNames', ['m', 'v'])
+
+AdagradSlotVariableName = collections.namedtuple(
+    'AdagradSlotVariableName', ['accumulator'])
+
+AdamSlotVariables = collections.namedtuple(
+    'AdamSlotVariables', ['m', 'v'])
+
+AdagradSlotVariable = collections.namedtuple(
+    'AdagradSlotVariable', ['accumulator'])
+
+VariablesAndOps = collections.namedtuple(
+    'VariablesAndOps',
+    ['embedding_variables_by_table', 'slot_variables_by_table',
+     'load_ops', 'retrieve_ops']
+)
+
+
+# TODO(shizhiw): Factor `use_gradient_accumulation` and
+# `pipeline_execution_with_tensor_core` out of `_OptimizationParameters`.
+class _OptimizationParameters(object):
+  """Parameters common to all optimizations."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation,
+               pipeline_execution_with_tensor_core):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+
+class AdagradParameters(_OptimizationParameters):
+  """Optimization parameters for Adagrad."""
+
+  def __init__(self, learning_rate, initial_accumulator,
+               use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: used for updating embedding table.
+      initial_accumulator: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `True` makes embedding
+         gradients calculation more accurate but slower. Please see
+         `optimization_parameters.proto` for details.
+         for details.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+    """
+    super(AdagradParameters, self).__init__(learning_rate,
+                                            use_gradient_accumulation,
+                                            pipeline_execution_with_tensor_core)
+    self.initial_accumulator = initial_accumulator
+
+
+class AdamParameters(_OptimizationParameters):
+  """Optimization parameters for Adam."""
+
+  def __init__(self, learning_rate,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-08,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    """Optimization parameters for Adam.
+
+    Args:
+      learning_rate: a floating point value. The learning rate.
+      beta1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+        Please see `optimization_parameters.proto` for details.
+      sum_inside_sqrt: This improves training speed. Please see
+        `optimization_parameters.proto` for details.
+      use_gradient_accumulation: setting this to `True` makes embedding
+        gradients calculation more accurate but slower. Please see
+        `optimization_parameters.proto` for details.
+        for details.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+    """
+    super(AdamParameters, self).__init__(learning_rate,
+                                         use_gradient_accumulation,
+                                         pipeline_execution_with_tensor_core)
+    self.beta1 = beta1
+    self.beta2 = beta2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+
+class StochasticGradientDescentParameters(_OptimizationParameters):
+  """Optimization parameters for stochastic gradient descent.
+
+  Args:
+    learning_rate: a floating point value. The learning rate.
+    use_gradient_accumulation: setting this to `True` makes embedding
+      gradients calculation more accurate but slower. Please see
+         `optimization_parameters.proto` for details.
+    pipeline_execution_with_tensor_core: setting this to `True` makes training
+      faster, but trained model will be different if step N and step N+1
+      involve the same set of embedding ID. Please see
+      `tpu_embedding_configuration.proto` for details.
+    """
+
+  def __init__(self, learning_rate, use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    super(StochasticGradientDescentParameters, self).__init__(
+        learning_rate, use_gradient_accumulation,
+        pipeline_execution_with_tensor_core)
+
+
+class TPUEmbedding(object):
+  """API for using TPU for embedding.
+
+    Example:
+    ```
+    table_config_user = tpu_embedding.TableConfig(
+        vocabulary_size=4, dimension=2,
+        initializer=initializer, combiner='mean')
+    table_to_config_dict = {'video': table_config_video,
+                          'user': table_config_user}
+    feature_to_table_dict = {'watched': 'video',
+                             'favorited': 'video',
+                             'friends': 'user'}
+    batch_size = 4
+    num_hosts = 1
+    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
+    mode = tpu_embedding.TRAINING
+    embedding = tpu_embedding.TPUEmbedding(
+        table_to_config_dict, feature_to_table_dict,
+        batch_size, num_hosts, mode, optimization_parameters)
+
+    batch_size_per_core = embedding.batch_size_per_core
+    sparse_features_list = []
+    for host in hosts:
+      with ops.device(host):
+        for _ in range(embedding.num_cores_per_host):
+          sparse_features = {}
+          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
+          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
+          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
+          sparse_features_list.append(sparse_features)
+
+    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
+    embedding_variables_and_ops = embedding.create_variables_and_ops()
+
+    def computation():
+      activations = embedding.get_activations()
+      loss = compute_loss(activations)
+
+      base_optimizer = gradient_descent.GradientDescentOptimizer(
+          learning_rate=1)
+      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
+          base_optimizer)
+
+      train_op = cross_shard_optimizer.minimize(loss)
+      gradients = (
+          tpu_embedding_gradient.get_gradients_through_compute_gradients(
+              cross_shard_optimizer, loss, activations)
+      send_gradients_op = embedding.generate_send_gradients_op(gradients)
+      with ops.control_dependencies([train_op, send_gradients_op]):
+        loss = array_ops.identity(loss)
+
+    loss = tpu.shard(computation,
+                     num_shards=embedding.num_cores)
+
+    with self.test_session() as sess:
+      sess.run(tpu.initialize_system(embedding_config=
+                                     embedding.config_proto))
+      sess.run(variables.global_variables_initializer())
+      sess.run(embedding_variables_and_ops.load_ops())
+      sess.run(enqueue_ops)
+      loss_val = sess.run(loss)
+    ```
+  """
+
+  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
+  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
+  # `FeatureConfig` could have fields other than table name. For example, it
+  # could have a field to indicate that the feature should not be used to
+  # update embedding table (cr/204852758, cr/204940540). Also, this can support
+  # different combiners for different features within the same table.
+  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
+  # to `FeatureConfig`?
+
+  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
+  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
+
+  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
+  # for-loops around construction of inputs.
+
+  # `optimization_parameter` applies to all tables. If the need arises,
+  # we can add `optimization_parameters` to `TableConfig` to override this
+  # global setting.
+  def __init__(self,
+               table_to_config_dict,
+               feature_to_table_dict,
+               batch_size,
+               mode,
+               master,
+               optimization_parameters=None):
+    """API for using TPU for embedding lookups.
+
+    Args:
+      table_to_config_dict: A dictionary mapping from string of table name to
+        `TableConfig`. Table refers to an embedding table, e.g. `params`
+        argument to `tf.nn.embedding_lookup_sparse()`.
+      feature_to_table_dict: A dictionary mapping from string of feature name
+        to string of table name. Feature refers to ids to lookup in embedding
+        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
+      batch_size: An `int` representing the global batch size.
+      mode: `TRAINING` or `INFERENCE`.
+      master: A `string` representing the TensorFlow master to use.
+      optimization_parameters: `AdagradParameters`, `AdamParameters`,
+        `Stochasticgradientdescentparameters`. Must be set in training and must
+        be `None` in inference.
+
+    Raises:
+      ValueError: if any input is invalid.
+    """
+    _validate_table_to_config_dict(table_to_config_dict)
+    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
+    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
+    self._combiners = _create_combiners(self._table_to_config_dict)
+
+    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
+    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
+    self._table_to_features_dict = _create_table_to_features_dict(
+        self._feature_to_table_dict)
+
+    self._batch_size = batch_size
+
+    self._master = master
+    self._tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
+    if self._tpu_system_metadata.num_cores == 0:
+      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
+                       'TPUs.'.format(self._master))
+    self._num_hosts = self._tpu_system_metadata.num_hosts
+    self._hosts = [device.name for device in self._tpu_system_metadata.devices
+                   if 'device:CPU:' in device.name]
+    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
+    self._num_cores = self._tpu_system_metadata.num_cores
+
+    _validate_batch_size(self._batch_size, self._num_cores)
+    self._batch_size_per_core = self._batch_size // self._num_cores
+
+    # TODO(shizhiw): remove `mode`?
+    if mode == TRAINING:
+      _validate_optimization_parameters(optimization_parameters)
+      self._optimization_parameters = optimization_parameters
+    elif mode == INFERENCE:
+      if optimization_parameters is not None:
+        raise ValueError('`optimization_parameters` should be `None` '
+                         'for inference mode.')
+      self._optimization_parameters = (
+          StochasticGradientDescentParameters(1.))
+    else:
+      raise ValueError('`mode` only supports {} and {}; got {}.'
+                       .format(TRAINING, INFERENCE, mode))
+    self._mode = mode
+
+    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
+    # and create special handler for inference that inherits from
+    # StochasticGradientDescentHandler with more user-friendly error message
+    # on get_slot().
+    self._optimizer_handler = _get_optimization_handler(
+        self._optimization_parameters)
+
+    self._config_proto = self._create_config_proto()
+
+  @property
+  def hosts(self):
+    """A list of device names for CPU hosts.
+
+    Returns:
+      A list of device names for CPU hosts.
+    """
+    return copy.copy(self._hosts)
+
+  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
+  # to be consistent with `tpu_embedding_configuration.proto`.
+  @property
+  def num_cores_per_host(self):
+    """Number of TPU cores on a CPU host.
+
+    Returns:
+      Number of TPU cores on a CPU host.
+    """
+    return self._num_cores_per_host
+
+  @property
+  def num_cores(self):
+    """Total number of TPU cores on all hosts.
+
+    Returns:
+      Total number of TPU cores on all hosts.
+    """
+    return self._num_cores
+
+  @property
+  def batch_size_per_core(self):
+    """Batch size for each TPU core.
+
+    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
+       must have batch dimension equal to this.
+
+    Returns:
+      Batch size for each TPU core.
+    """
+    return self._batch_size_per_core
+
+  @property
+  def config_proto(self):
+    """Create embedding config proto for `tpu.initialize_system()`.
+
+    Returns:
+      an `TPUEmbeddingConfiguration` proto describing the desired
+         configuration of the hardware embedding lookup tables, which
+         is passed to `tpu.initialize_system()`.
+    """
+    return self._config_proto
+
+  @property
+  def table_to_config_dict(self):
+    return copy.copy(self._table_to_config_dict)
+
+  @property
+  def feature_to_table_dict(self):
+    return copy.copy(self._feature_to_table_dict)
+
+  @property
+  def table_to_features_dict(self):
+    return copy.copy(self._table_to_features_dict)
+
+  @property
+  def optimization_parameters(self):
+    return self._optimization_parameters
+
+  def _create_config_proto(self):
+    """Create `TPUEmbeddingConfiguration`."""
+    config_proto = elc.TPUEmbeddingConfiguration()
+    for table in self._table_to_config_dict:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table
+
+      table_config = self._table_to_config_dict[table]
+      table_descriptor.vocabulary_size = table_config.vocabulary_size
+      table_descriptor.dimension = table_config.dimension
+
+      features_for_table = self._table_to_features_dict[table]
+      table_descriptor.num_features = len(features_for_table)
+
+      table_descriptor.optimization_parameters.learning_rate.constant = (
+          self._optimization_parameters.learning_rate)
+      table_descriptor.optimization_parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
+          if self._optimization_parameters.use_gradient_accumulation else
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+      self._optimizer_handler.set_optimization_parameters(table_descriptor)
+
+    config_proto.mode = self._mode
+    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
+    config_proto.num_hosts = self._num_hosts
+    config_proto.num_tensor_cores = self._num_cores
+    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._optimization_parameters.pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def create_variables_and_ops(self, embedding_variable_name_by_table=None,
+                               slot_variable_names_by_table=None):
+    """Create embedding and slot variables, with ops to load and retrieve them.
+
+    Args:
+      embedding_variable_name_by_table: A dictionary mapping from string of
+        table name to string of embedding variable name. If `None`,
+        defaults from `get_default_slot_variable_names()` will be used.
+      slot_variable_names_by_table: A dictionary mapping from string of table
+        name to `AdamSlotVariableNames`, `AdagradSlotVariableNames` etc. If
+        `None`, defaults from `get_default_slot_variable_names()` will be used.
+
+    Returns:
+      `tpu_embedding.VariablesAndOps` with:
+        A dictionary mapping from string of table name to embedding variables,
+        A dictionary mapping from string of table name to AdagradSlotVariable,
+         AdamSlotVariables etc with slot variables,
+        A function which returns a list of ops to load embedding and slot
+         variables from TPU to CPU.
+        A function which returns a list of ops to retrieve embedding and slot
+         variables from TPU to CPU.
+    """
+    embedding_variables_by_table = {}
+    slot_variables_by_table = {}
+    load_op_fns = []
+    retrieve_op_fns = []
+    for table in self._table_to_config_dict:
+      if embedding_variable_name_by_table:
+        embedding_variable_name = embedding_variable_name_by_table[table]
+      else:
+        embedding_variable_name = table
+      if slot_variable_names_by_table:
+        slot_variable_names = slot_variable_names_by_table[table]
+      else:
+        slot_variable_names = (
+            self._optimizer_handler.get_default_slot_variable_names(table))
+
+      device_fn = _create_device_fn(self._hosts)
+      with ops.device(device_fn):
+        table_variables = _create_partitioned_variables(
+            name=embedding_variable_name,
+            num_hosts=self._num_hosts,
+            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
+            embedding_dimension=self._table_to_config_dict[table].dimension,
+            initializer=self._table_to_config_dict[table].initializer,
+            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+        embedding_variables_by_table[table] = table_variables
+
+        slot_variables_for_table, load_ops_fn, retrieve_ops_fn = (
+            self._optimizer_handler.create_variables_and_ops(
+                table, slot_variable_names, self._num_hosts,
+                self._table_to_config_dict[table], table_variables)
+        )
+        slot_variables_by_table[table] = slot_variables_for_table
+        load_op_fns.append(load_ops_fn)
+        retrieve_op_fns.append(retrieve_ops_fn)
+
+    def load_ops():
+      """Calls and returns the load ops for each embedding table.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_ops_list = []
+      for load_op_fn in load_op_fns:
+        load_ops_list.extend(load_op_fn())
+      return load_ops_list
+
+    def retrieve_ops():
+      """Calls and returns the retrieve ops for each embedding table.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_ops_list = []
+      for retrieve_op_fn in retrieve_op_fns:
+        retrieve_ops_list.extend(retrieve_op_fn())
+      return retrieve_ops_list
+
+    return VariablesAndOps(embedding_variables_by_table,
+                           slot_variables_by_table,
+                           load_ops, retrieve_ops)
+
+  def generate_enqueue_ops(self, sparse_features_list):
+    """Generate enqueue ops.
+
+    Args:
+      sparse_features_list: a list of dictionary mapping from string
+        of feature names to sparse tensor. Each dictionary is for one
+        TPU core. Dictionaries for the same core should be contiguous
+        on the list.
+
+    Returns:
+      Ops to enqueue to TPU for embedding.
+    """
+    self._validate_generate_enqueue_ops_sparse_features_list(
+        sparse_features_list)
+    return [
+        self._generate_enqueue_op(
+            sparse_features, device_ordinal=i % self._num_cores_per_host)
+        for i, sparse_features in enumerate(sparse_features_list)
+    ]
+
+  def _validate_generate_enqueue_ops_sparse_features_list(
+      self, sparse_features_list):
+    """Validate `sparse_features_list`."""
+    if len(sparse_features_list) != self._num_cores:
+      raise ValueError('Length of `sparse_features_list` should match the '
+                       'number of cores; '
+                       '`len(sparse_features_list)` is {}, '
+                       'number of cores is {}.'.format(
+                           len(sparse_features_list), self._num_cores))
+
+    feature_set = set(self._feature_to_table_dict.keys())
+    contiguous_device = None
+    for i, sparse_features in enumerate(sparse_features_list):
+      used_feature_set = set(sparse_features.keys())
+
+      # Check features are valid.
+      missing_feature_set = feature_set - used_feature_set
+      if missing_feature_set:
+        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, missing_feature_set))
+
+      extra_feature_set = used_feature_set - feature_set
+      if extra_feature_set:
+        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, extra_feature_set))
+
+      device = None
+      device_feature = None
+      for feature, tensor in six.iteritems(sparse_features):
+        if not isinstance(tensor, sparse_tensor.SparseTensor):
+          raise ValueError('`sparse_features_list[{}]` has a feature that is '
+                           'not mapped to `SparseTensor`. '
+                           '`feature`: {}, type: {}'.format(
+                               i, feature, type(tensor)))
+
+        # Check all features are on the same device.
+        if device is None:
+          device = tensor.op.device
+          device_feature = feature
+        else:
+          if device != tensor.op.device:
+            raise ValueError('Devices are different between features in '
+                             '`sparse_features_list[{}]`; '
+                             'devices: {}, {}; features: {}, {}.'.format(
+                                 i, device, tensor.op.device, feature,
+                                 device_feature))
+
+      if i % self._num_cores_per_host:
+        if device != contiguous_device:
+          raise ValueError('We expect the `sparse_features` which are on the '
+                           'same host to be contiguous in '
+                           '`sparse_features_list`, '
+                           '`sparse_features_list[{}]` is on device {}, '
+                           'but is expected to be on device {}.'.format(
+                               i, device, contiguous_device))
+      else:
+        contiguous_device = device
+
+  def _generate_enqueue_op(self, sparse_features, device_ordinal):
+    with ops.colocate_with(list(sparse_features.values())[0]):
+      sample_idcs, embedding_idcs, aggregation_weights = (
+          self._format_for_tpu_embedding_sparse_batch(sparse_features))
+      return tpu_ops.enqueue_tpu_embedding_sparse_batch(
+          sample_idcs,
+          embedding_idcs,
+          aggregation_weights,
+          combiners=self._combiners,
+          device_ordinal=device_ordinal)
+
+  def _format_for_tpu_embedding_sparse_batch(self, sparse_features):
+    """Format sparse features for `enqueue_tpu_embedding_sparse_batch()`.
+
+    Args:
+      sparse_features: a `Dict` of `SparseTensor`s for embedding.
+
+    Returns:
+      Arguments for `enqueue_tpu_embedding_sparse_batch()`.
+    """
+
+    sample_idcs, embedding_idcs, aggregation_weights = list(), list(), list()
+    for table in self._table_to_features_dict:
+      sample_t, indices_t, weights_t = list(), list(), list()
+
+      features = self._table_to_features_dict[table]
+      for i, feature in enumerate(features):
+        tensor = sparse_features[feature]
+        sample_indices = tensor.indices[:, 0]
+        embedding_indices = tensor.values
+        weights = array_ops.ones_like(embedding_indices)
+        sample_t.append(i * self._batch_size_per_core + sample_indices)
+        indices_t.append(embedding_indices)
+        weights_t.append(weights)
+
+      sample_idcs.append(
+          math_ops.cast(array_ops.concat(sample_t, axis=0), dtype=dtypes.int32))
+      embedding_idcs.append(
+          math_ops.cast(
+              array_ops.concat(indices_t, axis=0), dtype=dtypes.int32))
+      aggregation_weights.append(
+          math_ops.cast(
+              array_ops.concat(weights_t, axis=0), dtype=dtypes.float32))
+
+    return sample_idcs, embedding_idcs, aggregation_weights
+
+  def get_activations(self):
+    """Get activations for features.
+
+    This should be called within `computation` that is passed to
+      `tpu.replicate` and friends.
+
+    Returns:
+      A dictionary mapping from `String` of feature name to `Tensor`
+        of activation.
+    """
+    recv_activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_to_config_dict),
+        config=self._config_proto.SerializeToString())
+
+    activations = collections.OrderedDict()
+    for table_id, table in enumerate(self._table_to_features_dict):
+      features = self._table_to_features_dict[table]
+      for lookup_id, feature in enumerate(features):
+        start_row = lookup_id * self._batch_size_per_core
+        end_row = start_row + self._batch_size_per_core
+        activations[feature] = recv_activations[table_id][start_row:end_row, :]
+    return activations
+
+  def generate_send_gradients_op(self, feature_to_gradient_dict):
+    """Send gradient to TPU embedding.
+
+    Args:
+      feature_to_gradient_dict: dict mapping feature names to gradient wrt
+        activations.
+
+    Returns:
+      SendTPUEmbeddingGradients Op.
+
+    Raises:
+      RuntimeError: If `mode` is not `TRAINING`.
+    """
+    if self._mode != TRAINING:
+      raise RuntimeError('Only in training mode gradients need to '
+                         'be sent to TPU embedding; got mode {}.'
+                         .format(self._mode))
+    gradients = []
+    for table in self._table_to_features_dict:
+      features = self._table_to_features_dict[table]
+      table_gradients = [
+          feature_to_gradient_dict[feature] for feature in features
+      ]
+      concat_table_grads = array_ops.concat(table_gradients, axis=0)
+      gradients.append(concat_table_grads)
+    return tpu_ops.send_tpu_embedding_gradients(
+        inputs=gradients, config=self.config_proto.SerializeToString())
+
+
+def _validate_table_to_config_dict(table_to_config_dict):
+  """Validate `table_to_config_dict`."""
+  for k, v in six.iteritems(table_to_config_dict):
+    if not isinstance(v, TableConfig):
+      raise ValueError('Value of `table_to_config_dict` must be of type '
+                       '`TableConfig`, got {} for {}.'.format(type(v), k))
+
+
+def _validate_feature_to_table_dict(table_to_config_dict,
+                                    feature_to_table_dict):
+  """Validate `feature_to_table_dict`."""
+  used_table_set = set(feature_to_table_dict.values())
+  table_set = set(table_to_config_dict.keys())
+
+  unused_table_set = table_set - used_table_set
+  if unused_table_set:
+    raise ValueError('`table_to_config_dict` specifies table that is not '
+                     'used in `feature_to_table_dict`: {}.'
+                     .format(unused_table_set))
+
+  extra_table_set = used_table_set - table_set
+  if extra_table_set:
+    raise ValueError('`feature_to_table_dict` refers to a table that is not '
+                     'specified in `table_to_config_dict`: {}.'
+                     .format(extra_table_set))
+
+
+def _validate_batch_size(batch_size, num_cores):
+  if batch_size % num_cores:
+    raise ValueError('`batch_size` is not a multiple of number of '
+                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
+                         batch_size, num_cores))
+
+
+def _validate_optimization_parameters(optimization_parameters):
+  if not isinstance(optimization_parameters, _OptimizationParameters):
+    raise ValueError('`optimization_parameters` must inherit from '
+                     '`_OptimizationPramaters`. '
+                     '`type(optimization_parameters)`={}'.format(
+                         type(optimization_parameters)))
+
+
+class _OptimizerHandler(object):
+  """Interface class for handling optimizer specific logic."""
+
+  def __init__(self, optimization_parameters):
+    self._optimization_parameters = optimization_parameters
+
+  def set_optimization_parameters(self, table_descriptor):
+    raise NotImplementedError()
+
+  def get_default_slot_variable_names(self, table):
+    raise NotImplementedError()
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    raise NotImplementedError()
+
+
+class _AdagradHandler(_OptimizerHandler):
+  """Handles Adagrad specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdagradHandler, self).__init__(optimization_parameters)
+    self._table_to_accumulator_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adagrad.SetInParent()
+
+  def get_default_slot_variable_names(self, table):
+    return AdagradSlotVariableName('{}/{}'.format(table, 'Adagrad'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    accumulator_initializer = init_ops.constant_initializer(
+        self._optimization_parameters.initial_accumulator)
+    accumulator_variables = _create_partitioned_variables(
+        name=slot_variable_names.accumulator,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=accumulator_initializer)
+    slot_variables = AdagradSlotVariable(accumulator_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_adagrad_parameters(
+                  parameters=table_variable,
+                  accumulators=accumulator_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_accumulator = (
+              tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(accumulator_variable, retrieved_accumulator))
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _AdamHandler(_OptimizerHandler):
+  """Handles Adam specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdamHandler, self).__init__(optimization_parameters)
+    self._table_to_m_variables_dict = {}
+    self._table_to_v_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adam.beta1 = (
+        self._optimization_parameters.beta1)
+    table_descriptor.optimization_parameters.adam.beta2 = (
+        self._optimization_parameters.beta2)
+    table_descriptor.optimization_parameters.adam.epsilon = (
+        self._optimization_parameters.epsilon)
+    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
+        not self._optimization_parameters.lazy_adam)
+    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
+        self._optimization_parameters.sum_inside_sqrt)
+
+  def get_default_slot_variable_names(self, table):
+    return AdamSlotVariableNames('{}/{}/m'.format(table, 'Adam'),
+                                 '{}/{}/v'.format(table, 'Adam'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    m_initializer = init_ops.zeros_initializer()
+    m_variables = _create_partitioned_variables(
+        name=slot_variable_names.m,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=m_initializer)
+    v_initializer = init_ops.zeros_initializer()
+    v_variables = _create_partitioned_variables(
+        name=slot_variable_names.v,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=v_initializer)
+    slot_variables = AdamSlotVariables(m_variables, v_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable, m_variable, v_variable in (zip(
+          range(num_hosts), table_variables,
+          m_variables, v_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_adam_parameters(
+                  parameters=table_variable,
+                  momenta=m_variable,
+                  velocities=v_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+
+      load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for Adam embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+
+      retrieve_op_list = []
+      for host_id, table_variable, m_variable, v_variable in (zip(
+          range(num_hosts), table_variables,
+          m_variables, v_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_m, retrieved_v = (
+              tpu_ops.retrieve_tpu_embedding_adam_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(m_variable, retrieved_m),
+              state_ops.assign(v_variable, retrieved_v))
+
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _StochasticGradientDescentHandler(_OptimizerHandler):
+  """Handles stochastic gradient descent specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    (table_descriptor.optimization_parameters.stochastic_gradient_descent
+     .SetInParent())
+
+  def get_default_slot_variable_names(self, table):
+    return None
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    del table_config
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable in (zip(
+          range(num_hosts), table_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops
+              .load_tpu_embedding_stochastic_gradient_descent_parameters(
+                  parameters=table_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for SGD embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+
+      retrieve_op_list = []
+      for host_id, table_variable in (zip(
+          range(num_hosts), table_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table = (
+              tpu_ops
+              .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table))
+
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return None, load_ops_fn, retrieve_ops_fn
+
+
+def _get_optimization_handler(optimization_parameters):
+  if isinstance(optimization_parameters, AdagradParameters):
+    return _AdagradHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, AdamParameters):
+    return _AdamHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
+    return _StochasticGradientDescentHandler(optimization_parameters)
+  else:
+    return NotImplementedError()
+
+
+def _create_ordered_dict(d):
+  """Create an OrderedDict from Dict."""
+  return collections.OrderedDict((k, d[k]) for k in sorted(d))
+
+
+def _create_combiners(table_to_config_dict):
+  return [table_to_config_dict[t].combiner for t in table_to_config_dict]
+
+
+def _create_table_to_features_dict(feature_to_table_dict):
+  """Create mapping from table to a list of its features."""
+  table_to_features_dict_tmp = {}
+  for feature, table in six.iteritems(feature_to_table_dict):
+    if table in table_to_features_dict_tmp:
+      table_to_features_dict_tmp[table].append(feature)
+    else:
+      table_to_features_dict_tmp[table] = [feature]
+
+  table_to_features_dict = collections.OrderedDict()
+  for table in sorted(table_to_features_dict_tmp):
+    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
+  return table_to_features_dict
+
+
+def _create_device_fn(hosts):
+  """Create device_fn() to use with _create_partitioned_variables()."""
+
+  def device_fn(op):
+    """Returns the `device` for `op`."""
+    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
+
+    if part_match:
+      idx = int(part_match.group(1))
+    else:
+      raise RuntimeError('Internal Error: '
+                         'Expected %s to contain /part_*.' % op.name)
+
+    device = hosts[idx]
+    return device
+
+  return device_fn
+
+
+def _create_partitioned_variables(name,
+                                  num_hosts,
+                                  vocabulary_size,
+                                  embedding_dimension,
+                                  initializer,
+                                  collections=None):  # pylint: disable=redefined-outer-name
+  """Creates ParitionedVariables based on `num_hosts` for `table`."""
+  # TODO(shizhiw): automatically place embedding lookup elsewhere?
+  if vocabulary_size < num_hosts:
+    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
+                     'As TPU embedding is not optimized for small tables, '
+                     'please consider other ways for this embedding lookup.')
+
+  return list(variable_scope.get_variable(
+      name,
+      shape=(vocabulary_size, embedding_dimension),
+      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
+      dtype=dtypes.float32,
+      initializer=initializer,
+      collections=collections,
+      trainable=False))
diff --git a/tensorflow/python/tpu/tpu_embedding_gradient.py b/tensorflow/python/tpu/tpu_embedding_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7de661cc35ff5439f9ce5a88fc5642cdeb07daf
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_gradient.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Optional helper for gradient handling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.tpu.ops import tpu_ops
+
+
+def get_gradients_through_compute_gradients(optimizer, loss, activations):
+  """Compute gradients to send to TPU embedding.
+
+  Args:
+    optimizer: a subclass of optimizer.Optimizer, usually CrossShardOptimizer.
+      Used to call compute_gradients().
+    loss: a Tensor to call optimizer.compute_gradients() on.
+    activations: an OrderedDict mapping feature_name to Tensors of activations.
+
+  Returns:
+    An OrderedDict mapping from feature name Strings to Tensors of gradients of
+      the loss wrt the activations of the features.
+  """
+  activation_list = activations.values()
+  grads_and_vars = optimizer.compute_gradients(loss, activation_list)
+  grads = [grad for grad, _ in grads_and_vars]
+  feature_to_gradient_dict = collections.OrderedDict(
+      zip(activations.keys(), grads))
+  return feature_to_gradient_dict
+
+
+def create_dummy_table_variables(tpu_embedding):
+  """Create dummy embedding table variables.
+
+  The sole purpose of these dummy variables are to trigger gradient
+  calcuation wrt them so that the gradients wrt activation can be captured
+  and later sent to TPU embedding.
+
+  Args:
+    tpu_embedding: TPUEmbedding, dummy table variables will be created for use
+      with tpu_embedding.
+
+  Returns:
+    A tuple of dummy variables and their initializer.
+
+  Raises:
+    RuntimeError: if collection to store gradients already exists and is not
+    empty.
+  """
+  dummy_table_variables = collections.OrderedDict()
+  for table_id, table in enumerate(tpu_embedding.table_to_features_dict):
+    dummy_table_variables[table] = (
+        # Explicitly specifying collections prevents this variable from
+        # being added to the GLOBAL_VARIABLES collection, so that Saver()
+        # ignores it.
+        # But Tensorflow optimizer creates slot variable for these dummy
+        # variable, e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1},
+        # which will be in GLOBAL_VARIABLES collection,
+        variable_scope.get_variable(
+            'tpu_embedding_dummy_table_variable_{}'.format(table),
+            dtype=dtypes.float32,
+            shape=[1],
+            use_resource=True,
+            trainable=True,
+            collections=['tpu_embedding_dummy_table_variables']))
+
+    g = ops.get_default_graph()
+    table_gradients = g.get_collection_ref(
+        'tpu_embedding_gradients_table_{}'.format(table_id))
+    if table_gradients:
+      raise RuntimeError(
+          'tpu_embedding_gradients_table_{} is not empty.'.format(table_id))
+    table_gradients.extend(
+        [None] * len(tpu_embedding.table_to_features_dict[table]))
+
+  return (dummy_table_variables,
+          variables.variables_initializer(
+              dummy_table_variables.values(),
+              name='tpu_embedding_dummy_table_variables_init'))
+
+
+def hook_dummy_table_variables_to_activations(tpu_embedding, activations,
+                                              dummy_table_variables):
+  """Have activations depend on dummy table variables for gradient intercept.
+
+  Args:
+    tpu_embedding: TPUEmbedding, activations and dummy_table_variables are from
+      tpu_embedding.
+    activations: An OrderedDict of feature name String to activation tensors.
+    dummy_table_variables: An OrderedDict of table name String to dummy table
+      variables.
+
+  Returns:
+    An OrderedDict of feature name String to activation tensors, which can be
+      used just as the activations input.
+  """
+  new_activations = collections.OrderedDict()
+  for feature in activations:
+    table = tpu_embedding.feature_to_table_dict[feature]
+    new_activations[feature] = tpu_ops.tpu_embedding_activations(
+        dummy_table_variables[table],
+        activations[feature],
+        table_id=tpu_embedding.table_to_config_dict.keys().index(table),
+        lookup_id=tpu_embedding.table_to_features_dict[table].index(feature))
+  return new_activations
+
+
+def get_gradients_through_dummy_table_variables(tpu_embedding):
+  """Get gradients wrt the activations of each feature.
+
+  Args:
+    tpu_embedding: TPUEmbedding, create dummy table variable to be used with
+      tpu_embedding.
+
+  Returns:
+    An OrderedDict mapping feature name to gradient.
+
+  Raises:
+    ValueError: if some gradients are not defined.
+  """
+  g = ops.get_default_graph()
+  feature_to_gradient_dict = collections.OrderedDict()
+  for table_id, table in enumerate(tpu_embedding.table_to_config_dict):
+    table_gradients = g.get_collection(
+        'tpu_embedding_gradients_table_{}'.format(table_id))
+    if any(gradient is None for gradient in table_gradients):
+      raise ValueError(
+          'Table {} with id {} has undefined gradients: this is probably '
+          'because the model asked TPUEmbedding to compute activations that '
+          'were not used.'.format(table, table_id))
+    for feature, gradient in zip(tpu_embedding.table_to_features_dict[table],
+                                 table_gradients):
+      feature_to_gradient_dict[feature] = gradient
+  return feature_to_gradient_dict
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea6b297ac3b993720e63a532649f5e174e91c9cf
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_estimator.py
@@ -0,0 +1,3760 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPUEstimator class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import os
+import signal
+import sys
+import threading
+import time
+
+import numpy as np
+import six
+from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import variable_pb2
+from tensorflow.core.framework.summary_pb2 import Summary
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary import summary
+from tensorflow.python.tpu import _tpu_estimator_embedding
+from tensorflow.python.tpu import error_handling
+from tensorflow.python.tpu import functional as tpu_functional
+from tensorflow.python.tpu import session_support
+from tensorflow.python.tpu import tensor_tracer
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_config
+from tensorflow.python.tpu import tpu_context
+from tensorflow.python.tpu import tpu_embedding_gradient
+from tensorflow.python.tpu import tpu_feed
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import training_loop
+from tensorflow.python.tpu import util as util_lib
+from tensorflow.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import evaluation
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
+
+_INITIAL_LOSS = 1e7
+_ZERO_LOSS = 0.
+_TPU_ESTIMATOR = 'tpu_estimator'
+_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
+_BATCH_SIZE_KEY = 'batch_size'
+_CTX_KEY = 'context'
+_USE_TPU_KEY = 'use_tpu'
+_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+_ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
+_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
+_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
+
+# Ideally _USE_TPU_KEY should be reserved as well. However there are already
+# models that make use of this key, thus it can not be reserved now to prevent
+# breakage. In the long run, we would like to mitigate this by migrating models
+# off of using _USE_TPU_KEY.
+_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
+_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
+
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
+def _is_iterable(obj):
+  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
+class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
+
+  def AddOp(self, op):
+    if op.type in [
+        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
+        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
+    ]:
+      raise ValueError('Use tf.contrib.summary inside of host_calls.')
+
+
+def _create_global_step(graph):
+  graph = graph or ops.get_default_graph()
+  if training.get_global_step(graph) is not None:
+    raise ValueError('"global_step" already exists.')
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        ops.GraphKeys.GLOBAL_STEP,
+        shape=[],
+        dtype=dtypes.int64,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        use_resource=True,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
+
+
+def _create_or_get_iterations_per_loop():
+  """Creates or gets the iterations_per_loop variable.
+
+  In TPUEstimator, the user provided computation, the model_fn, is wrapped
+  inside a tf.while_loop for peak performance. The iterations of the loop are
+  specified by this variable, which adjusts its value on the CPU after each TPU
+  program execution and before the next TPU execution.
+
+  The purpose of using a variable, rather then a constant, is to allow
+  TPUEstimator adapt the TPU training iterations according to the final steps
+  specified by users. For example, if the user sets the iterations_per_loop as 4
+  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
+  variable will have the following value before each TPU training.
+
+      - 1-th TPU execution: iterations_per_loop = 4
+      - 2-th TPU execution: iterations_per_loop = 4
+      - 3-th TPU execution: iterations_per_loop = 2
+
+  As model_fn increases the global step once per train_op invocation, the global
+  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
+  users.
+
+  Returns:
+    A TF non-trainable resource variable.
+
+  Raises:
+    RuntimeError: If multi iterations_per_loop variables were found.
+  """
+  graph = ops.get_default_graph()
+  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
+  iter_vars = graph.get_collection(collection_name)
+  if len(iter_vars) == 1:
+    return iter_vars[0]
+  elif len(iter_vars) > 1:
+    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
+
+  with ops.colocate_with(training_util.get_global_step()):
+    with variable_scope.variable_scope(
+        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
+      return variable_scope.get_variable(
+          _ITERATIONS_PER_LOOP_VAR,
+          initializer=init_ops.zeros_initializer(),
+          shape=[],
+          dtype=dtypes.int32,
+          trainable=False,
+          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
+          use_resource=True)
+
+
+def _sync_variables_ops(ctx):
+  """Create varriables synchronization ops.
+
+  Gets the variables back from TPU nodes. This means the variables updated
+  by TPU will now be *synced* to host memory.
+  In BROADCAST mode, we skip this sync since the variables are ususally too
+  big to transmit via RPC.
+
+  Args:
+    ctx: A `_InternalTPUContext` instance with mode.
+
+  Returns:
+    A list of sync ops.
+  """
+
+  if not ctx.is_input_broadcast_with_iterators():
+    return [
+        array_ops.check_numerics(v.read_value(),
+                                 'Gradient for %s is NaN' % v.name).op
+        for v in variables.trainable_variables()
+    ]
+  else:
+    return [control_flow_ops.no_op()]
+
+
+def _increase_eval_step_op(iterations_per_loop):
+  """Returns an op to increase the eval step for TPU evaluation.
+
+  Args:
+    iterations_per_loop: Tensor. The number of eval steps running in TPU system
+      before returning to CPU host for each `Session.run`.
+
+  Returns:
+    An operation
+  """
+  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
+  # Estimator evaluate increases 1 by default. So, we increase the difference.
+  return state_ops.assign_add(
+      eval_step,
+      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
+      use_locking=True)
+
+
+def _extract_key_names(tensor_or_dict):
+  if isinstance(tensor_or_dict, dict):
+    return sorted(tensor_or_dict.keys())
+  return []
+
+
+class _SIGNAL(object):
+  """Signal used to control the thread of infeed/outfeed.
+
+  All preserved signals must be negative numbers. Positive numbers are used to
+  indicate the number of iterations for next training/evaluation loop.
+  """
+  NEXT_BATCH = -1
+  STOP = -2
+
+
+class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
+
+  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
+  `export_outputs`.
+
+  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
+  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
+  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
+  To be precise, TPU evaluation expects a slightly different signature from the
+  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
+  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
+  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
+  `tensors` usually specify the model logits, which are transferred back from
+  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
+  size is the first dimension. Once all tensors are available at CPU host from
+  all shards, they are concatenated (on CPU) and passed as positional arguments
+  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
+  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
+  name to the result of calling a metric function, namely a `(metric_tensor,
+  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
+  `eval_metrics`.
+
+  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
+  function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with `tf.contrib.summary.create_file_writer`.
+  """
+
+  def __new__(cls,
+              mode,
+              predictions=None,
+              loss=None,
+              train_op=None,
+              eval_metrics=None,
+              export_outputs=None,
+              scaffold_fn=None,
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
+    """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
+    if eval_metrics is not None:
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
+
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in training_hooks + evaluation_hooks + prediction_hooks:
+      if not isinstance(hook, session_run_hook.SessionRunHook):
+        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
+                        .format(hook))
+
+    return super(TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
+
+  def as_estimator_spec(self):
+    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = self.host_call
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    loss = self.loss
+    if tensor_tracer.TensorTracer.is_enabled() \
+       and self.train_op is not None:
+      tt = tensor_tracer.TensorTracer()
+      loss = tt.trace_cpu(ops.get_default_graph(), loss, self.train_op)
+
+    hooks = tuple(hooks or [])
+    scaffold = self.scaffold_fn() if self.scaffold_fn else None
+    return model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        scaffold=scaffold,
+        training_hooks=self.training_hooks + hooks,
+        evaluation_hooks=self.evaluation_hooks + hooks,
+        prediction_hooks=self.prediction_hooks + hooks)
+
+
+class _OpQueueContext(object):
+  """Manages work queue and thread for a infeed/outfeed thread."""
+
+  def __init__(self, name, target, args):
+    self._name = name
+    self._queue = Queue.Queue()
+    args = (self,) + args
+    self._thread = threading.Thread(name=name, target=target, args=args)
+    self._thread.daemon = True
+    self._thread.start()
+
+  def stop(self):
+    self._queue.put(_SIGNAL.STOP)
+
+  def send_next_batch_signal(self, iterations):
+    self._queue.put(iterations)
+
+  def read_iteration_counts(self):
+    while True:
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
+        return
+      yield iterations
+
+  def join(self):
+    logging.info('Shutting down %s thread.', self._name)
+    self.stop()
+    self._thread.join()
+
+
+class _OpSignalOnceQueueContext(_OpQueueContext):
+  """Manages work queue and thread for a infeed/outfeed thread.
+
+  This subclass only signals once.
+  """
+
+  def __init__(self, name, target, args):
+    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
+    self._has_signaled = False
+
+  def send_next_batch_signal(self, iterations):
+    if not self._has_signaled:
+      self._queue.put(iterations)
+      self._has_signaled = True
+
+
+class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization, infeed, and outfeed.
+
+  This hook does two major things:
+  1. initialize and shutdown TPU system.
+  2. launch and join the threads for infeed enqueue and (optional) outfeed
+     dequeue.
+  """
+
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               tpu_compile_op,
+               run_infeed_loop_on_coordinator=True,
+               rendezvous=None,
+               master=None,
+               session_config=None,
+               tpu_init_ops=None):
+    self._master_job = ctx.master_job
+    self._enqueue_ops = enqueue_ops
+    self._dequeue_ops = dequeue_ops
+    self._rendezvous = rendezvous
+    self._master = master
+    self._session_config = session_config
+    self._init_ops = list(tpu_init_ops or [])
+    if ctx.embedding_config is None:
+      self._embedding_layer_config = None
+    else:
+      self._embedding_layer_config = (
+          ctx.embedding_config.tpu_embedding.config_proto)
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
+
+    self._feed_error = None
+    self._finished = False
+    # When using model parallelism, the TPU is pre-initialized at startup to
+    # fetch mesh information.  We skip re-initializing it here to avoid
+    # suspected issues due to the mesh layout changing on the second
+    # initialization.
+    self._should_initialize_tpu = not ctx.model_parallelism_enabled
+    self._tpu_compile_op = tpu_compile_op
+
+  def begin(self):
+    logging.info('TPU job name %s', self._master_job)
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    if self._should_initialize_tpu:
+      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+    else:
+      self._finalize_ops = []
+
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def _run_infeed(self, queue_ctx, session):
+    logging.info('Starting infeed thread controller.')
+    if self._initial_infeed_sleep_secs:
+      logging.info('Infeed thread sleeping for %d seconds.',
+                   self._initial_infeed_sleep_secs)
+      time.sleep(self._initial_infeed_sleep_secs)
+      logging.info('Infeed thread starting after sleep')
+
+    with self._rendezvous.catch_errors(source='infeed', session=session):
+      if self._run_infeed_loop_on_coordinator:
+        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+          for i in xrange(steps):
+            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
+            session.run(self._enqueue_ops)
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
+
+  def _run_outfeed(self, queue_ctx, session):
+    logging.info('Starting outfeed thread controller.')
+    with self._rendezvous.catch_errors(source='outfeed', session=session):
+      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+        for i in xrange(steps):
+          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
+          session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpQueueContext(name=name, target=target, args=args)
+
+  def _assertCompilationSucceeded(self, result, coord):
+    proto = tpu_compilation_result.CompilationResultProto()
+    proto.ParseFromString(result)
+    if proto.status_error_message:
+      logging.error('Compilation failed: {}'.format(proto.status_error_message))
+      coord.request_stop()
+    else:
+      logging.info('Compilation succeeded')
+
+  def after_create_session(self, session, coord):
+    if self._should_initialize_tpu:
+      logging.info('Init TPU system')
+      start = time.time()
+      with ops.Graph().as_default():
+        with tf_session.Session(
+            self._master, config=self._session_config) as sess:
+          sess.run(
+              tpu.initialize_system(
+                  job=self._master_job,
+                  embedding_config=self._embedding_layer_config))
+      logging.info('Initialized TPU in %d seconds', time.time() - start)
+
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
+
+    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
+      logging.info('Compiling user program: this may take a while...')
+      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
+
+    self._infeed_controller = self._create_infeed_controller(
+        name='InfeedController', target=self._run_infeed, args=(session,))
+
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
+
+    # Enable the worker watchdog to terminate workers on coordinator exit.
+    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
+    if watchdog_timeout > 0:
+      session_support.start_worker_watchdog(session,
+                                            shutdown_timeout=watchdog_timeout)
+
+  def before_run(self, run_context):
+    self._feed_error = None
+
+    iterations = run_context.session.run(self._iterations_per_loop_var)
+
+    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    self._infeed_controller.send_next_batch_signal(iterations)
+
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
+
+  def end(self, session):
+    self._finished = True
+    logging.info('Stop infeed thread controller')
+    self._infeed_controller.join()
+    self._rendezvous.record_done('infeed')
+
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
+    self._rendezvous.record_done('outfeed')
+
+    logging.info('Shutdown TPU system.')
+    session.run(self._finalize_ops)
+
+
+class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
+
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
+               rendezvous=None, master=None, session_config=None):
+    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
+        ctx,
+        enqueue_ops,
+        dequeue_ops,
+        tpu_compile_op=tpu_compile_op,
+        run_infeed_loop_on_coordinator=False,
+        rendezvous=rendezvous,
+        master=master,
+        session_config=session_config)
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
+
+
+class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step.
+
+  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
+  following differences for TPU training:
+
+  1. This hook sets the variable for iterations_per_loop, which is used by
+     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
+     As the hook execution order is not guaranteed, the variable update is
+     handled in `after_create_session` and `after_run` as
+     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
+
+  2. For each training loop (session.run), the global step could be increased
+     multiple times on TPU. The global step tensor value will be explicitly read
+     again in `after_run` to ensure the latest value is retrieved to avoid race
+     condition.
+  """
+
+  def __init__(self, iterations, num_steps=None, last_step=None):
+    """Initializes a `StopAtStepHook`.
+
+    Args:
+      iterations: The number of iterations to run optimizer per training loop.
+      num_steps: Number of steps to execute.
+      last_step: Step after which to stop.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if num_steps is None and last_step is None:
+      raise ValueError('One of num_steps or last_step must be specified.')
+    if num_steps is not None and last_step is not None:
+      raise ValueError('Only one of num_steps or last_step can be specified.')
+    self._num_steps = num_steps
+    self._last_step = last_step
+    self._iterations = iterations
+
+  def _next_iterations(self, global_step, last_step):
+    gap = last_step - global_step
+    return min(gap, self._iterations)
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError('Global step should be created.')
+
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    if self._last_step is None:
+      self._last_step = global_step + self._num_steps
+
+    iterations = self._next_iterations(global_step, self._last_step)
+
+    self._iterations_per_loop_var.load(iterations, session=session)
+
+  def after_run(self, run_context, run_values):
+    # Global step cannot be retrieved via SessionRunArgs and before_run due to
+    # race condition.
+    global_step = run_context.session.run(self._global_step_tensor)
+    if global_step >= self._last_step:
+      run_context.request_stop()
+    else:
+      iterations = self._next_iterations(global_step, self._last_step)
+      self._iterations_per_loop_var.load(
+          iterations, session=run_context.session)
+
+
+class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step."""
+
+  def __init__(self, num_steps):
+    """Initializes a `_SetEvalIterationsHook`.
+
+    Args:
+      num_steps: Number of steps to execute.
+    """
+    self._num_steps = num_steps
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    self._iterations_per_loop_var.load(self._num_steps, session=session)
+
+
+class _StoppingPredictHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop according to the stopping signal in prediction."""
+
+  def __init__(self, scalar_stopping_signal):
+    self._scalar_stopping_signal = scalar_stopping_signal
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
+    # in side threads for prediction model. But it makes the
+    # TPUInfeedOutfeedSessionHook prints nice message.
+    self._iterations_per_loop_var.load(1, session=session)
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+    scalar_stopping_signal = run_values.results
+    if _StopSignals.should_stop(scalar_stopping_signal):
+      # NOTE(xiejw): In prediction, stopping signals are inserted for each
+      # batch. And we append one more batch to signal the system it should stop.
+      # The data flow might look like
+      #
+      #  batch   0: images, labels, stop = 0  (user provided)
+      #  batch   1: images, labels, stop = 0  (user provided)
+      #  ...
+      #  batch  99: images, labels, stop = 0  (user provided)
+      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
+      #
+      # where the final batch (id = 100) is appended by TPUEstimator, so we
+      # should drop it before returning the predictions to user.
+      # To achieve that, we throw the OutOfRangeError in after_run. Once
+      # Monitored Session sees this error in SessionRunHook.after_run, the
+      # "current" prediction, i.e., batch with id=100, will be discarded
+      # immediately
+      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
+
+
+def generate_per_core_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
+  """Generates infeed enqueue ops for per-core input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """A fn returns enqueue_ops."""
+    num_cores_per_host = ctx.num_of_cores_per_host
+    per_host_sharded_inputs = []
+    for core_ordinal in range(num_cores_per_host):
+      with ops.name_scope('ordinal_%d' % (core_ordinal)):
+        user_context = tpu_context.TPUContext(
+            internal_ctx=ctx,
+            input_device=host_device,
+            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
+        inputs = _Inputs.from_input_fn(input_fn(user_context))
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels))
+        per_host_sharded_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+
+    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
+    return per_host_enqueue_ops
+
+  return enqueue_ops_fn, captured_infeed_queue
+
+
+def generate_per_host_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+
+  dataset_initializer = None
+
+  with ops.device(device):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+      if batch_axis is not None:
+        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      dataset_initializer = inputs.dataset_initializer()
+
+    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """A Fn returning the TPU infeed enqueue ops.
+
+    By providing as a Fn, it can be invoked inside the tf.while_loop such that
+    the input pipeline for multiple iterations can be executed by one
+    Session.run call.
+
+    Returns:
+      list of dict of ops.
+    """
+    with ops.device(device):
+      num_of_replicas_per_host = ctx.num_of_replicas_per_host
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
+      unsharded_tensor_list = (
+          inputs_structure_recorder.flatten_features_and_labels(
+              features, labels, signals))
+
+      infeed_queue = tpu_feed.InfeedQueue(
+          tuple_types=[t.dtype for t in unsharded_tensor_list],
+          tuple_shapes=[t.shape for t in unsharded_tensor_list],
+          shard_dimensions=batch_axis)
+      captured_infeed_queue.capture(infeed_queue)
+      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
+      per_host_enqueue_ops = (
+          infeed_queue.split_inputs_and_generate_enqueue_ops(
+              unsharded_tensor_list,
+              placement_function=lambda x: device,
+              tpu_ordinal_function=tpu_ordinal_function_impl))
+      if signals is None:
+        return per_host_enqueue_ops
+      else:
+        return {
+            'ops': per_host_enqueue_ops,
+            'signals': signals,
+        }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+def generate_per_host_v2_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+  dataset_initializer = None
+
+  with ops.device(device):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if not is_dataset:
+      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
+                      'input pipeline configuration.')
+
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True,
+          num_invocations_per_step=ctx.num_of_replicas_per_host)
+
+    dataset_initializer = inputs.dataset_initializer()
+    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """Generates the per_host enqueue ops."""
+    control_deps = []
+    per_host_sharded_inputs = []
+    sparse_features_list = []
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+    cached_signals = None
+    with ops.device(device):
+      if not inputs.is_dataset:
+        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
+      for _ in range(num_replicas_per_host):
+        # Use control dependencies to ensure a deterministic ordering.
+        with ops.control_dependencies(control_deps):
+          features, labels = inputs.features_and_labels()  # Calls get_next()
+          signals = inputs.signals()
+
+          # All the replicas share the replica 0's stopping singal.
+          # This avoids inconsistent state among different model replcias.
+          if cached_signals:
+            signals['stopping'] = cached_signals['stopping']
+          else:
+            cached_signals = signals
+
+        features, labels, sparse_features = (
+            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
+        sparse_features_list.append(sparse_features)
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels, signals))
+        control_deps.extend(flattened_inputs)
+        per_host_sharded_inputs.append(flattened_inputs)
+
+      if inputs_structure_recorder.flattened_input_dims:
+        input_partition_dims = inputs_structure_recorder.flattened_input_dims
+        if signals:
+          input_partition_dims += [None] * len(signals)
+        # pylint: disable=protected-access
+        infeed_queue = tpu_feed._PartitionedInfeedQueue(
+            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
+            host_id=host_id,
+            input_partition_dims=input_partition_dims,
+            device_assignment=ctx.device_assignment)
+        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+            per_host_sharded_inputs)
+      else:
+        infeed_queue = tpu_feed.InfeedQueue(
+            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+            per_host_sharded_inputs,
+            tpu_ordinal_function=tpu_ordinal_function_impl)
+      captured_infeed_queue.capture(infeed_queue)
+
+    if ctx.embedding_config:
+      per_host_enqueue_ops.extend(
+          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
+              sparse_features_list))
+
+    if signals is None:
+      return per_host_enqueue_ops
+    else:
+      return {
+          'ops': per_host_enqueue_ops,
+          'signals': signals,
+      }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
+                                      num_hosts):
+  """Generates infeed enqueue ops for one input_fn on all the hosts."""
+  captured_infeed_queue = _CapturedObject()
+  dataset_initializer = None
+  device_0 = ctx.tpu_host_placement_function(host_id=0)
+  with ops.device(device_0):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device_0, invocation_index=0)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      dataset_initializer = inputs.dataset_initializer()
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+
+  def tpu_ordinal_function_impl(replica_id):
+    if ctx.device_assignment:
+      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
+    else:
+      return replica_id % num_replicas_per_host
+
+  def device_function_impl(replica_id):
+    return ctx.tpu_host_placement_function(replica_id=replica_id)
+
+  def enqueue_ops_fn():
+    """Generates enqueue ops for all the hosts."""
+    broadcasted_inputs = []
+    flattened_inputs = None  # Cache result from input_fn.
+    signals = None
+    for host_id in xrange(num_hosts):
+      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
+        for _ in xrange(ctx.num_of_replicas_per_host):
+          # Note: input_fn is only called once at host 0 for the first replica.
+          # The features and labels returned from that invocation are
+          # broadcasted to other replicas(including the replicas on other
+          # hosts).
+          if flattened_inputs is None:
+            features, labels = inputs.features_and_labels()  # Calls get_next()
+            signals = inputs.signals()
+
+            inputs_structure_recorder.validate_and_record_structure(
+                features, labels)
+            flattened_inputs = (
+                inputs_structure_recorder.flatten_features_and_labels(
+                    features, labels, signals))
+          broadcasted_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(broadcasted_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+    enqueue_ops = infeed_queue.generate_enqueue_ops(
+        broadcasted_inputs,
+        tpu_ordinal_function=tpu_ordinal_function_impl,
+        placement_function=device_function_impl)
+
+    if signals is None:
+      return enqueue_ops
+    else:
+      return {
+          'ops': enqueue_ops,
+          'signals': signals,
+      }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+class _InputPipeline(object):
+  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
+
+  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
+  call site.  To be precise, based on the configuration in
+  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
+  multi-host TPU training) or for one host (usually for single-host TPU
+  evaluation), and sends all `features` and `labels` returned by `input_fn` to
+  TPU infeed. For per-core invocation, `features` and `labels` are piped to
+  infeed directly, one tuple for each core. For per-host invocation,  `features`
+  and `labels` are split at host (with respect to `batch_axis`) and piped to all
+  cores accordingly.
+
+  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
+  inputs returned by the `input_fn` can have one of the following forms:
+  1. features
+  2. (features, labels)
+  3. ((arbitrarily nested structure of features), labels)
+
+  Internally, form 1 is reformed to `(features, None)` as features and labels
+  are passed separately to underlying methods. For TPU training, TPUEstimator
+  may expect multiple `features` and `labels` tuples one for each core.
+
+  TPUEstimator allows various different structures for inputs (namely `features`
+  and `labels`).  Both `features` and `labels` can be any nested sturcture
+  supported by TF nest (namely, dict, tuples, namedtuples or any nested
+  structure of such of Tensors).  `labels` could be `None` as well.
+
+  These are flattened before they are passed to the infeed/outfeed library
+  as that expectes flattend lists.
+  """
+
+  class InputsStructureRecorder(object):
+    """The recorder to record inputs structure."""
+
+    def __init__(self, input_partition_dims=None):
+      # Holds the structure of inputs
+      self._feature_structure = {}
+      self._flattened_input_dims = None
+
+      if input_partition_dims:
+        # This should have been validated in TPUConfig.
+        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
+        if len(input_partition_dims) == 2:
+          self._feature_dims, self._label_dims = input_partition_dims
+        else:
+          self._feature_dims = input_partition_dims[0]
+          self._label_dims = None
+
+        assert self._feature_dims is not None, ('input_partition_dims[0] must '
+                                                'not be None')
+      else:
+        self._feature_dims = None
+        self._label_dims = None
+
+      # Internal state.
+      self._initialized = False
+
+    @property
+    def flattened_input_dims(self):
+      assert self._initialized, 'InputsStructureRecorder is not initialized.'
+      return self._flattened_input_dims
+
+    def has_labels(self):
+      return 'labels' in self._feature_structure
+
+    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
+                            label_dims_names, label_names, has_labels):
+      """Flatten input dims with the same order as flattened input tensors."""
+      flattened_input_dims = []
+      if feature_dims_names:
+        # We need a fixed ordering for matching the tensors in features.
+        flattened_input_dims.extend(
+            [feature_dims[name] for name in feature_dims_names])
+      else:
+        flattened_input_dims.append(feature_dims)
+
+      if label_dims_names:
+        # We need a fixed ordering for matching the tensors in labels.
+        flattened_input_dims.extend(
+            [label_dims[name] for name in label_dims_names])
+      else:
+        if label_names:
+          num_tensors_in_label = len(label_names)
+        else:
+          num_tensors_in_label = int(has_labels)
+        # Setting `None` in input_partition_dims[1] will apply `None` to
+        # all the tensors in labels, regardless of internal structure.
+        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
+
+      return flattened_input_dims
+
+    def validate_and_record_structure(self, features, labels):
+      """Validates and records the structure of `features` and `labels`."""
+      # Extract structure.
+      has_labels = labels is not None
+      feature_names = _extract_key_names(features)
+      label_names = _extract_key_names(labels)
+
+      if not self._initialized:
+        # Record structure.
+        self._initialized = True
+        if self._feature_dims is not None:
+          feature_dims_names = _extract_key_names(self._feature_dims)
+          if feature_dims_names != feature_names:
+            raise ValueError(
+                'TPUConfig.input_partition_dims[0] mismatched feature'
+                ' keys. Expected {}, got {}'.format(feature_names,
+                                                    feature_dims_names))
+
+          label_dims_names = _extract_key_names(self._label_dims)
+          if self._label_dims is not None and label_dims_names != label_names:
+            raise ValueError(
+                'TPUConfig.input_partition_dims[1] mismatched label'
+                ' keys. Expected {}, got {}'.format(label_names,
+                                                    label_dims_names))
+
+          self._flattened_input_dims = self._flatten_input_dims(
+              self._feature_dims, feature_dims_names, self._label_dims,
+              label_dims_names, label_names, has_labels)
+
+    def flatten_features_and_labels(self, features, labels, signals=None):
+      """Flattens the `features` and `labels` to a single tensor list."""
+      self._feature_structure['features'] = features
+      if labels is not None:
+        self._feature_structure['labels'] = labels
+      if signals is not None:
+        self._feature_structure['signals'] = signals
+      return data_nest.flatten(self._feature_structure)
+
+    def unflatten_features_and_labels(self, flattened_inputs):
+      """Restores the flattened inputs to original features and labels form.
+
+      Args:
+        flattened_inputs: Flattened inputs for each shard.
+
+      Returns:
+        A tuple of (`features`, `labels`), where `labels` could be None.
+        Each one, if present, should have identical structure (single tensor vs
+        dict) as the one returned by input_fn.
+
+      Raises:
+        ValueError: If the number of expected tensors from `flattened_inputs`
+          mismatches the recorded structure.
+      """
+
+      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
+                                                      flattened_inputs)
+      return _Inputs(
+          unflattened_inputs['features'],
+          unflattened_inputs.get('labels'),
+          signals=unflattened_inputs.get('signals'))
+
+  def __init__(self, input_fn, batch_axis, ctx):
+    """Constructor.
+
+    Args:
+      input_fn: input fn for train or eval.
+      batch_axis: A python tuple of int values describing how each tensor
+        produced by the Estimator `input_fn` should be split across the TPU
+        compute shards.
+      ctx: A `_InternalTPUContext` instance with mode.
+
+    Raises:
+      ValueError: If both `sharded_features` and `num_cores` are `None`.
+    """
+    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
+        ctx.input_partition_dims)
+
+    self._sharded_per_core = ctx.is_input_sharded_per_core()
+    self._input_fn = input_fn
+    self._infeed_queue = None
+    self._ctx = ctx
+    self._batch_axis = batch_axis
+
+  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
+    """Generates infeed enqueue ops and dequeue_fn."""
+    # While tf.while_loop is called, the body function, which invokes
+    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
+    # structure is recorded.
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
+
+    self._validate_input_pipeline()
+
+    def dequeue_fn():
+      """dequeue_fn is used by TPU to retrieve the tensors."""
+      # In the model-parallel case, both the host-side and device-side
+      # computations must agree on the core on which infeed takes place. We
+      # choose to perform infeed on logical core 0 of each replica.
+      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
+      # The unflatten process uses the structure information recorded above.
+      return self._inputs_structure_recorder.unflatten_features_and_labels(
+          values)
+
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
+
+  def _invoke_input_fn_and_record_structure(self):
+    """Deploys the input pipeline and record input structure."""
+    enqueue_ops = []
+    infeed_queues = []
+    all_dataset_initializers = []
+    num_hosts = self._ctx.num_hosts
+    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
+    if self._sharded_per_core:
+      # Per-Core input pipeline deployment.
+      # Invoke input pipeline for each core and placed on the corresponding
+      # host.
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
+        with ops.device(host_device):
+          with ops.name_scope('input_pipeline_task%d' % (host_id)):
+            enqueue_ops_fn, captured_infeed_queue = (
+                generate_per_core_enqueue_ops_fn_for_host(
+                    self._ctx, self._input_fn, self._inputs_structure_recorder,
+                    host_device, host_id))
+
+            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+              run_infeed_loop_on_coordinator = False
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
+            infeed_queues.append(captured_infeed_queue.get())
+
+    elif self._ctx.is_input_broadcast_with_iterators():
+      # Only calls input_fn in host 0.
+      host_device = tpu_host_placement_fn(host_id=0)
+      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
+                                            self._inputs_structure_recorder,
+                                            num_hosts))
+      if dataset_initializer:
+        all_dataset_initializers.append(dataset_initializer)
+        run_infeed_loop_on_coordinator = False
+        wrap_fn = (
+            _wrap_computation_in_while_loop
+            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+            _wrap_computation_in_while_loop_with_stopping_signals)
+        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
+      else:
+        enqueue_ops.append(enqueue_ops_fn())
+      infeed_queues.append(captured_infeed_queue.get())
+    else:
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
+        with ops.device(host_device):
+          with ops.name_scope('input_pipeline_task%d' % (host_id)):
+            if self._ctx.is_input_per_host_with_iterators():
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+                  generate_per_host_v2_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, host_device, host_id))
+            else:
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+                  generate_per_host_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, self._batch_axis,
+                      host_device, host_id))
+
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if dataset_initializer:
+              all_dataset_initializers.append(dataset_initializer)
+              run_infeed_loop_on_coordinator = False
+              wrap_fn = (
+                  _wrap_computation_in_while_loop
+                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+                  _wrap_computation_in_while_loop_with_stopping_signals)
+              enqueue_ops.append(
+                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            infeed_queues.append(captured_infeed_queue.get())
+    # infeed_queue is used to generate dequeue ops. The only thing it uses for
+    # dequeue is dtypes and types. So, any one can be used. Here, grab the
+    # first one.
+    self._infeed_queue = infeed_queues[0]
+    return enqueue_ops, [
+        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
+    ], run_infeed_loop_on_coordinator
+
+  def _validate_input_pipeline(self):
+    """Validates the input pipeline.
+
+    Perform some sanity checks to log user friendly information. We should
+    error out to give users better error message. But, if
+    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
+    user code, so, log a warning.
+
+    Raises:
+      RuntimeError: If the validation failed.
+    """
+    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
+      err_msg = ('Input pipeline contains one or more QueueRunners. '
+                 'It could be slow and not scalable. Please consider '
+                 'converting your input pipeline to use `tf.data` instead (see '
+                 'https://www.tensorflow.org/guide/datasets for '
+                 'instructions.')
+      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+        raise RuntimeError(err_msg)
+      else:
+        logging.warn(err_msg)
+
+
+def call_computation(computation,
+                     experimental_exported_model_uses_all_cores=True):
+  """Call computation.
+
+  computation uses a single-core for TPU inference. If
+  `experimental_exported_model_uses_all_cores` is `True`, this function will
+  round-robin
+  computation among all TPU cores visible to the host; otherwise, it will use
+  a single core.
+
+  Args:
+    computation: A Python function that takes no inputs and builds computation
+      graph. If `computation` returns m outputs, this function will return a
+      list of m Tensors.
+    experimental_exported_model_uses_all_cores: Whether to round-robin among all
+      cores visible to the host, or to use a single core.
+
+  Returns:
+    A list of output tensors.
+  """
+  if experimental_exported_model_uses_all_cores:
+    # Using `TPUPartitionedCall` makes it possible to target a different
+    # TPU core with every `Session.run()` call. Note that the entire inference
+    # graph executes on a single core, and that invocations of this graph
+    # will round-robin among the cores attached to a host.
+    @function.Defun(capture_resource_var_by_value=False)
+    def tpu_subgraph():
+      return computation()
+
+    return tpu_functional.TPUPartitionedCall(
+        args=tpu_subgraph.captured_inputs,
+        device_ordinal=tpu_ops.tpu_ordinal_selector(),
+        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
+        f=tpu_subgraph)
+  else:
+    return computation()
+
+
+class _ModelFnWrapper(object):
+  """A `model_fn` wrapper.
+
+  This makes calling model_fn on CPU and TPU easier and more consistent and
+  performs necessary check and mutation required by TPU training and evaluation.
+
+  In addition, this wrapper manages converting the `model_fn` to a single TPU
+  train and eval step.
+  """
+
+  def __init__(self, model_fn, config, params, ctx):
+    self._model_fn = model_fn
+    self._config = config
+    self._params = params
+    self._ctx = ctx
+
+  def call_without_tpu(self, features, labels, is_export_mode):
+    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
+
+  def _add_embedding_features(self, features, hook_dummy_table_variables):
+    """Add embedding features, optionally add hook to intercept gradient."""
+    if self._ctx.embedding_config:
+      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+      embedding_activations = tpu_embedding_.get_activations()
+      if hook_dummy_table_variables:
+        new_embedding_activations = (
+            tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
+                tpu_embedding_, embedding_activations,
+                self._ctx.embedding_config.dummy_table_variables))
+        features.update(new_embedding_activations)
+      else:
+        features.update(embedding_activations)
+
+  def convert_to_single_tpu_train_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single train step on TPU.
+
+    The user provided `model_fn` takes input tuple
+    (features, labels) and produces the EstimatorSpec with train_op and loss for
+    train `mode`. This usually represents a single train computation on CPU.
+
+    For TPU training, a train (computation) step is first wrapped in a
+    tf.while_loop control flow to repeat for many times and then replicated to
+    all TPU shards. Besides the input should be taken from TPU infeed rather
+    than input pipeline (input_fn) directly. To fit TPU loop and replicate
+    pattern, the original train computation should be reformed, which is the
+    returned `train_step`.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
+      representing the train step for TPU.
+    """
+
+    host_call = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_training_hooks = _CapturedObject()
+
+    def train_step(loss):
+      """Training step function for use inside a while loop."""
+      del loss  # unused; required in function signature.
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features, True)
+
+      estimator_spec = self._verify_estimator_spec(
+          self._call_model_fn(features, labels))
+      loss, train_op = estimator_spec.loss, estimator_spec.train_op
+
+      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+      else:
+        captured_scaffold_fn.capture(None)
+
+      captured_training_hooks.capture(estimator_spec.training_hooks)
+
+      if self._ctx.embedding_config is None:
+        apply_sparse_grads = []
+      else:
+        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+        gradients = (
+            tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
+                tpu_embedding_)
+        )
+        apply_sparse_grads = [
+            tpu_embedding_.generate_send_gradients_op(gradients)
+        ]
+
+      # We must run train_op to update the variables prior to running the
+      # outfeed.
+      with ops.control_dependencies([train_op] + apply_sparse_grads):
+        host_call_outfeed_ops = []
+        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
+            and estimator_spec.host_call is not None):
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
+        with ops.control_dependencies(host_call_outfeed_ops):
+          return array_ops.identity(loss)
+
+    return (train_step, host_call, captured_scaffold_fn,
+            captured_training_hooks)
+
+  def convert_to_single_tpu_eval_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single eval step on TPU.
+
+    Similar to training, the user provided `model_fn` takes input tuple
+    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
+    eval `mode`. This usually represents a single evaluation computation on CPU.
+
+    For TPU evaluation, a eval (computation) step is first wrapped in a
+    tf.while_loop control flow to repeat for many times and then replicated to
+    all TPU shards. Besides the input and output are slightly different. Input,
+    features and labels, should be taken from TPU infeed rather than input
+    pipeline (input_fn) directly. Output is managed in two stages.  First, the
+    model outputs as the result of evaluation computation, usually model logits,
+    should be transferred from TPU system to CPU. Then, all model outputs are
+    concatenated first on CPU and sent to the metric_fn for metrics computation.
+    To fit TPU evaluation pattern, the original eval computation should be
+    reformed, which is the returned `eval_step`.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
+      representing the eval step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_eval_hooks = _CapturedObject()
+
+    def eval_step(total_loss):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features, False)
+
+      tpu_estimator_spec = self._call_model_fn(features, labels)
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        raise RuntimeError(
+            'estimator_spec used by TPU evaluation must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      loss = tpu_estimator_spec.loss
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
+
+      to_record = {}
+      if tpu_estimator_spec.eval_metrics:
+        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        # We assume that evaluate won't update global step, so we don't wrap
+        # this host_call.
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return math_ops.add(total_loss, loss)
+
+    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+
+  def convert_to_single_tpu_predict_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single predict step on TPU.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
+      predict_fn representing the predict step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_predict_hooks = _CapturedObject()
+
+    def predict_step(unused_scalar_stopping_signal):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      stopping_signals = inputs.signals()
+
+      assert stopping_signals is not None, (
+          'Internal Error: `signals` is missing.')
+
+      tpu_estimator_spec = self._call_model_fn(
+          features, labels, is_export_mode=False)
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        raise RuntimeError(
+            'estimator_spec used by TPU prediction must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
+
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
+      to_record = {}
+      identity_fn = lambda **kwargs: kwargs
+      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
+      to_record['signals'] = [identity_fn, stopping_signals]
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
+
+    return (predict_step, host_calls, captured_scaffold_fn,
+            captured_predict_hooks)
+
+  def _verify_tpu_spec_predictions(self, predictions):
+    """Validates TPUEstimatorSpec.predictions dict."""
+    # TODO(xiejw): Adds validation for prediction dictionrary.
+    # TODO(xiejw): Adds support for single tensor as predictions.
+    if not isinstance(predictions, dict):
+      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+
+    for (key, tensor) in predictions.items():
+      if tensor.shape.dims[0].value is None:
+        raise ValueError(
+            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
+            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
+    return predictions
+
+  def _validate_model_features_and_labels(self, features, labels,
+                                          is_export_mode):
+    """Validates that the features and labels for the model function are valid.
+
+    A valid features/labels object is the one with:
+    - Type: A tensor or any nested structure of tensors supported by TF nest,
+        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
+    - Static shape if is_export_mode is False.
+
+    Args:
+      features: the features that would be input to the model function.
+      labels: the labels that would be input to the model function.
+      is_export_mode: boolean value specifying if in export mode.
+
+    Raises:
+      TypeError: If features/labels are not of the correct type.
+      ValueError: If features/labels have dynamic shape.
+    """
+
+    def validate(obj, obj_name):
+      """Helper validate function."""
+      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
+        return
+      if isinstance(obj, ops.Tensor):
+        if not obj.get_shape().is_fully_defined():
+          raise ValueError(
+              'The {} to the model returned by input_fn must have static shape.'
+              ' Tensor: {}'.format(obj_name, obj))
+      else:
+        for tensor in data_nest.flatten(obj):
+          if not tensor.get_shape().is_fully_defined():
+            raise ValueError(
+                ('The {} to the model returned by input_fn must have static '
+                 'shape. Tensor: {}').format(obj_name, tensor))
+
+    validate(features, 'features')
+    if labels is not None:
+      validate(labels, 'labels')
+
+  def _call_model_fn(self, features, labels, is_export_mode=False):
+    """Calls the model_fn with required parameters."""
+    self._validate_model_features_and_labels(features, labels, is_export_mode)
+    model_fn_args = function_utils.fn_args(self._model_fn)
+    kwargs = {}
+
+    # Makes deep copy with `config` and params` in case user mutates them.
+    config = copy.deepcopy(self._config)
+    params = copy.deepcopy(self._params)
+
+    if 'labels' in model_fn_args:
+      kwargs['labels'] = labels
+    elif labels is not None:
+      raise ValueError(
+          'model_fn does not take labels, but input_fn returns labels.')
+    if 'mode' in model_fn_args:
+      kwargs['mode'] = self._ctx.mode
+    if 'config' in model_fn_args:
+      kwargs['config'] = config
+    if 'params' in model_fn_args:
+      kwargs['params'] = params
+
+    if 'params' not in model_fn_args:
+      raise ValueError('model_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params[\'batch_size\']'.format(self._model_fn))
+
+    if is_export_mode:
+      batch_size_for_model_fn = None
+    else:
+      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+
+    if batch_size_for_model_fn is not None:
+      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
+
+    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
+    _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
+
+    if not running_on_cpu:
+      user_context = tpu_context.TPUContext(
+          internal_ctx=self._ctx, call_from_input_fn=False)
+      _add_item_to_params(params, _CTX_KEY, user_context)
+
+    estimator_spec = self._model_fn(features=features, **kwargs)
+    if (running_on_cpu and
+        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
+      # The estimator_spec will be passed to `Estimator` directly, which expects
+      # type `EstimatorSpec`.
+      return estimator_spec.as_estimator_spec()
+    else:
+      return estimator_spec
+
+  def _verify_estimator_spec(self, estimator_spec):
+    """Validates the estimator_spec."""
+    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+      return estimator_spec
+
+    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
+    if estimator_spec.training_chief_hooks:
+      raise ValueError(
+          err_msg.format('training_chief_hooks') + 'If you want' +
+          ' to pass training hooks, please pass via training_hooks.')
+
+    if estimator_spec.scaffold:
+      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
+                      'Please use TPUEstimatorSpec.')
+    return estimator_spec
+
+
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
+
+  def __init__(self, ctx):
+    self._ctx = ctx
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
+
+  @staticmethod
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
+
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
+
+      if isinstance(host_call[1], (tuple, list)):
+        fullargspec = tf_inspect.getfullargspec(host_call[0])
+        fn_args = function_utils.fn_args(host_call[0])
+        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
+        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
+              'method args of the function, which takes {}.'.format(
+                  name, len(host_call[1]), len(fn_args)))
+
+  @staticmethod
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
+
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise
+    return ret
+
+  def record(self, host_calls):
+    """Records the host_call structure."""
+
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
+
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
+
+    Returns:
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
+
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
+
+    with ops.device(tpu.core(0)):
+      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
+
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
+
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
+
+    Returns:
+      A dictionary mapping name to the return type of the host_call by that
+      name.
+
+    Raises:
+      RuntimeError: If outfeed tensor is scalar.
+    """
+    if not self._names:
+      return {}
+
+    ret = {}
+    # For each i, dequeue_ops[i] is a list containing the tensors from all
+    # shards. This list is concatenated later.
+    dequeue_ops = []
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
+
+    # Outfeed ops execute on each replica's first logical core. Note: we must
+    # constraint it such that we have at most one outfeed dequeue and enqueue
+    # per replica.
+    for i in xrange(self._ctx.num_replicas):
+      host_device, ordinal_id = self._ctx.device_for_replica(i)
+      with ops.device(host_device):
+        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
+            dtypes=tensor_dtypes,
+            shapes=tensor_shapes,
+            device_ordinal=ordinal_id)
+        for j, item in enumerate(outfeed_tensors):
+          dequeue_ops[j].append(item)
+
+    # Deconstruct dequeue ops.
+    flat_dequeue_ops = []
+    for l in dequeue_ops:
+      flat_dequeue_ops.extend(l)
+
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
+                                              len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    def _call_host_fn(fn, *args, **kw):
+      context = CatchInvalidHostcallFunctions()
+      context.Enter()
+      result = fn(*args, **kw)
+      context.Exit()
+      context.ExitResult(result)
+      return result
+
+    # It is assumed evaluation always happens on single host TPU system. So,
+    # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
+    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Make the specification of the outfeed combinaton
+          # function more explicit and well-documented.  We may want to give the
+          # user the option of concatenating along any axis.
+          if (self._ctx.config.tpu_config.per_host_input_for_training is
+              tpu_config.InputPipelineConfig.BROADCAST):
+            # If the infeed is in BROADCAST mode (each core recieving the same
+            # input), then we assume that the cores also produce identical
+            # copies of the same output, and we simply take the output from
+            # the first core.  This mode is used by Mesh-TensorFlow.
+            with ops.control_dependencies(dequeue_ops[i]):
+              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
+          else:
+            # Assume that the input has been batch-split and that axis 0 of the
+            # output tensors represents the batch size.  Concatenate along
+            # the axis 0 to re-combine the batch.
+            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise
+        else:
+          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
+
+    # force all dequeue operations to be run if not consumed by the host calls
+    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
+    return ret
+
+
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
+
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
+
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
+
+
+class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
+  """Calculate and report global_step/sec and examples/sec during runtime."""
+
+  def __init__(self,
+               batch_size,
+               every_n_steps=100,
+               every_n_secs=None,
+               output_dir=None,
+               summary_writer=None):
+    self._batch_size = batch_size
+    super(ExamplesPerSecondHook, self).__init__(
+        every_n_steps=every_n_steps,
+        every_n_secs=every_n_secs,
+        output_dir=output_dir,
+        summary_writer=summary_writer)
+
+  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
+    global_step_per_sec = elapsed_steps / elapsed_time
+    examples_per_sec = self._batch_size * global_step_per_sec
+    if self._summary_writer is not None:
+      global_step_summary = Summary(value=[
+          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
+      ])
+      example_summary = Summary(value=[
+          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
+      ])
+      self._summary_writer.add_summary(global_step_summary, global_step)
+      self._summary_writer.add_summary(example_summary, global_step)
+    logging.info('global_step/sec: %g', global_step_per_sec)
+    logging.info('examples/sec: %g', examples_per_sec)
+
+
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
+
+
+class TPUEstimator(estimator_lib.Estimator):
+  """Estimator with TPU support.
+
+  TPUEstimator also supports training on CPU and GPU. You don't need to define
+  a separate `tf.estimator.Estimator`.
+
+  TPUEstimator handles many of the details of running on TPU devices, such as
+  replicating inputs and models for each core, and returning to host
+  periodically to run hooks.
+
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+
+  - For training, `model_fn` gets per-core batch size; `input_fn` may get
+    per-core or per-host batch size depending on `per_host_input_for_training`
+    in `TPUConfig` (See docstring for TPUConfig for details).
+
+  - For evaluation and prediction, `model_fn` gets per-core batch size and
+    `input_fn` get per-host batch size.
+
+  Evaluation
+  ==========
+
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation. If eval_on_tpu is False, the evaluation will execute on
+  CPU or GPU; in this case the following discussion on TPU evaluation does not
+  apply.
+
+  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
+  `tensors` could be a list of any nested structure of `Tensor`s (See
+  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
+  a dict from metric string name to the result of calling a metric function,
+  namely a `(metric_tensor, update_op)` tuple.
+
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
+  Current limitations:
+  --------------------
+
+  1. TPU evaluation only works on a single host (one TPU worker) except
+     BROADCAST mode.
+
+  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
+     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
+     batches should have the same size.
+
+  Example (MNIST):
+  ----------------
+
+  ```
+  # The metric Fn which runs on CPU.
+  def metric_fn(labels, logits):
+    predictions = tf.argmax(logits, 1)
+    return {
+      'accuracy': tf.metrics.precision(
+          labels=labels, predictions=predictions),
+    }
+
+  # Your model Fn which runs on TPU (eval_metrics is list in this example)
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+
+    if mode = tf.estimator.ModeKeys.EVAL:
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metrics=(metric_fn, [labels, logits]))
+
+  # or specify the eval_metrics tensors as dict.
+  def model_fn(features, labels, mode, config, params):
+    ...
+    final_layer_output = ...
+
+    if mode = tf.estimator.ModeKeys.EVAL:
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metrics=(metric_fn, {
+              'labels': labels,
+              'logits': final_layer_output,
+          }))
+  ```
+
+  Prediction
+  ==========
+
+  Prediction on TPU is an experimental feature to support large batch inference.
+  It is not designed for latency-critical system. In addition, due to some
+  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
+  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
+  convenient.
+
+  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
+  *should* raise an end-of-input exception (`OutOfRangeError` or
+  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
+  precise, the ops created by `input_fn` produce one batch of the data.
+  The `predict()` API processes one batch at a time. When reaching the end of
+  the data source, an end-of-input exception should be raised by one of these
+  operations. The user usually does not need to do this manually. As long as the
+  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
+  exception automatically after the last batch has been produced.
+
+  Note: Estimator.predict returns a Python generator. Please consume all the
+  data from the generator so that TPUEstimator can shutdown the TPU system
+  properly for user.
+
+  Current limitations:
+  --------------------
+  1. TPU prediction only works on a single host (one TPU worker).
+
+  2. `input_fn` must return a `Dataset` instance rather than `features`. In
+  fact, .train() and .evaluate() also support Dataset as return value.
+
+  Example (MNIST):
+  ----------------
+  ```
+  height = 32
+  width = 32
+  total_examples = 100
+
+  def predict_input_fn(params):
+    batch_size = params['batch_size']
+
+    images = tf.random_uniform(
+        [total_examples, height, width, 3], minval=-1, maxval=1)
+
+    dataset = tf.data.Dataset.from_tensor_slices(images)
+    dataset = dataset.map(lambda images: {'image': images})
+
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def model_fn(features, labels, params, mode):
+     # Generate predictions, called 'output', from features['image']
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      return tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          predictions={
+              'predictions': output,
+              'is_padding': features['is_padding']
+          })
+
+  tpu_est = TPUEstimator(
+      model_fn=model_fn,
+      ...,
+      predict_batch_size=16)
+
+  # Fully consume the generator so that TPUEstimator can shutdown the TPU
+  # system.
+  for item in tpu_est.predict(input_fn=input_fn):
+    # Filter out item if the `is_padding` is 1.
+    # Process the 'predictions'
+  ```
+
+  Exporting
+  =========
+
+  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
+  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
+  At serving time, these tags are used to select metagraph to load.
+
+  Before running the graph on TPU, TPU system needs to be initialized. If
+  TensorFlow Serving model-server is used, this is done automatically. If
+  not, please call `session.run(tpu.initialize_system())`.
+
+  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
+  `model_fn`.
+
+  Example:
+  ----------------
+
+  ```
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+    export_outputs = {
+      'logits': export_output_lib.PredictOutput(
+        {'logits': logits})
+    }
+
+    def host_call(logits):
+      class_ids = math_ops.argmax(logits)
+      classes = string_ops.as_string(class_ids)
+      export_outputs['classes'] =
+        export_output_lib.ClassificationOutput(classes=classes)
+
+    tpu.outside_compilation(host_call, logits)
+
+    ...
+  ```
+
+  """
+
+  def __init__(self,
+               model_fn=None,
+               model_dir=None,
+               config=None,
+               params=None,
+               use_tpu=True,
+               train_batch_size=None,
+               eval_batch_size=None,
+               predict_batch_size=None,
+               batch_axis=None,
+               eval_on_tpu=True,
+               export_to_tpu=True,
+               export_to_cpu=True,
+               warm_start_from=None,
+               experimental_exported_model_uses_all_cores=False,
+               experimental_export_device_assignment=False,
+               experimental_embedding_config_spec=None):
+    """Constructs an `TPUEstimator` instance.
+
+    Args:
+      model_fn: Model function as required by `Estimator` which returns
+        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
+        and `prediction_hooks` must not capure any TPU Tensor inside the
+        model_fn.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same. If
+        both are `None`, a temporary directory will be used.
+      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
+      params: An optional `dict` of hyper parameters that will be passed into
+        `input_fn` and `model_fn`.  Keys are names of parameters, values are
+        basic python types. There are reserved keys for `TPUEstimator`,
+        including 'batch_size'.
+      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
+        TPU training and evaluation respect this bit, but eval_on_tpu can
+        override execution of eval. See below. - Predict still happens on CPU.
+      train_batch_size: An int representing the global training batch size.
+        TPUEstimator transforms this global batch size to a per-shard batch
+        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
+        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
+        number of replicas.
+      eval_batch_size: An int representing evaluation batch size. Must be
+        divisible by total number of replicas.
+      predict_batch_size: An int representing the prediction batch size. Must be
+        divisible by total number of replicas.
+      batch_axis: A python tuple of int values describing how each tensor
+        produced by the Estimator `input_fn` should be split across the TPU
+        compute shards. For example, if your input_fn produced (images, labels)
+        where the images tensor is in `HWCN` format, your shard dimensions would
+        be [3, 0], where 3 corresponds to the `N` dimension of your images
+        Tensor, and 0 corresponds to the dimension along which to split the
+        labels to match up with the corresponding images. If None is supplied,
+        and per_host_input_for_training is True, batches will be sharded based
+        on the major dimension. If tpu_config.per_host_input_for_training is
+        False or `PER_HOST_V2`, batch_axis is ignored.
+      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
+        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
+      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on TPU. Note that unsupported export modes such as EVAL will be
+        ignored. For those modes, only a CPU model will be exported.
+        Currently, export_to_tpu only supports PREDICT.
+      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on CPU.
+      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
+        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
+        configure warm-starting.  If the string filepath is provided instead of
+        a `WarmStartSettings`, then all variables are warm-started, and it is
+        assumed that vocabularies and Tensor names are unchanged.
+      experimental_exported_model_uses_all_cores: Whether to round-robin among
+        all cores visible to the host which is serving the saved model, or to
+        use a single core. This is a temporary flag to enable using all TPU
+        cores for inference with TPUPartitionedCall(). Once outside compilation
+        is supported in TPUPartitionedCall(), this flag will be enabled by
+        default.
+      experimental_export_device_assignment: Whether to include the device
+        assignment in the exported model. Doing so is useful in case of model
+        parallel inference but will tie the exported model to the TPU topology
+        used to export the model.
+      experimental_embedding_config_spec: Optional EmbeddingConfigSpec instance
+        to support using TPU embedding. IT IS STILL WORK IN PROGRESS, SO PLEASE
+        DO NOT USE.
+
+    Raises:
+      ValueError: `params` has reserved keys already.
+    """
+    if config is None or not isinstance(config, tpu_config.RunConfig):
+      raise ValueError(
+          '`config` must be provided with type `tpu_config.RunConfig`')
+
+    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
+      raise ValueError('{} are reserved keys but existed in params {}.'.format(
+          _RESERVED_PARAMS_KEYS, params))
+
+    if use_tpu:
+      # Perform some very basic validations. More validations will be found in
+      # _InternalTPUContext.
+      if train_batch_size is None:
+        raise ValueError('`train_batch_size` cannot be `None`')
+      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
+
+      if (config.tpu_config.per_host_input_for_training is
+          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
+          config.tpu_config.num_cores_per_replica):
+        raise ValueError(
+            'Model parallelism only supports per host input for training. '
+            'Please adjust TPURunconfig.per_host_input_for_training.')
+
+      if eval_batch_size is not None:
+        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
+
+      if predict_batch_size is not None:
+        util_lib.check_positive_integer(predict_batch_size,
+                                        'predict_batch_size')
+
+    # Verifies the model_fn signature according to Estimator framework.
+    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
+    # We cannot store config and params in this constructor as parent
+    # constructor might change them, such as assigning a temp dir for
+    # config.model_dir.
+    model_function = self._augment_model_fn(model_fn, batch_axis)
+
+    # Overwrite log_step_count_steps to disable TensorLoggingHook and
+    # StepCounterHook from being created in Estimator. TPUEstimator already
+    # added equivalent hooks in _augment_model_fn above.
+    self._log_every_n_steps = config.log_step_count_steps
+    config = config.replace(log_step_count_steps=None)
+
+    # Passing non-None params as wrapped model_fn has it.
+    params = params or {}
+    super(TPUEstimator, self).__init__(
+        model_fn=model_function,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        warm_start_from=warm_start_from)
+    self._iterations_per_training_loop = (
+        self._config.tpu_config.iterations_per_loop)
+
+    # All properties passed to _InternalTPUContext are immutable.
+    # pylint: disable=protected-access
+    self._ctx = tpu_context._get_tpu_context(
+        self._config, train_batch_size, eval_batch_size, predict_batch_size,
+        use_tpu, eval_on_tpu, experimental_embedding_config_spec)
+
+    self._export_to_cpu = export_to_cpu
+    self._export_to_tpu = export_to_tpu
+    self._experimental_exported_model_uses_all_cores = (
+        experimental_exported_model_uses_all_cores)
+    self._experimental_export_device_assignment = (
+        experimental_export_device_assignment)
+    if (experimental_exported_model_uses_all_cores and
+        experimental_export_device_assignment):
+      raise ValueError('experimental_exported_model_uses_all_cores and '
+                       'experimental_export_device_assignment is not supported '
+                       'at the same time.')
+
+    self._is_input_fn_invoked = None
+    self._rendezvous = {}
+
+  def _add_meta_graph_for_mode(self,
+                               builder,
+                               input_receiver_fn_map,
+                               checkpoint_path,
+                               save_variables=True,
+                               mode=model_fn_lib.ModeKeys.PREDICT,
+                               export_tags=None,
+                               check_variables=True):
+    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
+      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
+                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
+                      'for TPU.'.format(mode))
+
+    if not self._export_to_cpu and not self._export_to_tpu:
+      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
+
+    if self._export_to_cpu:
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
+
+    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
+      input_receiver_fn_map = {
+          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
+      }
+      export_tags = [tag_constants.SERVING, tag_constants.TPU]
+      mode = _REWRITE_FOR_INFERENCE_MODE
+
+      # See b/110052256 for why `check_variables` is `False`.
+      if not self._export_to_cpu:
+        check_variables = save_variables = True
+      else:
+        check_variables = save_variables = False
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables=save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
+
+  def _call_model_fn(self, features, labels, mode, config):
+    if mode == _REWRITE_FOR_INFERENCE_MODE:
+      return self._call_model_fn_for_inference(features, labels, mode, config)
+    else:
+      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
+                                                      config)
+
+  def _call_model_fn_for_inference(self, features, labels, mode, config):
+    """Wraps `_call_model_fn` for `export_savedmodel`."""
+    if mode != _REWRITE_FOR_INFERENCE_MODE:
+      raise ValueError('mode must be {}; '
+                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
+
+    computation, capture = self._build_computation_for_inference(
+        features, labels, mode, config)
+    tensors = call_computation(
+        computation,
+        experimental_exported_model_uses_all_cores=self
+        ._experimental_exported_model_uses_all_cores)
+    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
+        capture.get())
+    predictions_list = tensors[:len(predictions_dict)]
+    export_outputs_list_without_none = tensors[len(predictions_dict):]
+
+    # Reinsert `None`s which we've taken out in
+    # `_build_computation_for_inference()`.
+    export_outputs_list = []
+    while none_indices or export_outputs_list_without_none:
+      if none_indices and none_indices[0] == len(export_outputs_list):
+        export_outputs_list.append(None)
+        none_indices.pop(0)
+      else:
+        export_outputs_list.append(export_outputs_list_without_none.pop(0))
+
+    # Reconstruct `export_outputs` with updated tensors.
+    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
+                                                    export_outputs_list)
+    export_outputs = estimator_spec.export_outputs
+    new_export_outputs = collections.OrderedDict(
+        (k, _clone_export_output_with_tensors(export_outputs[k], v))
+        for k, v in six.iteritems(new_export_outputs_dict))
+    # Reconstruct `predictions` with updated tensors.
+    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
+    if (len(new_predictions) == 1 and
+        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
+      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
+
+    return estimator_spec._replace(
+        export_outputs=new_export_outputs, predictions=new_predictions)
+
+  def _build_computation_for_inference(self, features, labels, mode, config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Computation to be passed to `TPUPartitionedCall()`."""
+      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
+          features, labels, mode, config)
+
+      if self._experimental_export_device_assignment:
+        # Export the device assignment as part of the model. This is useful for
+        # model parallel usecases where the model relies on the mapping between
+        # logical and physical devices.
+        with self._ctx.with_mode(mode) as ctx:
+          device_assignment = ctx.device_assignment
+      else:
+        device_assignment = None
+
+      if self._experimental_exported_model_uses_all_cores:
+        tensors_on_cpu = tpu.rewrite(
+            tpu_computation, device_assignment=device_assignment)
+      else:
+        tensors_on_cpu = tpu.rewrite_for_inference(
+            tpu_computation, device_assignment=device_assignment)
+
+      (estimator_spec, export_outputs_dict, export_outputs_list,
+       predictions_dict) = (
+           tpu_capture.get())
+      predictions_list = tensors_on_cpu[:len(predictions_dict)]
+      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
+
+      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
+      # with their CPU counterpart returned from `rewrite_for_inference()`.
+      # `function.Defun()` does not like `None`s in return values, so we leave
+      # `None`s out but record their positions for later reconstruction.
+      export_outputs_list_without_none = []
+      none_indices = []
+      for i, t in enumerate(export_outputs_list):
+        if t is None:
+          none_indices.append(i)
+        else:
+          export_outputs_list_without_none.append(
+              export_outputs_tpu_on_cpu_list.pop(0))
+
+      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
+                       none_indices))
+      return predictions_list + export_outputs_list_without_none
+
+    return computation, capture
+
+  def _build_tpu_computation_for_inference(self, features, labels, mode,
+                                           config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Compute tpu tensors used in export_outputs.
+
+      Passed to rewrite_for_inference so that model_fn will be called under
+      the rewriting contexts. Only tpu tensors are returned, but export_outputs
+      and scaffold are captured.
+
+      Returns:
+         A list of Tensors used in export_outputs and not marked for
+         outside_compilation.
+      """
+      # We should only call model fn once and it should be inside `computation`
+      # so that building the graph will happen under `rewrite_for_inference`.
+      mode = model_fn_lib.ModeKeys.PREDICT
+      estimator_spec = self._call_model_fn(features, labels, mode, config)
+
+      # We pick the TPU tensors out from `export_output` and later return them
+      # from `computation` for rewriting.
+      export_outputs_dict = collections.OrderedDict(
+          (k, _export_output_to_tensors(v))
+          for k, v in six.iteritems(estimator_spec.export_outputs))
+      export_outputs_list = nest.flatten(export_outputs_dict)
+      export_outputs_tpu_list = [
+          t for t in export_outputs_list if t is not None
+      ]
+
+      if isinstance(estimator_spec.predictions, dict):
+        predictions_dict = collections.OrderedDict(
+            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
+      else:
+        predictions_dict = {
+            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
+        }
+      predictions_list = nest.flatten(predictions_dict)
+
+      # We cannot return everything we want through the return values, so
+      # capture the rest here for later use.
+      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
+                       predictions_dict))
+      return predictions_list + export_outputs_tpu_list
+
+    return computation, capture
+
+  def _create_global_step(self, graph):
+    """Creates a global step suitable for TPUs.
+
+    Args:
+      graph: The graph in which to create the global step.
+
+    Returns:
+      A global step `Tensor`.
+
+    Raises:
+      ValueError: if the global step tensor is already defined.
+    """
+    return _create_global_step(graph)
+
+  def _convert_train_steps_to_hooks(self, steps, max_steps):
+    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
+      if ctx.is_running_on_cpu():
+        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
+            steps, max_steps)
+
+    # On TPU.
+    if steps is None and max_steps is None:
+      raise ValueError(
+          'For TPU training, one of `steps` or `max_steps` must be set. '
+          'Cannot be both `None`.')
+
+    # Estimator.train has explicit positiveness check.
+    if steps is not None:
+      util_lib.check_positive_integer(steps, 'Train steps')
+    if max_steps is not None:
+      util_lib.check_positive_integer(max_steps, 'Train max_steps')
+
+    return [
+        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
+    ]
+
+  def _convert_eval_steps_to_hooks(self, steps):
+    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
+      if ctx.is_running_on_cpu():
+        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
+
+    if steps is None:
+      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
+
+    util_lib.check_positive_integer(steps, 'Eval steps')
+
+    return [
+        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
+            num_evals=steps),
+        _SetEvalIterationsHook(steps)
+    ]
+
+  def _call_input_fn(self, input_fn, mode):
+    """Calls the input function.
+
+    Args:
+      input_fn: The input function.
+      mode: ModeKeys
+
+    Returns:
+      In TPU mode, returns an input_fn to be called later in model_fn.
+      Otherwise, calls the input_fn and returns either fatures or
+        (features, labels).
+
+    Raises:
+      ValueError: if input_fn takes invalid arguments or does not have `params`.
+    """
+    input_fn_args = function_utils.fn_args(input_fn)
+    config = self.config  # a deep copy.
+    kwargs = {}
+    if 'params' in input_fn_args:
+      kwargs['params'] = self.params  # a deep copy.
+    else:
+      raise ValueError('input_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params["batch_size"]'.format(input_fn))
+    if 'config' in input_fn_args:
+      kwargs['config'] = config
+
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
+
+    # Records the fact input_fn has been invoked.
+    self._is_input_fn_invoked = True
+
+    with self._ctx.with_mode(mode) as ctx:
+      # Setting the batch size in params first. This helps user to have same
+      # input_fn for use_tpu=True/False.
+      batch_size_for_input_fn = ctx.batch_size_for_input_fn
+      if batch_size_for_input_fn is not None:
+        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
+                            batch_size_for_input_fn)
+
+      # For export_savedmodel, input_fn is never passed to Estimator. So,
+      # `is_export_mode` must be False.
+      if ctx.is_running_on_cpu(is_export_mode=False):
+        with ops.device('/device:CPU:0'):
+          return input_fn(**kwargs)
+
+      # For TPU computation, input_fn should be invoked in a tf.while_loop for
+      # performance. While constructing the tf.while_loop, the structure of
+      # inputs returned by the `input_fn` needs to be recorded. The structure
+      # includes whether features or labels is dict or single Tensor, dict keys,
+      # tensor shapes, and dtypes. The recorded structure is used to create the
+      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
+      # inside the TPU computation, as the TPU computation is wrapped inside a
+      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
+      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
+      # `features` in `model_fn` signature.
+      def _input_fn(ctx):
+        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
+        return input_fn(**kwargs)
+
+      return _input_fn
+
+  def _validate_features_in_predict_input(self, result):
+    """Skip the validation.
+
+    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
+    has stronger check. Parent class's check generates confusing warning msg.
+
+    Args:
+      result: `features` returned by input_fn.
+    """
+    pass
+
+  def train(self,
+            input_fn,
+            hooks=None,
+            steps=None,
+            max_steps=None,
+            saving_listeners=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
+    try:
+      return super(TPUEstimator, self).train(
+          input_fn=input_fn,
+          hooks=hooks,
+          steps=steps,
+          max_steps=max_steps,
+          saving_listeners=saving_listeners)
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('training_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('training_loop')
+      rendezvous.raise_errors()
+
+  def evaluate(self,
+               input_fn,
+               steps=None,
+               hooks=None,
+               checkpoint_path=None,
+               name=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
+    try:
+      return super(TPUEstimator, self).evaluate(
+          input_fn,
+          steps=steps,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          name=name)
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('evaluation_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('evaluation_loop')
+      rendezvous.raise_errors()
+
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None,
+              yield_single_examples=True):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
+    try:
+      for result in super(TPUEstimator, self).predict(
+          input_fn=input_fn,
+          predict_keys=predict_keys,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          yield_single_examples=yield_single_examples):
+        yield result
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('prediction_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('prediction_loop')
+      rendezvous.raise_errors()
+
+    rendezvous.record_done('prediction_loop')
+    rendezvous.raise_errors()
+
+  def _augment_model_fn(self, model_fn, batch_axis):
+    """Returns a new model_fn, which wraps the TPU support."""
+
+    def _model_fn(features, labels, mode, config, params):
+      """A Estimator `model_fn` for TPUEstimator."""
+      with self._ctx.with_mode(mode) as ctx:
+        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
+
+        # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
+        # but not in `export_savedmodel()`.
+        if self._is_input_fn_invoked:
+          is_export_mode = False
+        else:
+          is_export_mode = True
+
+        # Clear the bit.
+        self._is_input_fn_invoked = None
+
+        # examples_hook is added to training_hooks for both CPU and TPU
+        # execution.
+        if self._log_every_n_steps is not None:
+          examples_hook = ExamplesPerSecondHook(
+              ctx.global_batch_size,
+              # pylint:disable=g-long-ternary
+              output_dir=(self.model_dir
+                          if not config or config.save_summary_steps
+                          else None),
+              # pylint:enable=g-long-ternary
+              every_n_steps=self._log_every_n_steps)
+
+        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
+          logging.info('Running %s on CPU', mode)
+          estimator_spec = model_fn_wrapper.call_without_tpu(
+              features, labels, is_export_mode=is_export_mode)
+          if self._log_every_n_steps is not None:
+            estimator_spec = estimator_spec._replace(
+                training_hooks=estimator_spec.training_hooks + (examples_hook,))
+          return estimator_spec
+
+        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
+        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
+        assert callable(features), '`input_fn` is not callable.'
+        input_fn = features
+
+        tpu_init_ops = []
+        if ctx.embedding_config and mode == model_fn_lib.ModeKeys.TRAIN:
+          dummy_table_variables, dummy_table_variables_init = (
+              tpu_embedding_gradient.create_dummy_table_variables(
+                  ctx.embedding_config.tpu_embedding))
+          ctx.embedding_config.dummy_table_variables = dummy_table_variables
+          tpu_init_ops.append(dummy_table_variables_init)
+
+        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
+            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
+
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
+        if mode == model_fn_lib.ModeKeys.TRAIN:
+          compile_op, loss, host_call, scaffold, training_hooks = (
+              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          if ctx.embedding_config:
+            g = ops.get_default_graph()
+            table_to_config_dict = (
+                ctx.embedding_config.tpu_embedding.table_to_config_dict)
+            optimization_parameters = (
+                ctx.embedding_config.tpu_embedding.optimization_parameters)
+            embedding_variable_name_by_table, slot_variable_names_by_table = (
+                _tpu_estimator_embedding.get_full_variable_names(
+                    g, table_to_config_dict, optimization_parameters
+                )
+            )
+            embedding_variables_and_ops = (
+                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
+                    embedding_variable_name_by_table,
+                    slot_variable_names_by_table
+                ))
+            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
+
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
+
+          shutdown_hooks = []
+          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
+                                         'shutdown_worker')
+          if shutdown_mode:
+            if shutdown_mode == 'shutdown_worker':
+              finalizer_hooks = [
+                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
+              ]
+            elif shutdown_mode == 'shutdown_computation':
+              finalizer_hooks = [
+                  session_support.RestartComputation(timeout_ms=60 * 1000),
+              ]
+            else:
+              raise ValueError(
+                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
+
+            shutdown_hooks.append(
+                session_support.GracefulShutdownHook(
+                    checkpoint_prefix=self.model_dir + '/model.ckpt',
+                    on_shutdown_hooks=finalizer_hooks))
+
+          with ops.control_dependencies([loss]):
+            global_step = array_ops.identity(training.get_global_step())
+          hooks = input_hooks + shutdown_hooks
+          hooks.extend([
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  tpu_compile_op=compile_op,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.master,
+                  session_config=self._session_config,
+                  tpu_init_ops=tpu_init_ops),
+              InstallSignalHandlerHook()
+          ])
+          if self._log_every_n_steps is not None:
+            logging_hook_frequency = (  # Divide and round up
+                (self._log_every_n_steps +
+                 self._config.tpu_config.iterations_per_loop - 1) //
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(
+                training.LoggingTensorHook({
+                    'loss': array_ops.identity(loss),
+                    'step': global_step,
+                },
+                                           every_n_iter=logging_hook_frequency))
+            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(examples_hook)
+
+          if training_hooks:
+            hooks.extend(training_hooks)
+
+          chief_hooks = []
+          if (self._config.save_checkpoints_secs or
+              self._config.save_checkpoints_steps):
+            checkpoint_hook = training.CheckpointSaverHook(
+                self.model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=scaffold)
+            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            chief_hooks.append(checkpoint_hook)
+
+          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
+          with ops.control_dependencies([loss]):
+            update_ops = _sync_variables_ops(ctx)
+            if ctx.embedding_config:
+              update_ops.extend(embedding_variables_and_ops.retrieve_ops())
+
+          # Validate the TPU training graph to catch basic errors
+          _validate_tpu_training_graph()
+
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=loss,
+              training_chief_hooks=chief_hooks,
+              training_hooks=hooks,
+              train_op=train_op,
+              scaffold=scaffold)
+
+        if mode == model_fn_lib.ModeKeys.EVAL:
+          compile_op, total_loss, host_calls, scaffold, eval_hooks = (
+              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          iterations_per_loop_var = _create_or_get_iterations_per_loop()
+          mean_loss = math_ops.div(
+              total_loss,
+              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
+
+          with ops.control_dependencies([mean_loss]):
+            # After TPU evaluation computation is done (the mean_loss tensor),
+            # reads all variables back from TPU and updates the eval step
+            # counter properly
+            internal_ops_to_run = _sync_variables_ops(ctx)
+            internal_ops_to_run.append(
+                _increase_eval_step_op(iterations_per_loop_var))
+
+          host_call_ret = host_calls.create_tpu_hostcall()
+          eval_metric_ops = {}
+          eval_update_ops = []
+
+          eval_metrics = host_call_ret.get('eval_metrics', {})
+          if eval_metrics:
+            # Creates a dummy metric update_op for all metrics. Estimator
+            # expects all metrics in `eval_metric_ops` have update_op and calls
+            # them one by one. The real metric update_ops are invoked in a
+            # separated thread. So, here give Estimator the dummy op for all
+            # metrics.
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+            for k, v in eval_metrics.items():
+              eval_metric_ops[k] = (v[0], dummy_update_op)
+              eval_update_ops.append(v[1])
+          else:
+            # If no eval metrics are passed, create an identity node for the
+            # loss and add `internal_ops_to_run` to its dependencies. So
+            # `internal_ops_to_run` can be executed.
+            with ops.control_dependencies(internal_ops_to_run):
+              mean_loss = array_ops.identity(mean_loss)
+
+          if 'host_call' not in host_call_ret:
+            host_ops = []
+          else:
+            host_ops = host_call_ret['host_call']
+          hooks = [
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  eval_update_ops + host_ops,
+                  tpu_compile_op=compile_op,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.evaluation_master,
+                  session_config=self._session_config,
+                  tpu_init_ops=tpu_init_ops)
+          ] + input_hooks
+
+          if eval_hooks:
+            hooks.extend(eval_hooks)
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=mean_loss,
+              evaluation_hooks=hooks,
+              eval_metric_ops=eval_metric_ops,
+              scaffold=scaffold)
+
+        # Predict
+        assert mode == model_fn_lib.ModeKeys.PREDICT
+
+        (compile_op, dummy_predict_op, host_calls,
+         scaffold, prediction_hooks) = _predict_on_tpu_system(
+             ctx, model_fn_wrapper, dequeue_fn)
+        with ops.control_dependencies([dummy_predict_op]):
+          internal_ops_to_run = _sync_variables_ops(ctx)
+          with ops.control_dependencies(internal_ops_to_run):
+            dummy_predict_op = control_flow_ops.no_op()
+
+        # In train and evaluation, the main TPU program is passed to monitored
+        # training session to run. Infeed enqueue and outfeed dequeue are
+        # executed in side threads. This is not the configuration for
+        # prediction mode.
+        #
+        # For prediction, the Estimator executes the EstimatorSpec.predictions
+        # directly and yield the element (via generator) to call site. So, the
+        # outfeed based prediction must be passed to MonitoredSession directly.
+        # Other parts of the TPU execution are organized as follows.
+        #
+        # 1. All outfeed based Tensors must be grouped with predictions Tensors
+        #    to form a single invocation. This avoid the issue we might trigger
+        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
+        #    placed in control_dependencies of `stopping_signals`, and
+        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
+        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
+        #    all SessionRunArgs with the fetch in session.run together.
+        #
+        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
+        #    are grouped together. They will be launched once and only once in
+        #    side threads and they quit naturally according to the SAME stopping
+        #    condition.
+        enqueue_ops.append(dummy_predict_op)
+
+        host_call_ret = host_calls.create_tpu_hostcall()
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
+
+        predictions = host_call_ret['predictions']
+        _verify_cross_hosts_transfer_size(
+            predictions,
+            message=(
+                'The estimated size for TPUEstimatorSpec.predictions is too '
+                'large.'))
+        signals = host_call_ret['signals']
+
+        with ops.control_dependencies(host_ops):
+          host_ops = []  # Empty, we do do not need it anymore.
+          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
+              signals)
+          predictions = _PaddingSignals.slice_tensor_or_dict(
+              predictions, signals)
+
+        hooks = [
+            _StoppingPredictHook(scalar_stopping_signal),
+            TPUInfeedOutfeedSessionHookForPrediction(
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
+                tpu_compile_op=compile_op,
+                master=self._config.master,
+                session_config=self._session_config),
+        ] + input_hooks
+
+        if prediction_hooks:
+          hooks.extend(prediction_hooks)
+
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            prediction_hooks=hooks,
+            predictions=predictions,
+            scaffold=scaffold)
+
+    return _model_fn
+
+
+def _export_output_to_tensors(export_output):
+  """Get a list of `Tensors` used in `export_output`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+      `RegressionOutput`, or `PredictOutput`.
+
+  Returns:
+    a list of tensors used in export_output.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    return [export_output.scores, export_output.classes]
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    return [export_output.value]
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return list(export_output.outputs.values())
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _clone_export_output_with_tensors(export_output, tensors):
+  """Clones `export_output` but with new `tensors`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+      `RegressionOutput`, or `PredictOutput`.
+    tensors: a list of `Tensors` used to construct a new `export_output`.
+
+  Returns:
+    A dict similar to `export_output` but with `tensors`.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    if len(tensors) != 2:
+      raise ValueError('tensors must be of length 2; '
+                       'got {}.'.format(len(tensors)))
+    return export_output_lib.ClassificationOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    if len(tensors) != 1:
+      raise ValueError('tensors must be of length 1; '
+                       'got {}'.format(len(tensors)))
+    return export_output_lib.RegressionOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return export_output_lib.PredictOutput(
+        dict(zip(export_output.outputs.keys(), tensors)))
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
+
+  def multi_tpu_eval_steps_on_single_shard():
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
+                                [_ZERO_LOSS])
+
+  (compile_op, loss,) = tpu.split_compile_and_shard(
+      multi_tpu_eval_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  loss = loss[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get()
+
+
+def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  (single_tpu_train_step, host_call, captured_scaffold_fn,
+   captured_training_hooks) = (
+       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_train_steps_on_single_shard():
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
+                                [_INITIAL_LOSS])
+
+  (compile_op, loss,) = tpu.split_compile_and_shard(
+      multi_tpu_train_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  loss = loss[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return compile_op, loss, host_call, scaffold, captured_training_hooks.get()
+
+
+def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
+   captured_predict_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_predict_steps_on_single_shard():
+
+    def cond(scalar_stopping_signal):
+      return math_ops.logical_not(
+          _StopSignals.should_stop(scalar_stopping_signal))
+
+    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
+    outputs = training_loop.while_loop(
+        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
+    return outputs
+
+  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
+      multi_tpu_predict_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  dummy_predict_op = dummy_predict_op[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return (compile_op, dummy_predict_op, host_calls, scaffold,
+          captured_predict_hooks.get())
+
+
+def _wrap_computation_in_while_loop(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def computation(i):
+    with ops.control_dependencies(op_fn()):
+      return i + 1
+
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    iterations = array_ops.identity(iterations_per_loop_var)
+    return control_flow_ops.while_loop(
+        lambda i: i < iterations,
+        computation, [constant_op.constant(0)],
+        parallel_iterations=1)
+
+
+def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def cond(scalar_stopping_signal):
+    return math_ops.logical_not(
+        _StopSignals.should_stop(scalar_stopping_signal))
+
+  def computation(unused_scalar_stopping_signal):
+    return_value = op_fn()
+    execute_ops = return_value['ops']
+    signals = return_value['signals']
+    with ops.control_dependencies(execute_ops):
+      return _StopSignals.as_scalar_stopping_signal(signals)
+
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    return control_flow_ops.while_loop(
+        cond,
+        computation, [_StopSignals.NON_STOPPING_SIGNAL],
+        parallel_iterations=1)
+
+
+def _validate_tpu_training_graph():
+  """Validate graph before running distributed training.
+
+  Raises:
+    ValueError: If the graph seems invalid for running on device
+  """
+  operations = ops.get_default_graph().get_operations()
+
+  # Check if there is atleast one CrossReplicaSum operation in the graph
+  # This should be introduced by using the CrossShardOptimizer wrapper
+  cross_replica_sum_ops = [
+      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
+  ]
+  if not cross_replica_sum_ops:
+    raise ValueError(
+        'CrossShardOptimizer must be used for model training on TPUs.')
+
+
+class _CapturedObject(object):
+  """A placeholder to capture an object.
+
+  This is useful when we need to capture a Python object in the Tensorflow
+  control flow body function and use it outside the control flow.
+  """
+
+  def __init__(self):
+    self._object = None
+    self._captured = False
+
+  def capture(self, o):
+    if self._captured:
+      raise RuntimeError(
+          'InternalError: Object can capture only once. Please file bug.')
+
+    self._captured = True
+    self._object = o
+
+  def get(self):
+    if not self._captured:
+      raise RuntimeError(
+          'InternalError: Object is not captured properly before `get`. '
+          'Please file bug.')
+    return self._object
+
+
+def _get_scaffold(captured_scaffold_fn):
+  """Retrieves the Scaffold from `captured_scaffold_fn`."""
+  with _CapturingContext(message='Inside scaffold_fn'):
+    scaffold_fn = captured_scaffold_fn.get()
+    if scaffold_fn:
+      scaffold = scaffold_fn()
+      if scaffold is None:
+        raise ValueError(
+            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
+    else:
+      scaffold = None
+
+  if scaffold:
+    wrapped_finalize = scaffold.finalize
+
+    def _finalize():
+      with _CapturingContext('Inside Scaffold.finalize'):
+        wrapped_finalize()
+
+    scaffold.finalize = _finalize
+  return scaffold
+
+
+class _CapturingContext(control_flow_ops.ControlFlowContext):
+  """Tracks references to Tensors defined in TPU replication."""
+
+  def __init__(self, message):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._message = message
+
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    # pylint: disable=useless-super-delegation
+    # NOTE(slebedev): the method is required by `ControlFlowContext`.
+    super(_CapturingContext, self).to_control_flow_context_def(
+        context_def, export_scope)
+
+  def AddOp(self, op):  # pylint: disable=invalid-name
+    for c in op.inputs:
+      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
+        raise ValueError('{}: Op {} depends on TPU computation {}, '
+                         'which is not allowed.'.format(self._message, op, c))
+
+  def __enter__(self):
+    # pylint: disable=protected-access
+    self._g = ops.get_default_graph()
+    self._old = self._g._get_control_flow_context()
+    self._g._set_control_flow_context(self)
+    # pylint: enable=protected-access
+
+  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
+    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None, signals=None):
+    if dataset is not None and (features is not None or labels is not None or
+                                signals is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+    self._signals = signals
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.DatasetV2):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    features, labels = _Inputs._parse_inputs(return_values)
+    return _Inputs(features, labels)
+
+  @staticmethod
+  def _parse_inputs(return_values):
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return features, labels
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer(self):
+    """Returns the dataset's initializer.
+
+    The initializer must be run before calling `features_and_labels`.
+    """
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
+    return self._iterator.initializer
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      if self._iterator is None:
+        raise RuntimeError('Internal error: Must run dataset_initializer '
+                           'before calling features_and_labels(). Please file '
+                           'a bug!')
+      return _Inputs._parse_inputs(self._iterator.get_next())
+
+    return (self._features, self._labels)
+
+  def signals(self):
+    return self._signals
+
+  @property
+  def dataset(self):
+    return self._dataset
+
+
+class _InputsWithStoppingSignals(_Inputs):
+  """Inputs with `_StopSignals` inserted into the dataset."""
+
+  def __init__(self,
+               dataset,
+               batch_size,
+               add_padding=False,
+               num_invocations_per_step=1):
+
+    assert dataset is not None
+    user_provided_dataset = dataset.map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=False, batch_size=batch_size, add_padding=add_padding))
+    if num_invocations_per_step == 1:
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+    else:
+      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
+      # user_provided_dataset and stop properly.
+      # For example, if num_invocations_per_step is 2, we append 3 additional
+      # padding batches: b1, b2, b3.
+      # If user_provided_dataset contains two batches: a1, a2
+      # Step 1: [a1, a2]
+      # Step 2: [b1, b2] -> STOP
+      # If user_provided_dataset contains three batches: a1, a2, a3.
+      # The training loops:
+      # Step 1: [a1, a2]
+      # Step 2: [a3, b1]
+      # Step 3: [b2, b3] -> STOP.
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+      final_batch_dataset = final_batch_dataset.repeat(
+          2 * num_invocations_per_step - 1)
+
+      def _set_mask(data_dict):
+        signals = data_dict['signals']
+        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
+        data_dict['signals'] = signals
+        return data_dict
+
+      # Mask out the extra batch.
+      final_batch_dataset = final_batch_dataset.map(_set_mask)
+
+    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
+
+    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
+    self._current_inputs = None
+
+  def features_and_labels(self):
+    if self._current_inputs is not None:
+      raise RuntimeError(
+          'Internal Error: The previous inputs have not been properly '
+          'consumed. First call features_and_labels, then call signals.')
+
+    inputs_with_signals = self._iterator.get_next()
+    features = inputs_with_signals['features']
+    labels = inputs_with_signals.get('labels')
+
+    self._current_inputs = inputs_with_signals
+    return features, labels
+
+  def signals(self):
+    """Returns the `Signals` from `_Inputs`."""
+    if self._current_inputs is None:
+      raise RuntimeError(
+          'Internal Error: The current inputs have not been properly '
+          'generated. First call features_and_labels, then call signals.')
+    signals = self._current_inputs['signals']
+    self._current_inputs = None
+    return signals
+
+  @staticmethod
+  def insert_stopping_signal(stop, batch_size, add_padding=False):
+    """Inserts stopping_signal into dataset via _map_fn.
+
+    Here we change the data structure in the dataset, such that the return value
+    is a dictionary now and `features`, `labels`, and `signals` are three
+    distinguished keys in that dict. This provides a better structure, which
+    eases the process to decompose the inputs (see `features_and_labels`).
+
+    Args:
+      stop: bool, state of current stopping signals.
+      batch_size: int, batch size.
+      add_padding: bool, whether to pad the tensor to full batch size.
+
+    Returns:
+      A map_fn passed to dataset.map API.
+    """
+
+    def _map_fn(*args):
+      """The map fn to insert signals."""
+      if len(args) == 1:
+        # Unpack the single Tensor/dict argument as features. This is required
+        # for the input_fn returns no labels.
+        args = args[0]
+      features, labels = _Inputs._parse_inputs(args)
+      new_input_dict = {}
+
+      if add_padding:
+        padding_mask, features, labels = (
+            _PaddingSignals.pad_features_and_labels(features, labels,
+                                                    batch_size))
+
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+
+      else:
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+        padding_mask = None
+
+      new_input_dict['signals'] = _StopSignals(
+          stop=stop, batch_size=batch_size,
+          padding_mask=padding_mask).as_dict()
+
+      return new_input_dict
+
+    return _map_fn
+
+
+class _StopSignals(object):
+  """Signals class holding all logic to handle TPU stopping condition."""
+
+  NON_STOPPING_SIGNAL = False
+  STOPPING_SIGNAL = True
+
+  def __init__(self, stop, batch_size, padding_mask=None):
+    self._stop = stop
+    self._batch_size = batch_size
+    self._padding_mask = padding_mask
+
+  def as_dict(self):
+    """Returns the signals as Python dict."""
+    shape = [self._batch_size, 1]
+    dtype = dtypes.bool
+
+    if self._stop:
+      stopping = array_ops.ones(shape=shape, dtype=dtype)
+    else:
+      stopping = array_ops.zeros(shape=shape, dtype=dtype)
+
+    signals = {'stopping': stopping}
+    if self._padding_mask is not None:
+      signals['padding_mask'] = self._padding_mask
+    return signals
+
+  @staticmethod
+  def as_scalar_stopping_signal(signals):
+    return array_ops.identity(signals['stopping'][0][0])
+
+  @staticmethod
+  def should_stop(scalar_stopping_signal):
+    """Detects whether scalar_stopping_signal indicates stopping."""
+    if isinstance(scalar_stopping_signal, ops.Tensor):
+      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
+      # way to express the bool check whether scalar_stopping_signal is True.
+      return math_ops.logical_and(scalar_stopping_signal,
+                                  _StopSignals.STOPPING_SIGNAL)
+    else:
+      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
+      # the graph anymore. Here, we use pure Python.
+      return bool(scalar_stopping_signal)
+
+
+class _PaddingSignals(object):
+  """Signals class holding all logic to handle padding."""
+
+  @staticmethod
+  def pad_features_and_labels(features, labels, batch_size):
+    """Pads out the batch dimension of features and labels."""
+    real_batch_size = array_ops.shape(
+        _PaddingSignals._find_any_tensor(features))[0]
+
+    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
+
+    check_greater = check_ops.assert_greater_equal(
+        batch_size_tensor,
+        real_batch_size,
+        data=(batch_size_tensor, real_batch_size),
+        message='The real batch size should not be greater than batch_size.')
+
+    with ops.control_dependencies([check_greater]):
+      missing_count = batch_size_tensor - real_batch_size
+
+    def pad_single_tensor(tensor):
+      """Pads out the batch dimension of a tensor to the complete batch_size."""
+      rank = len(tensor.shape)
+      assert rank > 0
+      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
+      padded_tensor = array_ops.pad(tensor, padding)
+      padded_tensor.set_shape(padded_shape)
+      return padded_tensor
+
+    def nest_pad(tensor_or_dict):
+      return nest.map_structure(pad_single_tensor, tensor_or_dict)
+
+    features = nest_pad(features)
+    if labels is not None:
+      labels = nest_pad(labels)
+
+    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
+                                                 batch_size)
+
+    return padding_mask, features, labels
+
+  @staticmethod
+  def slice_tensor_or_dict(tensor_or_dict, signals):
+    """Slice the real Tensors according to padding mask in signals."""
+
+    padding_mask = signals['padding_mask']
+    batch_size = array_ops.shape(padding_mask)[0]
+
+    def verify_batch_size(tensor):
+      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
+      with ops.control_dependencies([check_batch_size]):
+        return array_ops.identity(tensor)
+
+    def slice_single_tensor(tensor):
+      rank = len(tensor.shape)
+      assert rank > 0
+      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
+      return verify_batch_size(tensor)[0:real_batch_size]
+
+    # As we split the Tensors to all TPU cores and concat them back, it is
+    # important to ensure the real data is placed before padded ones, i.e.,
+    # order is preserved. By that, the sliced padding mask should have all 0's.
+    # If this assertion failed, # the slice logic here would not hold.
+    sliced_padding_mask = slice_single_tensor(padding_mask)
+    assert_padding_mask = math_ops.equal(
+        math_ops.reduce_sum(sliced_padding_mask), 0)
+
+    with ops.control_dependencies([assert_padding_mask]):
+      should_stop = _StopSignals.should_stop(
+          _StopSignals.as_scalar_stopping_signal(signals))
+
+    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
+
+    def slice_fn(tensor):
+      # If the current batch is full batch or part of stopping signals, we do
+      # not need to slice to save performance.
+      return control_flow_ops.cond(
+          math_ops.logical_or(should_stop, is_full_batch),
+          (lambda: verify_batch_size(tensor)),
+          (lambda: slice_single_tensor(tensor)))
+
+    return nest.map_structure(slice_fn, tensor_or_dict)
+
+  @staticmethod
+  def _find_any_tensor(batch_features):
+    tensors = [
+        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
+    ]
+    if not tensors:
+      raise ValueError('Cannot find any Tensor in features dict.')
+    return tensors[0]
+
+  @staticmethod
+  def _padding_mask(real_batch_size, missing_count, batch_size):
+    padding_mask = array_ops.concat([
+        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
+        array_ops.ones((missing_count,), dtype=dtypes.int32)
+    ],
+                                    axis=0)
+    padding_mask.set_shape((batch_size,))
+    return padding_mask
+
+
+def _verify_cross_hosts_transfer_size(tensor_dict, message):
+  total_size = 0
+  tensor_structure = {}
+  for key, tensor in tensor_dict.items():
+    shape = tensor.shape
+    size = np.product(shape) * tensor.dtype.size
+    tensor_structure[key] = shape
+    total_size += size
+  if total_size >= _ONE_GIGABYTE:
+    raise ValueError(
+        '{} The transfer size is larger than the protobuf limit. Please '
+        'consider to use Tensors with smaller shapes or reduce batch '
+        'size. Given:\n'
+        '{}'.format(
+            message, '\n'.join([
+                ' -- Key: {}, Shape: {}'.format(k, v)
+                for k, v in tensor_structure.items()
+            ])))
+
+
+def _add_item_to_params(params, key, value):
+  """Adds a new item into `params`."""
+  if hasattr(params, 'set_hparam'):
+    # For HParams, we need to use special API.
+    if key in params:
+      params.set_hparam(key, value)
+    else:
+      params.add_hparam(key, value)
+  else:
+    # Now params is Python dict.
+    params[key] = value
+
+
+def export_estimator_savedmodel(estimator,
+                                export_dir_base,
+                                serving_input_receiver_fn,
+                                assets_extra=None,
+                                as_text=False,
+                                checkpoint_path=None,
+                                strip_default_attrs=False):
+  """Export `Estimator` trained model for TPU inference.
+
+  Args:
+    estimator: `Estimator` with which model has been trained.
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    serving_input_receiver_fn: A function that takes no argument and returns a
+      `ServingInputReceiver` or `TensorServingInputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs.
+
+  Returns:
+    The string path to the exported directory.
+  """
+  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
+  # `estimator.config`.
+  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
+  est = TPUEstimator(
+      estimator._model_fn,  # pylint: disable=protected-access
+      config=config,
+      params=estimator.params,
+      use_tpu=True,
+      train_batch_size=2048,  # Does not matter.
+      eval_batch_size=2048,  # Does not matter.
+  )
+  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                               assets_extra, as_text, checkpoint_path,
+                               strip_default_attrs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/python/tpu/tpu_estimator_signals_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
rename to tensorflow/python/tpu/tpu_estimator_signals_test.py
index e3ea983abfd24d03c964fbc647b56262e15e0a96..ca3eeaa9c9ace9bdbf6a3c6efa8b84eeecc7a60f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/python/tpu/tpu_estimator_signals_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_estimator
 
 
 def make_input_fn(num_samples):
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..de1adc80e6015a8418bd3ea470d1a17561eb542b
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -0,0 +1,919 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Helper library for handling infeed between hosts and TPUs.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_sharding
+from tensorflow.python.tpu.ops import tpu_ops
+
+from tensorflow.python.util import nest
+
+
+def partition_or_replicate_on_host(tensor, dims):
+  """Partitions or replicates the input tensor.
+
+    The ops inside this function are placed on the host side.
+
+  Args:
+    tensor: The input tensor which will be partioned or replicated.
+    dims: A list of integer describes how to partition the input tensor.
+
+  Returns:
+    An iterator of `Tensor`s or a list of partioned tensors.
+  """
+  if dims is None:
+    return itertools.repeat(tensor)
+  dims = np.array(dims)
+  output = [tensor]
+  shape_list = np.array(tensor.shape.as_list())
+  quotients, remainders = np.divmod(shape_list, dims)
+  for axis, (quotient, remainder, dim, original_size) in enumerate(
+      zip(quotients, remainders, dims, shape_list)):
+    if dim <= 1:
+      continue
+    if remainder > 0:
+      # For each dimension, when it cannot be evenly partitioned, XLA assumes
+      # tensors are partitioned in a greedy manner by using
+      # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
+      # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
+      # [[(3, 4), (3, 4), (2, 4), (2, 2)],
+      # [(2, 4), (2, 4), (2, 4), (2, 2)]]
+      ceil_ratio = quotient + 1
+      num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
+      num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
+      if len(num_or_size_splits) < dim:
+        num_or_size_splits += [0] * (dim - len(num_or_size_splits))
+      new_output = []
+      for x in output:
+        new_output.append(
+            array_ops.split(
+                x, num_or_size_splits=num_or_size_splits, axis=axis))
+      output = new_output
+    else:
+      output = [array_ops.split(x, dim, axis=axis) for x in output]
+    output = nest.flatten(output)
+  return output
+
+
+def _tag_sharding_attribute_for_dequeued_tensor(tensor, dims):
+  """Tags appropriate XLA sharding attribute to the dequeued tensor.
+
+  Args:
+    tensor: The dequeued tensor on TPU.
+    dims: A list of integer describes how the tensor is partitioned.
+
+  Returns:
+    The same tensor with the xla_sharding attribute.
+  """
+  if dims is None:
+    return xla_sharding.replicate(tensor)
+  elif np.prod(dims) == 1:
+    return xla_sharding.assign_device(tensor, 0)
+  else:
+    tile_assignment = np.arange(np.prod(dims)).reshape(dims)
+    return xla_sharding.tile(tensor=tensor, tile_assignment=tile_assignment)
+
+
+def tag_sharding_attribute_for_dequeued_tensors(dequeues, dims):
+  """Tags appropriate XLA sharding attribute to the dequeued tensors.
+
+  Args:
+    dequeues: A list of dequeued tensors on TPU.
+    dims: A list of integer describes how the tensor is partitioned.
+
+  Returns:
+    The same dequeues with appropriate xla_sharding attribute.
+  """
+  nest.assert_shallow_structure(dequeues, dims)
+  return nest.map_structure_up_to(
+      dequeues, _tag_sharding_attribute_for_dequeued_tensor, dequeues, dims)
+
+
+class InfeedQueue(object):
+  """A helper object to build a device infeed queue.
+
+  The InfeedQueue builds the host-side and device-side Ops to enqueue and
+  dequeue elements, respectively, and ensures that their types and
+  shapes match.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               shard_dimensions=None,
+               name=None):
+    """Creates a new InfeedQueue with the given configuration.
+
+    The configuration need not be fully specified at creation since it
+    can be modified subsequently by methods that set the values
+    explicitly or infer them from the shapes of inputs.
+
+    Args:
+      number_of_tuple_elements: the number of Tensors fed atomically through the
+        queue, must be present unless it can be inferred from other arguments.
+      tuple_types: if not None, a list of types of the elements of the queue.
+      tuple_shapes: if not None, a list of shapes of the elements of the queue.
+      shard_dimensions: if not None, a list of dimensions on which the
+        elements of the queue should be sharded during automatic
+        parallelization.
+      name: the name of the queue.
+
+    Raises:
+      ValueError: if number_of_tuple_elements <= 0; or
+        number_of_tuple_arguments, tuple_types, tuple_shapes, and
+        shard_dimensions are all None; or the length of tuple_types,
+        tuple_shapes, or shard_dimensions is not equal to
+        number_of_tuple_elements; or any element of shard_dimensions
+        can't be converted to a Dimension.
+      TypeError: if any element of tuple_types or tuple_shapes can't
+        be converted to a dtype or TensorShape, respectively.
+    """
+    self._frozen = False
+    self._generated_enqueue_ops = False
+    self._generated_dequeue_op = False
+    self._name = "InfeedQueue" if name is None else name
+    if number_of_tuple_elements is None:
+      if tuple_types is not None:
+        number_of_tuple_elements = len(tuple_types)
+      elif tuple_shapes is not None:
+        number_of_tuple_elements = len(tuple_shapes)
+      elif shard_dimensions is not None:
+        number_of_tuple_elements = len(shard_dimensions)
+      else:
+        raise ValueError(
+            "number of tuple elements cannot be inferred from InfeedQueue "
+            "constructor")
+    if number_of_tuple_elements <= 0:
+      raise ValueError("number_of_tuple_elements %d must be > 0" %
+                       number_of_tuple_elements)
+    # Make an empty sharding policy for each tuple element.
+    self._sharding_policies = [
+        tpu_sharding.ShardingPolicy()
+        for _ in xrange(number_of_tuple_elements)
+    ]
+    if tuple_types is not None:
+      self.set_tuple_types(tuple_types)
+    else:
+      self._tuple_types = None
+    if tuple_shapes is not None:
+      self.set_tuple_shapes(tuple_shapes)
+    else:
+      self._tuple_shapes = None
+    if shard_dimensions is not None:
+      self.set_shard_dimensions(shard_dimensions)
+    self._validate()
+
+  def _validate(self):
+    """Checks that the configuration is self-consistent.
+
+    Raises:
+      ValueError: if the shapes and sharding policies don't match.
+    """
+    if self.tuple_shapes is not None:
+      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
+        # Raise an error if the policy is incompatible with the shape.
+        _ = policy.get_sharded_shape(shape)
+
+  @property
+  def number_of_tuple_elements(self):
+    """Returns the number of InfeedQueue tuple elements."""
+    return len(self._sharding_policies)
+
+  @property
+  def tuple_types(self):
+    """Returns the types of the InfeedQueue tuple elements."""
+    return self._tuple_types
+
+  def set_tuple_types(self, tuple_types):
+    """Sets the type of each element of the queue.
+
+    tuple_types must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a dtype.
+
+    Args:
+      tuple_types: the types of each queue element.
+
+    Raises:
+      ValueError: if tuple_types is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_types cannot be converted to a
+        dtype.
+    """
+    if len(tuple_types) != self.number_of_tuple_elements:
+      raise ValueError("tuple_types is %s, but must be a list of length %d" %
+                       (str(tuple_types), self.number_of_tuple_elements))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_types, tuple_types):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible type. Frozen types are %s, updated types are %s" % (
+                  str(self._tuple_types), str(tuple_types)))
+    else:
+      try:
+        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
+      except (TypeError) as e:
+        raise TypeError(
+            "tuple_types is %s, but must be a list of elements each "
+            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
+
+  @property
+  def tuple_shapes(self):
+    """Returns the shapes of the InfeedQueue tuple elements."""
+    return self._tuple_shapes
+
+  def set_tuple_shapes(self, tuple_shapes):
+    """Sets the shape of each element of the queue.
+
+    tuple_shapes must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a TensorShape.
+
+    Args:
+      tuple_shapes: the shapes of each queue element.
+
+    Raises:
+      ValueError: if tuple_shapes is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_shapes cannot be converted to
+        a TensorShape.
+    """
+    if len(tuple_shapes) != self.number_of_tuple_elements:
+      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
+                       (str(tuple_shapes), self.number_of_tuple_elements))
+    try:
+      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
+    except (ValueError, TypeError) as e:
+      raise TypeError(
+          "tuple_shapes is %s, but must be a list of elements each "
+          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
+                                                        str(e)))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
+              % (str(self._tuple_shapes), str(tuple_shapes)))
+    else:
+      self._tuple_shapes = tuple_shapes
+    self._validate()
+
+  @property
+  def sharding_policies(self):
+    """Returns the sharding policies of the InfeedQueue tuple elements."""
+    return self._sharding_policies
+
+  @property
+  def shard_dimensions(self):
+    """Gets the shard dimension of each tuple element.
+
+    Returns:
+      A list of length number_of_tuple_elements, where each list entry
+      is the shard dimension of that tuple element or None if the
+      shard dimension has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return [policy.shard_dimension for policy in self._sharding_policies]
+
+  def set_shard_dimensions(self, shard_dimensions):
+    """Sets the shard_dimension of each element of the queue.
+
+    shard_dimensions must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a Dimension compatible with self.tuple_shapes.
+
+    Args:
+      shard_dimensions: the dimensions of each queue element.
+
+    Raises:
+      ValueError: if shard_dimensions is not of length
+        self.number_of_tuple_elements; or an element of
+        shard_dimensions cannot be converted to a Dimension; or an
+        element of shard_dimensions is a Dimension that is out of
+        range for the corresponding tuple element shape.
+    """
+    if len(shard_dimensions) != self.number_of_tuple_elements:
+      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
+                       % (str(shard_dimensions),
+                          self.number_of_tuple_elements))
+    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
+      policy.set_shard_dimension(dimension)
+    self._validate()
+
+  @property
+  def number_of_shards(self):
+    """Gets the number of shards to use for the InfeedQueue.
+
+    Returns:
+      Number of shards or None if the number of shards has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return self._sharding_policies[0].number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards to use for the InfeedQueue.
+
+    Args:
+      number_of_shards: number of ways to shard the InfeedQueue.
+
+    Raises:
+      ValueError: if number_of_shards is not > 0; or the policies have
+        been frozen and number_of_shards was already set to something
+        else.
+    """
+    for policy in self._sharding_policies:
+      policy.set_number_of_shards(number_of_shards)
+    self._validate()
+
+  def set_configuration_from_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of Tensors whose types and shapes are used
+    to set the queue configuration.
+
+    Args:
+      input_tensors: list of Tensors of the same types and shapes as
+        the desired queue Tuple.
+
+    Raises:
+      ValueError: if input_tensors is not a list of length
+        self.number_of_tuple_elements
+    """
+    if len(input_tensors) != self.number_of_tuple_elements:
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
+    self.set_tuple_shapes([t.shape for t in input_tensors])
+    self.set_tuple_types([t.dtype for t in input_tensors])
+
+  def set_configuration_from_sharded_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of lists of Tensors whose types and shapes are used
+    to set the queue configuration. The length of the outer list is the number
+    of shards required, and each inner list is the tuple of Tensors to use to
+    determine the types and shapes of the corresponding shard. This method
+    depends on the shard dimension, and calling it freezes the shard policy.
+
+    Args:
+      input_tensors: list of lists of Tensors. The outer list length corresponds
+        to the desired number of shards, and each inner list is the size
+        and shape of the desired configuration of the corresponding shard.
+
+    Raises:
+      ValueError: if any inner list is not a list of length
+        self.number_of_tuple_elements; or the inner lists do not combine to
+        form a consistent unsharded shape.
+      TypeError: if the types of the Tensors in the inner lists do not match.
+    """
+    if not self._frozen:
+      # Unset the tuple shapes in case the configuration becomes
+      # transiently inconsistent.
+      self._tuple_shapes = None
+    number_of_shards = len(input_tensors)
+    self.set_number_of_shards(number_of_shards)
+    for t in input_tensors:
+      if len(t) != self.number_of_tuple_elements:
+        raise ValueError(
+            "input_tensors is %s but must be a list of lists, where each inner"
+            " list has length number_of_tuple_elements=%d" % (
+                str(input_tensors), self.number_of_tuple_elements))
+    # Transpose the inputs to make a list of shard shapes for each tuple
+    # element.
+    sharded_shapes = [[t[i].shape for t in input_tensors]
+                      for i in xrange(self.number_of_tuple_elements)]
+    # For each tuple, get the unsharded shape using that tuple's policy.
+    unsharded_shapes = [
+        policy.get_unsharded_shape(s)
+        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
+    ]
+    self.set_tuple_shapes(unsharded_shapes)
+    for i in xrange(1, self.number_of_shards):
+      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
+        if t1.dtype != t2.dtype:
+          raise TypeError(
+              "types of the tuple elements of input_tensors %s are not "
+              "consistent" % str(input_tensors))
+    self.set_tuple_types([t.dtype for t in input_tensors[0]])
+
+  def freeze(self):
+    """Freezes the InfeedQueue so it can no longer be modified.
+
+    The configuration is implicitly frozen before any host-side or
+    device-side Ops are generated. The configuration cannot be frozen
+    until the types and shapes of the tuple elements have been set.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set.
+    """
+    self._frozen = True
+    if self._tuple_types is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple types.")
+    if self._tuple_shapes is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for shape in self._tuple_shapes:
+      if shape.dims is None:
+        raise ValueError(
+            "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for policy in self._sharding_policies:
+      policy.freeze()
+    self._validate()
+
+  def generate_dequeue_op(self, tpu_device=0):
+    """Generates the device-side Op to dequeue a tuple from the queue.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen, which will raise errors if the shapes and types have not
+    been fully specified.
+
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed. If None, no explicit placement will be performed, and it is up
+        to the user to call this API from within a proper TPU device scope.
+        The XLA code will fail if the TPU dequeue instruction is not bound to
+        any device.
+
+    Returns:
+      A list of Outputs corresponding to a shard of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    if tpu_device is not None:
+      with ops.device(tpu.core(tpu_device)):
+        return tpu_ops.infeed_dequeue_tuple(
+            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    else:
+      return tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+
+  def _generate_enqueue_op(self,
+                           inputs,
+                           name_prefix,
+                           index,
+                           device=None,
+                           tpu_ordinal=-1):
+    """Generate a host-side Op to enqueue a tuple to the queue.
+
+    If device is None the inputs are all required to have the same
+    device specification, and the enqueue Op is colocated with
+    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
+
+    Args:
+      inputs: a list of Tensors with the types and shapes of the tuple elements.
+      name_prefix: the base name for the Op.
+      index: the shard index, used to uniquify the Op name.
+      device: device to place the Op on, or None if it should be
+        colocated with the inputs.
+      tpu_ordinal: ordinal of the TPU device on the host to use for
+      infeed if device is a CPU device. Should be set to -1 if device
+      is a TPU device.
+
+    Returns:
+      An Op corresponding to a shard of infeed enqueued at the host,
+      suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if device is None and inputs do not all have the
+        same device specification.
+    """
+    full_name = "%s/%d" % (name_prefix, index)
+    shapes = [t.shape for t in inputs]
+    if device is None:
+      devices = [t.device for t in inputs]
+      for i in xrange(1, self.number_of_tuple_elements):
+        if devices[0] != devices[i]:
+          raise ValueError(
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
+      with ops.colocate_with(inputs[0]):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+    else:
+      with ops.device(device):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+
+  def generate_enqueue_ops(self,
+                           sharded_inputs,
+                           tpu_ordinal_function=None,
+                           placement_function=None):
+    """Generates the host-side Ops to enqueue the shards of a tuple.
+
+    sharded_inputs is a list, one for each shard, of lists of
+    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
+    shard 0 if the queue. Returns the host-side Ops that must be run to
+    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
+    for shard i.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of sharded_inputs, an error
+    will be raised.
+
+    Args:
+      sharded_inputs: a list of lists of Tensors. The length of the outer list
+        determines the number of shards. Each inner list indicates the types
+        and shapes of the tuples in the corresponding shard.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. tpu_ordinal_function must be
+        set if the inputs are placed on CPU devices.
+      placement_function: if not None, a function that takes the shard index as
+        input and returns the host device where the enqueue op should be placed
+        on.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    if tpu_ordinal_function is None:
+      tpu_ordinal_function = lambda index: -1
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            tpu_ordinal=tpu_ordinal_function(index),
+            device=placement_function(index) if placement_function else None)
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+  # TODO(misard) Generalize this to the case of systems that don't
+  # have 8 devices per host, and figure out what to do with
+  # model-parallelism.
+  def _default_placement_function(self, index):
+    return "/task:%d/device:CPU:0" % (index / 8)
+
+  def _default_ordinal_function(self, index):
+    return index % 8
+
+  # TODO(b/36470756) remove this from tutorials once we have a better story
+  # for automatic placement of input pipelines.
+  def split_inputs_and_generate_enqueue_ops(self,
+                                            inputs,
+                                            device_assignment=None,
+                                            placement_function=None,
+                                            tpu_ordinal_function=None):
+    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
+
+    Generates the host-side Ops to enqueue a tuple.
+
+    This method performs poorly because it takes an entire input on a single
+    host, splits it, and distributes it to all of the cores. It is present only
+    to simplify tutorial examples.
+
+    inputs is a list of Tensors to use to feed the queue. Each input is split
+    into self.number_of_shards shards. Returns an Op for each shard to enqueue
+    the shard. The Op for shard i is placed on device placement_function(i).
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of inputs, an error
+    will be raised.
+
+    Args:
+      inputs: a list of Tensors which indicates the types and shapes of the
+        queue tuple.
+     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
+        device_assignment is not `None`, but `placement_function` and
+        `ordinal_function` are None, then `device_assignment` will be used to
+        place infeeds on the first k TPU shards, where k is the number of shards
+        in the queue. If all three are `None`, then default placement and
+        ordinal functions are used.
+      placement_function: if not None, a function that takes the shard
+        index as input and returns a device string indicating which
+        device the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of inputs are not compatible with the frozen
+        configuration.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of inputs are not compatible with the frozen
+        configuration.
+    """
+    if device_assignment is None:
+      if placement_function is None:
+        placement_function = self._default_placement_function
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = self._default_ordinal_function
+    else:
+
+      def _placement_function_from_map(index):
+        return device_assignment.host_device(replica=index)
+
+      def _ordinal_function_from_map(index):
+        return device_assignment.tpu_ordinal(replica=index)
+
+      if placement_function is None:
+        placement_function = _placement_function_from_map
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = _ordinal_function_from_map
+    self.set_configuration_from_input_tensors(inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    split_name_prefix = "%s/split" % self._name
+    if self.number_of_shards == 1:
+      transposed_sharded_inputs = [[inp] for inp in inputs]
+    else:
+
+      def split_fn(inp, num_shards, axis, name):
+        with ops.colocate_with(inp):
+          return array_ops.split(inp, num_shards, axis=axis, name=name)
+
+      transposed_sharded_inputs = [
+          split_fn(
+              inp,
+              self.number_of_shards,
+              axis=policy.shard_dimension,
+              name="%s/%d" % (split_name_prefix, index))
+          for (inp, policy, index) in zip(inputs, self._sharding_policies,
+                                          xrange(self.number_of_tuple_elements))
+      ]
+    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
+                      for i in xrange(self.number_of_shards)]
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            device=placement_function(index),
+            tpu_ordinal=tpu_ordinal_function(index))
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+
+class _PartitionedInfeedQueue(InfeedQueue):
+  """A helper object to build a device infeed queue with input partition.
+
+  Args:
+    number_of_tuple_elements: the number of Tensors fed atomically through the
+      queue, must be present unless it can be inferred from other arguments.
+    device_assignment: A TPU `DeviceAssignment` which is used to place all the
+      partitions to different TPU infeed queues.
+    host_id: The id of the host machine.
+    input_partition_dims: A nested list/tuple of integers. Each inner
+      list/tuple describes how to partition the corresponding input tensor.
+    tuple_types: If not None, a list of types of the elements of the queue.
+    tuple_shapes: If not None, a list of shapes of the elements of the queue.
+    name: The name of the queue.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements,
+               device_assignment,
+               host_id,
+               input_partition_dims=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               name=None):
+    super(_PartitionedInfeedQueue, self).__init__(
+        number_of_tuple_elements=number_of_tuple_elements,
+        tuple_types=tuple_types,
+        tuple_shapes=None,
+        shard_dimensions=None,
+        name="PartitionedInfeedQueue" if name is None else name)
+    self._input_partition_dims = input_partition_dims
+    self._host_id = host_id
+    self._device_assignment = device_assignment
+
+  def generate_dequeue_op(self, tpu_device=0):
+    """Generate TPU dequeue ops.
+
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed.
+
+    Returns:
+      A list of Outputs corresponding to a partition of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    with ops.device(tpu.core(tpu_device)):
+      values = tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    return tag_sharding_attribute_for_dequeued_tensors(
+        values, self._input_partition_dims)
+
+  def generate_enqueue_ops(self, per_host_sharded_inputs):
+    """Generates the host-side Ops to enqueue the partitioned inputs.
+
+    per_host_sharded_inputs is a list, one for each replica, of lists of
+    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
+    replica i.
+    sharded_inputs[i][j] is partitioned by self._input_partition_dims[j].
+
+    For example, if sharded_inputs[i][j] is a 2-D Tensor:
+    [[A, B, C, D],
+     [E ,F, G, H]]
+    self._input_partition_dims[j] is [2, 4].
+
+    sharded_inputs[i][j] will be partitioned and flattened into:
+    [A, B, C, D, E, F, G, H] and fed into the logical core ids:
+    [0, 1, 2, 3, 4, 5, 6, 7] respectively.
+
+    Args:
+      per_host_sharded_inputs: a list of lists of Tensors. The length of the
+        outer list determines the number of shards. Each inner list indicates
+        the types and shapes of the tuples in the corresponding shard.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints; or if the partition dims are invalid.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs)
+    number_of_replicas_per_host = len(per_host_sharded_inputs)
+    number_of_tuple_elements = len(per_host_sharded_inputs[0])
+
+    assert len(self._input_partition_dims) == number_of_tuple_elements
+    per_host_enqueue_ops = []
+
+    for replica_index in range(number_of_replicas_per_host):
+      flattened_inputs = per_host_sharded_inputs[replica_index]
+      inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs,
+                                                 self._input_partition_dims)
+      inputs_parted_iters = [
+          iter(self._check_dims_and_partition_or_replicate_on_host(x, dims))
+          for x, dims in zip(per_host_sharded_inputs[replica_index],
+                             inputs_part_dims_flat)
+      ]
+
+      for logical_core in xrange(self._device_assignment.num_cores_per_replica):
+        # Places different partitions to different logic cores.
+        replica_id = self._device_assignment.lookup_replicas(
+            self._host_id, logical_core)[replica_index]
+        ordinal = self._device_assignment.tpu_ordinal(
+            replica=replica_id, logical_core=logical_core)
+        infeed_inputs = []
+        for it in inputs_parted_iters:
+          input_for_device = next(it, None)
+          if input_for_device is not None:
+            infeed_inputs.append(input_for_device)
+
+        if infeed_inputs:
+          per_host_enqueue_ops.append(
+              tpu_ops.infeed_enqueue_tuple(
+                  inputs=infeed_inputs,
+                  shapes=[x.shape for x in infeed_inputs],
+                  name="enqueue/replica_{0}/input_{1}".format(
+                      replica_index, logical_core),
+                  device_ordinal=ordinal))
+    return per_host_enqueue_ops
+
+  def _check_input_partition_dims(self, tensor, dims):
+    """Checks that input partition dims are valid for the `Tensor`.
+
+    Args:
+      tensor: Input tensor for partitioning.
+      dims: A list of integer describes how to partition the input tensor.
+
+    Raises:
+      ValueError: If the tensor can't be partitioned by dims or the
+        num_cores_per_replica doesn't match the number of
+        partitions(dims.prod()).
+    """
+    # No partitioning specified, so don't perform further checks.
+    if dims is None:
+      return
+
+    dims = np.array(dims)
+
+    if (dims < 1).any():
+      raise ValueError("All input partition dims must be >= 1.")
+
+    # No partitioning, so don't perform further checks.
+    if dims.prod() == 1:
+      return
+
+    if dims.prod() != self._device_assignment.num_cores_per_replica:
+      raise ValueError(
+          "The product of each input parition dim should equal to "
+          "num_cores_per_replica. (dim = {}, num_cores_per_replica "
+          "= {})".format(dims, self._device_assignment.num_cores_per_replica))
+    if dims.shape[0] != tensor.shape.ndims:
+      raise ValueError(
+          "Input partition dims must have the same number of dimensions "
+          "as the `Tensor` to be partitioned. (tensor shape = {}, input "
+          "partition dims = {}).".format(tensor.shape.as_list(), dims))
+
+    tensor.shape.assert_is_fully_defined()
+
+  def _check_dims_and_partition_or_replicate_on_host(self, tensor, dims):
+    """Checks dims and partitions or replicates the input tensor.
+
+      The ops inside this function are placed on the host side.
+
+    Args:
+      tensor: The input tensor which will be partioned or replicated.
+      dims: A list of integer describes how to partition the input tensor.
+
+    Returns:
+      An iterator of `Tensor`s or a list of partioned tensors.
+    """
+    self._check_input_partition_dims(tensor, dims)
+    return partition_or_replicate_on_host(tensor, dims)
diff --git a/tensorflow/python/tpu/tpu_function.py b/tensorflow/python/tpu/tpu_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..422c7d3b26ffb4ad1b72450c4803ac2eb87cea3b
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_function.py
@@ -0,0 +1,66 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for functions used during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+class TpuContext(object):
+  """A context object holding state about the TPU computation being built."""
+
+  def __init__(self):
+    """Creates a new TpuContext."""
+    self._number_of_shards = None
+
+  @property
+  def number_of_shards(self):
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    self._number_of_shards = number_of_shards
+
+
+# The Tpu context holds the number of shards when a sharded computation is
+# being built, or None if no computation is being built.
+_current_tpu_context = TpuContext()
+
+
+@contextlib.contextmanager
+def tpu_shard_context(number_of_shards):
+  if _current_tpu_context.number_of_shards is not None:
+    raise NotImplementedError("tpu_shard_context cannot be nested.")
+  try:
+    _current_tpu_context.set_number_of_shards(number_of_shards)
+    yield
+  finally:
+    _current_tpu_context.set_number_of_shards(None)
+
+
+def get_tpu_context():
+  return _current_tpu_context
+
+
+# Decorator function for tpu computation func that was passed to tpu.rewrite()
+# if there is an embedded training loop in this func, trace tools will generate
+# step markers for each iteration.
+def on_device_training_loop(func):
+  # Value for this attribute is from xla.DebugOptions.StepMarkerLocation.
+  setattr(func, "step_marker_location", "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP")
+  return func
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py b/tensorflow/python/tpu/tpu_infeed_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
rename to tensorflow/python/tpu/tpu_infeed_test.py
index a41ff60d0af6c89fa9825d557aceefc9f6b8098d..3e90979157f891a989209fea4e56ff7090dde837 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
+++ b/tensorflow/python/tpu/tpu_infeed_test.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_feed
 
 
 class InfeedTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu_optimizer.py b/tensorflow/python/tpu/tpu_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c409eaa1cd4d499b72dbfbf429324d5f641e7c
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_optimizer.py
@@ -0,0 +1,203 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Optimizer that implements cross-shard gradient reduction for TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import optimizer
+
+
+class CrossShardOptimizer(optimizer.Optimizer):
+  """An optimizer that averages gradients across TPU shards."""
+
+  def __init__(self,
+               opt,
+               reduction=losses.Reduction.MEAN,
+               name="CrossShardOptimizer",
+               group_assignment=None):
+    """Construct a new cross-shard optimizer.
+
+    Args:
+      opt: An existing `Optimizer` to encapsulate.
+      reduction: The reduction to apply to the shard losses.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "CrossShardOptimizer".
+      group_assignment: Optional 2d int32 lists with shape
+        [num_groups, num_replicas_per_group] which describles how to apply
+        optimizer to subgroups.
+
+    Raises:
+      ValueError: If reduction is not a valid cross-shard reduction.
+    """
+    if reduction not in (losses.Reduction.SUM, losses.Reduction.MEAN):
+      raise ValueError("Unsupported reduction: %s." % reduction)
+
+    super(CrossShardOptimizer, self).__init__(False, name)
+    self._opt = opt
+    self._reduction = reduction
+    self._group_assignment = group_assignment
+
+  def _verify_and_get_subgroup_size(self, group_assignment, num_shards):
+    """Verify group_assignment and get the subgroup size".
+
+    Args:
+      group_assignment: list of group ids for applying the optimizer
+        to subgroups.
+      num_shards: The number of TPU shards.
+
+    Returns:
+      The size of one subgroup in group_assignment.
+
+    Raises:
+      ValueError: If group_assignment is invalid.
+    """
+    if not group_assignment:
+      return None
+    if not (isinstance(group_assignment, list) and
+            all(isinstance(i, list) for i in group_assignment)):
+      raise ValueError("group_assignment must be a list of list. Got {}".format(
+          group_assignment))
+
+    replica_ids = set()
+    for g in group_assignment:
+      for i in g:
+        replica_ids.add(i)
+
+    if set(range(num_shards)) != replica_ids:
+      raise ValueError("group_assignment must be a permutation of range({0})."
+                       " Got group_assignment={1}".format(
+                           num_shards, group_assignment))
+
+    subgroup_size_list = [len(group) for group in group_assignment]
+    if all(subgroup_size_list[0] == size for size in subgroup_size_list):
+      return subgroup_size_list[0]
+    else:
+      raise ValueError("The size of each subgroup in group_assignment must "
+                       "be equal. Got group_assignment={}".format(
+                           self._group_assignment))
+
+  def compute_gradients(self, loss, var_list=None, **kwargs):
+    """Compute gradients of "loss" for the variables in "var_list".
+
+    This simply wraps the compute_gradients() from the real optimizer. The
+    gradients will be aggregated in the apply_gradients() so that user can
+    modify the gradients like clipping with per replica global norm if needed.
+    The global norm with aggregated gradients can be bad as one replica's huge
+    gradients can hurt the gradients from other replicas.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      **kwargs: Keyword arguments for compute_gradients().
+
+    Returns:
+      A list of (gradient, variable) pairs.
+
+    Raises:
+      ValueError: If not within a tpu_shard_context or group_assignment is
+        invalid.
+    """
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    if num_shards is None:
+      logging.warning(
+          "CrossShardOptimizer should be used within a tpu_shard_context, but "
+          "got unset number_of_shards. Assuming 1.")
+      num_shards = 1
+
+    subgroup_size = self._verify_and_get_subgroup_size(self._group_assignment,
+                                                       num_shards)
+
+    if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
+      if self._group_assignment:
+        scale = 1.0 / subgroup_size
+      else:
+        scale = 1.0 / num_shards
+      loss *= scale
+
+    return self._opt.compute_gradients(loss, var_list=var_list, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
+    replicas, and then applies the real optimizer.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        compute_gradients().
+      global_step: Optional Variable to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the Optimizer constructor.
+
+    Returns:
+      An `Operation` that applies the gradients. If `global_step` was not None,
+      that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If the grads_and_vars is malformed.
+    """
+    summed_grads_and_vars = []
+    for (grad, var) in grads_and_vars:
+      if grad is None:
+        summed_grads_and_vars.append((grad, var))
+      else:
+        with ops.colocate_with(grad):
+          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
+              grad, self._group_assignment), var))
+    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
+
+  def get_slot(self, *args, **kwargs):
+    """Return a slot named "name" created for "var" by the Optimizer.
+
+    This simply wraps the get_slot() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    This simply wraps the get_slot_names() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      A list of strings.
+    """
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def variables(self):
+    """Forwarding the variables from the underlying optimizer."""
+    return self._opt.variables()
diff --git a/tensorflow/python/tpu/tpu_sharding.py b/tensorflow/python/tpu/tpu_sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5af03f33ca8f13af517007672e9ce0e12be6205
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_sharding.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for sharding during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import tensor_shape
+
+_DEFAULT_NUMBER_OF_SHARDS = 1
+_DEFAULT_SHARD_DIMENSION = 0
+
+
+# TODO(b/36777903) change other parts of tpu.py to use this class.
+class ShardingPolicy(object):
+  """An object use to hold the sharding policy for a Tensor.
+  """
+
+  def __init__(self):
+    self._number_of_shards = None
+    self._shard_dimension = None
+    self._frozen = False
+
+  def __str__(self):
+    if self.number_of_shards is None or self.shard_dimension is None:
+      return "ShardingPolicy(unset)"
+    else:
+      return ("ShardingPolicy(%d shards dimension %d)" %
+              (self.number_of_shards, self.shard_dimension))
+
+  def _fill_default_values(self):
+    if self._number_of_shards is None:
+      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
+    if self._shard_dimension is None:
+      self._shard_dimension = tensor_shape.as_dimension(
+          _DEFAULT_SHARD_DIMENSION)
+
+  def freeze(self):
+    """Prevents further modification to the sharding policy.
+
+    Any values that have not been set when freeze is called are set to
+    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
+    """
+    if not self._frozen:
+      self._fill_default_values()
+      self._frozen = True
+
+  @property
+  def number_of_shards(self):
+    """Returns the number of shards in the policy or None if unspecified."""
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards for the current policy.
+
+    If the policy has been frozen then number_of_shards must match the
+    existing setting.
+
+    Args:
+      number_of_shards: The number of shards to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and number_of_shards
+        differs from the frozen value; or number_of_shards <= 0.
+    """
+    if self._frozen:
+      if self._number_of_shards != number_of_shards:
+        raise ValueError(
+            "Can't set sharding policy to use %d shards since it has been "
+            "frozen to use %d." % (number_of_shards, self._number_of_shards))
+    else:
+      if number_of_shards > 0:
+        self._number_of_shards = number_of_shards
+      else:
+        raise ValueError(
+            "Can't set sharding policy to use %s shards; value must be >0",
+            str(number_of_shards))
+
+  @property
+  def shard_dimension(self):
+    """Returns the shard dimension of the policy or None if unspecified."""
+    return self._shard_dimension
+
+  def set_shard_dimension(self, shard_dimension):
+    """Sets the shard dimension for the current policy.
+
+    If the policy has been frozen then shard_dimension must match the
+    existing setting.
+
+    Args:
+      shard_dimension: The shard dimension to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and shard_dimension
+        differs from the frozen value, or shard_dimension can't be
+        interpreted as a Dimension.
+    """
+    if self._frozen:
+      if self._shard_dimension != shard_dimension:
+        raise ValueError(
+            "Can't set shard dimension to %d since it has been frozen to "
+            "use %d." % (shard_dimension, self._shard_dimension))
+    else:
+      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
+
+  def merge(self, other):
+    """Merges the policy of another policy into the current policy.
+
+    Args:
+      other: The policy to merge into this one.
+
+    Raises:
+      ValueError: If this policy has been frozen and the merge conflicts with
+      the frozen policy.
+    """
+    if other.number_of_shards is not None:
+      self.set_number_of_shards(other.number_of_shards)
+    if other.shard_dimension is not None:
+      self.set_shard_dimension(other.shard_dimension)
+
+  def get_sharded_shape(self, shape, shard_index=None):
+    """Returns the shape of a shard of a full Tensor.
+
+    When given the shape of a 'full-size' Tensor, returns the shape of
+    the sub-Tensor after it has been sharded. Freezes the policy if it
+    has not yet been frozen.
+
+    Args:
+      shape: The shape of the full-size Tensor to be sharded.
+      shard_index: The index of the shard whose shape should be returned.
+        shard_index can be None for sharding policies that use the same
+        shape for every shard.
+      freeze_config:
+
+    Returns:
+      The shape of the sharded version of the Tensor.
+
+    Raises:
+      ValueError: If shard_index is None when shards are of different
+        shapes; or shard_index is not None and
+        !(0<=shard_index<number_of_shards); or shape does not have at
+        least self.shard_dimension+1 dimensions; or the value of
+        shape's shard dimension is not a multiple of
+        self.number_of_shards
+    """
+    if self._shard_dimension is None or self._number_of_shards is None:
+      # Don't raise an error if the config is unset.
+      return None
+    if shard_index is not None:
+      if shard_index < 0 or shard_index >= self.number_of_shards:
+        raise ValueError("shard_index %d, but must be in [0,%d)." %
+                         (shard_index, self._number_of_shards))
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    if dims[self._shard_dimension] is None:
+      raise ValueError("shape %s must have a fixed size for dimension %d "
+                       "that is known at graph construction time." %
+                       (shape.as_list(), self._shard_dimension))
+    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
+      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
+                       (shape.as_list(), self._number_of_shards,
+                        self._shard_dimension))
+    dims[self._shard_dimension] /= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def _unshard_shape(self, shape):
+    """Return the unsharded shape that would generate a given sharded shape.
+
+    Args:
+      shape: the sharded shape to unshard
+
+    Returns:
+      The unsharded shape.
+
+    Raises:
+      ValueError: if shape is unknown or does not contain
+        self.shard_dimension
+      TypeError: if shape is not convertible to a TensorShape
+    """
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    dims[self._shard_dimension] *= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def get_unsharded_shape(self, shapes):
+    """Returns the shape of an unsharded Tensor given a list of shards.
+
+    When given a list of shapes of shards, returns the shape of the
+    unsharded Tensor that would generate the shards. Sets defaults for the
+    policy if number_of_shards or shard_dimension is None.
+
+    Args:
+      shapes: The shapes of the Tensor shards to be combined.
+
+    Returns:
+      The shape of the unsharded version of the Tensor.
+
+    Raises:
+      ValueError: if shapes is not a list of length
+        self.number_of_shards; or any element of shapes is not a valid
+        shape consistent with the sharding policy; or the list of
+        shapes is not a valid sharding of a full shape.
+      TypeError: if an element of shapes is not convertible to a
+        TensorShape
+    """
+    self._fill_default_values()
+    if len(shapes) != self.number_of_shards:
+      raise ValueError(
+          "shapes is %s but must be a list of length number_of_shards=%d" % (
+              str(shapes), self.number_of_shards))
+    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
+    for i in xrange(self.number_of_shards - 1):
+      if not unsharded_shapes[i].is_compatible_with(
+          unsharded_shapes[self.number_of_shards - 1]):
+        raise ValueError(
+            "sharded shapes %s are not consistent shards of a full shape "
+            "sharded %d ways along dimension %d" % (
+                str(shapes), self.number_of_shards, self.shard_dimension))
+    return unsharded_shapes[0]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py b/tensorflow/python/tpu/tpu_sharding_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
rename to tensorflow/python/tpu/tpu_sharding_test.py
index b0a5511d2d7683a5e0f527e49651df236c7a68d4..21d2a0897a0ff938359a4ca29e077678778ddc56 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
+++ b/tensorflow/python/tpu/tpu_sharding_test.py
@@ -19,10 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu_sharding
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_sharding
 
 
 class ShardingTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu_system_metadata.py b/tensorflow/python/tpu/tpu_system_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..866895922a42e0db8c46f515a817a31fc54d3401
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_system_metadata.py
@@ -0,0 +1,156 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu
+
+_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
+_RETRY_TIMES = 120
+_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
+
+_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
+
+# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
+# including num_cores and num_hosts.
+_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
+    'num_cores',
+    'num_hosts',
+    'num_of_cores_per_host',
+    'topology',
+    'devices',
+])
+
+
+def _query_tpu_system_metadata(master_address, cluster_def=None,
+                               query_topology=False):
+  """Automatically detects the TPU system metadata in the system."""
+  tpu_core_count = 0
+  devices = []
+  device_dict = collections.defaultdict(list)
+
+  # TODO(b/120564445): Replace with standard library for retries.
+  retry_count = 1
+  while True:
+    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
+                 master_address)
+    try:
+      with ops.Graph().as_default():
+        with session_lib.Session(
+            master_address,
+            config=get_session_config_with_timeout(
+                _PINGING_MASTER_TIMEOUT_IN_MS,
+                cluster_def)) as sess:
+          devices = sess.list_devices()
+          for device in devices:
+            match = _TPU_DEVICE_REG.match(device.name)
+            if match:
+              host_id = match.group(1)
+              core_id = match.group(2)
+              device_dict[host_id].append(core_id)
+              tpu_core_count += 1
+          break
+    except errors.DeadlineExceededError:
+      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
+             'not be ready (still scheduling) or the Tensorflow master address '
+             'is incorrect: got (%s).' %
+             (master_address))
+
+      # TODO(xiejw): For local or grpc master we might not need retry logic
+      # here.
+      if retry_count <= _RETRY_TIMES:
+        logging.warning('%s', msg)
+        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
+        retry_count += 1
+      else:
+        raise ValueError(msg)
+
+  num_of_cores_per_host = 0
+  if tpu_core_count:
+    num_cores_per_host_set = set(
+        [len(core_ids) for core_ids in device_dict.values()])
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError(
+          'TPU cores on each host is not same. This should not happen!. '
+          'devices: {}'.format(devices))
+    num_of_cores_per_host = num_cores_per_host_set.pop()
+
+  topology = None
+  if query_topology:
+    if not tpu_core_count:
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system (master address {}). '
+          'This usually means the master address is incorrect or the '
+          'TPU worker has some problems. Available devices: {}'.format(
+              master_address, devices))
+
+    topology = _obtain_topology(master_address, cluster_def)
+
+  metadata = _TPUSystemMetadata(
+      num_cores=tpu_core_count,
+      num_hosts=len(device_dict),
+      num_of_cores_per_host=num_of_cores_per_host,
+      topology=topology,
+      devices=devices)
+
+  if tpu_core_count:
+    logging.info('Found TPU system:')
+    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
+    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
+    logging.info('*** Num TPU Cores Per Worker: %d',
+                 metadata.num_of_cores_per_host)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
+  else:
+    logging.info('Failed to find TPU: %s', metadata)
+  return metadata
+
+
+def _obtain_topology(master_address, cluster_def):
+  """Obtains TPU fabric topology."""
+  try:
+    logging.info('Initializing TPU system (master: %s) to fetch topology '
+                 'for model parallelism. This might take a while.',
+                 master_address)
+    with ops.Graph().as_default():
+      session_config = get_session_config_with_timeout(
+          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
+      with session_lib.Session(
+          master_address, config=session_config) as sess:
+        topology = sess.run(tpu.initialize_system())
+        return topology
+  except errors.DeadlineExceededError:
+    raise ValueError(
+        'Fail to initialize TPU system with master (%s). '
+        'Please double check the TPU system is functional.' % (
+            master_address))
+
+
+def get_session_config_with_timeout(timeout_in_secs, cluster_def):
+  """Returns a session given a timeout and a cluster configuration."""
+  config = config_pb2.ConfigProto(
+      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
+  return config
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
similarity index 94%
rename from tensorflow/contrib/tpu/python/tpu/tpu_test.py
rename to tensorflow/python/tpu/tpu_test.py
index 6bdaa528f9f946ae4b9813d554409da2406b1f8d..69b03811daa7055e20471474767242788eecdc0a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -19,18 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import training_loop
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_feed
+from tensorflow.python.tpu import training_loop
 
 
 class TPUContextTest(test.TestCase):
diff --git a/tensorflow/python/tpu/training_loop.py b/tensorflow/python/tpu/training_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffeb7e915a06a513e96e1ed60beabf6b79b6518
--- /dev/null
+++ b/tensorflow/python/tpu/training_loop.py
@@ -0,0 +1,222 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Library for constructing a training loop, suitable for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.tpu import tensor_tracer
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import xla
+
+
+def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop for TPUs.
+
+  The set of loop-carried tensors corresponds to `inputs`.  Both
+  `condition` and `body` take the current value of the loop-carried
+  tensors. 'body' additionally takes a tuple of infeed from
+  infeed_queue if infeed_queue is not None. `condition` must return a
+  single boolean value that determines whether iteration
+  continues. `body` must return an updated list of values for the
+  loop-carried tensors.
+
+  Args:
+    condition: a Python function that builds the loop condition.
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop, or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: (Deprecated) Does nothing.
+
+  Returns:
+    The final values of the loop-carried tensors.
+
+  Raises:
+    TypeError: if body or condition has the wrong signature.
+  """
+  del name
+  # Converts inputs to Tensors.
+  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
+                                      x in inputs]
+  input_types = [x.dtype for x in inputs]
+  input_arity = len(inputs)
+
+  body_arg_error = xla.check_function_argument_count(
+      body, input_arity, infeed_queue)
+  if body_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
+              input_arity, str([i.name for i in inputs]), body_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s and %d additional inputs from "
+          "infeed, but the computation needs %s" % (input_arity, str(
+              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
+                                                    body_arg_error))
+  condition_arg_error = xla.check_function_argument_count(
+      condition, input_arity, None)
+  if condition_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
+                                  condition_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s. Note that infeed is not passed to the loop "
+          "condition." % (input_arity, str([i.name for i in inputs]),
+                          condition_arg_error))
+
+  def condition_wrapper(*inputs):
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+    return condition(*inputs)
+
+  def body_wrapper(*inputs):
+    """Wrapper around `body` that handles infeed queues and control deps."""
+    inputs = list(inputs)
+
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+
+    # Runs `body` with the dequeue_ops appended.
+    if infeed_queue:
+      number_of_shards = tpu_function.get_tpu_context().number_of_shards
+      if number_of_shards is None:
+        raise ValueError("Can't build training loop with infeed when there is "
+                         "no tpu_shard_context. Are you building a loop or "
+                         "graph directly rather than from inside tpu.rewrite, "
+                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
+      infeed_queue.set_number_of_shards(number_of_shards)
+      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
+    else:
+      dequeue_ops = []
+    outputs = body(*(inputs + dequeue_ops))
+
+    # If the computation only returned one value, make it a tuple.
+    if not isinstance(outputs, (list, tuple)):
+      outputs = (outputs,)
+
+    outputs = [
+        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+        for o in outputs
+    ]
+
+    # Separates the returned Operations and Tensors.
+    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs
+                      if not isinstance(o, ops.Operation)]
+
+    if outputs != output_tensors + output_operations:
+      raise ValueError(
+          "TPU training loop body must return zero or more Tensor values "
+          "followed by zero or more Operations.")
+
+    output_types = [op.dtype for op in output_tensors]
+    if input_types != output_types:
+      raise TypeError(
+          "Mismatch between input types and output types for training loop "
+          "body: {} vs {}".format(input_types, output_types))
+
+    # Add the dequeue operations to output_operations to ensure they are run
+    # by the loop, even if the programmer's loop body does not use them.
+    output_operations += dequeue_ops
+
+    # Add a dummy output, if needed.
+    if not output_tensors:
+      output_tensors = array_ops.constant(0)
+
+    if output_operations:
+      # TODO(phawkins): in principle this is too restrictive since it serializes
+      # the training loop steps. In practice it does not matter since this loop
+      # will be compiled by XLA.
+      output_tensors = control_flow_ops.tuple(output_tensors,
+                                              control_inputs=output_operations)
+
+    if tensor_tracer.TensorTracer.is_enabled():
+      num_replicas = tpu_function.get_tpu_context().number_of_shards
+      if num_replicas is None:
+        num_replicas = 1
+      tt = tensor_tracer.TensorTracer()
+      output_tensors = tt.trace_tpu(ops.get_default_graph(),
+                                    output_tensors, None,
+                                    num_replicas)
+    return output_tensors
+
+  # If the body has arity 0, add a dummy loop-carried value to which we can add
+  # control dependencies from any side-effecting operations.
+  if input_arity == 0:
+    inputs = [array_ops.constant(0)]
+  return control_flow_ops.while_loop(
+      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
+
+
+def repeat(n, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop that executes a fixed number of iterations.
+
+  The set of loop-carried tensors correspond to `inputs`.
+  `body` must be a function that takes and returns the values of the
+  loop-carried tensors.
+
+  Args:
+    n: the number of loop iterations
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: (Deprecated) Does nothing.
+  Returns:
+    The final values of the loop-carried tensors.
+  Raises:
+    ValueError: if there is a type error.
+  """
+  def _convert_to_list(xs):
+    if not isinstance(xs, (list, tuple)):
+      return [xs]
+    else:
+      return list(xs)
+
+  def cond(i, *args):
+    del args
+    return i < n
+
+  def body_wrapper(i, *args):
+    return [i + 1] + _convert_to_list(body(*args))
+
+  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
+  outputs = while_loop(
+      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
+  outputs = _convert_to_list(outputs)
+  if len(outputs) == 1:
+    # Returns the Op rather than an empty list.
+    return outputs[0].op
+  else:
+    return outputs[1:]
diff --git a/tensorflow/python/tpu/util.py b/tensorflow/python/tpu/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b
--- /dev/null
+++ b/tensorflow/python/tpu/util.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Utilities for the functionalities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import six
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training
+
+def check_positive_integer(value, name):
+  """Checks whether `value` is a positive integer."""
+  if not isinstance(value, six.integer_types):
+    raise TypeError('{} must be int, got {}'.format(name, type(value)))
+
+  if value <= 0:
+    raise ValueError('{} must be positive, got {}'.format(name, value))
+
+
+# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
+# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
+# python/estimator/util.py.
+class MultiHostDatasetInitializerHook(training.SessionRunHook):
+  """Creates a SessionRunHook that initializes all passed iterators."""
+
+  def __init__(self, dataset_initializers):
+    self._initializers = dataset_initializers
+
+  def after_create_session(self, session, coord):
+    del coord
+    start = time.time()
+    session.run(self._initializers)
+    logging.info('Initialized dataset iterators in %d seconds',
+                 time.time() - start)
diff --git a/tensorflow/python/tpu/xla.py b/tensorflow/python/tpu/xla.py
new file mode 100644
index 0000000000000000000000000000000000000000..58476fae3d132aeeac7c23f176e23ea609478b15
--- /dev/null
+++ b/tensorflow/python/tpu/xla.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""XLA utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.util import tf_inspect
+
+
+def is_flat(outputs):
+  """Checks if outputs is a flat structure.
+
+    Following structures and values are considered flat:
+    1) None
+    2) A single object
+    3) A list or tuple of Tensors/Operations
+
+    The only structures that this function understands are sequences and
+    dictionaries.  E.g. this means that if outputs contains a single
+    user-defined Object, it is considered to be flat. Errors are raised later on
+    if that Object cannot be converted to a Tensor.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    A boolean indicates whether outputs is flat.
+  """
+  # If outputs is a list or tuple, check if it has any nested structure. If
+  # there is, then outputs is non-flat.
+  if isinstance(outputs, collections.Sequence):
+    for o in outputs:
+      if isinstance(o, collections.Sequence) or isinstance(o, dict):
+        return False
+
+  # If outputs is a dict, it is non-flat.
+  if isinstance(outputs, dict):
+    return False
+
+  # Getting here means either outputs itself is a single non-structured value
+  # or it is a flat list of single non-structured values.
+  return True
+
+
+def check_function_argument_count(func, input_arity, infeed_queue):
+  """Validate the number of input arguments to an XLA function.
+
+  Args:
+    func: the Python function that will be called to generate the body of an XLA
+      computation graph.
+    input_arity: the number of explicit arguments supplied by the caller.
+    infeed_queue: if not None, the infeed queue that will supply
+      additional arguments to the function.
+
+  Returns:
+    None if function can be called with the supplied number of
+      arguments, or an error string if it cannot.
+  """
+  def format_error(complaint, quantity):
+    return '%s %d argument%s' % (complaint, quantity, ''
+                                 if quantity == 1 else 's')
+
+  num_args_supplied = input_arity
+  if infeed_queue is not None:
+    num_args_supplied += infeed_queue.number_of_tuple_elements
+  arg_spec = tf_inspect.getargspec(func)
+  num_func_args = len(arg_spec.args)
+  if arg_spec.defaults is None:
+    num_func_defaults = 0
+  else:
+    num_func_defaults = len(arg_spec.defaults)
+  min_func_args = num_func_args - num_func_defaults
+  if num_args_supplied < min_func_args:
+    # The required number of arguments is not enough to call the function.
+    if num_func_defaults == 0 and arg_spec.varargs is None:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at least', min_func_args)
+  if arg_spec.varargs is None and num_args_supplied > num_func_args:
+    # The required number of arguments is too many to call the function.
+    if num_func_defaults == 0:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at most', num_func_args)
+  # Reaching here means either
+  # 1) There are varargs, func can accept any number of arguments greater than
+  # the minimum.
+  # 2) Number of supplied arguments falls in range of acceptable argument count
+  # of func.
+  return None
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 81e03c735851ae67e3b332c956f7c91446da01d6..46ec3be54ec6851bd096d59f7298b6202608c53b 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -52,7 +52,7 @@ class AdamOptimizer(optimizer.Optimizer):
     $$t := 0 \text{(Initialize timestep)}$$
 
     The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section2 of the paper:
+    described at the end of section 2 of the paper:
 
     $$t := t + 1$$
     $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 8606ec4a206ffbce85cf4071934deeb5a545b055..053298d1a592df821cd56e15d9026f6386f0e502 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -38,7 +38,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import util
 
 
 class LatestCheckpointWithRelativePaths(test.TestCase):
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 41a42bd2695eaecaee271409afa03653ca6b4014..7d7e95c7e60fe379ded98c60ca89a71a288b3e50 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,7 +41,8 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
@@ -227,10 +228,10 @@ class Scaffold(object):
     if self._saver is None:
       self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
     # pylint: enable=g-long-lambda
-    if isinstance(self._saver, checkpointable_util.Checkpoint):
+    if isinstance(self._saver, trackable_util.Checkpoint):
       self._saver = training_saver.Saver(
-          var_list=checkpointable_util.CheckpointableSaver(
-              self._saver).gather_objects(),
+          var_list=graph_view.ObjectGraphView(
+              self._saver).frozen_saveable_objects(),
           sharded=True)
     else:
       self._saver.build()
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 3742ebb807f4c245aef956144f7888d7b0560375..4361f07e196050c87338d0f7102f530d2c2c9be7 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -24,7 +24,6 @@ import abc
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
@@ -40,7 +39,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -215,10 +214,10 @@ def _get_processor(v):
 
 @tf_export(v1=["train.Optimizer"])
 class Optimizer(
-    # Optimizers inherit from CheckpointableBase rather than Checkpointable
+    # Optimizers inherit from Trackable rather than AutoTrackable
     # since they do most of their dependency management themselves (slot
     # variables are special-cased, and non-slot variables are keyed to graphs).
-    checkpointable.Checkpointable):
+    trackable.Trackable):
   """Base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -334,9 +333,9 @@ class Optimizer(
     #   ... }
     self._slots = {}
     self._non_slot_dict = {}
-    # For implementing Checkpointable. Stores information about how to restore
+    # For implementing Trackable. Stores information about how to restore
     # slot variables which have not yet been created
-    # (checkpointable._CheckpointPosition objects).
+    # (trackable._CheckpointPosition objects).
     #  {slot_name :
     #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
     #   ... }
@@ -461,12 +460,6 @@ class Optimizer(
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss if using a "mean" loss reduction and multiple replicas.
-        # Have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
-        # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-        loss_value = self._scale_loss(loss_value)
-
       if var_list is None:
         var_list = tape.watched_variables()
       # TODO(jhseu): Figure out why GradientTape's gradients don't require loss
@@ -481,9 +474,6 @@ class Optimizer(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
 
-    # Scale loss if using a "mean" loss reduction and multiple replicas.
-    loss = self._scale_loss(loss)
-
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -518,14 +508,6 @@ class Optimizer(
          if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
-  @staticmethod
-  def _scale_loss(loss_value):
-    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
-      if num_replicas > 1:
-        loss_value *= (1. / num_replicas)
-    return loss_value
-
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -814,7 +796,7 @@ class Optimizer(
     key = (name, graph)
     v = self._non_slot_dict.get(key, None)
     if v is None:
-      self._maybe_initialize_checkpointable()
+      self._maybe_initialize_trackable()
       distribution_strategy = distribute_ctx.get_strategy()
       with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
@@ -827,19 +809,19 @@ class Optimizer(
             use_resource=resource_variable_ops.is_resource_variable(
                 colocate_with))
       # Restore this variable by name if necessary, but don't add a
-      # Checkpointable dependency. Optimizers return the current graph's
+      # Trackable dependency. Optimizers return the current graph's
       # non-slot variables from _checkpoint_dependencies explicitly rather
       # than unconditionally adding dependencies (since there may be multiple
       # non-slot variables with the same name in different graphs, trying to
       # save all of them would result in errors).
-      self._handle_deferred_dependencies(name=name, checkpointable=v)
+      self._handle_deferred_dependencies(name=name, trackable=v)
       self._non_slot_dict[key] = v
 
     return v
 
   @property
   def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    """From Trackable. Gather graph-specific non-slot variables to save."""
     current_graph_non_slot_variables = []
     current_graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     for (name, _), variable_object in sorted(self._non_slot_dict.items(),
@@ -847,13 +829,13 @@ class Optimizer(
                                              key=lambda item: item[0][0]):
       if variable_object._graph_key == current_graph_key:  # pylint: disable=protected-access
         current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
+            trackable.TrackableReference(
                 name=name, ref=variable_object))
     return (super(Optimizer, self)._checkpoint_dependencies
             + current_graph_non_slot_variables)
 
   def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
+    """From Trackable. Find a non-slot variable in the current graph."""
     unconditional = super(Optimizer, self)._lookup_dependency(name)
     if unconditional is not None:
       return unconditional
@@ -1158,7 +1140,7 @@ class Optimizer(
     return named_slots[_var_key(var)]
 
   # --------------
-  # For implementing the Checkpointable interface.
+  # For implementing the Trackable interface.
   # --------------
 
   def _restore_slot_variable(self, slot_name, variable, slot_variable):
@@ -1189,8 +1171,8 @@ class Optimizer(
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
@@ -1208,7 +1190,7 @@ class Optimizer(
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self._get_or_make_slot(
           var=variable,
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index ee0e2154e803af64462d027e690d4796fa251dd1..215fc3965248c6582993bae3844cc0e3067acce3 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -17,7 +17,7 @@
 """Save and restore variables.
 
 Symbols in this file are deprecated. See replacements in
-tensorflow/python/training/checkpointable and tensorflow/python/training/saving.
+tensorflow/python/training/trackable and tensorflow/python/training/saving.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,10 +29,9 @@ import time
 import uuid
 
 import numpy as np
-
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -51,9 +50,9 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1552,12 +1551,13 @@ def export_meta_graph(filename=None,
     RuntimeError: If called with eager execution enabled.
 
   @compatibility(eager)
-  Exporting/importing meta graphs is not supported. No graph exists when eager
-  execution is enabled.
+  Exporting/importing meta graphs is not supported unless both `graph_def` and
+  `graph` are provided. No graph exists when eager execution is enabled.
   @end_compatibility
   """
   # pylint: enable=line-too-long
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not (graph_def is not None and
+                                          graph is not None):
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
                        "execution is enabled.")
@@ -1604,9 +1604,9 @@ def object_graph_key_mapping(checkpoint_path):
   """
   reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path)
   object_graph_string = reader.get_tensor(
-      checkpointable.OBJECT_GRAPH_PROTO_KEY)
+      trackable.OBJECT_GRAPH_PROTO_KEY)
   object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+      trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   names_to_keys = {}
   for node in object_graph_proto.nodes:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index dec23c50e8c069d4f2dd18c49ecdabb447f4872b..9b2a1da7c29723b589b67484bd2e1d880ef1363d 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -73,9 +73,9 @@ from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable_base
-from tensorflow.python.training.checkpointable import tracking as checkpointable_tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import tracking as trackable_tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import compat
 
 
@@ -2775,15 +2775,15 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
 
 
-class _OwnsAVariableSimple(checkpointable_base.Checkpointable):
-  """A Checkpointable object which can be saved using a tf.train.Saver."""
+class _OwnsAVariableSimple(trackable_base.Trackable):
+  """A Trackable object which can be saved using a tf.train.Saver."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
         name="non_dep_variable", initializer=6., use_resource=True)
 
   def _gather_saveables_for_checkpoint(self):
-    return {checkpointable_base.VARIABLE_VALUE_KEY: self.non_dep_variable}
+    return {trackable_base.VARIABLE_VALUE_KEY: self.non_dep_variable}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2808,8 +2808,8 @@ class _MirroringSaveable(
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
-  """A Checkpointable object which returns a more complex SaveableObject."""
+class _OwnsMirroredVariables(trackable_base.Trackable):
+  """A Trackable object which returns a more complex SaveableObject."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
@@ -2823,7 +2823,7 @@ class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
           primary_variable=self.non_dep_variable,
           mirrored_variable=self.mirrored,
           name=name)
-    return {checkpointable_base.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable_base.VARIABLE_VALUE_KEY: _saveable_factory}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2831,11 +2831,11 @@ class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
     return self.non_dep_variable.name
 
 
-class NonLayerCheckpointable(checkpointable_tracking.AutoCheckpointable):
+class NonLayerTrackable(trackable_tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -2846,19 +2846,19 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
     return ret
 
 
-class CheckpointableCompatibilityTests(test.TestCase):
+class TrackableCompatibilityTests(test.TestCase):
 
   # TODO(allenl): Track down python3 reference cycles in these tests.
   @test_util.run_in_graph_and_eager_modes
-  def testNotSaveableButIsCheckpointable(self):
+  def testNotSaveableButIsTrackable(self):
     v = _OwnsAVariableSimple()
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
@@ -2923,13 +2923,13 @@ class CheckpointableCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -2938,24 +2938,24 @@ class CheckpointableCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta1_power))
 
   def testVariableNotFoundErrorRaised(self):
@@ -3012,13 +3012,13 @@ class CheckpointableCompatibilityTests(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.Checkpoint(root=root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
       # An incompatible object-based checkpoint to check error messages
       var = resource_variable_ops.ResourceVariable(1., name="a")
       self.evaluate(var.initializer)
-      second_saver = checkpointable_utils.CheckpointableSaver(var)
+      second_saver = trackable_utils.Checkpoint(v=var)
       second_path = second_saver.save(file_prefix=os.path.join(
           checkpoint_directory, "second"))
 
@@ -3046,7 +3046,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph):
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.Checkpoint(root=root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
     with context.eager_mode():
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 67ccd59b88c289a11791c9098a2014c48e6c33fb..adb50f9aa6b2e3c19c9d616cf996d37ae503d7a4 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -49,7 +49,7 @@ py_library(
     deps = [
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index b8cc66249bffd7c5c21280969a5d27b8c3b89da7..eeec19e5886d48828b85ab7aa6931db38a561613 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -26,8 +26,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.tracking import base as trackable
 
 
 # Op names which identify variable reads which should be saved.
@@ -137,7 +137,7 @@ def saveable_objects_for_op(op, name):
   if not isinstance(name, six.string_types):
     raise TypeError(
         "names_to_saveables must be a dict mapping string names to "
-        "checkpointable operations. Name is not a string: %s" % name)
+        "trackable operations. Name is not a string: %s" % name)
   if isinstance(op, saveable_object.SaveableObject):
     yield op
   elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
@@ -165,11 +165,11 @@ def saveable_objects_for_op(op, name):
         yield ResourceVariableSaveable(
             variable, variable._save_slice_info.spec, name)
     # pylint: enable=protected-access
-  elif isinstance(op, checkpointable.Checkpointable) and not isinstance(
+  elif isinstance(op, trackable.Trackable) and not isinstance(
       op, variables.Variable):
     # pylint: disable=protected-access
     for attr, factory in op._gather_saveables_for_checkpoint().items():
-      if attr == checkpointable.VARIABLE_VALUE_KEY:
+      if attr == trackable.VARIABLE_VALUE_KEY:
         # Keep original name for classes masquerading as variables.
         full_name = name
       else:
@@ -250,15 +250,18 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
         names_to_saveables[name].append(var)
       else:
         names_to_saveables[name] = [var]
-    elif (isinstance(var, checkpointable.Checkpointable)
+    elif (isinstance(var, trackable.Trackable)
           and not isinstance(var, variables.Variable)):
-      checkpointable_saveables = [
+      trackable_saveables = [
           (factory() if callable(factory) else factory)
           for factory in var._gather_saveables_for_checkpoint().values()]
       names_to_saveables.update(
-          op_list_to_dict(checkpointable_saveables))
+          op_list_to_dict(trackable_saveables))
     else:
-      if context.executing_eagerly():
+      # Variables (reference and resource) have an _in_graph_mode property
+      # indicating whether they were created in a graph building context. We
+      # also get Tensors when graph building, which do not have this property.
+      if not getattr(var, "_in_graph_mode", True):
         if not isinstance(var, resource_variable_ops.ResourceVariable):
           raise ValueError(
               "Can only save/restore ResourceVariables when eager execution "
@@ -323,7 +326,7 @@ def validate_and_slice_inputs(names_to_saveables):
 
   Raises:
     TypeError: If any of the keys are not strings or any of the
-      values are not one of Tensor or Variable or a checkpointable operation.
+      values are not one of Tensor or Variable or a trackable operation.
     ValueError: If the same operation is given in more than one value
       (this also applies to slices of SlicedVariables).
   """
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/tracking/BUILD
similarity index 88%
rename from tensorflow/python/training/checkpointable/BUILD
rename to tensorflow/python/training/tracking/BUILD
index a39462732f591cb49bb4ee07a45a9efe732f589e..40a6e93e09c8a5c603e183f87a15652c08eaca44 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/saving:saveable_object",
+        "@six_archive//:six",
     ],
 )
 
@@ -94,6 +95,29 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "object_identity",
+    srcs = ["object_identity.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "graph_view",
+    srcs = ["graph_view.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        ":object_identity",
+        ":tracking",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+    ],
+)
+
 py_library(
     name = "util",
     srcs = ["util.py"],
@@ -101,17 +125,19 @@ py_library(
     deps = [
         ":base",
         ":data_structures",
+        ":graph_view",
+        ":object_identity",
         ":tracking",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
@@ -122,7 +148,6 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/training/saving:functional_saver",
-        "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
@@ -132,10 +157,12 @@ tf_py_test(
     srcs = ["util_test.py"],
     additional_deps = [
         ":base",
+        ":graph_view",
         ":tracking",
         ":util",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
+        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -149,8 +176,8 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:template",
-        "//tensorflow/python:training",
         "//tensorflow/python:training_util",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -158,6 +185,7 @@ tf_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
+        "//tensorflow/python:variables",
     ],
     tags = ["notsan"],  # b/74395663
 )
@@ -167,6 +195,7 @@ tf_xla_py_test(
     srcs = ["util_xla_test.py"],
     tags = [
         "no_pip",
+        "no_rocm",
         "nomac",
         "notsan",  # b/74395663
     ],
@@ -190,6 +219,7 @@ tf_py_test(
     srcs = ["util_with_v1_optimizers_test.py"],
     additional_deps = [
         ":base",
+        ":graph_view",
         ":tracking",
         ":util",
         "@absl_py//absl/testing:parameterized",
@@ -218,5 +248,8 @@ tf_py_test(
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
     ],
-    tags = ["notsan"],  # b/74395663
+    tags = [
+        "no_windows",  # b/124401331
+        "notsan",  # b/74395663
+    ],
 )
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/tracking/base.py
similarity index 81%
rename from tensorflow/python/training/checkpointable/base.py
rename to tensorflow/python/training/tracking/base.py
index 9fb251e455c3980dbb78682087b4a736266da2b6..4a8960d3cabdac1852f42e89a6c2e61d0ca4577e 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -44,18 +44,18 @@ OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
 
 
 # A key indicating a variable's value in an object's checkpointed Tensors
-# (Checkpointable._gather_saveables_for_checkpoint). If this is the only key and
+# (Trackable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
 # creation (avoiding double assignment when executing eagerly).
 VARIABLE_VALUE_KEY = "VARIABLE_VALUE"
 OBJECT_CONFIG_JSON_KEY = "OBJECT_CONFIG_JSON"
 
-CheckpointableReference = collections.namedtuple(
-    "CheckpointableReference",
+TrackableReference = collections.namedtuple(
+    "TrackableReference",
     [
         # The local name for this dependency.
         "name",
-        # The Checkpointable object being referenced.
+        # The Trackable object being referenced.
         "ref"
     ])
 
@@ -187,34 +187,34 @@ class PythonStringStateSaveable(PythonStateSaveable):
     return control_flow_ops.no_op()
 
 
-class _CheckpointPosition(object):
-  """Indicates a position within a `_Checkpoint`."""
+class CheckpointPosition(object):
+  """Indicates a position within a `_CheckpointRestoreCoordinator`."""
 
   def __init__(self, checkpoint, proto_id):
     """Specify an object within a checkpoint.
 
     Args:
-      checkpoint: A _Checkpoint object.
-      proto_id: The index of this object in CheckpointableObjectGraph.nodes.
+      checkpoint: A _CheckpointRestoreCoordinator object.
+      proto_id: The index of this object in TrackableObjectGraph.nodes.
     """
     self._checkpoint = checkpoint
     self._proto_id = proto_id
 
-  def restore(self, checkpointable):
-    """Restore this value into `checkpointable`."""
+  def restore(self, trackable):
+    """Restore this value into `trackable`."""
     with ops.init_scope():
-      if self.bind_object(checkpointable):
+      if self.bind_object(trackable):
         # This object's correspondence with a checkpointed object is new, so
         # process deferred restorations for it and its dependencies.
-        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        restore_ops = trackable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
         if restore_ops:
           self._checkpoint.new_restore_ops(restore_ops)
 
-  def bind_object(self, checkpointable):
+  def bind_object(self, trackable):
     """Set a checkpoint<->object correspondence and process slot variables.
 
     Args:
-      checkpointable: The object to record a correspondence for.
+      trackable: The object to record a correspondence for.
     Returns:
       True if this is a new assignment, False if this object has already been
       mapped to a checkpointed `Object` proto.
@@ -222,14 +222,14 @@ class _CheckpointPosition(object):
       AssertionError: If another object is already bound to the `Object` proto.
     """
     checkpoint = self.checkpoint
-    checkpoint.all_python_objects.add(checkpointable)
+    checkpoint.all_python_objects.add(trackable)
     current_assignment = checkpoint.object_by_proto_id.get(self._proto_id, None)
     if current_assignment is None:
-      checkpoint.object_by_proto_id[self._proto_id] = checkpointable
+      checkpoint.object_by_proto_id[self._proto_id] = trackable
       for deferred_slot_restoration in (
           checkpoint.deferred_slot_restorations.pop(self._proto_id, ())):
-        checkpointable._create_or_restore_slot_variable(  # pylint: disable=protected-access
-            slot_variable_position=_CheckpointPosition(
+        trackable._create_or_restore_slot_variable(  # pylint: disable=protected-access
+            slot_variable_position=CheckpointPosition(
                 checkpoint=checkpoint,
                 proto_id=deferred_slot_restoration.slot_variable_id),
             variable=deferred_slot_restoration.original_variable,
@@ -244,15 +244,15 @@ class _CheckpointPosition(object):
           checkpoint.deferred_slot_restorations.setdefault(
               slot_restoration.optimizer_id, []).append(
                   _DeferredSlotVariableRestoration(
-                      original_variable=checkpointable,
+                      original_variable=trackable,
                       slot_variable_id=slot_restoration.slot_variable_id,
                       slot_name=slot_restoration.slot_name))
         else:
           optimizer_object._create_or_restore_slot_variable(  # pylint: disable=protected-access
-              slot_variable_position=_CheckpointPosition(
+              slot_variable_position=CheckpointPosition(
                   checkpoint=checkpoint,
                   proto_id=slot_restoration.slot_variable_id),
-              variable=checkpointable,
+              variable=trackable,
               slot_name=slot_restoration.slot_name)
       return True  # New assignment
     else:
@@ -260,14 +260,14 @@ class _CheckpointPosition(object):
       # we don't need to do anything besides check that the mapping is
       # consistent (if the dependency DAG is not a tree then there are
       # multiple paths to the same object).
-      if current_assignment is not checkpointable:
+      if current_assignment is not trackable:
         logging.warning(
             ("Inconsistent references when loading the checkpoint into this "
-             "object graph. Either the Checkpointable object references in the "
+             "object graph. Either the Trackable object references in the "
              "Python program have changed in an incompatible way, or the "
              "checkpoint was generated in an incompatible program.\n\nTwo "
              "checkpoint references resolved to different objects (%s and %s).")
-            % (current_assignment, checkpointable))
+            % (current_assignment, trackable))
       return False  # Not a new assignment
 
   def is_simple_variable(self):
@@ -306,7 +306,7 @@ class _CheckpointPosition(object):
 
   def _gather_ops_or_named_saveables(self):
     """Looks up or creates SaveableObjects which don't have cached ops."""
-    saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
+    saveables = self.trackable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
     # Name saveables based on the name this object had when it was checkpointed.
     named_saveables = {}
     python_saveables = []
@@ -325,15 +325,16 @@ class _CheckpointPosition(object):
       # the SaveableObject itself has been cached. If not, we'll make it, and
       # either way we'll extract new ops from it (or if it has Python state to
       # restore, we'll run that).
-      if self._checkpoint.saveable_object_cache is None:
+      saveables_cache = self._checkpoint.graph_view.saveables_cache
+      if saveables_cache is None:
         # No SaveableObject caching when executing eagerly.
         saveable = None
       else:
         # If we've already created and cached a SaveableObject for this
         # attribute, we can re-use it to avoid re-creating some ops when graph
         # building.
-        saveable_list = self._checkpoint.saveable_object_cache.get(
-            self.checkpointable, {}).get(serialized_tensor.name, (None,))
+        saveable_list = saveables_cache.get(
+            self.trackable, {}).get(serialized_tensor.name, (None,))
         if len(saveable_list) == 1:
           # Almost every attribute will have exactly one SaveableObject.
           saveable, = saveable_list
@@ -347,7 +348,7 @@ class _CheckpointPosition(object):
         # the SaveableObject.
         if serialized_tensor.checkpoint_key not in saveable.name:
           saveable = None
-          del self._checkpoint.saveable_object_cache[self.checkpointable]
+          del saveables_cache[self.trackable]
           break
       if saveable is None:
         # If there was no cached SaveableObject, we should check if the Python
@@ -360,15 +361,15 @@ class _CheckpointPosition(object):
           # checkpoint was loaded.
           if not serialized_tensor.optional_restore:
             self._checkpoint.unused_attributes.setdefault(
-                self.checkpointable, []).append(serialized_tensor.name)
+                self.trackable, []).append(serialized_tensor.name)
           continue
         if callable(saveable_factory):
           saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
         else:
           saveable = saveable_factory
-        if self._checkpoint.saveable_object_cache is not None:
-          self._checkpoint.saveable_object_cache.setdefault(
-              self.checkpointable, {})[serialized_tensor.name] = [saveable]
+        if saveables_cache is not None:
+          saveables_cache.setdefault(
+              self.trackable, {})[serialized_tensor.name] = [saveable]
       if isinstance(saveable, PythonStateSaveable):
         python_saveables.append(saveable)
       else:
@@ -378,7 +379,7 @@ class _CheckpointPosition(object):
   def restore_ops(self):
     """Create or fetch restore ops for this object's attributes.
 
-    Requires that the `Checkpointable` Python object has been bound to an object
+    Requires that the `Trackable` Python object has been bound to an object
     ID in the checkpoint.
 
     Returns:
@@ -397,7 +398,7 @@ class _CheckpointPosition(object):
     return self._checkpoint
 
   @property
-  def checkpointable(self):
+  def trackable(self):
     return self._checkpoint.object_by_proto_id[self._proto_id]
 
   @property
@@ -435,11 +436,11 @@ _SlotVariableRestoration = collections.namedtuple(
 def no_automatic_dependency_tracking(method):
   """Disables automatic dependency tracking on attribute assignment.
 
-  Use to decorate any method of a Checkpointable object. Attribute assignment in
+  Use to decorate any method of a Trackable object. Attribute assignment in
   that method will not add dependencies (also respected in Model). Harmless if
   used in a class which does not do automatic dependency tracking (which means
   it's safe to use in base classes which may have subclasses which also inherit
-  from Checkpointable).
+  from Trackable).
 
   Args:
     method: The method to decorate.
@@ -452,46 +453,47 @@ def no_automatic_dependency_tracking(method):
     previous_value = getattr(self, "_setattr_tracking", True)
     self._setattr_tracking = False  # pylint: disable=protected-access
     try:
-      method(self, *args, **kwargs)
+      result = method(self, *args, **kwargs)
     finally:
       self._setattr_tracking = previous_value  # pylint: disable=protected-access
+    return result
 
   return tf_decorator.make_decorator(
       target=method, decorator_func=_method_wrapper)
 
 
-class Checkpointable(object):
-  """Base class for `Checkpointable` objects without automatic dependencies.
+class Trackable(object):
+  """Base class for `Trackable` objects without automatic dependencies.
 
   This class has no __setattr__ override for performance reasons. Dependencies
   must be added explicitly. Unless attribute assignment is performance-critical,
-  use `AutoCheckpointable` instead. Use `Checkpointable` for `isinstance`
+  use `AutoTrackable` instead. Use `Trackable` for `isinstance`
   checks.
   """
 
-  # Checkpointable does not do automatic dependency tracking, but uses the
+  # Trackable does not do automatic dependency tracking, but uses the
   # no_automatic_dependency_tracking decorator so it can avoid adding
-  # dependencies if a subclass is Checkpointable / inherits from Model (both of
+  # dependencies if a subclass is Trackable / inherits from Model (both of
   # which have __setattr__ overrides).
   @no_automatic_dependency_tracking
-  def _maybe_initialize_checkpointable(self):
+  def _maybe_initialize_trackable(self):
     """Initialize dependency management.
 
     Not __init__, since most objects will forget to call it.
     """
     if hasattr(self, "_unconditional_checkpoint_dependencies"):
       # __init__ already called. This check means that we don't need
-      # Checkpointable.__init__() in the constructor of every TensorFlow object.
+      # Trackable.__init__() in the constructor of every TensorFlow object.
       return
-    # A list of CheckpointableReference objects. Some classes implementing
-    # `Checkpointable`, notably `Optimizer`s, may override the
+    # A list of TrackableReference objects. Some classes implementing
+    # `Trackable`, notably `Optimizer`s, may override the
     # _checkpoint_dependencies property with conditional dependencies
     # (e.g. based on the current graph when saving).
     self._unconditional_checkpoint_dependencies = []
-    # Maps names -> Checkpointable objects
+    # Maps names -> Trackable objects
     self._unconditional_dependency_names = {}
-    # Restorations for other Checkpointable objects on which this object may
-    # eventually depend. Maps local name -> _CheckpointPosition list. Optimizers
+    # Restorations for other Trackable objects on which this object may
+    # eventually depend. Maps local name -> CheckpointPosition list. Optimizers
     # tack on conditional dependencies, and so need separate management of
     # deferred dependencies too.
     self._unconditional_deferred_dependencies = {}
@@ -529,8 +531,8 @@ class Checkpointable(object):
     May be overridden to include conditional dependencies.
 
     Returns:
-      A list of `CheckpointableReference` objects indicating named
-      `Checkpointable` dependencies which should be saved along with this
+      A list of `TrackableReference` objects indicating named
+      `Trackable` dependencies which should be saved along with this
       object.
     """
     return self._unconditional_checkpoint_dependencies
@@ -539,13 +541,13 @@ class Checkpointable(object):
   def _deferred_dependencies(self):
     """A dictionary with deferred dependencies.
 
-    Stores restorations for other Checkpointable objects on which this object
+    Stores restorations for other Trackable objects on which this object
     may eventually depend. May be overridden by sub-classes (e.g. Optimizers use
     conditional dependencies based the current graph, and so need separate
     management of deferred dependencies too).
 
     Returns:
-      A dictionary mapping from local name to a list of _CheckpointPosition
+      A dictionary mapping from local name to a list of CheckpointPosition
       objects.
     """
     return self._unconditional_deferred_dependencies
@@ -558,7 +560,7 @@ class Checkpointable(object):
     Args:
       name: The local name of the dependency.
     Returns:
-      A `Checkpointable` object, or `None` if no dependency by this name was
+      A `Trackable` object, or `None` if no dependency by this name was
       found.
     """
     return self._unconditional_dependency_names.get(name, None)
@@ -567,9 +569,9 @@ class Checkpointable(object):
       self, name, shape=None, dtype=dtypes.float32,
       initializer=None, getter=None, overwrite=False,
       **kwargs_for_getter):
-    """Restore-on-create for a variable be saved with this `Checkpointable`.
+    """Restore-on-create for a variable be saved with this `Trackable`.
 
-    If the user has requested that this object or another `Checkpointable` which
+    If the user has requested that this object or another `Trackable` which
     depends on this object be restored from a checkpoint (deferred loading
     before variable object creation), `initializer` may be ignored and the value
     from the checkpoint used instead.
@@ -591,7 +593,7 @@ class Checkpointable(object):
     Raises:
       ValueError: If the variable name is not unique.
     """
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     with ops.init_scope():
       if context.executing_eagerly():
         # If this is a variable with a single Tensor stored in the checkpoint,
@@ -607,11 +609,11 @@ class Checkpointable(object):
               isinstance(initializer, CheckpointInitialValue)
               and (initializer.restore_uid
                    > checkpoint_initializer.restore_uid))):
-        # If multiple Checkpointable objects are "creating" the same variable
+        # If multiple Trackable objects are "creating" the same variable
         # via the magic of custom getters, the one with the highest restore UID
         # (the one called last) has to make the final initializer. If another
         # custom getter interrupts this process by overwriting the initializer,
-        # then we'll catch that when we call _track_checkpointable. So this is
+        # then we'll catch that when we call _track_trackable. So this is
         # "best effort" to set the initializer with the highest restore UID.
         initializer = checkpoint_initializer
         shape = None
@@ -623,12 +625,12 @@ class Checkpointable(object):
     # assign again. It will add this variable to our dependencies, and if there
     # is a non-trivial restoration queued, it will handle that. This also
     # handles slot variables.
-    if not overwrite or isinstance(new_variable, Checkpointable):
-      return self._track_checkpointable(new_variable, name=name,
-                                        overwrite=overwrite)
+    if not overwrite or isinstance(new_variable, Trackable):
+      return self._track_trackable(new_variable, name=name,
+                                   overwrite=overwrite)
     else:
       # TODO(allenl): Some variable types are not yet supported. Remove this
-      # fallback once all get_variable() return types are Checkpointable.
+      # fallback once all get_variable() return types are Trackable.
       return new_variable
 
   def _preload_simple_restoration(self, name, shape):
@@ -667,46 +669,46 @@ class Checkpointable(object):
     return CheckpointInitialValue(
         checkpoint_position=checkpoint_position, shape=shape)
 
-  def _track_checkpointable(self, checkpointable, name, overwrite=False):
-    """Declare a dependency on another `Checkpointable` object.
+  def _track_trackable(self, trackable, name, overwrite=False):
+    """Declare a dependency on another `Trackable` object.
 
     Indicates that checkpoints for this object should include variables from
-    `checkpointable`.
+    `trackable`.
 
-    Variables in a checkpoint are mapped to `Checkpointable`s based on the names
+    Variables in a checkpoint are mapped to `Trackable`s based on the names
     provided when the checkpoint was written. To avoid breaking existing
     checkpoints when modifying a class, neither variable names nor dependency
-    names (the names passed to `_track_checkpointable`) may change.
+    names (the names passed to `_track_trackable`) may change.
 
     Args:
-      checkpointable: A `Checkpointable` which this object depends on.
-      name: A local name for `checkpointable`, used for loading checkpoints into
+      trackable: A `Trackable` which this object depends on.
+      name: A local name for `trackable`, used for loading checkpoints into
         the correct objects.
       overwrite: Boolean, whether silently replacing dependencies is OK. Used
         for __setattr__, where throwing an error on attribute reassignment would
         be inappropriate.
 
     Returns:
-      `checkpointable`, for convenience when declaring a dependency and
+      `trackable`, for convenience when declaring a dependency and
       assigning to a member variable in one statement.
 
     Raises:
-      TypeError: If `checkpointable` does not inherit from `Checkpointable`.
+      TypeError: If `trackable` does not inherit from `Trackable`.
       ValueError: If another object is already tracked by this name.
     """
-    self._maybe_initialize_checkpointable()
-    if not isinstance(checkpointable, Checkpointable):
+    self._maybe_initialize_trackable()
+    if not isinstance(trackable, Trackable):
       raise TypeError(
-          ("Checkpointable._track_checkpointable() passed type %s, not a "
-           "Checkpointable.") % (type(checkpointable),))
-    new_reference = CheckpointableReference(name=name, ref=checkpointable)
+          ("Trackable._track_trackable() passed type %s, not a "
+           "Trackable.") % (type(trackable),))
+    new_reference = TrackableReference(name=name, ref=trackable)
     current_object = self._lookup_dependency(name)
     if (current_object is not None
-        and current_object is not checkpointable):
+        and current_object is not trackable):
       if not overwrite:
         raise ValueError(
-            ("Called Checkpointable._track_checkpointable() with name='%s', "
-             "but a Checkpointable with this name is already declared as a "
+            ("Called Trackable._track_trackable() with name='%s', "
+             "but a Trackable with this name is already declared as a "
              "dependency. Names must be unique (or overwrite=True).") % (name,))
       # This is a weird thing to do, but we're not going to stop people from
       # using __setattr__.
@@ -717,20 +719,20 @@ class Checkpointable(object):
     elif current_object is None:
       self._unconditional_checkpoint_dependencies.append(new_reference)
       self._handle_deferred_dependencies(
-          name=name, checkpointable=checkpointable)
-    self._unconditional_dependency_names[name] = checkpointable
-    return checkpointable
+          name=name, trackable=trackable)
+    self._unconditional_dependency_names[name] = trackable
+    return trackable
 
-  def _handle_deferred_dependencies(self, name, checkpointable):
-    """Pop and load any deferred checkpoint restores into `checkpointable`.
+  def _handle_deferred_dependencies(self, name, trackable):
+    """Pop and load any deferred checkpoint restores into `trackable`.
 
-    This method does not add a new dependency on `checkpointable`, but it does
+    This method does not add a new dependency on `trackable`, but it does
     check if any outstanding/deferred dependencies have been queued waiting for
     this dependency to be added (matched based on `name`). If so,
-    `checkpointable` and its dependencies are restored. The restorations are
+    `trackable` and its dependencies are restored. The restorations are
     considered fulfilled and so are deleted.
 
-    `_track_checkpointable` is more appropriate for adding a
+    `_track_trackable` is more appropriate for adding a
     normal/unconditional dependency, and includes handling for deferred
     restorations. This method allows objects such as `Optimizer` to use the same
     restoration logic while managing conditional dependencies themselves, by
@@ -740,25 +742,25 @@ class Checkpointable(object):
 
     Args:
       name: The name of the dependency within this object (`self`), used to
-        match `checkpointable` with values saved in a checkpoint.
-      checkpointable: The Checkpointable object to restore (inheriting from
-        `Checkpointable`).
+        match `trackable` with values saved in a checkpoint.
+      trackable: The Trackable object to restore (inheriting from
+        `Trackable`).
     """
-    self._maybe_initialize_checkpointable()
-    checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    self._maybe_initialize_trackable()
+    trackable._maybe_initialize_trackable()  # pylint: disable=protected-access
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
         key=lambda restore: restore.checkpoint.restore_uid,
         reverse=True):
-      checkpoint_position.restore(checkpointable)
+      checkpoint_position.restore(trackable)
 
     # Pass on any name-based restores queued in this object.
     for name_based_restore in sorted(
         self._name_based_restores,
         key=lambda checkpoint: checkpoint.restore_uid,
         reverse=True):
-      checkpointable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
+      trackable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
 
   def _restore_from_checkpoint_position(self, checkpoint_position):
     """Restore this object and its dependencies (may be deferred)."""
@@ -771,7 +773,7 @@ class Checkpointable(object):
     while visit_queue:
       current_position = visit_queue.popleft()
       restore_ops.extend(nest.flatten(
-          current_position.checkpointable  # pylint: disable=protected-access
+          current_position.trackable  # pylint: disable=protected-access
           ._single_restoration_from_checkpoint_position(
               checkpoint_position=current_position,
               visit_queue=visit_queue)))
@@ -780,7 +782,7 @@ class Checkpointable(object):
   def _single_restoration_from_checkpoint_position(
       self, checkpoint_position, visit_queue):
     """Restore this object, and either queue its dependencies or defer them."""
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     checkpoint = checkpoint_position.checkpoint
     # If the UID of this restore is lower than our current update UID, we don't
     # need to actually restore the object. However, we should pass the
@@ -791,7 +793,7 @@ class Checkpointable(object):
     else:
       restore_ops = ()
     for child in checkpoint_position.object_proto.children:
-      child_position = _CheckpointPosition(
+      child_position = CheckpointPosition(
           checkpoint=checkpoint,
           proto_id=child.node_id)
       local_object = self._lookup_dependency(child.local_name)
@@ -801,7 +803,7 @@ class Checkpointable(object):
         self._deferred_dependencies.setdefault(child.local_name, []).append(
             child_position)
       else:
-        if child_position.bind_object(checkpointable=local_object):
+        if child_position.bind_object(trackable=local_object):
           # This object's correspondence is new, so dependencies need to be
           # visited. Delay doing it so that we get a breadth-first dependency
           # resolution order (shallowest paths first). The caller is responsible
@@ -817,7 +819,7 @@ class Checkpointable(object):
     or variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
     `var_list` constructor argument).
 
-    `SaveableObjects` have a name set, which Checkpointable needs to generate
+    `SaveableObjects` have a name set, which Trackable needs to generate
     itself. So rather than returning `SaveableObjects` directly, this method
     should return a dictionary of callables which take `name` arguments and
     return `SaveableObjects` with that name.
@@ -849,15 +851,25 @@ class Checkpointable(object):
       """Serializes `self.get_config()` for saving."""
       dereferenced_self = weak_self()
       if dereferenced_self:
-        try:
-          return json.dumps(dereferenced_self,
-                            default=serialization.get_json_type,
-                            sort_keys=True).encode("utf8")
-        except TypeError:
-          # Even if get_config worked objects may have produced garbage.
-          return ""
+        return json.dumps(
+            dereferenced_self,
+            default=serialization.get_json_type,
+            sort_keys=True).encode("utf8")
       else:
         return ""
     return {OBJECT_CONFIG_JSON_KEY: functools.partial(
         PythonStringStateSaveable,
         state_callback=_state_callback)}
+
+  def _list_functions_for_serialization(self):
+    """Lists the functions of this trackable to serialize.
+
+    Internal sub-classes can override this with specific logic. E.g.
+    `AutoTrackable` provides an implementation that returns the `attr`
+    that return functions.
+
+    Returns:
+        A dictionary mapping attribute names to `Function` or
+        `ConcreteFunction`.
+    """
+    return dict()
diff --git a/tensorflow/python/training/checkpointable/base_test.py b/tensorflow/python/training/tracking/base_test.py
similarity index 75%
rename from tensorflow/python/training/checkpointable/base_test.py
rename to tensorflow/python/training/tracking/base_test.py
index 5863f5a5604b465cb33a3388c16656b3fe065b5d..4a74417e3ba9a081ad2a6c7150e63ffd3aa898fa 100644
--- a/tensorflow/python/training/checkpointable/base_test.py
+++ b/tensorflow/python/training/tracking/base_test.py
@@ -22,29 +22,29 @@ import os
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import util
 
 
 class InterfaceTests(test.TestCase):
 
   def testOverwrite(self):
-    root = base.Checkpointable()
-    leaf = base.Checkpointable()
-    root._track_checkpointable(leaf, name="leaf")
+    root = base.Trackable()
+    leaf = base.Trackable()
+    root._track_trackable(leaf, name="leaf")
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(leaf, current_dependency)
     self.assertEqual("leaf", current_name)
-    duplicate_name_dep = base.Checkpointable()
+    duplicate_name_dep = base.Trackable()
     with self.assertRaises(ValueError):
-      root._track_checkpointable(duplicate_name_dep, name="leaf")
-    root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
+      root._track_trackable(duplicate_name_dep, name="leaf")
+    root._track_trackable(duplicate_name_dep, name="leaf", overwrite=True)
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, current_dependency)
     self.assertEqual("leaf", current_name)
 
   def testAddVariableOverwrite(self):
-    root = base.Checkpointable()
+    root = base.Trackable()
     a = root._add_variable_with_custom_getter(
         name="v", shape=[], getter=variable_scope.get_variable)
     self.assertEqual([root, a], util.list_objects(root))
@@ -61,15 +61,15 @@ class InterfaceTests(test.TestCase):
             getter=variable_scope.get_variable)
 
   def testAssertConsumedWithUnusedPythonState(self):
-    has_config = base.Checkpointable()
+    has_config = base.Trackable()
     has_config.get_config = lambda: {}
     saved = util.Checkpoint(obj=has_config)
     save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    restored = util.Checkpoint(obj=base.Checkpointable())
+    restored = util.Checkpoint(obj=base.Trackable())
     restored.restore(save_path).assert_consumed()
 
   def testAssertConsumedFailsWithUsedPythonState(self):
-    has_config = base.Checkpointable()
+    has_config = base.Trackable()
     attributes = {
         "foo_attr": functools.partial(
             base.PythonStringStateSaveable,
@@ -78,24 +78,11 @@ class InterfaceTests(test.TestCase):
     has_config._gather_saveables_for_checkpoint = lambda: attributes
     saved = util.Checkpoint(obj=has_config)
     save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    restored = util.Checkpoint(obj=base.Checkpointable())
+    restored = util.Checkpoint(obj=base.Trackable())
     status = restored.restore(save_path)
     with self.assertRaisesRegexp(AssertionError, "foo_attr"):
       status.assert_consumed()
 
-  def testBuggyGetConfig(self):
-
-    class NotSerializable(object):
-      pass
-
-    class GetConfigRaisesError(base.Checkpointable):
-
-      def get_config(self):
-        return NotSerializable()
-
-    util.Checkpoint(obj=GetConfigRaisesError()).save(
-        os.path.join(self.get_temp_dir(), "ckpt"))
-
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
similarity index 84%
rename from tensorflow/python/training/checkpointable/data_structures.py
rename to tensorflow/python/training/tracking/data_structures.py
index c86846f8e23d7e2de8e062e2c16e38a8bfcab423..7f1cc50710452ec7262be03434f4d1dc450b5442 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -1,4 +1,4 @@
-"""Checkpointable data structures."""
+"""Trackable data structures."""
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,18 +24,20 @@ import sys
 
 import six
 
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import layer_utils
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import layer_utils
 
 
 class NoDependency(object):
-  """Allows attribute assignment to `Checkpointable` objects with no dependency.
+  """Allows attribute assignment to `Trackable` objects with no dependency.
 
   Example usage:
   ```python
-  obj = Checkpointable()
+  obj = Trackable()
   obj.has_dependency = tf.Variable(0., name="dep")
   obj.no_dependency = NoDependency(tf.Variable(1., name="nodep"))
   assert obj.no_dependency.name == "nodep:0"
@@ -57,16 +59,22 @@ class NoDependency(object):
 
 def _wrap_or_unwrap(value):
   """Wraps basic data structures, unwraps NoDependency objects."""
+  # pylint: disable=unidiomatic-typecheck
+  # Exact type checking to avoid mucking up custom logic in list/dict
+  # subclasses, e.g. collections.Counter.
   if isinstance(value, NoDependency):
     return value.value
-  if isinstance(value, base.Checkpointable):
-    return value  # Skip conversion for already checkpointable objects.
-  elif isinstance(value, dict):
+  if isinstance(value, base.Trackable):
+    return value  # Skip conversion for already trackable objects.
+  elif type(value) == dict:
     return _DictWrapper(value)
-  elif isinstance(value, list):
+  elif type(value) == collections.OrderedDict:
+    return _DictWrapper(value)
+  elif type(value) == list:
     return _ListWrapper(value)
   else:
     return value
+  # pylint: enable=unidiomatic-typecheck
   # TODO(allenl): Handle other common data structures. Tuples will require
   # special casing (tuple subclasses are not weak referenceable, so replacement
   # with a wrapper that subclasses tuple on attribute assignment works poorly,
@@ -75,19 +83,19 @@ def _wrap_or_unwrap(value):
   # come up with names. Dictionaries should look like lists.
 
 
-def sticky_attribute_assignment(checkpointable, name, value):
+def sticky_attribute_assignment(trackable, name, value):
   """Adds dependencies, generally called from __setattr__.
 
-  This behavior is shared between Checkpointable and Model.
+  This behavior is shared between Trackable and Model.
 
-  Respects NoDependency indicators, but otherwise makes checkpointable objects
+  Respects NoDependency indicators, but otherwise makes trackable objects
   out of common data structures and tracks objects by their attribute names.
 
   Args:
-    checkpointable: The object to add dependencies to (generally the one having
+    trackable: The object to add dependencies to (generally the one having
       an attribute assigned).
     name: The attribute name being assigned.
-    value: The value being assigned. Not necessarily a checkpointable object.
+    value: The value being assigned. Not necessarily a trackable object.
 
   Returns:
     The value which should be stored in the attribute (unwrapped from a
@@ -100,18 +108,18 @@ def sticky_attribute_assignment(checkpointable, name, value):
   value = _wrap_or_unwrap(value)
   if not add_dependency:
     return value
-  if isinstance(value, base.Checkpointable):
-    checkpointable._track_checkpointable(  # pylint: disable=protected-access
+  if isinstance(value, base.Trackable):
+    trackable._track_trackable(  # pylint: disable=protected-access
         value, name=name,
-        # Allow the user to switch the Checkpointable which is tracked by this
+        # Allow the user to switch the Trackable which is tracked by this
         # name, since assigning a new variable to an attribute has
         # historically been fine (e.g. Adam did this).
         overwrite=True)
   return value
 
 
-class CheckpointableDataStructure(base.Checkpointable):
-  """Base class for data structures which contain checkpointable objects."""
+class TrackableDataStructure(base.Trackable):
+  """Base class for data structures which contain trackable objects."""
 
   def __init__(self):
     self.trainable = True
@@ -120,14 +128,14 @@ class CheckpointableDataStructure(base.Checkpointable):
   def _track_value(self, value, name):
     """Add a dependency on `value`."""
     value = sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
+        trackable=self, value=value, name=name)
     if isinstance(value, variables.Variable):
       self._extra_variables.append(value)
-    if not isinstance(value, base.Checkpointable):
+    if not isinstance(value, base.Trackable):
       raise ValueError(
-          ("Only checkpointable objects (such as Layers or Optimizers) may be "
+          ("Only trackable objects (such as Layers or Optimizers) may be "
            "stored in a List object. Got %s, which does not inherit from "
-           "Checkpointable.") % (value,))
+           "Trackable.") % (value,))
     if hasattr(value, "_use_resource_variables"):
       # In subclassed models, legacy layers (tf.layers) must always use
       # resource variables.
@@ -136,7 +144,7 @@ class CheckpointableDataStructure(base.Checkpointable):
 
   @property
   def _values(self):
-    """An iterable/sequence which may contain checkpointable objects."""
+    """An iterable/sequence which may contain trackable objects."""
     raise NotImplementedError("Abstract method")
 
   @property
@@ -146,7 +154,7 @@ class CheckpointableDataStructure(base.Checkpointable):
     # they're wrapping if out of sync.
     collected = []
     for obj in self._values:
-      if (isinstance(obj, CheckpointableDataStructure)
+      if (isinstance(obj, TrackableDataStructure)
           or layer_utils.is_layer(obj)
           or layer_utils.has_weights(obj)):
         collected.append(obj)
@@ -213,19 +221,19 @@ class CheckpointableDataStructure(base.Checkpointable):
     return id(self)
 
   def __eq__(self, other):
-    # Similar to Tensors, checkpointable data structures use object-identity
+    # Similar to Tensors, trackable data structures use object-identity
     # equality to support set/dict membership.
     return self is other
 
 
-class List(CheckpointableDataStructure, collections.Sequence):
-  """An append-only sequence type which is checkpointable.
+class List(TrackableDataStructure, collections.Sequence):
+  """An append-only sequence type which is trackable.
 
   Maintains checkpoint dependencies on its contents (which must also be
-  checkpointable), and forwards any `Layer` metadata such as updates and losses.
+  trackable), and forwards any `Layer` metadata such as updates and losses.
 
   Note that `List` is purely a container. It lets a `tf.keras.Model` or
-  other checkpointable object know about its contents, but does not call any
+  other trackable object know about its contents, but does not call any
   `Layer` instances which are added to it. To indicate a sequence of `Layer`
   instances which should be called sequentially, use `tf.keras.Sequential`.
 
@@ -246,7 +254,7 @@ class List(CheckpointableDataStructure, collections.Sequence):
       return aggregation
   ```
 
-  This kind of wrapping is necessary because `Checkpointable` objects do not
+  This kind of wrapping is necessary because `Trackable` objects do not
   (yet) deeply inspect regular Python data structures, so for example assigning
   a regular list (`self.layer_list = [layers.Dense(3)]`) does not create a
   checkpoint dependency and does not add the `Layer` instance's weights to its
@@ -282,12 +290,12 @@ class List(CheckpointableDataStructure, collections.Sequence):
     return self
 
   def append(self, value):
-    """Add a new checkpointable value."""
+    """Add a new trackable value."""
     value = self._track_value(value, self._name_element(len(self._storage)))
     self._storage.append(value)
 
   def extend(self, values):
-    """Add a sequence of checkpointable values."""
+    """Add a sequence of trackable values."""
     for value in values:
       self.append(value)
 
@@ -318,7 +326,7 @@ class List(CheckpointableDataStructure, collections.Sequence):
     return self * n
 
   def __radd__(self, other):
-    return self + other
+    return self.__class__(other) + self
 
   def __getitem__(self, key):
     return self._storage[key]
@@ -348,7 +356,7 @@ class _ListWrapper(List, collections.MutableSequence,
   occupied, meaning both elements get the same names at different times) and
   refuses to save.
 
-  On assignment to an attribute of a Model or Checkpointable object, Python
+  On assignment to an attribute of a Model or Trackable object, Python
   lists are replaced with _ListWrapper. Wrapping a list in a
   `tf.contrib.checkpoint.NoDependency` object prevents this.
   """
@@ -408,7 +416,7 @@ class _ListWrapper(List, collections.MutableSequence,
     if self._non_append_mutation:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
-           "checkpointable TensorFlow objects). A list element was replaced "
+           "trackable TensorFlow objects). A list element was replaced "
            "(__setitem__, __setslice__), deleted (__delitem__, __delslice__), "
            "or moved (sort). In order to support restoration on object "
            "creation, tracking is exclusively for append-only data structures."
@@ -418,7 +426,7 @@ class _ListWrapper(List, collections.MutableSequence,
     if self._external_modification:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
-           "checkpointable TensorFlow objects). The wrapped list was modified "
+           "trackable TensorFlow objects). The wrapped list was modified "
            "outside the wrapper (its final value was %s, its value when a "
            "checkpoint dependency was added was %s), which breaks restoration "
            "on object creation.\n\nIf you don't need this list checkpointed, "
@@ -447,7 +455,7 @@ class _ListWrapper(List, collections.MutableSequence,
         value_now = self._storage[i] if i < len_now else None
         value_before = storage_copy[i] if i < len_before else None
 
-        if isinstance(value_before, base.Checkpointable):
+        if isinstance(value_before, base.Trackable):
           self._non_append_mutation = True
 
         if value_now is not None and value_now != value_before:
@@ -455,20 +463,20 @@ class _ListWrapper(List, collections.MutableSequence,
                                                self._name_element(i))
 
     else:
-      if isinstance(self._storage[key], base.Checkpointable):
+      if isinstance(self._storage[key], base.Trackable):
         self._non_append_mutation = True
       self._storage[key] = self._track_value(value, self._name_element(key))
 
     self._update_snapshot()
 
   def append(self, value):
-    """Add a new checkpointable value."""
+    """Add a new trackable value."""
     self._check_external_modification()
     super(_ListWrapper, self).append(value)
     self._update_snapshot()
 
   def extend(self, values):
-    """Add a sequence of checkpointable values."""
+    """Add a sequence of trackable values."""
     self._check_external_modification()
     super(_ListWrapper, self).extend(values)
     self._update_snapshot()
@@ -512,25 +520,31 @@ class _ListWrapper(List, collections.MutableSequence,
     del self._storage[slice(i, j)]
 
   def _track_value(self, value, name):
-    """Allows storage of non-checkpointable objects."""
+    """Allows storage of non-trackable objects."""
     try:
       value = super(_ListWrapper, self)._track_value(value=value, name=name)
     except ValueError:
-      # Even if this value isn't checkpointable, we need to make sure
+      # Even if this value isn't trackable, we need to make sure
       # NoDependency objects get unwrapped.
       value = sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
+          trackable=self, value=value, name=name)
     return value
 
   def __repr__(self):
     return "ListWrapper(%s)" % (repr(self._storage),)
 
+  def _list_functions_for_serialization(self):
+    return {
+        str(key): value for key, value in enumerate(self)
+        if _is_function(value)
+    }
+
 
-class Mapping(CheckpointableDataStructure, collections.Mapping):
-  """An append-only checkpointable mapping data structure with string keys.
+class Mapping(TrackableDataStructure, collections.Mapping):
+  """An append-only trackable mapping data structure with string keys.
 
   Maintains checkpoint dependencies on its contents (which must also be
-  checkpointable), named based on its keys.
+  trackable), named based on its keys.
 
   Note that once a key has been added, it may not be deleted or replaced. If
   names may not be unique, see `tf.contrib.checkpoint.UniqueNameTracker`.
@@ -607,7 +621,7 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
 # patching all of the "wrapped" dict's methods instead of creating a wrapper
 # object is an option, but not a very attractive one (replacing methods without
 # creating reference cycles is difficult, and then dicts would need to be
-# special cased everywhere as being checkpointable).
+# special cased everywhere as being trackable).
 class _DictWrapper(Mapping, collections.MutableMapping):
   """Wraps built-in dicts to support restore-on-create for variables.
 
@@ -663,7 +677,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
           "automatically on attribute assignment). The wrapped dictionary "
-          "contains a non-string key which maps to a checkpointable object or "
+          "contains a non-string key which maps to a trackable object or "
           "mutable data structure.\n\nIf you don't need this dictionary "
           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
           "object; it will be automatically un-wrapped and subsequently "
@@ -672,7 +686,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
           "automatically on attribute assignment). A key mapping to a "
-          "checkpointable object was overwritten or deleted, which would "
+          "trackable object was overwritten or deleted, which would "
           "cause problems for restoration.\n\nIf you don't need this "
           "dictionary checkpointed, wrap it in a "
           "tf.contrib.checkpoint.NoDependency object; it will be automatically "
@@ -713,7 +727,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     self._last_wrapped_dict_snapshot = dict(self)
 
   def _track_value(self, value, name):
-    """Allows storage of non-checkpointable objects."""
+    """Allows storage of non-trackable objects."""
     if isinstance(name, six.string_types):
       string_key = True
     else:
@@ -723,15 +737,15 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       no_dependency = isinstance(value, NoDependency)
       value = super(_DictWrapper, self)._track_value(value=value, name=name)
       if not (string_key or no_dependency):
-        # A non-string key maps to a checkpointable value. This data structure
+        # A non-string key maps to a trackable value. This data structure
         # is not saveable.
         self._non_string_key = True
       return value
     except ValueError:
-      # Even if this value isn't checkpointable, we need to make sure
+      # Even if this value isn't trackable, we need to make sure
       # NoDependency objects get unwrapped.
       return sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
+          trackable=self, value=value, name=name)
 
   def _name_element(self, key):
     """Don't throw errors for non-string keys."""
@@ -750,31 +764,32 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     else:
       value = _wrap_or_unwrap(value)
       existing_dependency = None
-      if not no_dep and isinstance(value, base.Checkpointable):
+      if not no_dep and isinstance(value, base.Trackable):
         # Non-string keys are OK as long as we have no reason to add a
         # dependency on the value (either because the value is not
-        # checkpointable, or because it was wrapped in a NoDependency object).
+        # trackable, or because it was wrapped in a NoDependency object).
         self._non_string_key = True
-    current_value = self._storage.setdefault(key, value)
-    if current_value is not value:
-      if ((not no_dep and isinstance(value, base.Checkpointable))
-          # We don't want to just check that the existing object is
-          # checkpointable, since it may have been wrapped in a NoDependency
-          # object.
-          or existing_dependency is not None):
-        # A checkpointable object was replaced under the same key; this means
-        # that restoring would be error-prone, so we'll throw an exception on
-        # save.
-        self._non_append_mutation = True
-      self._storage[key] = value
+    if key in self._storage:
+      previous_value = self._storage[key]
+      if previous_value is not value:
+        if ((not no_dep and isinstance(value, base.Trackable))
+            # We don't want to just check that the existing object is
+            # trackable, since it may have been wrapped in a NoDependency
+            # object.
+            or existing_dependency is not None):
+          # A trackable object was replaced under the same key; this means
+          # that restoring would be error-prone, so we'll throw an exception on
+          # save.
+          self._non_append_mutation = True
+    self._storage[key] = value
 
     self._update_snapshot()
 
   def __delitem__(self, key):
     self._check_external_modification()
     existing_value = self[key]
-    if isinstance(existing_value, base.Checkpointable):
-      # Deleting tracked checkpointable values means restoring is problematic,
+    if isinstance(existing_value, base.Trackable):
+      # Deleting tracked trackable values means restoring is problematic,
       # so we'll throw an exception on save.
       self._non_append_mutation = True
     del self._storage[key]
@@ -793,19 +808,27 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     for key, value in dict(*args, **kwargs).items():
       self[key] = value
 
+  def _list_functions_for_serialization(self):
+    return {
+        key: value for key, value in self.items()
+        if _is_function(value)
+    }
+
+
+def _is_function(x):
+  return isinstance(x, (def_function.Function, defun.ConcreteFunction))
+
 revived_types.register_revived_type(
-    "checkpointable_dict_wrapper",
+    "trackable_dict_wrapper",
     lambda obj: isinstance(obj, _DictWrapper),
     versions=[revived_types.VersionedTypeRegistration(
-        # Standard dependencies are enough to reconstruct the checkpointable
+        # Standard dependencies are enough to reconstruct the trackable
         # items in dictionaries, so we don't need to save any extra information.
         object_factory=lambda proto: _DictWrapper({}),
         version=1,
         min_producer_version=1,
         min_consumer_version=1,
-        setter=operator.setitem,
-        getter=_DictWrapper.get,
-        attribute_extractor=lambda obj: obj.keys())])
+        setter=operator.setitem)])
 
 
 def _set_list_item(list_object, index_string, value):
@@ -815,21 +838,12 @@ def _set_list_item(list_object, index_string, value):
   list_object[item_index] = value
 
 
-def _list_getter(obj, item, default=None):
-  index = int(item)
-  if index < len(obj):
-    return obj[index]
-  return default
-
-
 revived_types.register_revived_type(
-    "checkpointable_list_wrapper",
+    "trackable_list_wrapper",
     lambda obj: isinstance(obj, _ListWrapper),
     versions=[revived_types.VersionedTypeRegistration(
         object_factory=lambda proto: _ListWrapper([]),
         version=1,
         min_producer_version=1,
         min_consumer_version=1,
-        setter=_set_list_item,
-        getter=_list_getter,
-        attribute_extractor=lambda obj: [str(i) for i in range(len(obj))])])
+        setter=_set_list_item)])
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
similarity index 92%
rename from tensorflow/python/training/checkpointable/data_structures_test.py
rename to tensorflow/python/training/tracking/data_structures_test.py
index 72045876b61b5331f8c576e6a611839a101973e2..a03614bd60e60d66c781d321e4029f7cc0c4f5fe 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -34,9 +34,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 class HasList(training.Model):
@@ -145,12 +145,12 @@ class ListTests(test.TestCase):
     model.l2.append(second_layer)
     self.assertEqual([first_layer, second_layer], model.layers)
 
-  def testNotCheckpointable(self):
-    class NotCheckpointable(object):
+  def testNotTrackable(self):
+    class NotTrackable(object):
       pass
 
     with self.assertRaises(ValueError):
-      data_structures.List([NotCheckpointable()])
+      data_structures.List([NotTrackable()])
 
   def testCallNotImplemented(self):
     with self.assertRaisesRegexp(TypeError, "not callable"):
@@ -287,8 +287,8 @@ class ListWrapperTest(test.TestCase):
   def testListWrapperBasic(self):
     # _ListWrapper, unlike List, compares like the built-in list type (since it
     # is used to automatically replace lists).
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     self.assertEqual([a, a],
                      [a, a])
     self.assertEqual(data_structures._ListWrapper([a, a]),
@@ -321,7 +321,7 @@ class ListWrapperTest(test.TestCase):
     self.assertEqual([a, a], [a] + data_structures._ListWrapper([a]))
     self.assertIsInstance(data_structures._ListWrapper([a]), list)
 
-  def testAcceptsNonCheckpointableContent(self):
+  def testAcceptsNonTrackableContent(self):
     l = data_structures._ListWrapper([1, 2, 3])
     self.assertEqual(l, [1, 2, 3])
 
@@ -360,14 +360,14 @@ class ListWrapperTest(test.TestCase):
     self.assertEqual(l, [1, 2, 4])
     self.assertUnableToSave(l, "Unable to save .*__delslice__")
 
-  def testSetSlice_canSaveForNonCheckpointableItems(self):
+  def testSetSlice_canSaveForNonTrackableItems(self):
     l = data_structures._ListWrapper([1, 2, 3, 4])
     l[:] = 2, 8, 9, 0
     self.assertEqual(l, [2, 8, 9, 0])
-    l._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    l._maybe_initialize_trackable()  # pylint: disable=protected-access
     self.assertEqual(len(l._checkpoint_dependencies), 0)  # pylint: disable=protected-access
 
-  def testSetSlice_cannotSaveIfCheckpointableModified(self):
+  def testSetSlice_cannotSaveIfTrackableModified(self):
     v1 = resource_variable_ops.ResourceVariable(1.)
     v2 = resource_variable_ops.ResourceVariable(1.)
     l = data_structures._ListWrapper([1, 2, v1, v2])
@@ -391,12 +391,12 @@ class ListWrapperTest(test.TestCase):
     self.assertEqual(l, [1, 2, 3, 4])
     # Regardless of being a no-op for the input list, we still refuse to save.
     # This is intentional since otherwise we would end up with a hard to debug
-    # case for users (e.g. sometimes sort on a ListWrapper is checkpointable and
+    # case for users (e.g. sometimes sort on a ListWrapper is trackable and
     # other times it is not).
     self.assertUnableToSave(l, "Unable to save .*sort")
 
   def assertUnableToSave(self, l, msg):
-    l._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    l._maybe_initialize_trackable()  # pylint: disable=protected-access
     with self.assertRaisesRegexp(ValueError, msg):
       return l._checkpoint_dependencies  # pylint: disable=protected-access
 
@@ -466,7 +466,7 @@ class MappingTests(test.TestCase):
 
   def testLayerCollectionWithExternalMutation(self):
     d = {}
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.wrapper = d
     self.assertEqual([], root.wrapper.layers)
     self.assertEqual([], root.wrapper.trainable_weights)
@@ -484,7 +484,7 @@ class MappingTests(test.TestCase):
     self.assertEqual(2, len(has_mappings))
     self.assertNotIn(data_structures.Mapping(), has_mappings)
     # In contrast to Mapping, dict wrappers are not hashable
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     self.assertEqual({}, a.d)
     self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
@@ -493,7 +493,7 @@ class MappingTests(test.TestCase):
       set([a.d])
 
   def testDictWrapperBadKeys(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d[1] = data_structures.List()
     model = training.Model()
@@ -503,7 +503,7 @@ class MappingTests(test.TestCase):
       model.save_weights(save_path)
 
   def testDictWrapperNoDependency(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = data_structures.NoDependency({})
     a.d[1] = [3]
     self.assertEqual([a], util.list_objects(a))
@@ -513,8 +513,8 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testNonStringKeyNotCheckpointableValue(self):
-    a = tracking.AutoCheckpointable()
+  def testNonStringKeyNotTrackableValue(self):
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = data_structures.NoDependency([3])
@@ -525,18 +525,18 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testNonAppendNotCheckpointable(self):
+  def testNonAppendNotTrackable(self):
     # Non-append mutations (deleting or overwriting values) are OK when the
     # values aren't tracked.
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = 3
     a.d[1] = 2
     self.assertEqual(2, a.d[1])
     del a.d[1]
-    a.d[2] = data_structures.NoDependency(tracking.AutoCheckpointable())
-    second = tracking.AutoCheckpointable()
+    a.d[2] = data_structures.NoDependency(tracking.AutoTrackable())
+    second = tracking.AutoTrackable()
     a.d[2] = data_structures.NoDependency(second)
     self.assertIs(second, a.d[2])
     self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
@@ -598,7 +598,7 @@ class MappingTests(test.TestCase):
     self.assertEqual({1: 3}, new_dict)
 
   def testListShallowCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.copy(root.a)
@@ -615,7 +615,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testListDeepCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.deepcopy(root.a)
@@ -632,7 +632,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.deepcopy(root.a))
 
   def testDictShallowCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.copy(root.a)
@@ -649,7 +649,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testDictDeepCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.deepcopy(root.a)
@@ -665,9 +665,9 @@ class MappingTests(test.TestCase):
     with self.assertRaises(ValueError):
       util.list_objects(copy.deepcopy(root.a))
 
-  def testShallowCopyCheckpointable(self):
-    original = tracking.AutoCheckpointable()
-    original_sub = tracking.AutoCheckpointable()
+  def testShallowCopyTrackable(self):
+    original = tracking.AutoTrackable()
+    original_sub = tracking.AutoTrackable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     shallow_copied = copy.copy(original)
@@ -679,16 +679,16 @@ class MappingTests(test.TestCase):
     self.assertIn(shallow_copied.b, shallow_deps)
     self.assertIn(shallow_copied.b["a"], shallow_deps)
 
-  def testDeepCopyCheckpointable(self):
-    original = tracking.AutoCheckpointable()
-    original_sub = tracking.AutoCheckpointable()
+  def testDeepCopyTrackable(self):
+    original = tracking.AutoTrackable()
+    original_sub = tracking.AutoTrackable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     deep_copied = copy.deepcopy(original)
     self.assertIsNot(original, deep_copied)
     self.assertIsNot(original_sub, deep_copied.b["a"])
     self.assertEqual([[1.]], deep_copied.a)
-    self.assertIsInstance(deep_copied.b["a"], tracking.AutoCheckpointable)
+    self.assertIsInstance(deep_copied.b["a"], tracking.AutoTrackable)
     deps = util.list_objects(deep_copied)
     self.assertIn(deep_copied.a, deps)
     self.assertIn(deep_copied.b, deps)
@@ -700,5 +700,17 @@ class MappingTests(test.TestCase):
     self.assertIsInstance(result, dict)
     self.assertEqual({1: 2, 3: 4}, result)
 
+  def testListAddOrder(self):
+    self.assertEqual([1., 2.],
+                     data_structures._ListWrapper([1.])
+                     + data_structures._ListWrapper([2.]))
+    self.assertEqual([1., 2.],
+                     data_structures._ListWrapper([1.])
+                     + [2.])
+    self.assertEqual([1., 2.],
+                     [1.]
+                     + data_structures._ListWrapper([2.]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2387870182cef6e578c7b947f07a4957fdf22c
--- /dev/null
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -0,0 +1,431 @@
+"""Manages a graph of Trackable objects."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+from tensorflow.core.protobuf import trackable_object_graph_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training.saving import saveable_object as saveable_object_lib
+from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
+
+
+_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. Checkpoint names for slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
+# Keyword for separating the path to an object from the name of an
+# attribute in checkpoint names. Used like:
+#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
+_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
+
+
+def _escape_local_name(name):
+  # We need to support slashes in local names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # names.
+  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
+          .replace(r"/", _ESCAPE_CHAR + "S"))
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (_escape_local_name(trackable.name)
+       for trackable in path_to_root))
+
+
+def _slot_variable_naming_for_optimizer(optimizer_path):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+    return (variable_path
+            + optimizer_identifier
+            + _escape_local_name(slot_name))
+
+  return _name_slot_variable
+
+
+def _serialize_slot_variables(trackable_objects, node_ids, object_names):
+  """Gather and name slot variables."""
+  non_slot_objects = list(trackable_objects)
+  slot_variables = object_identity.ObjectIdentityDictionary()
+  for trackable in non_slot_objects:
+    if (isinstance(trackable, optimizer_v1.Optimizer)
+        # TODO(b/110718070): Fix Keras imports.
+        or hasattr(trackable, "_create_or_restore_slot_variable")):
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer_path=object_names[trackable])
+      slot_names = trackable.get_slot_names()
+      for slot_name in slot_names:
+        for original_variable_node_id, original_variable in enumerate(
+            non_slot_objects):
+          try:
+            slot_variable = trackable.get_slot(
+                original_variable, slot_name)
+          except (AttributeError, KeyError):
+            slot_variable = None
+          if slot_variable is None:
+            continue
+          slot_variable._maybe_initialize_trackable()  # pylint: disable=protected-access
+          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
+            # TODO(allenl): Gather dependencies of slot variables.
+            raise NotImplementedError(
+                "Currently only variables with no dependencies can be saved as "
+                "slot variables. File a feature request if this limitation "
+                "bothers you.")
+          if slot_variable in node_ids:
+            raise NotImplementedError(
+                "A slot variable was re-used as a dependency of a "
+                "Trackable object. This is not currently allowed. File a "
+                "feature request if this limitation bothers you.")
+          checkpoint_name = naming_scheme(
+              variable_path=object_names[original_variable],
+              slot_name=slot_name)
+          object_names[slot_variable] = checkpoint_name
+          slot_variable_node_id = len(trackable_objects)
+          node_ids[slot_variable] = slot_variable_node_id
+          trackable_objects.append(slot_variable)
+          slot_variable_proto = (
+              trackable_object_graph_pb2.TrackableObjectGraph
+              .TrackableObject.SlotVariableReference(
+                  slot_name=slot_name,
+                  original_variable_node_id=original_variable_node_id,
+                  slot_variable_node_id=slot_variable_node_id))
+          slot_variables.setdefault(trackable, []).append(
+              slot_variable_proto)
+  return slot_variables
+
+
+class ObjectGraphView(object):
+  """Gathers and serializes an object graph."""
+
+  def __init__(self, root, saveables_cache=None):
+    """Configure the graph view.
+
+    Args:
+      root: A `Trackable` object whose variables (including the variables
+        of dependencies, recursively) should be saved. May be a weak reference.
+      saveables_cache: A dictionary mapping `Trackable` objects ->
+        attribute names -> SaveableObjects, used to avoid re-creating
+        SaveableObjects when graph building.
+    """
+    self._root_ref = root
+    self._saveables_cache = saveables_cache
+
+  def list_dependencies(self, obj):
+    # pylint: disable=protected-access
+    obj._maybe_initialize_trackable()
+    return obj._checkpoint_dependencies
+    # pylint: enable=protected-access
+
+  @property
+  def saveables_cache(self):
+    """Maps Trackable objects -> attribute names -> list(SaveableObjects).
+
+    Used to avoid re-creating SaveableObjects when graph building. None when
+    executing eagerly.
+
+    Returns:
+      The cache (an object-identity dictionary), or None if caching is disabled.
+    """
+    return self._saveables_cache
+
+  @property
+  def root(self):
+    if isinstance(self._root_ref, weakref.ref):
+      derefed = self._root_ref()
+      assert derefed is not None
+      return derefed
+    else:
+      return self._root_ref
+
+  def _breadth_first_traversal(self):
+    """Find shortest paths to all dependencies of self.root."""
+    bfs_sorted = []
+    to_visit = collections.deque([self.root])
+    path_to_root = object_identity.ObjectIdentityDictionary()
+    path_to_root[self.root] = ()
+    while to_visit:
+      current_trackable = to_visit.popleft()
+      if isinstance(current_trackable, tracking.NotTrackable):
+        raise NotImplementedError(
+            ("The object %s does not support object-based saving. File a "
+             "feature request if this limitation bothers you. In the meantime, "
+             "you can remove the dependency on this object and save everything "
+             "else.")
+            % (current_trackable,))
+      bfs_sorted.append(current_trackable)
+      for name, dependency in self.list_dependencies(current_trackable):
+        if dependency not in path_to_root:
+          path_to_root[dependency] = (
+              path_to_root[current_trackable] + (
+                  base.TrackableReference(name, dependency),))
+          to_visit.append(dependency)
+    return bfs_sorted, path_to_root
+
+  def _add_attributes_to_object_graph(
+      self, trackable_objects, object_graph_proto, node_ids, object_names,
+      object_map):
+    """Create SaveableObjects and corresponding SerializedTensor protos."""
+    named_saveable_objects = []
+    if self._saveables_cache is None:
+      # No SaveableObject caching. Either we're executing eagerly, or building a
+      # static save which is specialized to the current Python state.
+      feed_additions = None
+    else:
+      # If we are caching SaveableObjects, we need to build up a feed_dict with
+      # functions computing volatile Python state to be saved with the
+      # checkpoint.
+      feed_additions = {}
+    for checkpoint_id, (trackable, object_proto) in enumerate(
+        zip(trackable_objects, object_graph_proto.nodes)):
+      assert node_ids[trackable] == checkpoint_id
+      object_name = object_names[trackable]
+      if object_map is None:
+        object_to_save = trackable
+      else:
+        object_to_save = object_map.get(trackable, trackable)
+      if self._saveables_cache is not None:
+        cached_attributes = self._saveables_cache.setdefault(object_to_save, {})
+      else:
+        cached_attributes = None
+
+      for name, saveable_factory in (
+          object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        attribute = object_proto.attributes.add()
+        attribute.name = name
+        attribute.checkpoint_key = "%s/%s/%s" % (
+            object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+        if cached_attributes is None:
+          saveables = None
+        else:
+          saveables = cached_attributes.get(name, None)
+          if saveables is not None:
+            for saveable in saveables:
+              if attribute.checkpoint_key not in saveable.name:
+                # The checkpoint key for this SaveableObject is different. We
+                # need to re-create it.
+                saveables = None
+                del cached_attributes[name]
+                break
+        if saveables is None:
+          if callable(saveable_factory):
+            maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+          else:
+            maybe_saveable = saveable_factory
+          if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
+            saveables = (maybe_saveable,)
+          else:
+            # Figure out the name-based Saver's name for this variable. If it's
+            # already a SaveableObject we'd just get the checkpoint key back, so
+            # we leave full_name blank.
+            saver_dict = saveable_object_util.op_list_to_dict(
+                [maybe_saveable], convert_variable_to_tensor=False)
+            full_name, = saver_dict.keys()
+            saveables = tuple(saveable_object_util.saveable_objects_for_op(
+                op=maybe_saveable, name=attribute.checkpoint_key))
+            for saveable in saveables:
+              saveable.full_name = full_name
+          for saveable in saveables:
+            if attribute.checkpoint_key not in saveable.name:
+              raise AssertionError(
+                  ("The object %s produced a SaveableObject with name '%s' for "
+                   "attribute '%s'. Expected a name containing '%s'.")
+                  % (trackable, name, saveable.name,
+                     attribute.checkpoint_key))
+          if cached_attributes is not None:
+            cached_attributes[name] = saveables
+
+        optional_restore = None
+        for saveable in saveables:
+          if optional_restore is None:
+            optional_restore = saveable.optional_restore
+          else:
+            optional_restore = optional_restore and saveable.optional_restore
+
+          if hasattr(saveable, "full_name"):
+            attribute.full_name = saveable.full_name
+          if isinstance(saveable, base.PythonStateSaveable):
+            if feed_additions is None:
+              assert self._saveables_cache is None
+              # If we're not caching saveables, then we're either executing
+              # eagerly or building a static save/restore (e.g. for a
+              # SavedModel). In either case, we should embed the current Python
+              # state in the graph rather than relying on a feed dict.
+              saveable = saveable.freeze()
+            else:
+              saveable_feed_dict = saveable.feed_dict_additions()
+              for new_feed_key in saveable_feed_dict.keys():
+                if new_feed_key in feed_additions:
+                  raise AssertionError(
+                      ("The object %s tried to feed a value for the Tensor %s "
+                       "when saving, but another object is already feeding a "
+                       "value.")
+                      % (trackable, new_feed_key))
+              feed_additions.update(saveable_feed_dict)
+          named_saveable_objects.append(saveable)
+        if optional_restore is None:
+          optional_restore = False
+        attribute.optional_restore = optional_restore
+
+    return named_saveable_objects, feed_additions
+
+  def _fill_object_graph_proto(self, trackable_objects,
+                               node_ids,
+                               slot_variables,
+                               object_graph_proto=None):
+    """Name non-slot `Trackable`s and add them to `object_graph_proto`."""
+    if object_graph_proto is None:
+      object_graph_proto = (
+          trackable_object_graph_pb2.TrackableObjectGraph())
+    for checkpoint_id, trackable in enumerate(trackable_objects):
+      assert node_ids[trackable] == checkpoint_id
+      object_proto = object_graph_proto.nodes.add()
+      object_proto.slot_variables.extend(slot_variables.get(trackable, ()))
+      for child in self.list_dependencies(trackable):
+        child_proto = object_proto.children.add()
+        child_proto.node_id = node_ids[child.ref]
+        child_proto.local_name = child.name
+    return object_graph_proto
+
+  def _serialize_gathered_objects(self, trackable_objects, path_to_root,
+                                  object_map=None):
+    """Create SaveableObjects and protos for gathered objects."""
+    object_names = object_identity.ObjectIdentityDictionary()
+    for obj, path in path_to_root.items():
+      object_names[obj] = _object_prefix_from_path(path)
+    node_ids = object_identity.ObjectIdentityDictionary()
+    for node_id, node in enumerate(trackable_objects):
+      node_ids[node] = node_id
+    slot_variables = _serialize_slot_variables(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        object_names=object_names)
+    object_graph_proto = self._fill_object_graph_proto(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        slot_variables=slot_variables)
+    named_saveable_objects, feed_additions = (
+        self._add_attributes_to_object_graph(
+            trackable_objects=trackable_objects,
+            object_graph_proto=object_graph_proto,
+            node_ids=node_ids,
+            object_names=object_names,
+            object_map=object_map))
+    return named_saveable_objects, object_graph_proto, feed_additions
+
+  def serialize_object_graph(self):
+    """Determine checkpoint keys for variables and build a serialized graph.
+
+    Non-slot variables are keyed based on a shortest path from the root saveable
+    to the object which owns the variable (i.e. the one which called
+    `Trackable._add_variable` to create it).
+
+    Slot variables are keyed based on a shortest path to the variable being
+    slotted for, a shortest path to their optimizer, and the slot name.
+
+    Returns:
+      A tuple of (named_variables, object_graph_proto, feed_additions):
+        named_variables: A dictionary mapping names to variable objects.
+        object_graph_proto: A TrackableObjectGraph protocol buffer
+          containing the serialized object graph and variable references.
+        feed_additions: A dictionary mapping from Tensors to values which should
+          be fed when saving.
+
+    Raises:
+      ValueError: If there are invalid characters in an optimizer's slot names.
+    """
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    return self._serialize_gathered_objects(
+        trackable_objects, path_to_root)
+
+  def frozen_saveable_objects(self, object_map=None, to_graph=None):
+    """Creates SaveableObjects with the current object graph frozen."""
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    if to_graph:
+      target_context = to_graph.as_default
+    else:
+      target_context = ops.NullContextmanager
+    with target_context():
+      named_saveable_objects, graph_proto, _ = self._serialize_gathered_objects(
+          trackable_objects,
+          path_to_root,
+          object_map)
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      named_saveable_objects.append(
+          base.NoRestoreSaveable(
+              tensor=object_graph_tensor,
+              name=base.OBJECT_GRAPH_PROTO_KEY))
+    return named_saveable_objects
+
+  def objects_ids_and_slot_variables(self):
+    """Traverse the object graph and list all accessible objects.
+
+    Looks for `Trackable` objects which are dependencies of
+    `root_trackable`. Includes slot variables only if the variable they are
+    slotting for and the optimizer are dependencies of `root_trackable`
+    (i.e. if they would be saved with a checkpoint).
+
+    Returns:
+      A tuple of (trackable objects, object -> node id, slot variables)
+    """
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    object_names = object_identity.ObjectIdentityDictionary()
+    for obj, path in path_to_root.items():
+      object_names[obj] = _object_prefix_from_path(path)
+    node_ids = object_identity.ObjectIdentityDictionary()
+    for node_id, node in enumerate(trackable_objects):
+      node_ids[node] = node_id
+    slot_variables = _serialize_slot_variables(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        object_names=object_names)
+    return trackable_objects, node_ids, slot_variables
+
+  def list_objects(self):
+    """Traverse the object graph and list all accessible objects."""
+    trackable_objects, _, _ = self.objects_ids_and_slot_variables()
+    return trackable_objects
diff --git a/tensorflow/python/training/checkpointable/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
similarity index 89%
rename from tensorflow/python/training/checkpointable/layer_utils.py
rename to tensorflow/python/training/tracking/layer_utils.py
index 9d45c4883e70d140650660971656478c216530c9..818563c32fa6ed726156781704b869978409652c 100644
--- a/tensorflow/python/training/checkpointable/layer_utils.py
+++ b/tensorflow/python/training/tracking/layer_utils.py
@@ -21,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.training.tracking import object_identity
+
 
 def is_layer(obj):
   """Implicit check for Layer-like objects."""
@@ -36,15 +38,21 @@ def has_weights(obj):
 
 
 def filter_empty_layer_containers(layer_list):
-  """Filter out empty Layer-like containers."""
+  """Filter out empty Layer-like containers and uniquify."""
+  existing = object_identity.ObjectIdentitySet()
+  to_visit = layer_list[::-1]
   filtered = []
-  for obj in layer_list:
+  while to_visit:
+    obj = to_visit.pop()
+    if obj in existing:
+      continue
+    existing.add(obj)
     if is_layer(obj):
       filtered.append(obj)
     elif hasattr(obj, "layers"):
-      # Checkpointable data structures will not show up in ".layers" lists, but
+      # Trackable data structures will not show up in ".layers" lists, but
       # the layers they contain will.
-      filtered.extend(obj.layers)
+      to_visit.extend(obj.layers[::-1])
   return filtered
 
 
diff --git a/tensorflow/python/training/tracking/object_identity.py b/tensorflow/python/training/tracking/object_identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4eef5b34b55dbf41bea09c5ac6ec7aadaac70ad
--- /dev/null
+++ b/tensorflow/python/training/tracking/object_identity.py
@@ -0,0 +1,156 @@
+"""Utilities for collecting objects based on "is" comparison."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+
+class _ObjectIdentityWrapper(object):
+  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
+
+  Since __eq__ is based on object identity, it's safe to also define __hash__
+  based on object ids. This lets us add unhashable types like trackable
+  _ListWrapper objects to object-identity collections.
+  """
+
+  def __init__(self, wrapped):
+    self._wrapped = wrapped
+
+  @property
+  def unwrapped(self):
+    return self._wrapped
+
+  def __eq__(self, other):
+    if isinstance(other, _ObjectIdentityWrapper):
+      return self._wrapped is other._wrapped  # pylint: disable=protected-access
+    return self._wrapped is other
+
+  def __hash__(self):
+    # Wrapper id() is also fine for weakrefs. In fact, we rely on
+    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
+    # weakref.ref(a) in _WeakObjectIdentityWrapper.
+    return id(self._wrapped)
+
+
+class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
+
+  def __init__(self, wrapped):
+    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
+
+  @property
+  def unwrapped(self):
+    return self._wrapped()
+
+
+class ObjectIdentityDictionary(collections.MutableMapping):
+  """A mutable mapping data structure which compares using "is".
+
+  This is necessary because we have trackable objects (_ListWrapper) which
+  have behavior identical to built-in Python lists (including being unhashable
+  and comparing based on the equality of their contents by default).
+  """
+
+  def __init__(self):
+    self._storage = {}
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __getitem__(self, key):
+    return self._storage[self._wrap_key(key)]
+
+  def __setitem__(self, key, value):
+    self._storage[self._wrap_key(key)] = value
+
+  def __delitem__(self, key):
+    del self._storage[self._wrap_key(key)]
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    for key in self._storage:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
+  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len(list(self._storage))
+
+  def __iter__(self):
+    keys = self._storage.keys()
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        del self[key]
+      else:
+        yield unwrapped
+
+
+class ObjectIdentitySet(collections.MutableSet):
+  """Like the built-in set, but compares objects with "is"."""
+
+  def __init__(self, *args):
+    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __contains__(self, key):
+    return self._wrap_key(key) in self._storage
+
+  def discard(self, key):
+    self._storage.discard(self._wrap_key(key))
+
+  def add(self, key):
+    self._storage.add(self._wrap_key(key))
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakSet(ObjectIdentitySet):
+  """Like weakref.WeakSet, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len([_ for _ in self])
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        self.discard(key)
+      else:
+        yield unwrapped
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/tracking/tracking.py
similarity index 63%
rename from tensorflow/python/training/checkpointable/tracking.py
rename to tensorflow/python/training/tracking/tracking.py
index f1ef5cca490cf979a924946a39b18b2c782b1228..c8024e14ab14e0fa6a254458de184131d0d5ab91 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -1,4 +1,4 @@
-"""Dependency tracking for checkpointable objects."""
+"""Dependency tracking for trackable objects."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import tf_contextlib
 
 
@@ -30,21 +30,21 @@ from tensorflow.python.util import tf_contextlib
 _RESOURCE_TRACKER_STACK = []
 
 
-class NotCheckpointable(object):
+class NotTrackable(object):
   """Marks instances of child classes as unsaveable using an object-based API.
 
-  Useful for marking objects which would otherwise look checkpointable because
-  of inheritance (e.g. through `Layer`) as not checkpointable. Inheriting from
-  `NotCheckpointable` does not prevent an object from being assigned to any
+  Useful for marking objects which would otherwise look trackable because
+  of inheritance (e.g. through `Layer`) as not trackable. Inheriting from
+  `NotTrackable` does not prevent an object from being assigned to any
   attributes, but will throw an error on save/restore.
   """
   pass
 
 
-class AutoCheckpointable(base.Checkpointable):
+class AutoTrackable(base.Trackable):
   """Manages dependencies on other objects.
 
-  `Checkpointable` objects may have dependencies: other `Checkpointable` objects
+  `Trackable` objects may have dependencies: other `Trackable` objects
   which should be saved if the object declaring the dependency is saved. A
   correctly saveable program has a dependency graph such that if changing a
   global variable affects an object (e.g. changes the behavior of any of its
@@ -52,32 +52,32 @@ class AutoCheckpointable(base.Checkpointable):
   the variable.
 
   Dependency edges have names, and are created implicitly when a
-  `Checkpointable` object is assigned to an attribute of another
-  `Checkpointable` object. For example:
+  `Trackable` object is assigned to an attribute of another
+  `Trackable` object. For example:
 
   ```
-  obj = Checkpointable()
+  obj = Trackable()
   obj.v = ResourceVariable(0.)
   ```
 
-  The `Checkpointable` object `obj` now has a dependency named "v" on a
+  The `Trackable` object `obj` now has a dependency named "v" on a
   variable.
 
-  `Checkpointable` objects may specify `Tensor`s to be saved and restored
+  `Trackable` objects may specify `Tensor`s to be saved and restored
   directly (e.g. a `Variable` indicating how to save itself) rather than through
   dependencies on other objects. See
-  `Checkpointable._gather_saveables_for_checkpoint` for details.
+  `Trackable._gather_saveables_for_checkpoint` for details.
   """
 
   def __setattr__(self, name, value):
-    """Support self.foo = checkpointable syntax."""
+    """Support self.foo = trackable syntax."""
     if getattr(self, "_setattr_tracking", True):
       value = data_structures.sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
-    super(AutoCheckpointable, self).__setattr__(name, value)
+          trackable=self, value=value, name=name)
+    super(AutoTrackable, self).__setattr__(name, value)
 
   def __delattr__(self, name):
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     if name in self._unconditional_dependency_names:
       del self._unconditional_dependency_names[name]
       for index, (dep_name, _) in enumerate(
@@ -85,12 +85,27 @@ class AutoCheckpointable(base.Checkpointable):
         if dep_name == name:
           del self._unconditional_checkpoint_dependencies[index]
           break
-    super(AutoCheckpointable, self).__delattr__(name)
+    super(AutoTrackable, self).__delattr__(name)
 
   def _no_dependency(self, value):
-    """Override to allow CheckpointableBase to disable dependency tracking."""
+    """Override to allow TrackableBase to disable dependency tracking."""
     return data_structures.NoDependency(value)
 
+  def _list_functions_for_serialization(self):
+    """Return a dict of `Function`s of a trackable."""
+    functions = dict()
+    for attribute_name in dir(self):
+      try:
+        attribute_value = getattr(self, attribute_name, None)
+      except Exception:  # pylint: disable=broad-except
+        # We really don't want to throw an exception just because some object's
+        # attribute accessor is broken.
+        attribute_value = None
+      if isinstance(attribute_value, (def_function.Function,
+                                      defun.ConcreteFunction)):
+        functions[attribute_name] = attribute_value
+    return functions
+
 
 class ResourceTracker(object):
   """An object that tracks a list of resources."""
@@ -135,7 +150,7 @@ def resource_tracker_scope(resource_tracker):
     _RESOURCE_TRACKER_STACK = old
 
 
-class TrackableResource(base.Checkpointable):
+class TrackableResource(base.Trackable):
   """Base class for all resources that need to be tracked."""
 
   def __init__(self):
@@ -161,29 +176,34 @@ class TrackableResource(base.Checkpointable):
       self._resource_handle = self.create_resource()
     return self._resource_handle
 
+  def _list_functions_for_serialization(self):
+    @def_function.function(input_signature=[], autograph=False)
+    def _creator():
+      resource = self.create_resource()
+      return resource
+
+    @def_function.function(input_signature=[], autograph=False)
+    def _initializer():
+      self.initialize()
+      return 1  # Dummy return
+
+    return {
+        "create_resource": _creator,
+        "initialize": _initializer,
+    }
+
 
-class TrackableAsset(base.Checkpointable):
+class TrackableAsset(base.Trackable):
   """Base class for asset files which need to be tracked."""
 
   def __init__(self, path):
     """Record the full path to the asset."""
-    # We use a variable here so that @tf.functions do not capture a literal
-    # value. The init_scope prevents functions from capturing `path` in an
+    # The init_scope prevents functions from capturing `path` in an
     # initialization graph, since it is transient and should not end up in a
-    # serialized function body. When serialized in a SavedModel, the variable
-    # will be set during the loading process to its location in the assets/
-    # directory.
+    # serialized function body.
     with ops.init_scope():
-      if context.executing_eagerly():
-        self._path = self._no_dependency(
-            resource_variable_ops.ResourceVariable(
-                path, dtype=dtypes.string,
-                name="asset_path"))
-      else:
-        # Adding a variable is too disruptive when v1-style graph building,
-        # since things may get fed and local variable initializers would then
-        # need to be run.
-        self._path = path
+      self._path = ops.internal_convert_to_tensor(path, dtype=dtypes.string,
+                                                  name="asset_path")
 
   @property
   def asset_path(self):
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
similarity index 86%
rename from tensorflow/python/training/checkpointable/tracking_test.py
rename to tensorflow/python/training/tracking/tracking_test.py
index 87c6603ca256c23c7e79148520b2b4b36daff86d..37e14174368f074c1e9f430d2001a5e3f57008c6 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -25,35 +25,35 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
 
 
 class InterfaceTests(test.TestCase):
 
   def testMultipleAssignment(self):
-    root = tracking.AutoCheckpointable()
-    root.leaf = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    root.leaf = tracking.AutoTrackable()
     root.leaf = root.leaf
-    duplicate_name_dep = tracking.AutoCheckpointable()
+    duplicate_name_dep = tracking.AutoTrackable()
     with self.assertRaisesRegexp(ValueError, "already declared"):
-      root._track_checkpointable(duplicate_name_dep, name="leaf")
+      root._track_trackable(duplicate_name_dep, name="leaf")
     # No error; we're overriding __setattr__, so we can't really stop people
     # from doing this while maintaining backward compatibility.
     root.leaf = duplicate_name_dep
-    root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
+    root._track_trackable(duplicate_name_dep, name="leaf", overwrite=True)
     self.assertIs(duplicate_name_dep, root._lookup_dependency("leaf"))
     (_, dep_object), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, dep_object)
 
   def testNoDependency(self):
-    root = tracking.AutoCheckpointable()
-    hasdep = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    hasdep = tracking.AutoTrackable()
     root.hasdep = hasdep
-    nodep = tracking.AutoCheckpointable()
+    nodep = tracking.AutoTrackable()
     root.nodep = data_structures.NoDependency(nodep)
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
@@ -66,14 +66,14 @@ class InterfaceTests(test.TestCase):
       def __init__(self):
         super(NoDependencyModel, self).__init__()
         self.a = []
-        self.b = tracking.AutoCheckpointable()
+        self.b = tracking.AutoTrackable()
 
     nodeps = NoDependencyModel()
     self.assertEqual([nodeps], util.list_objects(nodeps))
 
   def testRemoveDependency(self):
-    root = tracking.AutoCheckpointable()
-    root.a = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    root.a = tracking.AutoTrackable()
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertEqual(1, len(root._unconditional_checkpoint_dependencies))
     self.assertIs(root.a, root._checkpoint_dependencies[0].ref)
@@ -81,16 +81,16 @@ class InterfaceTests(test.TestCase):
     self.assertFalse(hasattr(root, "a"))
     self.assertEqual(0, len(root._checkpoint_dependencies))
     self.assertEqual(0, len(root._unconditional_checkpoint_dependencies))
-    root.a = tracking.AutoCheckpointable()
+    root.a = tracking.AutoTrackable()
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertEqual(1, len(root._unconditional_checkpoint_dependencies))
     self.assertIs(root.a, root._checkpoint_dependencies[0].ref)
 
   def testListBasic(self):
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     a.l = [b]
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     a.l.append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
@@ -102,10 +102,10 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testMutationDirtiesList(self):
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     a.l = [b]
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     a.l.insert(0, c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
@@ -113,11 +113,11 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testOutOfBandEditDirtiesList(self):
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     held_reference = [b]
     a.l = held_reference
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     held_reference.append(c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "The wrapped list was modified"):
@@ -125,25 +125,25 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testNestedLists(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.l = []
-    b = tracking.AutoCheckpointable()
+    b = tracking.AutoTrackable()
     a.l.append([b])
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     a.l[0].append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     a.l[0].append(1)
-    d = tracking.AutoCheckpointable()
+    d = tracking.AutoTrackable()
     a.l[0].append(d)
     a_deps = util.list_objects(a)
     self.assertIn(d, a_deps)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     self.assertNotIn(1, a_deps)
-    e = tracking.AutoCheckpointable()
-    f = tracking.AutoCheckpointable()
+    e = tracking.AutoTrackable()
+    f = tracking.AutoTrackable()
     a.l1 = [[], [e]]
     a.l1[0].append(f)
     a_deps = util.list_objects(a)
@@ -198,7 +198,7 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.l = {"k": [numpy.zeros([2, 2])]}
     self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
                         nest.flatten(a.l))
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/tracking/util.py
similarity index 62%
rename from tensorflow/python/training/checkpointable/util.py
rename to tensorflow/python/training/tracking/util.py
index 129ad55f961794b387a38ecc28e539fca5826477..f5f70a443c4eb84d7783410909ee0698f03ab3c8 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -1,4 +1,4 @@
-"""Utilities for saving/loading Checkpointable objects."""
+"""Utilities for saving/loading Trackable objects."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import os
 import weakref
 
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
@@ -32,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
@@ -39,45 +39,29 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import saver as v1_saver_lib
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.saving import functional_saver
-from tensorflow.python.training.saving import saveable_object as saveable_object_lib
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import graph_view as graph_view_lib
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 
-_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
-
-# Keyword for identifying that the next bit of a checkpoint variable name is a
-# slot name. Checkpoint names for slot variables look like:
-#
-#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
-#
-# Where <path to variable> is a full path from the checkpoint root to the
-# variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
-# Keyword for separating the path to an object from the name of an
-# attribute in checkpoint names. Used like:
-#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
-_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-
-
 class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
   def __init__(self, object_graph_proto, save_path, save_path_tensor,
-               restore_op_cache, saveable_object_cache):
+               restore_op_cache, graph_view):
     """Specify the checkpoint being loaded.
 
     Args:
-      object_graph_proto: The CheckpointableObjectGraph protocol buffer
+      object_graph_proto: The TrackableObjectGraph protocol buffer
         associated with this checkpoint.
       save_path: A string, the path to the checkpoint, as returned by
         `tf.train.latest_checkpoint`.
@@ -87,10 +71,8 @@ class _CheckpointRestoreCoordinator(object):
         `_CheckpointRestoreCoordinator`s for the same Python objects, used to
         look up restore ops by name to avoid re-creating them across multiple
         `restore()` calls.
-      saveable_object_cache: A mapping of checkpointable objects -> attribute
-        names -> list(`SaveableObject`s), used when `SaveableObjects` must be
-        referenced every restore (e.g. for Python state); otherwise they would
-        create their own ops every restore.
+      graph_view: A graph_view_lib.ObjectGraphView object for the restored
+        objects.
     """
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
@@ -98,7 +80,7 @@ class _CheckpointRestoreCoordinator(object):
     # not loaded into any object, for error checking.
     self.unused_attributes = weakref.WeakKeyDictionary()
     # Dictionary mapping from an id in the protocol buffer flat array to
-    # Checkpointable Python objects. This mapping may be deferred if a
+    # Trackable Python objects. This mapping may be deferred if a
     # checkpoint is restored before all dependencies have been tracked. Uses
     # weak references so that partial restorations don't create reference cycles
     # (as objects with deferred dependencies will generally have references to
@@ -108,7 +90,7 @@ class _CheckpointRestoreCoordinator(object):
     # use them (for example because of inconsistent references when
     # loading). Used to make status assertions fail when loading checkpoints
     # that don't quite match.
-    self.all_python_objects = _ObjectIdentityWeakSet()
+    self.all_python_objects = object_identity.ObjectIdentityWeakSet()
     self.save_path_tensor = save_path_tensor
     self.save_path_string = save_path
     self.dtype_map = pywrap_tensorflow.NewCheckpointReader(
@@ -119,7 +101,7 @@ class _CheckpointRestoreCoordinator(object):
     # this checkpoint.
     self.restore_ops = []
     self.restore_ops_by_name = restore_op_cache
-    self.saveable_object_cache = saveable_object_cache
+    self.graph_view = graph_view
     self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
@@ -195,7 +177,7 @@ class _NameBasedRestoreCoordinator(object):
     self.unused_attributes = weakref.WeakKeyDictionary()
     self.restore_uid = ops.uid()
 
-  def globally_named_object_attributes(self, checkpointable):
+  def globally_named_object_attributes(self, trackable):
     """Create globally named SaveableObjects from attributes.
 
     If an object's attribute has no global name specified (default construction
@@ -204,13 +186,13 @@ class _NameBasedRestoreCoordinator(object):
     fail; see `NameBasedSaverStatus`).
 
     Args:
-      checkpointable: An object to save.
+      trackable: An object to save.
 
     Yields:
-      SaveableObjects for `checkpointable`'s attributes.
+      SaveableObjects for `trackable`'s attributes.
     """
     for attribute_name, saveable_factory in (
-        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        trackable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       if callable(saveable_factory):
         try:
           # This saveable object factory does not have a default name= argument,
@@ -219,7 +201,7 @@ class _NameBasedRestoreCoordinator(object):
           # fails.
           saveable = saveable_factory()
         except TypeError:
-          self.unused_attributes.setdefault(checkpointable, []).append(
+          self.unused_attributes.setdefault(trackable, []).append(
               attribute_name)
           continue
       else:
@@ -232,14 +214,14 @@ class _NameBasedRestoreCoordinator(object):
             op=op, name=name):
           yield saveable_object
 
-  def eager_restore(self, checkpointable):
-    """Runs restore ops for `checkpointable`'s attributes."""
+  def eager_restore(self, trackable):
+    """Runs restore ops for `trackable`'s attributes."""
     # When graph building, we don't add any restore ops to the graph until
     # run_restore_ops/initialize_or_restore on the status object for name-based
     # checkpoints.
     assert context.executing_eagerly()
     for saveable in self.globally_named_object_attributes(
-        checkpointable):
+        trackable):
       restored_tensors = []
       tensor_missing = False
       for spec in saveable.specs:
@@ -299,10 +281,10 @@ def _default_getter(name, shape, dtype, initializer=None,
     )
 
 
-def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
+def add_variable(trackable, name, shape=None, dtype=dtypes.float32,
                  initializer=None):
-  """Add a variable to a Checkpointable with no scope influence."""
-  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
+  """Add a variable to a Trackable with no scope influence."""
+  return trackable._add_variable_with_custom_getter(  # pylint: disable=protected-access
       name=name, shape=shape, dtype=dtype,
       initializer=initializer, getter=_default_getter)
 
@@ -325,7 +307,7 @@ def object_metadata(save_path):
     save_path: The path to the checkpoint, as returned by `save` or
       `tf.train.latest_checkpoint`.
   Returns:
-    A parsed `tf.contrib.checkpoint.CheckpointableObjectGraph` protocol buffer.
+    A parsed `tf.contrib.checkpoint.TrackableObjectGraph` protocol buffer.
   Raises:
     ValueError: If an object graph was not found in the checkpoint.
   """
@@ -340,503 +322,44 @@ def object_metadata(save_path):
          'saver and does not contain an object dependency graph.') % (
              save_path, base.OBJECT_GRAPH_PROTO_KEY))
   object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+      trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   return object_graph_proto
 
 
-class _ObjectIdentityWrapper(object):
-  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
-
-  Since __eq__ is based on object identity, it's safe to also define __hash__
-  based on object ids. This lets us add unhashable types like checkpointable
-  _ListWrapper objects to object-identity collections.
-  """
-
-  def __init__(self, wrapped):
-    self._wrapped = wrapped
-
-  @property
-  def unwrapped(self):
-    return self._wrapped
-
-  def __eq__(self, other):
-    if isinstance(other, _ObjectIdentityWrapper):
-      return self._wrapped is other._wrapped  # pylint: disable=protected-access
-    return self._wrapped is other
-
-  def __hash__(self):
-    # Wrapper id() is also fine for weakrefs. In fact, we rely on
-    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
-    # weakref.ref(a) in _WeakObjectIdentityWrapper.
-    return id(self._wrapped)
-
-
-class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
-
-  def __init__(self, wrapped):
-    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
-
-  @property
-  def unwrapped(self):
-    return self._wrapped()
-
-
-class ObjectIdentityDictionary(collections.MutableMapping):
-  """A mutable mapping data structure which compares using "is".
-
-  This is necessary because we have checkpointable objects (_ListWrapper) which
-  have behavior identical to built-in Python lists (including being unhashable
-  and comparing based on the equality of their contents by default).
-  """
-
-  def __init__(self):
-    self._storage = {}
-
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
-
-  def __getitem__(self, key):
-    return self._storage[self._wrap_key(key)]
-
-  def __setitem__(self, key, value):
-    self._storage[self._wrap_key(key)] = value
-
-  def __delitem__(self, key):
-    del self._storage[self._wrap_key(key)]
-
-  def __len__(self):
-    return len(self._storage)
-
-  def __iter__(self):
-    for key in self._storage:
-      yield key.unwrapped
-
-
-class _ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
-  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
-
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
-
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len(list(self._storage))
-
-  def __iter__(self):
-    keys = self._storage.keys()
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        del self[key]
-      else:
-        yield unwrapped
-
-
-class _ObjectIdentitySet(collections.MutableSet):
-  """Like the built-in set, but compares objects with "is"."""
-
-  def __init__(self, *args):
-    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
-
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
-
-  def __contains__(self, key):
-    return self._wrap_key(key) in self._storage
-
-  def discard(self, key):
-    self._storage.discard(self._wrap_key(key))
-
-  def add(self, key):
-    self._storage.add(self._wrap_key(key))
-
-  def __len__(self):
-    return len(self._storage)
-
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      yield key.unwrapped
-
-
-class _ObjectIdentityWeakSet(_ObjectIdentitySet):
-  """Like weakref.WeakSet, but compares objects with "is"."""
-
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
-
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len([_ for _ in self])
-
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        self.discard(key)
-      else:
-        yield unwrapped
-
-
-def _breadth_first_checkpointable_traversal(root_checkpointable):
-  """Find shortest paths to all variables owned by dependencies of root."""
-  bfs_sorted = []
-  to_visit = collections.deque([root_checkpointable])
-  path_to_root = ObjectIdentityDictionary()
-  path_to_root[root_checkpointable] = ()
-  while to_visit:
-    current_checkpointable = to_visit.popleft()
-    if isinstance(current_checkpointable, tracking.NotCheckpointable):
-      raise NotImplementedError(
-          ("The object %s does not support object-based saving. File a feature "
-           "request if this limitation bothers you. In the meantime, you can "
-           "remove the dependency on this object and save everything else.")
-          % (current_checkpointable,))
-    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-    bfs_sorted.append(current_checkpointable)
-    for child_checkpointable in (
-        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
-      if child_checkpointable.ref not in path_to_root:
-        path_to_root[child_checkpointable.ref] = (
-            path_to_root[current_checkpointable] + (child_checkpointable,))
-        to_visit.append(child_checkpointable.ref)
-  return bfs_sorted, path_to_root
-
-
-def _escape_local_name(name):
-  # We need to support slashes in local names for compatibility, since this
-  # naming scheme is being patched in to things like Layer.add_variable where
-  # slashes were previously accepted. We also want to use slashes to indicate
-  # edges traversed to reach the variable, so we escape forward slashes in
-  # names.
-  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
-          .replace(r"/", _ESCAPE_CHAR + "S"))
-
-
-def _object_prefix_from_path(path_to_root):
-  return "/".join(
-      (_escape_local_name(checkpointable.name)
-       for checkpointable in path_to_root))
-
-
-def _slot_variable_naming_for_optimizer(optimizer_path):
-  """Make a function for naming slot variables in an optimizer."""
-  # Name slot variables:
-  #
-  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
-  #
-  # where <variable name> is exactly the checkpoint name used for the original
-  # variable, including the path from the checkpoint root and the local name in
-  # the object which owns it. Note that we only save slot variables if the
-  # variable it's slotting for is also being saved.
-
-  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
-
-  def _name_slot_variable(variable_path, slot_name):
-    """With an optimizer specified, name a slot variable."""
-    return (variable_path
-            + optimizer_identifier
-            + _escape_local_name(slot_name))
-
-  return _name_slot_variable
-
-
-def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
-  """Gather and name slot variables."""
-  non_slot_objects = list(checkpointable_objects)
-  slot_variables = ObjectIdentityDictionary()
-  for checkpointable in non_slot_objects:
-    if (isinstance(checkpointable, optimizer_v1.Optimizer)
-        # TODO(b/110718070): Fix Keras imports.
-        or hasattr(checkpointable, "_create_or_restore_slot_variable")):
-      naming_scheme = _slot_variable_naming_for_optimizer(
-          optimizer_path=object_names[checkpointable])
-      slot_names = checkpointable.get_slot_names()
-      for slot_name in slot_names:
-        for original_variable_node_id, original_variable in enumerate(
-            non_slot_objects):
-          try:
-            slot_variable = checkpointable.get_slot(
-                original_variable, slot_name)
-          except (AttributeError, KeyError):
-            slot_variable = None
-          if slot_variable is None:
-            continue
-          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
-            # TODO(allenl): Gather dependencies of slot variables.
-            raise NotImplementedError(
-                "Currently only variables with no dependencies can be saved as "
-                "slot variables. File a feature request if this limitation "
-                "bothers you.")
-          if slot_variable in node_ids:
-            raise NotImplementedError(
-                "A slot variable was re-used as a dependency of a "
-                "Checkpointable object. This is not currently allowed. File a "
-                "feature request if this limitation bothers you.")
-          checkpoint_name = naming_scheme(
-              variable_path=object_names[original_variable],
-              slot_name=slot_name)
-          object_names[slot_variable] = checkpoint_name
-          slot_variable_node_id = len(checkpointable_objects)
-          node_ids[slot_variable] = slot_variable_node_id
-          checkpointable_objects.append(slot_variable)
-          slot_variable_proto = (
-              checkpointable_object_graph_pb2.CheckpointableObjectGraph
-              .CheckpointableObject.SlotVariableReference(
-                  slot_name=slot_name,
-                  original_variable_node_id=original_variable_node_id,
-                  slot_variable_node_id=slot_variable_node_id))
-          slot_variables.setdefault(checkpointable, []).append(
-              slot_variable_proto)
-  return slot_variables
-
-
-def _add_attributes_to_object_graph(
-    checkpointable_objects, object_graph_proto, node_ids, object_names,
-    saveables_cache, object_map):
-  """Create SaveableObjects and corresponding SerializedTensor protos."""
-  named_saveable_objects = []
-  if saveables_cache is None:
-    # No SaveableObject caching. Either we're executing eagerly, or building a
-    # static save which is specialized to the current Python state.
-    feed_additions = None
-  else:
-    # If we are caching SaveableObjects, we need to build up a feed_dict with
-    # functions computing volatile Python state to be saved with the checkpoint.
-    feed_additions = {}
-  for checkpoint_id, (checkpointable, object_proto) in enumerate(
-      zip(checkpointable_objects, object_graph_proto.nodes)):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_name = object_names[checkpointable]
-    if object_map:
-      object_to_save = object_map.get(checkpointable, checkpointable)
-    else:
-      object_to_save = checkpointable
-    if saveables_cache is not None:
-      cached_attributes = saveables_cache.setdefault(object_to_save, {})
-    else:
-      cached_attributes = None
-
-    for name, saveable_factory in (
-        object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
-      attribute = object_proto.attributes.add()
-      attribute.name = name
-      attribute.checkpoint_key = "%s/%s/%s" % (
-          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if cached_attributes is None:
-        saveables = None
-      else:
-        saveables = cached_attributes.get(name, None)
-        if saveables is not None:
-          for saveable in saveables:
-            if attribute.checkpoint_key not in saveable.name:
-              # The checkpoint key for this SaveableObject is different. We need
-              # to re-create it.
-              saveables = None
-              del cached_attributes[name]
-              break
-      if saveables is None:
-        if callable(saveable_factory):
-          maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
-        else:
-          maybe_saveable = saveable_factory
-        if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
-          saveables = (maybe_saveable,)
-        else:
-          # Figure out the name-based Saver's name for this variable. If it's
-          # already a SaveableObject we'd just get the checkpoint key back, so
-          # we leave full_name blank.
-          saver_dict = saveable_object_util.op_list_to_dict(
-              [maybe_saveable], convert_variable_to_tensor=False)
-          full_name, = saver_dict.keys()
-          saveables = tuple(saveable_object_util.saveable_objects_for_op(
-              op=maybe_saveable, name=attribute.checkpoint_key))
-          for saveable in saveables:
-            saveable.full_name = full_name
-        for saveable in saveables:
-          if attribute.checkpoint_key not in saveable.name:
-            raise AssertionError(
-                ("The object %s produced a SaveableObject with name '%s' for "
-                 "attribute '%s'. Expected a name containing '%s'.")
-                % (checkpointable, name, saveable.name,
-                   attribute.checkpoint_key))
-        if cached_attributes is not None:
-          cached_attributes[name] = saveables
-
-      optional_restore = None
-      for saveable in saveables:
-        if optional_restore is None:
-          optional_restore = saveable.optional_restore
-        else:
-          optional_restore = optional_restore and saveable.optional_restore
-
-        if hasattr(saveable, "full_name"):
-          attribute.full_name = saveable.full_name
-        if isinstance(saveable, base.PythonStateSaveable):
-          if feed_additions is None:
-            assert saveables_cache is None
-            # If we're not caching saveables, then we're either executing
-            # eagerly or building a static save/restore (e.g. for a
-            # SavedModel). In either case, we should embed the current Python
-            # state in the graph rather than relying on a feed dict.
-            saveable = saveable.freeze()
-          else:
-            saveable_feed_dict = saveable.feed_dict_additions()
-            for new_feed_key in saveable_feed_dict.keys():
-              if new_feed_key in feed_additions:
-                raise AssertionError(
-                    ("The object %s tried to feed a value for the Tensor %s "
-                     "when saving, but another object is already feeding a "
-                     "value.")
-                    % (checkpointable, new_feed_key))
-            feed_additions.update(saveable_feed_dict)
-        named_saveable_objects.append(saveable)
-      if optional_restore is None:
-        optional_restore = False
-      attribute.optional_restore = optional_restore
-
-  return named_saveable_objects, feed_additions
-
-
-def fill_object_graph_proto(checkpointable_objects,
-                            node_ids,
-                            slot_variables,
-                            object_graph_proto=None):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  if object_graph_proto is None:
-    object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
-    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      child_proto = object_proto.children.add()
-      child_proto.node_id = node_ids[child.ref]
-      child_proto.local_name = child.name
-  return object_graph_proto
-
-
-def _serialize_gathered_objects(
-    checkpointable_objects, path_to_root, saveables_cache, object_map):
-  """Create SaveableObjects and protos for gathered objects."""
-  object_names = ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  object_graph_proto = fill_object_graph_proto(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      slot_variables=slot_variables)
-  named_saveable_objects, feed_additions = _add_attributes_to_object_graph(
-      checkpointable_objects=checkpointable_objects,
-      object_graph_proto=object_graph_proto,
-      node_ids=node_ids,
-      object_names=object_names,
-      saveables_cache=saveables_cache,
-      object_map=object_map)
-  return named_saveable_objects, object_graph_proto, feed_additions
-
-
-def _serialize_object_graph(root_checkpointable, saveables_cache):
-  """Determine checkpoint keys for variables and build a serialized graph.
-
-  Non-slot variables are keyed based on a shortest path from the root saveable
-  to the object which owns the variable (i.e. the one which called
-  `Checkpointable._add_variable` to create it).
-
-  Slot variables are keyed based on a shortest path to the variable being
-  slotted for, a shortest path to their optimizer, and the slot name.
-
-  Args:
-    root_checkpointable: A `Checkpointable` object whose variables (including
-      the variables of dependencies, recursively) should be saved.
-    saveables_cache: A dictionary mapping `Checkpointable` objects -> attribute
-      names -> SaveableObjects, used to avoid re-creating SaveableObjects when
-      graph building.
-
-  Returns:
-    A tuple of (named_variables, object_graph_proto, feed_additions):
-      named_variables: A dictionary mapping names to variable objects.
-      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
-        the serialized object graph and variable references.
-      feed_additions: A dictionary mapping from Tensors to values which should
-        be fed when saving.
-
-  Raises:
-    ValueError: If there are invalid characters in an optimizer's slot names.
-  """
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  return _serialize_gathered_objects(
-      checkpointable_objects, path_to_root, saveables_cache, object_map=None)
-
-
-def named_saveables(root_checkpointable):
-  """Gather list of all SaveableObjects in the Checkpointable object."""
-  return _serialize_object_graph(root_checkpointable, None)[0]
-
-
-def find_objects(root_checkpointable):
-  """Find and number objects which are dependencies of `root_checkpointable`."""
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return checkpointable_objects, node_ids, slot_variables
-
-
-def list_objects(root_checkpointable):
+def list_objects(root_trackable):
   """Traverse the object graph and list all accessible objects.
 
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable`. Includes slot variables only if the variable they are
-  slotting for and the optimizer are dependencies of `root_checkpointable`
+  Looks for `Trackable` objects which are dependencies of
+  `root_trackable`. Includes slot variables only if the variable they are
+  slotting for and the optimizer are dependencies of `root_trackable`
   (i.e. if they would be saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object whose dependencies should be
+    root_trackable: A `Trackable` object whose dependencies should be
       flattened.
   Returns:
     A flat list of objects.
   """
-  checkpointable_objects, _, _ = find_objects(root_checkpointable)
-  return checkpointable_objects
+  return graph_view_lib.ObjectGraphView(root_trackable).list_objects()
 
 
-def gather_initializers(root_checkpointable):
+def gather_initializers(root_trackable):
   """Traverse the object graph and find initialization ops.
 
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable` and which have an `initializer` property. Includes
+  Looks for `Trackable` objects which are dependencies of
+  `root_trackable` and which have an `initializer` property. Includes
   initializers for slot variables only if the variable they are slotting for and
-  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  the optimizer are dependencies of `root_trackable` (i.e. if they would be
   saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object to gather initializers for.
+    root_trackable: A `Trackable` object to gather initializers for.
   Returns:
     A list of initialization ops.
   """
-  checkpointable_objects = list_objects(root_checkpointable)
-  return [c.initializer for c in checkpointable_objects
+  trackable_objects = list_objects(root_trackable)
+  return [c.initializer for c in trackable_objects
           if hasattr(c, "initializer") and c.initializer is not None]
 
 
@@ -846,7 +369,7 @@ def capture_dependencies(template):
 
   Requires that `template.variable_scope` is active.
 
-  This scope is intended as a compatibility measure, allowing a checkpointable
+  This scope is intended as a compatibility measure, allowing a trackable
   object to add dependencies on variables created in a block of code which is
   not aware of object-based saving (and instead uses variable names
   heavily). This is how `Template` objects add dependencies on variables and
@@ -860,17 +383,17 @@ def capture_dependencies(template):
   """
   name_prefix = template.variable_scope.name
 
-  def _checkpointable_custom_creator(next_creator, name, initial_value,
-                                     checkpointable_parent=None, **kwargs):
-    """A variable creation hook which adds Checkpointable dependencies.
+  def _trackable_custom_creator(next_creator, name, initial_value,
+                                trackable_parent=None, **kwargs):
+    """A variable creation hook which adds Trackable dependencies.
 
     Set for example during a `Template`'s first wrapped function
-    execution. Ensures that (a) `template` depends on any checkpointable
+    execution. Ensures that (a) `template` depends on any trackable
     objects using their own `capture_dependencies` scope inside this scope which
     create variables, and (b) that any variables not in a more deeply nested
     scope are added as dependencies directly.
 
-    The `checkpointable_parent` argument is passed between custom creators but
+    The `trackable_parent` argument is passed between custom creators but
     ignored when the variable object itself is created. This argument indicates
     (if not `None`) that a more deeply nested scope has already added the
     variable as a dependency, and that parent scopes should add a dependency on
@@ -884,8 +407,8 @@ def capture_dependencies(template):
         but scopes opened within this scope are respected.
       initial_value: See `variable_scope.variable_creator_scope`. Taken
         explicitly so the argument can be re-named and used with
-        `Checkpointable._add_variable_with_custom_getter`.
-      checkpointable_parent: If not None, a more deeply nested checkpointable
+        `Trackable._add_variable_with_custom_getter`.
+      trackable_parent: If not None, a more deeply nested trackable
         object and its name prefix which were passed to `capture_dependencies`
         to add a dependency on (rather than depending on the variable directly).
       **kwargs: Passed through to the next creator.
@@ -902,28 +425,28 @@ def capture_dependencies(template):
           **inner_kwargs)
     if name is not None and name.startswith(name_prefix):
       scope_stripped_name = name[len(name_prefix) + 1:]
-      if not checkpointable_parent:
+      if not trackable_parent:
         return template._add_variable_with_custom_getter(  # pylint: disable=protected-access
             initializer=initial_value,
             name=scope_stripped_name,
             getter=_call_next_creator_renaming_initializer,
-            # Disable error checking for Checkpointable. Exceptions are instead
+            # Disable error checking for Trackable. Exceptions are instead
             # raised if necessary when the object-based saver tries to
             # save/restore the object.
             overwrite=True,
-            checkpointable_parent=(template, name_prefix),
+            trackable_parent=(template, name_prefix),
             **kwargs)
       else:
-        parent_object, parent_name_prefix = checkpointable_parent
-        template._track_checkpointable(  # pylint: disable=protected-access
+        parent_object, parent_name_prefix = trackable_parent
+        template._track_trackable(  # pylint: disable=protected-access
             parent_object,
             name=parent_name_prefix[len(name_prefix) + 1:],
             overwrite=True)
     return next_creator(
         name=name, initial_value=initial_value,
-        checkpointable_parent=(template, name_prefix), **kwargs)
+        trackable_parent=(template, name_prefix), **kwargs)
 
-  with variable_scope.variable_creator_scope(_checkpointable_custom_creator):
+  with variable_scope.variable_creator_scope(_trackable_custom_creator):
     yield
 
 
@@ -999,10 +522,10 @@ class CheckpointLoadStatus(_LoadStatus):
   See `Saver.restore` for usage examples.
   """
 
-  def __init__(self, checkpoint, feed_dict, root_checkpointable):
+  def __init__(self, checkpoint, feed_dict, graph_view):
     self._checkpoint = checkpoint
     self._feed_dict = feed_dict
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Asserts that all objects in the checkpoint have been created/matched.
@@ -1017,8 +540,8 @@ class CheckpointLoadStatus(_LoadStatus):
     """
     self.assert_existing_objects_matched()
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if checkpointable is None:
+      trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if trackable is None:
         raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
     if self._checkpoint.slot_restorations:
       # Sanity check; this collection should be clear if everything has been
@@ -1033,7 +556,7 @@ class CheckpointLoadStatus(_LoadStatus):
     return self
 
   def assert_existing_objects_matched(self):
-    """Asserts that checkpointable Python objects have been matched.
+    """Asserts that trackable Python objects have been matched.
 
     Note that this is a weaker assertion than `assert_consumed`. It will only
     fail for existing Python objects which are (transitive) dependencies of the
@@ -1050,22 +573,23 @@ class CheckpointLoadStatus(_LoadStatus):
         of the root object but does not have a value in the checkpoint.
     """
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if (checkpointable is not None
-          and checkpointable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
+      trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if (trackable is not None
+          and trackable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
         raise AssertionError(
             "Object not assigned a value from checkpoint: %s" % (node,))
-    for checkpointable_object in list_objects(self._root_checkpointable):
+    for trackable_object in self._graph_view.list_objects():
       # Remove data structures that do not contain any variables from
       # restoration checks.
-      if (isinstance(checkpointable_object,
-                     data_structures.CheckpointableDataStructure) and
-          not checkpointable_object._checkpoint_dependencies):
+      if (isinstance(trackable_object,
+                     data_structures.TrackableDataStructure) and
+          not trackable_object._checkpoint_dependencies):
         continue
-      self._checkpoint.all_python_objects.add(checkpointable_object)
+      self._checkpoint.all_python_objects.add(trackable_object)
     unused_python_objects = (
-        _ObjectIdentitySet(self._checkpoint.all_python_objects)
-        - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+        object_identity.ObjectIdentitySet(self._checkpoint.all_python_objects)
+        - object_identity.ObjectIdentitySet(
+            self._checkpoint.object_by_proto_id.values()))
     if unused_python_objects:
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
@@ -1075,12 +599,14 @@ class CheckpointLoadStatus(_LoadStatus):
 
   def assert_nontrivial_match(self):
     """Raises an exception if only the root object matched."""
-    for checkpointable_object in list_objects(self._root_checkpointable):
-      self._checkpoint.all_python_objects.add(checkpointable_object)
+    for trackable_object in self._graph_view.list_objects():
+      self._checkpoint.all_python_objects.add(trackable_object)
     if len(self._checkpoint.object_by_proto_id) <= 1:
       unused_python_objects = (
-          _ObjectIdentitySet(self._checkpoint.all_python_objects)
-          - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+          object_identity.ObjectIdentitySet(
+              self._checkpoint.all_python_objects)
+          - object_identity.ObjectIdentitySet(
+              self._checkpoint.object_by_proto_id.values()))
       if unused_python_objects:
         raise AssertionError(
             ("Nothing except the root object matched a checkpointed value. "
@@ -1090,7 +616,7 @@ class CheckpointLoadStatus(_LoadStatus):
       else:
         raise AssertionError(
             "Nothing to load. No dependencies have been added to %s yet." % (
-                self._root_checkpointable,))
+                self._graph_view.root,))
     return self
 
   def run_restore_ops(self, session=None):
@@ -1120,8 +646,8 @@ class CheckpointLoadStatus(_LoadStatus):
       return  # Initialization and restoration ops are run eagerly
     if session is None:
       session = ops.get_default_session()
-    all_objects = list_objects(self._root_checkpointable)
-    already_initialized_objects = _ObjectIdentitySet(
+    all_objects = self._graph_view.list_objects()
+    already_initialized_objects = object_identity.ObjectIdentitySet(
         self._checkpoint.object_by_proto_id.values())
     initializers_for_non_restored_variables = [
         c.initializer for c in all_objects
@@ -1143,9 +669,9 @@ class InitializationOnlyStatus(_LoadStatus):
   otherwise.
   """
 
-  def __init__(self, root_checkpointable, restore_uid):
+  def __init__(self, graph_view, restore_uid):
     self._restore_uid = restore_uid
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
@@ -1193,9 +719,9 @@ class InitializationOnlyStatus(_LoadStatus):
       return  # run eagerly
     if session is None:
       session = ops.get_default_session()
-    checkpointable_objects = list_objects(self._root_checkpointable)
+    trackable_objects = self._graph_view.list_objects()
     initializers = [
-        c.initializer for c in checkpointable_objects
+        c.initializer for c in trackable_objects
         if hasattr(c, "initializer") and c.initializer is not None
         and (getattr(c, "_update_uid", self._restore_uid - 1)
              < self._restore_uid)]
@@ -1218,9 +744,9 @@ class NameBasedSaverStatus(_LoadStatus):
   # interferes with isinstance checks.
   @deprecation.deprecated(
       date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
-  def __init__(self, checkpoint, root_checkpointable):
+  def __init__(self, checkpoint, graph_view):
     self._checkpoint = checkpoint
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Raises an exception if any variables/objects are unmatched."""
@@ -1229,11 +755,11 @@ class NameBasedSaverStatus(_LoadStatus):
       raise AssertionError(
           "Some objects had attributes which were not restored: %s"
           % (unused_attributes,))
-    for checkpointable in list_objects(self._root_checkpointable):
+    for trackable in self._graph_view.list_objects():
       # pylint: disable=protected-access
-      checkpointable._maybe_initialize_checkpointable()
-      if checkpointable._update_uid < self._checkpoint.restore_uid:
-        raise AssertionError("Object not restored: %s" % (checkpointable,))
+      trackable._maybe_initialize_trackable()
+      if trackable._update_uid < self._checkpoint.restore_uid:
+        raise AssertionError("Object not restored: %s" % (trackable,))
       # pylint: enable=protected-access
     return self
 
@@ -1255,19 +781,19 @@ class NameBasedSaverStatus(_LoadStatus):
 
   def _gather_saveable_objects(self):
     """Walk the object graph, using global names for SaveableObjects."""
-    objects = list_objects(self._root_checkpointable)
+    objects = self._graph_view.list_objects()
     saveable_objects = []
-    for checkpointable in objects:
+    for trackable in objects:
       # pylint: disable=protected-access
-      checkpointable._maybe_initialize_checkpointable()
-      if checkpointable._update_uid < self._checkpoint.restore_uid:
-        checkpointable._update_uid = self._checkpoint.restore_uid
+      trackable._maybe_initialize_trackable()
+      if trackable._update_uid < self._checkpoint.restore_uid:
+        trackable._update_uid = self._checkpoint.restore_uid
       else:
         continue
       # pylint: enable=protected-access
       saveable_objects.extend(
           self._checkpoint.globally_named_object_attributes(
-              checkpointable))
+              trackable))
     return saveable_objects
 
   def run_restore_ops(self, session=None):
@@ -1303,36 +829,32 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
-class CheckpointableSaver(object):
-  """Saves and restores a `Checkpointable` object and its dependencies.
+class TrackableSaver(object):
+  """Saves and restores a `Trackable` object and its dependencies.
 
-  See `Checkpointable` for details of dependency management. `Saver` wraps
+  See `Trackable` for details of dependency management. `Saver` wraps
   `tf.train.Saver` for saving, including extra information about the graph of
   dependencies between Python objects. When restoring, it uses this information
   about the save-time dependency graph to more robustly match objects with their
   checkpointed values. When executing eagerly, it supports restoring variables
   on object creation (see `Saver.restore`).
 
-  Values in a checkpoint are mapped to `Checkpointable` Python objects
+  Values in a checkpoint are mapped to `Trackable` Python objects
   (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
   checkpoint was written. To avoid breaking existing checkpoints when modifying
-  a class, dependency names (the names of attributes to which `Checkpointable`
+  a class, dependency names (the names of attributes to which `Trackable`
   objects are assigned) may not change. These names are local to objects, in
   contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
   so allow additional program transformations.
   """
 
-  def __init__(self, root_checkpointable):
+  def __init__(self, graph_view):
     """Configure saving.
 
     Args:
-      root_checkpointable: The root of the object graph to save/restore. This
-        object and all of its dependencies are saved in the checkpoint. When
-        restoring, objects are matched and restored starting from this root.
+      graph_view: A `GraphView` object containing a description of the object
+        graph to save.
     """
-    # Allow passing in a weak reference to avoid reference cycles when
-    # `Checkpointable` objects save themselves.
-    self._root_checkpointable_ref = root_checkpointable
     # The file prefix placeholder is created lazily when graph building (and not
     # at all when executing eagerly) to avoid creating ops in the constructor
     # (when they may never be necessary).
@@ -1346,34 +868,13 @@ class CheckpointableSaver(object):
 
     # Op caching for restore, shared between _CheckpointRestoreCoordinators
     self._restore_op_cache = {}
-
-    if context.executing_eagerly():
-      # SaveableObjects are always recreated when executing eagerly.
-      self._saveable_object_cache = None
-    else:
-      # Maps Checkpointable objects -> attribute names -> list(SaveableObjects),
-      # to avoid re-creating SaveableObjects when graph building.
-      self._saveable_object_cache = _ObjectIdentityWeakKeyDictionary()
-
-  @property
-  def _root_checkpointable(self):
-    if isinstance(self._root_checkpointable_ref, weakref.ref):
-      derefed = self._root_checkpointable_ref()
-      assert derefed is not None
-      return derefed
-    else:
-      return self._root_checkpointable_ref
+    self._graph_view = graph_view
 
   def _gather_saveables(
-      self, object_graph_tensor=None, saveable_object_cache=None):
+      self, object_graph_tensor=None):
     """Wraps _serialize_object_graph to include the object graph proto."""
-    assert ((object_graph_tensor is None and saveable_object_cache is None)
-            or (object_graph_tensor is not None
-                and saveable_object_cache is not None))
     (named_saveable_objects, graph_proto,
-     feed_additions) = _serialize_object_graph(
-         self._root_checkpointable,
-         saveables_cache=saveable_object_cache)
+     feed_additions) = self._graph_view.serialize_object_graph()
     if object_graph_tensor is None:
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
@@ -1388,52 +889,16 @@ class CheckpointableSaver(object):
             name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects, graph_proto, feed_additions
 
-  def gather_objects(self, object_map=None, to_graph=None):
-    """Creates SaveableObjects with the current object graph frozen."""
-    checkpointable_objects, path_to_root = (
-        _breadth_first_checkpointable_traversal(self._root_checkpointable))
-    if to_graph:
-      target_context = to_graph.as_default
-    else:
-      target_context = ops.NullContextmanager
-    with target_context():
-      named_saveable_objects, graph_proto, _ = _serialize_gathered_objects(
-          checkpointable_objects,
-          path_to_root,
-          saveables_cache=None,
-          object_map=object_map)
-      with ops.device("/cpu:0"):
-        object_graph_tensor = constant_op.constant(
-            graph_proto.SerializeToString(), dtype=dtypes.string)
-      named_saveable_objects.append(
-          base.NoRestoreSaveable(
-              tensor=object_graph_tensor,
-              name=base.OBJECT_GRAPH_PROTO_KEY))
-    return named_saveable_objects
-
-  def freeze(self, object_map=None, to_graph=None):
-    named_saveable_objects = self.gather_objects(
-        object_map=object_map, to_graph=to_graph)
-    return functional_saver.Saver(named_saveable_objects)
-
   def _save_cached_when_graph_building(
       self,
       file_prefix,
-      object_graph_tensor=None,
-      saveable_object_cache=None):
+      object_graph_tensor=None):
     """Create or retrieve save ops.
 
-    When graph building, `saveable_object_cache` will typically be non-`None`,
-    meaning that existing `SaveableObject`s are re-used across calls to
-    `_prepare_save` even if the object graph has grown. This avoids
-    unnecessarily re-creating save ops.
-
     Args:
       file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
-      saveable_object_cache: A dictionary; if specified, used to cache
-        `SaveableObject`s.
 
     Returns:
       A two-element tuple with a filename tensor and a feed_dict of tensors to
@@ -1443,14 +908,14 @@ class CheckpointableSaver(object):
     """
     (named_saveable_objects, graph_proto,
      feed_additions) = self._gather_saveables(
-         object_graph_tensor=object_graph_tensor,
-         saveable_object_cache=saveable_object_cache)
+         object_graph_tensor=object_graph_tensor)
     if (self._last_save_object_graph != graph_proto
         # When executing eagerly, we need to re-create SaveableObjects each time
         # save() is called so they pick up new Tensors passed to their
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
-        or context.executing_eagerly()):
+        or context.executing_eagerly()
+        or ops.inside_function()):
       saver = functional_saver.Saver(named_saveable_objects)
       with ops.device("/cpu:0"):
         self._cached_save_operation = saver.save(file_prefix)
@@ -1461,7 +926,7 @@ class CheckpointableSaver(object):
     """Save a training checkpoint.
 
     The saved checkpoint includes variables created by this object and any
-    Checkpointable objects it depends on at the time `Saver.save()` is called.
+    Trackable objects it depends on at the time `Saver.save()` is called.
 
     Args:
       file_prefix: A prefix to use for the checkpoint filenames
@@ -1470,8 +935,8 @@ class CheckpointableSaver(object):
       checkpoint_number: An integer variable or Tensor, used to number
         checkpoints. Typically this value is saved along with other variables in
         training checkpoints, which will happen automatically if it was created
-        by `root_checkpointable` or one of its dependencies (via
-        `Checkpointable._add_variable`).
+        by `root_trackable` or one of its dependencies (via
+        `Trackable._add_variable`).
       session: The session to evaluate variables in. Ignored when executing
         eagerly. If not provided when graph building, the default session is
         used.
@@ -1480,10 +945,11 @@ class CheckpointableSaver(object):
       The full path to the checkpoint.
     """
     feed_dict = {}
-    graph_building = not context.executing_eagerly()
+    use_session = (not context.executing_eagerly()
+                   and not ops.inside_function())
     if checkpoint_number:
       file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
-    if graph_building:
+    if use_session:
       if self._object_graph_feed_tensor is None:
         with ops.device("/cpu:0"):
           self._object_graph_feed_tensor = constant_op.constant(
@@ -1502,28 +968,26 @@ class CheckpointableSaver(object):
     file_io.recursive_create_dir(os.path.dirname(file_prefix))
     save_path, new_feed_additions = self._save_cached_when_graph_building(
         file_prefix=file_prefix_tensor,
-        object_graph_tensor=object_graph_tensor,
-        saveable_object_cache=self._saveable_object_cache)
+        object_graph_tensor=object_graph_tensor)
     if new_feed_additions:
       feed_dict.update(new_feed_additions)
-    if not graph_building:
+    if not use_session:
       session = None
     elif session is None:
       session = ops.get_default_session()
 
     if session:
-      save_path = session.run(save_path, feed_dict=feed_dict)
+      return session.run(save_path, feed_dict=feed_dict)
     else:
-      save_path = save_path.numpy()
-    return save_path
+      return save_path
 
   def restore(self, save_path):
     """Restore a training checkpoint.
 
-    Restores `root_checkpointable` and any objects that it tracks
+    Restores `root_trackable` and any objects that it tracks
     (transitive). Either assigns values immediately if variables to restore have
     been created already, or defers restoration until the variables are
-    created. Dependencies added to the `root_checkpointable` passed to the
+    created. Dependencies added to the `root_trackable` passed to the
     constructor after this call will be matched if they have a corresponding
     object in the checkpoint.
 
@@ -1576,7 +1040,7 @@ class CheckpointableSaver(object):
       object is returned which runs restore ops from a name-based saver.
     """
     if save_path is None:
-      return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
+      return InitializationOnlyStatus(self._graph_view, ops.uid())
     reader = pywrap_tensorflow.NewCheckpointReader(save_path)
     graph_building = not context.executing_eagerly()
     if graph_building:
@@ -1592,15 +1056,15 @@ class CheckpointableSaver(object):
       restore_coordinator = _NameBasedRestoreCoordinator(
           save_path=save_path, dtype_map=dtype_map)
       if not graph_building:
-        for existing_checkpointable in list_objects(self._root_checkpointable):
+        for existing_trackable in self._graph_view.list_objects():
           # pylint: disable=protected-access
-          existing_checkpointable._maybe_initialize_checkpointable()
-          existing_checkpointable._name_based_restores.add(restore_coordinator)
-          existing_checkpointable._name_based_attribute_restore(
+          existing_trackable._maybe_initialize_trackable()
+          existing_trackable._name_based_restores.add(restore_coordinator)
+          existing_trackable._name_based_attribute_restore(
               restore_coordinator)
           # pylint: enable=protected-access
       return NameBasedSaverStatus(
-          restore_coordinator, root_checkpointable=self._root_checkpointable)
+          restore_coordinator, graph_view=self._graph_view)
 
     if graph_building:
       if self._file_prefix_placeholder is None:
@@ -1613,25 +1077,25 @@ class CheckpointableSaver(object):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
     object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+        trackable_object_graph_pb2.TrackableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
     checkpoint = _CheckpointRestoreCoordinator(
         object_graph_proto=object_graph_proto,
         save_path=save_path,
         save_path_tensor=file_prefix_tensor,
         restore_op_cache=self._restore_op_cache,
-        saveable_object_cache=self._saveable_object_cache)
-    base._CheckpointPosition(  # pylint: disable=protected-access
-        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
+        graph_view=self._graph_view)
+    base.CheckpointPosition(checkpoint=checkpoint, proto_id=0).restore(
+        self._graph_view.root)
     load_status = CheckpointLoadStatus(
         checkpoint,
-        root_checkpointable=self._root_checkpointable,
+        graph_view=self._graph_view,
         feed_dict=file_prefix_feed_dict)
     return load_status
 
 
-def frozen_saver(root_checkpointable):
-  """Creates a static `tf.train.Saver` from a checkpointable object.
+def frozen_saver(root_trackable):
+  """Creates a static `tf.train.Saver` from a trackable object.
 
   The returned `Saver` saves object-based checkpoints, but these checkpoints
   will no longer reflect structural changes to the object graph, only changes to
@@ -1645,21 +1109,34 @@ def frozen_saver(root_checkpointable):
   object graph and the current Python object graph.
 
   Args:
-    root_checkpointable: A checkpointable object to save.
+    root_trackable: A trackable object to save.
 
   Returns:
-    A `tf.train.Saver` which saves object-based checkpoints for the object graph
-    frozen at the time `frozen_saver` was called.
+    A saver which saves object-based checkpoints for the object graph frozen at
+    the time `frozen_saver` was called.
   """
-  return CheckpointableSaver(root_checkpointable).freeze()
+  named_saveable_objects = graph_view_lib.ObjectGraphView(
+      root_trackable).frozen_saveable_objects()
+  return functional_saver.Saver(named_saveable_objects)
+
+
+def saver_with_op_caching(obj):
+  """A TrackableSaver with a SaveableObject cache when graph building."""
+  if context.executing_eagerly():
+    saveables_cache = None
+  else:
+    saveables_cache = object_identity.ObjectIdentityWeakKeyDictionary()
+  return TrackableSaver(graph_view_lib.ObjectGraphView(
+      weakref.ref(obj),
+      saveables_cache=saveables_cache))
 
 
 @tf_export("train.Checkpoint")
-class Checkpoint(tracking.AutoCheckpointable):
-  """Groups checkpointable objects, saving and restoring them.
+class Checkpoint(tracking.AutoTrackable):
+  """Groups trackable objects, saving and restoring them.
 
   `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain checkpointable state, such as `tf.train.Optimizer`
+  that contain trackable state, such as `tf.train.Optimizer`
   implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
   `tf.keras.Model` implementations. It saves these values with a checkpoint, and
   maintains a `save_counter` for numbering checkpoints.
@@ -1751,23 +1228,23 @@ class Checkpoint(tracking.AutoCheckpointable):
 
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Values must be checkpointable objects.
+        saved with the checkpoint. Values must be trackable objects.
     Raises:
-      ValueError: If objects in `kwargs` are not checkpointable.
+      ValueError: If objects in `kwargs` are not trackable.
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, (base.Checkpointable, def_function.Function)):
+      if not isinstance(v, (base.Trackable, def_function.Function)):
         raise ValueError(
-            ("`Checkpoint` was expecting a checkpointable object (an object "
-             "derived from `CheckpointableBase`), got %s. If you believe this "
-             "object should be checkpointable (i.e. it is part of the "
+            ("`Checkpoint` was expecting a trackable object (an object "
+             "derived from `TrackableBase`), got %s. If you believe this "
+             "object should be trackable (i.e. it is part of the "
              "TensorFlow Python API and manages state), please open an issue.")
             % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
     self._save_assign_op = None
-    self._saver = CheckpointableSaver(weakref.ref(self))
+    self._saver = saver_with_op_caching(self)
 
   def _maybe_create_save_counter(self):
     """Create a save counter if it does not yet exist."""
@@ -1784,7 +1261,7 @@ class Checkpoint(tracking.AutoCheckpointable):
     """Writes a training checkpoint.
 
     The checkpoint includes variables created by this object and any
-    checkpointable objects it depends on at the time `Checkpoint.write()` is
+    trackable objects it depends on at the time `Checkpoint.write()` is
     called.
 
     `write` does not number checkpoints, increment `save_counter`, or update the
@@ -1802,9 +1279,18 @@ class Checkpoint(tracking.AutoCheckpointable):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    return compat.as_str(self._saver.save(
+    output = self._saver.save(
         file_prefix=file_prefix,
-        session=session))
+        session=session)
+    if tensor_util.is_tensor(output):
+      if context.executing_eagerly():
+        return compat.as_str(output.numpy())
+      else:
+        # Function building
+        return output
+    else:
+      # Graph + Session, so we already session.ran it.
+      return compat.as_str(output)
 
   @property
   def save_counter(self):
@@ -1822,7 +1308,7 @@ class Checkpoint(tracking.AutoCheckpointable):
     """Saves a training checkpoint and provides basic checkpoint management.
 
     The saved checkpoint includes variables created by this object and any
-    checkpointable objects it depends on at the time `Checkpoint.save()` is
+    trackable objects it depends on at the time `Checkpoint.save()` is
     called.
 
     `save` is a basic convenience wrapper around the `write` method,
@@ -1845,6 +1331,14 @@ class Checkpoint(tracking.AutoCheckpointable):
     """
     graph_building = not context.executing_eagerly()
     if graph_building:
+      if ops.inside_function():
+        raise NotImplementedError(
+            "Calling tf.train.Checkpoint.save() from a function is not "
+            "supported, as save() modifies saving metadata in ways not "
+            "supported by TensorFlow Operations. Consider using "
+            "tf.train.Checkpoint.write(), a lower-level API which does not "
+            "update metadata. tf.train.latest_checkpoint and related APIs will "
+            "not see this checkpoint.")
       if session is None:
         session = ops.get_default_session()
       if self._save_counter is None:
@@ -1866,7 +1360,8 @@ class Checkpoint(tracking.AutoCheckpointable):
     checkpoint_management.update_checkpoint_state_internal(
         save_dir=os.path.dirname(file_prefix),
         model_checkpoint_path=file_path,
-        all_model_checkpoint_paths=[file_path])
+        all_model_checkpoint_paths=[file_path],
+        save_relative_paths=True)
     return file_path
 
   def restore(self, save_path):
@@ -1878,7 +1373,7 @@ class Checkpoint(tracking.AutoCheckpointable):
     restore have been created already, or defers restoration until the variables
     are created. Dependencies added after this call will be matched if they have
     a corresponding object in the checkpoint (the restore request will queue in
-    any checkpointable object waiting for the expected dependency to be added).
+    any trackable object waiting for the expected dependency to be added).
 
     When graph building, restoration ops are added to the graph but not run
     immediately.
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/tracking/util_test.py
similarity index 81%
rename from tensorflow/python/training/checkpointable/util_test.py
rename to tensorflow/python/training/tracking/util_test.py
index cef1075e93ca8a2aecfb5af9362308dabb82e47c..e2878915f8e15d49162b1d97969496c794fc110d 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -46,16 +46,17 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -67,8 +68,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -77,21 +78,30 @@ class MyModel(training.Model):
 
 class InterfaceTests(test.TestCase):
 
+  def testLayerDeduplication(self):
+    model = training.Model()
+    layer_one = core.Dense(1)
+    layer_two = core.Dense(1)
+    model.other_path = [layer_one, layer_two]
+    model.l2 = layer_two
+    model.l1 = layer_one
+    self.assertEqual([layer_one, layer_two], model.layers)
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAddVariable(self):
-    obj = NonLayerCheckpointable()
+    obj = NonLayerTrackable()
     with self.assertRaisesRegexp(ValueError, "do not specify shape"):
-      checkpointable_utils.add_variable(
+      trackable_utils.add_variable(
           obj, name="shape_specified_twice", shape=[], initializer=1)
-    constant_initializer = checkpointable_utils.add_variable(
+    constant_initializer = trackable_utils.add_variable(
         obj, name="constant_initializer", initializer=1)
     with variable_scope.variable_scope("some_variable_scope"):
-      ones_initializer = checkpointable_utils.add_variable(
+      ones_initializer = trackable_utils.add_variable(
           obj,
           name="ones_initializer",
           shape=[2],
           initializer=init_ops.ones_initializer(dtype=dtypes.float32))
-    bare_initializer = checkpointable_utils.add_variable(
+    bare_initializer = trackable_utils.add_variable(
         obj,
         name="bare_initializer",
         shape=[2, 2],
@@ -102,12 +112,12 @@ class InterfaceTests(test.TestCase):
     # naming conflicts within an object.
     other_duplicate = resource_variable_ops.ResourceVariable(
         name="duplicate", initial_value=1.)
-    duplicate = checkpointable_utils.add_variable(
+    duplicate = trackable_utils.add_variable(
         obj, name="duplicate", shape=[])
     with self.assertRaisesRegexp(ValueError, "'duplicate'.*already declared"):
-      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
+      trackable_utils.add_variable(obj, name="duplicate", shape=[])
 
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    self.evaluate(trackable_utils.gather_initializers(obj))
     self.assertEqual("constant_initializer:0", constant_initializer.name)
     self.assertEqual(1, self.evaluate(constant_initializer))
     self.assertEqual("some_variable_scope/ones_initializer:0",
@@ -125,8 +135,8 @@ class InterfaceTests(test.TestCase):
       # The .name attribute may be globally influenced, but the checkpoint name
       # won't be (tested below).
       self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _, _ = checkpointable_utils._serialize_object_graph(
-        obj, saveables_cache=None)
+    named_variables, _, _ = (
+        graph_view.ObjectGraphView(obj).serialize_object_graph())
     expected_checkpoint_names = (
         "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
         "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
@@ -139,20 +149,20 @@ class InterfaceTests(test.TestCase):
 
   def testInitNotCalled(self):
 
-    class NoInit(tracking.AutoCheckpointable):
+    class NoInit(tracking.AutoTrackable):
 
       def __init__(self):
         pass
 
-    # __init__ for Checkpointable will be called implicitly.
-    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
+    # __init__ for Trackable will be called implicitly.
+    trackable_utils.add_variable(NoInit(), "var", shape=[])
 
   def testShapeDtype(self):
-    root = tracking.AutoCheckpointable()
-    v1 = checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    v1 = trackable_utils.add_variable(
         root, name="v1", initializer=3., dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, v1.dtype)
-    v2 = checkpointable_utils.add_variable(
+    v2 = trackable_utils.add_variable(
         root,
         name="v2",
         shape=[3],
@@ -166,34 +176,34 @@ class InterfaceTests(test.TestCase):
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       dense = core.Dense(1)
-      checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+      checkpoint = trackable_utils.Checkpoint(dense=dense)
       dense(constant_op.constant([[1.]]))
       save_path = checkpoint.save(checkpoint_prefix)
 
-    objects = checkpointable_utils.object_metadata(save_path)
+    objects = trackable_utils.object_metadata(save_path)
     all_variable_names = []
     for obj in objects.nodes:
       for attribute in obj.attributes:
         all_variable_names.append(attribute.full_name)
     self.assertIn("dense/kernel", all_variable_names)
 
-  def testNotCheckpointable(self):
+  def testNotTrackable(self):
 
     class CallsFunctionalStuff(
-        tracking.NotCheckpointable, tracking.AutoCheckpointable):
+        tracking.NotTrackable, tracking.AutoTrackable):
       pass
 
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
-    checkpoint = checkpointable_utils.Checkpoint(x=CallsFunctionalStuff())
+    checkpoint = trackable_utils.Checkpoint(x=CallsFunctionalStuff())
     with self.assertRaises(NotImplementedError):
       checkpoint.save(prefix)
 
     class CallsFunctionalStuffOtherMRO(
-        tracking.AutoCheckpointable, tracking.NotCheckpointable):
+        tracking.AutoTrackable, tracking.NotTrackable):
       pass
 
-    checkpoint_reversed = checkpointable_utils.Checkpoint(
+    checkpoint_reversed = trackable_utils.Checkpoint(
         x=CallsFunctionalStuffOtherMRO())
     with self.assertRaises(NotImplementedError):
       checkpoint_reversed.save(prefix)
@@ -220,8 +230,8 @@ class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(base.Checkpointable):
-  """A Checkpointable object which returns a more complex SaveableObject."""
+class _OwnsMirroredVariables(base.Trackable):
+  """A Trackable object which returns a more complex SaveableObject."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
@@ -254,7 +264,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     other_model = MyModel()
     optimizer = adam.Adam(0.001)
     step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, step=step)
 
     with backprop.GradientTape() as tape:
@@ -271,12 +281,11 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     gradients = tape.gradient(loss, variables)
     optimizer.apply_gradients(zip(gradients, variables))
 
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        checkpointable_utils._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_slot_keys = (
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
@@ -352,7 +361,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMoreComplexSaveableReturned(self):
     v = _OwnsMirroredVariables()
-    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    checkpoint = trackable_utils.Checkpoint(v=v)
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
     self.evaluate(v.non_dep_variable.assign(42.))
@@ -389,7 +398,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.Adam(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     with backprop.GradientTape() as tape:
@@ -397,31 +406,31 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     variables = model.trainable_variables
     gradients = tape.gradient(loss, variables)
     train_op = optimizer.apply_gradients(zip(gradients, variables))
-    root_checkpointable.save_counter  # pylint: disable=pointless-statement
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    root_trackable.save_counter  # pylint: disable=pointless-statement
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(
         sorted(optimizer.variables(), key=lambda v: v.name))
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
     on_create_model = MyModel()
     on_create_optimizer = adam.Adam(0.001)
-    on_create_root = checkpointable_utils.Checkpoint(
+    on_create_root = trackable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
@@ -462,7 +471,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     for training_continuation in range(3):
       model = MyModel()
       optimizer = adam.Adam(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model)
       root.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
@@ -488,7 +497,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         with ops.Graph().as_default():
           model = MyModel()
           optimizer = adam.Adam(0.001)
-          root = checkpointable_utils.Checkpoint(
+          root = trackable_utils.Checkpoint(
               optimizer=optimizer, model=model)
           input_value = constant_op.constant([[3.]])
           with backprop.GradientTape() as tape:
@@ -535,7 +544,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with test_util.device(use_gpu=True):
         model = MyModel()
         optimizer = adam.Adam(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model)
         manager = checkpoint_management.CheckpointManager(
             root, checkpoint_directory, max_to_keep=1)
@@ -560,12 +569,12 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       directory = self.get_temp_dir()
       prefix = os.path.join(directory, "ckpt")
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-      checkpoint = checkpointable_utils.Checkpoint(v=v)
+      checkpoint = trackable_utils.Checkpoint(v=v)
       self.evaluate(v.assign(3))
       # Create the save counter so assert_consumed doesn't complain about it not
       # existing in the checkpoint on restore.
       self.evaluate(checkpoint.save_counter.assign(12))
-      saver = checkpointable_utils.frozen_saver(checkpoint)
+      saver = trackable_utils.frozen_saver(checkpoint)
       with ops.device("cpu:0"):
         prefix_tensor = constant_op.constant(prefix)
       save_path = self.evaluate(saver.save(prefix_tensor))
@@ -577,14 +586,14 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       # Restore using another frozen saver on an identical object graph
       del v, checkpoint, saver
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-      checkpoint = checkpointable_utils.Checkpoint(v=v)
-      saver = checkpointable_utils.frozen_saver(checkpoint)
+      checkpoint = trackable_utils.Checkpoint(v=v)
+      saver = trackable_utils.frozen_saver(checkpoint)
       self.evaluate(saver.restore(prefix_tensor))
       self.assertEqual(3, self.evaluate(v))
 
       # Restore as an object-based checkpoint
       del v, checkpoint, saver
-      checkpoint = checkpointable_utils.Checkpoint()
+      checkpoint = trackable_utils.Checkpoint()
       status = checkpoint.restore(save_path)
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
       if context.executing_eagerly():
@@ -600,7 +609,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     directory = self.get_temp_dir()
     prefix = os.path.join(directory, "ckpt")
     step = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-    checkpoint = checkpointable_utils.Checkpoint(step=step)
+    checkpoint = trackable_utils.Checkpoint(step=step)
     self.evaluate(step.initializer)
     for i in range(5):
       path = checkpoint.write("%s-%d" % (prefix, self.evaluate(step)))
@@ -621,7 +630,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         model = MyModel()
         # Don't actually train so we can test variable values
         optimizer = adam.Adam(0.)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model)
         checkpoint_path = checkpoint_management.latest_checkpoint(
             checkpoint_directory)
@@ -653,11 +662,11 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
     return named_variable.name
@@ -674,23 +683,23 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNumberedPath(self):
-    root = tracking.AutoCheckpointable()
-    leaf = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    leaf = tracking.AutoTrackable()
     root.leaf = leaf
-    checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    trackable_utils.add_variable(leaf, name="v", shape=[])
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", named_variable.name)
 
   @test_util.run_in_graph_and_eager_modes
   def testLocalNameValidation(self):
-    root = tracking.AutoCheckpointable()
-    leaf = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    leaf = tracking.AutoTrackable()
     # Dots are escaped, which avoids conflicts with reserved names.
-    root._track_checkpointable(leaf, name=".ATTRIBUTES")
-    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    root._track_trackable(leaf, name=".ATTRIBUTES")
+    trackable_utils.add_variable(trackable=leaf, name="a", shape=[])
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
                      named_variable.name)
 
@@ -712,7 +721,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       optimizer = adam.Adam(learning_rate=0.05)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
+      checkpoint = trackable_utils.Checkpoint(
           model=model, optimizer=optimizer)
       for _ in range(2):
         checkpoint.save(checkpoint_prefix)
@@ -726,13 +735,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testLateDependencyTracking(self):
 
-    class Dependency(tracking.AutoCheckpointable):
+    class Dependency(tracking.AutoTrackable):
 
       def build(self):
-        self.var = checkpointable_utils.add_variable(
+        self.var = trackable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class LateDependencies(tracking.AutoCheckpointable):
+    class LateDependencies(trackable_utils.Checkpoint):
 
       def add_dep(self):
         self.dep = Dependency()
@@ -743,11 +752,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.evaluate(state_ops.assign(original.dep.var, 123.))
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(
-        original).save(checkpoint_prefix)
+    save_path = original.save(checkpoint_prefix)
     load_into = LateDependencies()
-    status = checkpointable_utils.CheckpointableSaver(
-        load_into).restore(save_path)
+    status = load_into.restore(save_path)
     status.assert_existing_objects_matched()
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -759,13 +766,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testDepAfterVar(self):
 
-    class Dependency(tracking.AutoCheckpointable):
+    class Dependency(tracking.AutoTrackable):
 
       def build(self):
-        self.var = checkpointable_utils.add_variable(
+        self.var = trackable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class DepAfterVar(tracking.AutoCheckpointable):
+    class DepAfterVar(trackable_utils.Checkpoint):
 
       def add_dep(self):
         dep = Dependency()
@@ -777,12 +784,10 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
-        checkpoint_prefix)
+    save_path = dep_after_var.save(checkpoint_prefix)
 
     loaded_dep_after_var = DepAfterVar()
-    status = checkpointable_utils.CheckpointableSaver(
-        loaded_dep_after_var).restore(save_path)
+    status = loaded_dep_after_var.restore(save_path)
     loaded_dep_after_var.add_dep()
     status.assert_consumed()
     status.run_restore_ops()
@@ -792,8 +797,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.AutoCheckpointable()
-    root.var = checkpointable_utils.add_variable(
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.Adam(0.1)
     variables = [root.var]
@@ -802,29 +807,25 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     # Note that `optimizer` has not been added as a dependency of
     # `root`. Create a one-off grouping so that slot variables for `root.var`
     # get initialized too.
-    self.evaluate(checkpointable_utils.gather_initializers(
-        checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+    self.evaluate(trackable_utils.gather_initializers(
+        trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
     self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(
         optimizer.get_slot(slot_name="m", var=root.var),
         14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.AutoCheckpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
+    new_root.var = trackable_utils.add_variable(
         new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
@@ -861,24 +862,21 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.AutoCheckpointable()
-    save_root.dep = tracking.AutoCheckpointable()
-    save_root.dep.var = checkpointable_utils.add_variable(
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep = tracking.AutoTrackable()
+    save_root.dep.var = trackable_utils.add_variable(
         save_root.dep, name="var", initializer=0.)
     self.evaluate(state_ops.assign(save_root.dep.var, 12.))
-    saver = checkpointable_utils.CheckpointableSaver(save_root)
-    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
+    first_path = save_root.save(os.path.join(checkpoint_directory, "first"))
     self.evaluate(state_ops.assign(save_root.dep.var, 13.))
-    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
-
-    first_root = tracking.AutoCheckpointable()
-    second_root = tracking.AutoCheckpointable()
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    load_dep = tracking.AutoCheckpointable()
-    load_dep.var = checkpointable_utils.add_variable(
+    second_path = save_root.save(os.path.join(checkpoint_directory, "second"))
+
+    first_root = trackable_utils.Checkpoint()
+    second_root = trackable_utils.Checkpoint()
+    first_status = first_root.restore(first_path)
+    second_status = second_root.restore(second_path)
+    load_dep = tracking.AutoTrackable()
+    load_dep.var = trackable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
     first_status.assert_consumed()
@@ -891,14 +889,12 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
     # Try again with the order of the restore() reversed. The last restore
     # determines the final value.
-    first_root = tracking.AutoCheckpointable()
-    second_root = tracking.AutoCheckpointable()
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    load_dep = tracking.AutoCheckpointable()
-    load_dep.var = checkpointable_utils.add_variable(
+    first_root = trackable_utils.Checkpoint()
+    second_root = trackable_utils.Checkpoint()
+    second_status = second_root.restore(second_path)
+    first_status = first_root.restore(first_path)
+    load_dep = tracking.AutoTrackable()
+    load_dep.var = trackable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
     first_status.assert_consumed()
@@ -913,24 +909,22 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testAmbiguousLoad(self):
     # Not OK to split one checkpoint object into two
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.AutoCheckpointable()
-    save_root.dep_one = tracking.AutoCheckpointable()
-    save_root.dep_two = tracking.AutoCheckpointable()
-    dep_three = tracking.AutoCheckpointable()
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep_one = tracking.AutoTrackable()
+    save_root.dep_two = tracking.AutoTrackable()
+    dep_three = tracking.AutoTrackable()
     save_root.dep_one.dep_three = dep_three
     save_root.dep_two.dep_three = dep_three
-    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.AutoCheckpointable()
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
-        save_path)
-    load_root.dep_one = tracking.AutoCheckpointable()
-    load_root.dep_two = tracking.AutoCheckpointable()
-    load_root.dep_one.dep_three = tracking.AutoCheckpointable()
-    load_root.dep_two.dep_three = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    trackable_utils.add_variable(dep_three, name="var", initializer=0.)
+    self.evaluate(trackable_utils.gather_initializers(save_root))
+    save_path = save_root.save(os.path.join(checkpoint_directory, "ckpt"))
+    load_root = trackable_utils.Checkpoint()
+    status = load_root.restore(save_path)
+    load_root.dep_one = tracking.AutoTrackable()
+    load_root.dep_two = tracking.AutoTrackable()
+    load_root.dep_one.dep_three = tracking.AutoTrackable()
+    load_root.dep_two.dep_three = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         load_root.dep_one.dep_three, name="var", initializer=0.)
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -941,24 +935,23 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testObjectsCombined(self):
     # Currently fine to load two checkpoint objects into one Python object
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.AutoCheckpointable()
-    save_root.dep_one = tracking.AutoCheckpointable()
-    save_root.dep_two = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep_one = tracking.AutoTrackable()
+    save_root.dep_two = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
-    checkpointable_utils.add_variable(
+    trackable_utils.add_variable(
         save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.AutoCheckpointable()
-    load_root.dep_one = tracking.AutoCheckpointable()
+    self.evaluate(trackable_utils.gather_initializers(save_root))
+    save_path = save_root.save(os.path.join(checkpoint_directory, "ckpt"))
+    load_root = trackable_utils.Checkpoint()
+    load_root.dep_one = tracking.AutoTrackable()
     load_root.dep_two = load_root.dep_one
-    v1 = checkpointable_utils.add_variable(
+    v1 = trackable_utils.add_variable(
         load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
-    v2 = checkpointable_utils.add_variable(
+    v2 = trackable_utils.add_variable(
         load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+    status = load_root.restore(
         save_path).assert_consumed().assert_existing_objects_matched()
     status.run_restore_ops()
     self.assertEqual(32., self.evaluate(v1))
@@ -968,31 +961,29 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testDependencyLoop(self):
     # Note: this test creates garbage during eager execution because it
     # purposefully creates a reference cycle.
-    first = tracking.AutoCheckpointable()
-    second = tracking.AutoCheckpointable()
+    first = trackable_utils.Checkpoint()
+    second = trackable_utils.Checkpoint()
     first.second = second
     second.first = first
-    first.v = checkpointable_utils.add_variable(
+    first.v = trackable_utils.add_variable(
         first, "v1", initializer=[3., 1., 4.])
-    second.v = checkpointable_utils.add_variable(
+    second.v = trackable_utils.add_variable(
         second, "v2", initializer=[1., 1., 2., 3.])
-    self.evaluate(checkpointable_utils.gather_initializers(first))
+    self.evaluate(trackable_utils.gather_initializers(first))
     checkpoint_directory = self.get_temp_dir()
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        os.path.join(checkpoint_directory, "ckpt"))
+    save_path = first.save(os.path.join(checkpoint_directory, "ckpt"))
 
     # Test deferred loading
-    first_load = tracking.AutoCheckpointable()
-    status = checkpointable_utils.CheckpointableSaver(
-        first_load).restore(save_path)
-    second_load = tracking.AutoCheckpointable()
+    first_load = trackable_utils.Checkpoint()
+    status = first_load.restore(save_path)
+    second_load = tracking.AutoTrackable()
     first_load.second = second_load
     second_load.first = first_load
     with self.assertRaises(AssertionError):
       status.assert_consumed()
-    first_load.v = checkpointable_utils.add_variable(
+    first_load.v = trackable_utils.add_variable(
         first_load, "v1", shape=[3])
-    second_load.v = checkpointable_utils.add_variable(
+    second_load.v = trackable_utils.add_variable(
         second_load, "v2", shape=[4])
     status.assert_consumed()
     status.run_restore_ops()
@@ -1004,8 +995,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
     self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
     self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
-    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
-        save_path).assert_consumed()
+    status = first_load.restore(save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
     self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
@@ -1014,18 +1004,16 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testRestoreOnAssign(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    first = tracking.AutoCheckpointable()
+    first = trackable_utils.Checkpoint()
     first.var1 = variables_lib.Variable(0., name="outside_var")
     first.var2 = variables_lib.Variable(0., name="blah")
     self.evaluate(first.var1.assign(4.))
     self.evaluate(first.var2.assign(8.))
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        checkpoint_prefix)
+    save_path = first.save(checkpoint_prefix)
 
-    second = tracking.AutoCheckpointable()
+    second = trackable_utils.Checkpoint()
     second.var2 = variables_lib.Variable(0., name="blah")
-    status = checkpointable_utils.CheckpointableSaver(
-        second).restore(save_path)
+    status = second.restore(save_path)
     recreated_var1 = variables_lib.Variable(0., name="outside_var")
     status.run_restore_ops()
     self.assertEqual(8., self.evaluate(second.var2))
@@ -1042,27 +1030,26 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variables_lib.Variable(0., name="v")
         obj.opt = adam.Adam(0.1)
         variables = [obj.var]
         gradients = [1.]
         obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
         graph.finalize()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
 
   @test_util.run_in_graph_and_eager_modes
   def testCheckpointState(self):
     # No checkpoints are deleted by default
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.AutoCheckpointable()
+    obj = tracking.AutoTrackable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    saver = checkpointable_utils.Checkpoint(obj=obj)
+    self.evaluate(trackable_utils.gather_initializers(obj))
+    saver = trackable_utils.Checkpoint(obj=obj)
     for _ in range(10):
       saver.save(checkpoint_prefix)
     expected_filenames = ["checkpoint"]
@@ -1079,10 +1066,10 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testCheckpointStateChangingVarList(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.AutoCheckpointable()
+    obj = tracking.AutoTrackable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    self.evaluate(trackable_utils.gather_initializers(obj))
+    checkpoint = trackable_utils.Checkpoint(obj=obj)
     looped_variables = []
     for iteration in range(10):
       new_variable = resource_variable_ops.ResourceVariable(iteration)
@@ -1132,23 +1119,22 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variables_lib.Variable(0., name="v")
         obj.opt = adam.Adam(0.1)
         variables = [obj.var]
         gradients = [1.]
         obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         graph.finalize()
-        saver.restore(save_path)
+        obj.restore(save_path)
 
   @test_util.run_in_graph_and_eager_modes
   def test_sequential(self):
     model = sequential.Sequential()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint = trackable_utils.Checkpoint(model=model)
     model.add(core.Dense(4))
     second_dense = core.Dense(5)
     model.add(second_dense)
@@ -1165,7 +1151,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
 
     deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+    deferred_sequential_checkpoint = trackable_utils.Checkpoint(
         model=deferred_sequential)
     status = deferred_sequential_checkpoint.restore(save_path)
     deferred_sequential.add(core.Dense(4))
@@ -1185,9 +1171,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.Adam(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           model=model)  # Do not save the optimizer with the checkpoint.
-      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+      optimizer_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer)
 
       checkpoint_path = checkpoint_management.latest_checkpoint(
@@ -1221,7 +1207,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.Adam(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model)
       status = root.restore(save_path=model_save_path)
       input_value = constant_op.constant([[3.]])
@@ -1245,9 +1231,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.Adam(0.001, beta_1=1.0)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model)
-      opt_root = checkpointable_utils.Checkpoint(
+      opt_root = trackable_utils.Checkpoint(
           optimizer=optimizer)
       status = root.restore(save_path=model_save_path)
       init_only_optimizer_status = opt_root.restore(save_path=None)
@@ -1268,9 +1254,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       self.assertEqual(42., self.evaluate(optimizer.beta_1))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_restore_after_adding_empty_checkpointable_data_structure(self):
-    model = NonLayerCheckpointable()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+  def test_restore_after_adding_empty_trackable_data_structure(self):
+    model = NonLayerTrackable()
+    checkpoint = trackable_utils.Checkpoint(model=model)
     checkpoint.restore(None).initialize_or_restore()
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -1278,20 +1264,43 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
     del model, checkpoint
 
-    model = NonLayerCheckpointable()
+    model = NonLayerTrackable()
     model.dict = {"a": 1}
     model.list = {"b": 1}
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint = trackable_utils.Checkpoint(model=model)
     load_status = checkpoint.restore(save_path)
     load_status.assert_existing_objects_matched().run_restore_ops()
 
-
-class _ManualScope(tracking.AutoCheckpointable):
+  @test_util.run_in_graph_and_eager_modes
+  def test_write_checkpoint_from_function(self):
+    checkpoint_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_checkpoint = trackable_utils.Checkpoint(
+        v=variables_lib.Variable(1.))
+
+    @def_function.function
+    def _write_checkpoint():
+      save_path = save_checkpoint.write(checkpoint_prefix)
+      return save_path
+
+    self.evaluate([save_checkpoint.v.initializer])
+    self.evaluate(_write_checkpoint())
+    load_checkpoint = trackable_utils.Checkpoint(
+        v=variables_lib.Variable(0.))
+    load_checkpoint.restore(checkpoint_prefix).run_restore_ops()
+    self.assertEqual(1., self.evaluate(load_checkpoint.v))
+    self.evaluate(save_checkpoint.v.assign(3.))
+    self.evaluate(_write_checkpoint())
+    self.evaluate(save_checkpoint.v.assign(0.))
+    load_checkpoint.restore(checkpoint_prefix).run_restore_ops()
+    self.assertEqual(3., self.evaluate(load_checkpoint.v))
+
+
+class _ManualScope(tracking.AutoTrackable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
       self.variable_scope = vs
-      with checkpointable_utils.capture_dependencies(template=self):
+      with trackable_utils.capture_dependencies(template=self):
         return self._build()
 
   def _build(self):
@@ -1301,7 +1310,7 @@ class _ManualScope(tracking.AutoCheckpointable):
 class TemplateTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -1318,12 +1327,12 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     six.assertCountEqual(
         self,
         [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        checkpointable_utils.list_objects(save_template))
+        trackable_utils.list_objects(save_template))
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
     optimizer = adam.Adam(0.0)
-    save_root = checkpointable_utils.Checkpoint(
+    save_root = trackable_utils.Checkpoint(
         my_template=save_template, optimizer=optimizer)
     optimizer.minimize(v1_save.read_value,
                        var_list=[v1_save])
@@ -1337,7 +1346,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
 
     load_template = template.make_template("s2", _templated)
     load_optimizer = adam.Adam(0.0)
-    load_root = checkpointable_utils.Checkpoint(
+    load_root = trackable_utils.Checkpoint(
         my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2, _, _ = load_template()
@@ -1353,7 +1362,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([14.], self.evaluate(var2))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore_nested(self):
+  def test_trackable_save_restore_nested(self):
 
     def _inner_template():
       v = variable_scope.get_variable(
@@ -1370,7 +1379,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
 
     with variable_scope.variable_scope("ignored"):
       save_template = template.make_template("s1", _outer_template)
-      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+      save_root = trackable_utils.Checkpoint(my_template=save_template)
       (inner_template_one, inner_template_two), _ = save_template()
     self.evaluate(inner_template_one.variables[0].assign([20.]))
     self.evaluate(inner_template_two.variables[0].assign([25.]))
@@ -1379,7 +1388,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _outer_template)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    load_root = trackable_utils.Checkpoint(my_template=load_template)
     status = load_root.restore(save_path)
     (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
     outer_template_dependencies = load_root.my_template._checkpoint_dependencies
@@ -1404,15 +1413,15 @@ class CheckpointCompatibilityTests(test.TestCase):
     input_value = constant_op.constant([[3.]])
     model = MyModel()
     optimizer = adam.Adam(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     with backprop.GradientTape() as tape:
       loss = model(input_value)
     variables = model.trainable_variables
     gradients = tape.gradient(loss, variables)
     train_op = optimizer.apply_gradients(zip(gradients, variables))
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -1420,24 +1429,24 @@ class CheckpointCompatibilityTests(test.TestCase):
     self.evaluate(optimizer.get_slot(
         var=model._named_dense.bias, slot_name="m").assign([2.]))
     self.evaluate(optimizer.beta_1.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, slot_name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")
         .assign([102.]))
-    self.evaluate(root_checkpointable.optimizer.beta_1.assign(103.))
+    self.evaluate(root_trackable.optimizer.beta_1.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, slot_name="m")))
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")))
     self.assertAllEqual(3.,
-                        self.evaluate(root_checkpointable.optimizer.beta_1))
+                        self.evaluate(root_trackable.optimizer.beta_1))
 
   def _write_name_based_checkpoint(self):
     checkpoint_directory = self.get_temp_dir()
@@ -1461,7 +1470,8 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
@@ -1533,7 +1543,7 @@ class PythonMetadataTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     dense = core.Dense(1)
-    checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+    checkpoint = trackable_utils.Checkpoint(dense=dense)
     dense(constant_op.constant([[1.]]))
     checkpoint.restore(None).initialize_or_restore()
     save_path = checkpoint.save(checkpoint_prefix)
@@ -1553,7 +1563,7 @@ class PythonMetadataTests(test.TestCase):
       return json.loads(layer_json.decode("utf-8"))
 
     layer_data = _get_dense_node_from_object_graph(
-        checkpointable_utils.object_metadata(save_path))
+        trackable_utils.object_metadata(save_path))
     self.assertEqual("Dense", layer_data["class_name"])
     self.assertEqual(1, layer_data["config"]["units"])
 
@@ -1563,7 +1573,7 @@ class PythonMetadataTests(test.TestCase):
     dense.units = 42
     save_path = checkpoint.save(checkpoint_prefix)
     layer_data = _get_dense_node_from_object_graph(
-        checkpointable_utils.object_metadata(save_path))
+        trackable_utils.object_metadata(save_path))
     self.assertEqual("Dense", layer_data["class_name"])
     self.assertEqual(42, layer_data["config"]["units"])
 
diff --git a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
similarity index 87%
rename from tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
rename to tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index bd80fa60f0b27f16da01002ab5088495f0a43edb..793929380ff210ff3f8fdb873d66b81ad6556f27 100644
--- a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -43,15 +43,16 @@ from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -63,8 +64,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -82,7 +83,7 @@ class CheckpointingTests(test.TestCase):
     other_model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     if context.executing_eagerly():
       optimizer.minimize(
@@ -97,12 +98,11 @@ class CheckpointingTests(test.TestCase):
       optimizer.minimize(
           other_model(input_value),
           global_step=optimizer_step)
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
       self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        checkpointable_utils._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -190,7 +190,7 @@ class CheckpointingTests(test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     if context.executing_eagerly():
@@ -199,24 +199,24 @@ class CheckpointingTests(test.TestCase):
     else:
       train_op = optimizer.minimize(model(input_value))
       # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
+      root_trackable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
       self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(optimizer.variables())
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
@@ -226,7 +226,7 @@ class CheckpointingTests(test.TestCase):
         # Preserve beta1_power and beta2_power when appying gradients so we can
         # test that they've been restored correctly.
         beta1=1.0, beta2=1.0)
-    on_create_root = checkpointable_utils.Checkpoint(
+    on_create_root = trackable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
@@ -266,7 +266,7 @@ class CheckpointingTests(test.TestCase):
     for training_continuation in range(3):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           optimizer_step=training_util.get_or_create_global_step())
       root.restore(checkpoint_management.latest_checkpoint(
@@ -297,7 +297,7 @@ class CheckpointingTests(test.TestCase):
       with strategy.scope():
         model = MyModel()
         optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             optimizer_step=training_util.get_or_create_global_step())
         root.restore(checkpoint_management.latest_checkpoint(
@@ -328,7 +328,7 @@ class CheckpointingTests(test.TestCase):
         with strategy.scope():
           model = MyModel()
           optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
+          root = trackable_utils.Checkpoint(
               optimizer=optimizer, model=model,
               optimizer_step=training_util.get_or_create_global_step())
           status = root.restore(checkpoint_management.latest_checkpoint(
@@ -355,7 +355,7 @@ class CheckpointingTests(test.TestCase):
         with ops.Graph().as_default():
           model = MyModel()
           optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
+          root = trackable_utils.Checkpoint(
               optimizer=optimizer, model=model,
               global_step=training_util.get_or_create_global_step())
           input_value = constant_op.constant([[3.]])
@@ -394,7 +394,7 @@ class CheckpointingTests(test.TestCase):
       with test_util.device(use_gpu=True):
         model = MyModel()
         optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             global_step=training_util.get_or_create_global_step())
         manager = checkpoint_management.CheckpointManager(
@@ -427,7 +427,7 @@ class CheckpointingTests(test.TestCase):
         model = MyModel()
         # Don't actually train so we can test variable values
         optimizer = adam.AdamOptimizer(0.)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             global_step=training_util.get_or_create_global_step())
         checkpoint_path = checkpoint_management.latest_checkpoint(
@@ -461,10 +461,10 @@ class CheckpointingTests(test.TestCase):
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
         root, saveables_cache=None)
     with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
@@ -488,7 +488,7 @@ class CheckpointingTests(test.TestCase):
       optimizer = adam.AdamOptimizer(learning_rate=0.05)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
+      checkpoint = trackable_utils.Checkpoint(
           model=model, optimizer=optimizer)
       for _ in range(2):
         checkpoint.save(checkpoint_prefix)
@@ -503,8 +503,8 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.AutoCheckpointable()
-    root.var = checkpointable_utils.add_variable(
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
     if context.executing_eagerly():
@@ -514,28 +514,24 @@ class CheckpointingTests(test.TestCase):
       # Note that `optimizer` has not been added as a dependency of
       # `root`. Create a one-off grouping so that slot variables for `root.var`
       # get initialized too.
-      self.evaluate(checkpointable_utils.gather_initializers(
-          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(trackable_utils.gather_initializers(
+          trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
       self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
                                    14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.AutoCheckpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
+    new_root.var = trackable_utils.add_variable(
         new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
@@ -572,15 +568,14 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
         before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testManyRestoresGraph(self):
@@ -590,16 +585,15 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         before_ops = graph.get_operations()
-        saver.restore(save_path)
+        obj.restore(save_path)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testMultipleGraphsNonSlotVariables(self):
@@ -612,11 +606,11 @@ class CheckpointingTests(test.TestCase):
       first_session = session_lib.Session(graph=first_graph)
       with first_graph.as_default(), first_session.as_default():
         first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = checkpointable_utils.Checkpoint(
+        first_root_trackable = trackable_utils.Checkpoint(
             optimizer=optimizer, variable=first_variable)
         train_op = optimizer.minimize(first_variable.read_value)
-        self.evaluate(checkpointable_utils.gather_initializers(
-            first_root_checkpointable))
+        self.evaluate(trackable_utils.gather_initializers(
+            first_root_trackable))
         self.evaluate(train_op)
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
@@ -628,23 +622,23 @@ class CheckpointingTests(test.TestCase):
       second_graph = ops.Graph()
       with second_graph.as_default(), session_lib.Session(graph=second_graph):
         second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = checkpointable_utils.Checkpoint(
+        second_root_trackable = trackable_utils.Checkpoint(
             optimizer=optimizer, variable=second_variable)
         train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
+        second_root_trackable.restore(None).initialize_or_restore()
         self.evaluate(train_op)
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.evaluate(beta1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        save_path = second_root_trackable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(6., self.evaluate(beta1_power))
-        status = second_root_checkpointable.restore(save_path)
+        status = second_root_trackable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
@@ -668,10 +662,10 @@ class CheckpointingTests(test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           model=model,  # Do not save the optimizer with the checkpoint.
           global_step=training_util.get_or_create_global_step())
-      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+      optimizer_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer)
 
       checkpoint_path = checkpoint_management.latest_checkpoint(
@@ -695,7 +689,7 @@ class CheckpointingTests(test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           global_step=training_util.get_or_create_global_step())
       status = root.restore(save_path=model_save_path)
@@ -717,10 +711,10 @@ class CheckpointingTests(test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           global_step=training_util.get_or_create_global_step())
-      opt_root = checkpointable_utils.Checkpoint(
+      opt_root = trackable_utils.Checkpoint(
           optimizer=optimizer)
       status = root.restore(save_path=model_save_path)
       init_only_optimizer_status = opt_root.restore(save_path=None)
@@ -739,12 +733,12 @@ class CheckpointingTests(test.TestCase):
       self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
 
-class _ManualScope(tracking.AutoCheckpointable):
+class _ManualScope(tracking.AutoTrackable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
       self.variable_scope = vs
-      with checkpointable_utils.capture_dependencies(template=self):
+      with trackable_utils.capture_dependencies(template=self):
         return self._build()
 
   def _build(self):
@@ -754,7 +748,7 @@ class _ManualScope(tracking.AutoCheckpointable):
 class TemplateTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -771,12 +765,12 @@ class TemplateTests(test.TestCase):
     six.assertCountEqual(
         self,
         [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        checkpointable_utils.list_objects(save_template))
+        trackable_utils.list_objects(save_template))
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
     optimizer = adam.AdamOptimizer(0.0)
-    save_root = checkpointable_utils.Checkpoint(
+    save_root = trackable_utils.Checkpoint(
         my_template=save_template, optimizer=optimizer)
     optimizer.minimize(v1_save.read_value)
     self.evaluate([v.initializer for v in save_template.variables])
@@ -789,7 +783,7 @@ class TemplateTests(test.TestCase):
 
     load_template = template.make_template("s2", _templated)
     load_optimizer = adam.AdamOptimizer(0.0)
-    load_root = checkpointable_utils.Checkpoint(
+    load_root = trackable_utils.Checkpoint(
         my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2, _, _ = load_template()
@@ -812,13 +806,13 @@ class CheckpointCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -827,24 +821,24 @@ class CheckpointCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta1_power))
 
   def _write_name_based_checkpoint(self):
@@ -869,7 +863,8 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
diff --git a/tensorflow/python/training/checkpointable/util_xla_test.py b/tensorflow/python/training/tracking/util_xla_test.py
similarity index 85%
rename from tensorflow/python/training/checkpointable/util_xla_test.py
rename to tensorflow/python/training/tracking/util_xla_test.py
index 4e96a7514a24be19b857eab7032846e7578cc55c..4e8dd0a6fd3231b335758a7e0fb05c7db37ac95c 100644
--- a/tensorflow/python/training/checkpointable/util_xla_test.py
+++ b/tensorflow/python/training/tracking/util_xla_test.py
@@ -25,15 +25,15 @@ from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -44,8 +44,8 @@ class Subclassed(training.Model):
     super(Subclassed, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -62,7 +62,7 @@ class CheckpointingTests(xla_test.XLATestCase):
       with self.test_scope():
         model = Subclassed()
         optimizer = adam.Adam(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model)
         manager = checkpoint_management.CheckpointManager(
             root, checkpoint_directory, max_to_keep=2)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index ae71a628c1f9e1e7e86a25cbcacab0bd400ed279..5a1527888977ec0e2d88dfe313ee4767bf59cba4 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -68,7 +68,7 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
-from tensorflow.python.training.checkpointable.util import Checkpoint
+from tensorflow.python.training.tracking.util import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index a43ec48589f278dc4aeb85c461b2ccfbab71510a..9a1241cff3f0879166f46c3e56f54b0658f2ac7d 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -79,7 +79,7 @@ def _get_attrs_items(obj):
 def _sorted(dict_):
   """Returns a sorted list of the dict keys, with error if keys not sortable."""
   try:
-    return sorted(_six.iterkeys(dict_))
+    return sorted(dict_)
   except TypeError:
     raise TypeError("nest only supports dicts with sortable keys.")
 
@@ -124,7 +124,7 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+    return type(instance)((key, result[key]) for key in instance)
   elif _is_namedtuple(instance) or _is_attrs(instance):
     return type(instance)(*args)
   elif _is_composite_tensor(instance):
@@ -184,6 +184,20 @@ is_sequence = _pywrap_tensorflow.IsSequence
 is_sequence_or_composite = _pywrap_tensorflow.IsSequenceOrComposite
 
 
+@tf_export("nest.is_nested")
+def is_nested(seq):
+  """Returns true if its input is a collections.Sequence (except strings).
+
+  Args:
+    seq: an input sequence.
+
+  Returns:
+    True if the sequence is a not a string and is a collections.Sequence or a
+    dict.
+  """
+  return is_sequence(seq)
+
+
 @tf_export("nest.flatten")
 def flatten(structure, expand_composites=False):
   """Returns a flat list from a given nested structure.
@@ -258,7 +272,7 @@ def assert_same_structure(nest1, nest2, check_types=True,
         size. Note that namedtuples with identical name and fields are always
         considered to have the same shallow structure. Two types will also be
         considered the same if they are both list subtypes (which allows "list"
-        and "_ListWrapper" from checkpointable dependency tracking to compare
+        and "_ListWrapper" from trackable dependency tracking to compare
         equal).
     expand_composites: If true, then composite tensors such as `tf.SparseTensor`
         and `tf.RaggedTensor` are expanded into their component tensors.
@@ -626,7 +640,7 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
   The following code will not raise an exception:
   ```python
     shallow_tree = ["a", "b"]
-    input_tree = ["c", ["d", "e"]]
+    input_tree = ["c", ["d", "e"], "f"]
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
@@ -761,6 +775,100 @@ def flatten_up_to(shallow_tree, input_tree, check_types=True):
   return list(v for _, v in _yield_flat_up_to(shallow_tree, input_tree))
 
 
+def flatten_with_tuple_paths_up_to(shallow_tree, input_tree, check_types=True):
+  """Flattens `input_tree` up to `shallow_tree`.
+
+  Any further depth in structure in `input_tree` is retained as elements in the
+  partially flattened output.
+
+  Returns a list of (path, value) pairs, where value a leaf node in the
+  flattened tree, and path is the tuple path of that leaf in input_tree.
+
+  If `shallow_tree` and `input_tree` are not sequences, this returns a
+  single-element list: `[((), input_tree)]`.
+
+  Use Case:
+
+  Sometimes we may wish to partially flatten a nested sequence, retaining some
+  of the nested structure. We achieve this by specifying a shallow structure,
+  `shallow_tree`, we wish to flatten up to.
+
+  The input, `input_tree`, can be thought of as having the same structure as
+  `shallow_tree`, but with leaf nodes that are themselves tree structures.
+
+  Examples:
+
+  ```python
+  input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+  shallow_tree = [[True, True], [False, True]]
+
+  flattened_input_tree = flatten_with_tuple_paths_up_to(shallow_tree,
+                                                        input_tree)
+  flattened_shallow_tree = flatten_with_tuple_paths_up_to(shallow_tree,
+                                                          shallow_tree)
+
+  # Output is:
+  # [((0, 0), [2, 2]),
+  #  ((0, 1), [3, 3]),
+  #  ((1, 0), [4, 9]),
+  #  ((1, 1), [5, 5])]
+  #
+  # [((0, 0), True),
+  #  ((0, 1), True),
+  #  ((1, 0), False),
+  #  ((1, 1), True)]
+  ```
+
+  ```python
+  input_tree = [[('a', 1), [('b', 2), [('c', 3), [('d', 4)]]]]]
+  shallow_tree = [['level_1', ['level_2', ['level_3', ['level_4']]]]]
+
+  input_tree_flattened_as_shallow_tree = flatten_up_to(shallow_tree, input_tree)
+  input_tree_flattened = flatten(input_tree)
+
+  # Output is:
+  # [((0, 0), ('a', 1)),
+  #  ((0, 1, 0), ('b', 2)),
+  #  ((0, 1, 1, 0), ('c', 3)),
+  #  ((0, 1, 1, 1), ('d', 4))]
+  # ['a', 1, 'b', 2, 'c', 3, 'd', 4]
+  ```
+
+  Non-Sequence Edge Cases:
+
+  ```python
+  flatten_with_tuple_paths_up_to(0, 0)  # Output: [(), 0]
+
+  flatten_with_tuple_paths_up_to(0, [0, 1, 2])  # Output: [(), [0, 1, 2]]
+
+  flatten_with_tuple_paths_up_to([0, 1, 2], 0)  # Output: TypeError
+
+  flatten_with_tuple_paths_up_to([0, 1, 2], [0, 1, 2])
+  # Output: [((0,) 0), ((1,), 1), ((2,), 2)]
+  ```
+
+  Args:
+    shallow_tree: a possibly pruned structure of input_tree.
+    input_tree: an arbitrarily nested structure or a scalar object.
+      Note, numpy arrays are considered scalars.
+    check_types: bool. If True, check that each node in shallow_tree has the
+      same type as the corresponding node in input_tree.
+
+  Returns:
+    A Python list, the partially flattened version of `input_tree` according to
+    the structure of `shallow_tree`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+  """
+  assert_shallow_structure(shallow_tree, input_tree, check_types=check_types)
+  return list(_yield_flat_up_to(shallow_tree, input_tree))
+
+
 def map_structure_up_to(shallow_tree, func, *inputs, **kwargs):
   """Applies a function or op to a number of partially flattened inputs.
 
@@ -781,6 +889,14 @@ def map_structure_up_to(shallow_tree, func, *inputs, **kwargs):
 
   Examples:
 
+  ```python
+  shallow_tree = [None, None]
+  inp_val = [1, 2, 3]
+  out = map_structure_up_to(shallow_tree, lambda x: 2 * x, inp_val)
+
+  # Output is: [2, 4]
+  ```
+
   ```python
   ab_tuple = collections.namedtuple("ab_tuple", "a, b")
   op_tuple = collections.namedtuple("op_tuple", "add, mul")
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 71034ffcb6bf3cf7e2b47cf9615500774dc4957f..0540f71f7a98b3fd574c98ae5d0406a4b5d94ff5 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -231,17 +231,17 @@ class NestTest(parameterized.TestCase, test.TestCase):
                             ["and", "goodbye", "again"])
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testIsSequence(self):
-    self.assertFalse(nest.is_sequence("1234"))
-    self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
-    self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
-    self.assertTrue(nest.is_sequence([]))
-    self.assertTrue(nest.is_sequence({"a": 1, "b": 2}))
-    self.assertFalse(nest.is_sequence(set([1, 2])))
+  def testIsNested(self):
+    self.assertFalse(nest.is_nested("1234"))
+    self.assertTrue(nest.is_nested([1, 3, [4, 5]]))
+    self.assertTrue(nest.is_nested(((7, 8), (5, 6))))
+    self.assertTrue(nest.is_nested([]))
+    self.assertTrue(nest.is_nested({"a": 1, "b": 2}))
+    self.assertFalse(nest.is_nested(set([1, 2])))
     ones = array_ops.ones([2, 3])
-    self.assertFalse(nest.is_sequence(ones))
-    self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
-    self.assertFalse(nest.is_sequence(np.ones((4, 5))))
+    self.assertFalse(nest.is_nested(ones))
+    self.assertFalse(nest.is_nested(math_ops.tanh(ones)))
+    self.assertFalse(nest.is_nested(np.ones((4, 5))))
 
   @parameterized.parameters({"mapping_type": _CustomMapping},
                             {"mapping_type": dict})
@@ -686,6 +686,244 @@ class NestTest(parameterized.TestCase, test.TestCase):
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
 
+  def testFlattenWithTuplePathsUpTo(self):
+    def get_paths_and_values(shallow_tree, input_tree):
+      path_value_pairs = nest.flatten_with_tuple_paths_up_to(shallow_tree,
+                                                             input_tree)
+      paths = [p for p, _ in path_value_pairs]
+      values = [v for _, v in path_value_pairs]
+      return paths, values
+
+    # Shallow tree ends at scalar.
+    input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+    shallow_tree = [[True, True], [False, True]]
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths,
+                     [(0, 0), (0, 1), (1, 0), (1, 1)])
+    self.assertEqual(flattened_input_tree, [[2, 2], [3, 3], [4, 9], [5, 5]])
+    self.assertEqual(flattened_shallow_tree_paths,
+                     [(0, 0), (0, 1), (1, 0), (1, 1)])
+    self.assertEqual(flattened_shallow_tree, [True, True, False, True])
+
+    # Shallow tree ends at string.
+    input_tree = [[("a", 1), [("b", 2), [("c", 3), [("d", 4)]]]]]
+    shallow_tree = [["level_1", ["level_2", ["level_3", ["level_4"]]]]]
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    input_tree_flattened_paths = [p for p, _ in
+                                  nest.flatten_with_tuple_paths(input_tree)]
+    input_tree_flattened = nest.flatten(input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [(0, 0), (0, 1, 0), (0, 1, 1, 0), (0, 1, 1, 1, 0)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [("a", 1), ("b", 2), ("c", 3), ("d", 4)])
+
+    self.assertEqual(input_tree_flattened_paths,
+                     [(0, 0, 0), (0, 0, 1),
+                      (0, 1, 0, 0), (0, 1, 0, 1),
+                      (0, 1, 1, 0, 0), (0, 1, 1, 0, 1),
+                      (0, 1, 1, 1, 0, 0), (0, 1, 1, 1, 0, 1)])
+    self.assertEqual(input_tree_flattened, ["a", 1, "b", 2, "c", 3, "d", 4])
+
+    # Make sure dicts are correctly flattened, yielding values, not keys.
+    input_tree = {"a": 1, "b": {"c": 2}, "d": [3, (4, 5)]}
+    shallow_tree = {"a": 0, "b": 0, "d": [0, 0]}
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("b",), ("d", 0), ("d", 1)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [1, {"c": 2}, 3, (4, 5)])
+
+    # Namedtuples.
+    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    input_tree = ab_tuple(a=[0, 1], b=2)
+    shallow_tree = ab_tuple(a=0, b=1)
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("b",)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [[0, 1], 2])
+
+    # Nested dicts, OrderedDicts and namedtuples.
+    input_tree = collections.OrderedDict(
+        [("a", ab_tuple(a=[0, {"b": 1}], b=2)),
+         ("c", {"d": 3, "e": collections.OrderedDict([("f", 4)])})])
+    shallow_tree = input_tree
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a", "a", 0),
+                      ("a", "a", 1, "b"),
+                      ("a", "b"),
+                      ("c", "d"),
+                      ("c", "e", "f")])
+    self.assertEqual(input_tree_flattened_as_shallow_tree, [0, 1, 2, 3, 4])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", {"d": 3, "e": 1})])
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",),
+                      ("c", "d"),
+                      ("c", "e")])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [ab_tuple(a=[0, {"b": 1}], b=2),
+                      3,
+                      collections.OrderedDict([("f", 4)])])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", 0)])
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("c",)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [ab_tuple(a=[0, {"b": 1}], b=2),
+                      {"d": 3, "e": collections.OrderedDict([("f", 4)])}])
+
+    ## Shallow non-list edge-case.
+    # Using iterable elements.
+    input_tree = ["input_tree"]
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = ["input_tree_0", "input_tree_1"]
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Test case where len(shallow_tree) < len(input_tree)
+    input_tree = {"a": "A", "b": "B", "c": "C"}
+    shallow_tree = {"a": 1, "c": 2}
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [("a",), ("c",)])
+    self.assertEqual(flattened_input_tree, ["A", "C"])
+    self.assertEqual(flattened_shallow_tree_paths, [("a",), ("c",)])
+    self.assertEqual(flattened_shallow_tree, [1, 2])
+
+    # Using non-iterable elements.
+    input_tree = [0]
+    shallow_tree = 9
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = [0, 1]
+    shallow_tree = 9
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Both non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = 0
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Input non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = ["shallow_tree"]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    input_tree = "input_tree"
+    shallow_tree = ["shallow_tree_9", "shallow_tree_8"]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,), (1,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = [9]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    input_tree = 0
+    shallow_tree = [9, 8]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,), (1,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
   def testMapStructureUpTo(self):
     # Named tuples.
     ab_tuple = collections.namedtuple("ab_tuple", "a, b")
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index cff864c0304b02aaa6339efb403388c65ab6fec4..2164ba4dbf22b46e7fad3ac45a164ddbdd2f01c0 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
@@ -61,4 +63,7 @@ def get_json_type(obj):
   if isinstance(obj, tensor_shape.TensorShape):
     return obj.as_list()
 
+  if isinstance(obj, collections.Mapping):
+    return dict(obj)
+
   raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index f018e1a1bd35f0111cacc20e678c0466bfd5f2e3..f5ce5bd42a5b1c844931131ea9a969431fa2ef49 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -59,7 +59,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools as _functools
 import traceback as _traceback
 
 
@@ -95,6 +94,11 @@ def make_decorator(target,
     decorator_func.__name__ = target.__name__
   if hasattr(target, '__module__'):
     decorator_func.__module__ = target.__module__
+  if hasattr(target, '__dict__'):
+    # Copy dict entries from target which are not overridden by decorator_func.
+    for name in target.__dict__:
+      if name not in decorator_func.__dict__:
+        decorator_func.__dict__[name] = target.__dict__[name]
   if hasattr(target, '__doc__'):
     decorator_func.__doc__ = decorator.__doc__
   decorator_func.__wrapped__ = target
@@ -207,8 +211,8 @@ class TFDecorator(object):
     else:
       self.__doc__ = ''
 
-  def __get__(self, obj, objtype):
-    return _functools.partial(self.__call__, obj)
+  def __get__(self, instance, owner):
+    return self._decorated_target.__get__(instance, owner)
 
   def __call__(self, *args, **kwargs):
     return self._decorated_target(*args, **kwargs)
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 9198f0b3fad1590bedac71b30cf332e35cb489fe..48d735189cdb0acb394747aa3a99864393ccda7b 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -170,6 +170,17 @@ class TfDecoratorTest(test.TestCase):
     self.assertEqual('Return parameters.',
                      TestDecoratedClass().return_params.__doc__)
 
+  def testTarget__get__IsProxied(self):
+    class Descr(object):
+
+      def __get__(self, instance, owner):
+        return self
+
+    class Foo(object):
+      foo = tf_decorator.TFDecorator('Descr', Descr())
+
+    self.assertIsInstance(Foo.foo, Descr)
+
 
 def test_wrapper(*args, **kwargs):
   return test_function(*args, **kwargs)
@@ -199,6 +210,20 @@ class TfMakeDecoratorTest(test.TestCase):
     decorator = getattr(decorated, '_tf_decorator')
     self.assertEqual('test decorator doc', decorator.decorator_doc)
 
+  def testUpdatesDictWithMissingEntries(self):
+    test_function.foobar = True
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    self.assertTrue(decorated.foobar)
+    del test_function.foobar
+
+  def testUpdatesDict_doesNotOverridePresentEntries(self):
+    test_function.foobar = True
+    test_wrapper.foobar = False
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    self.assertFalse(decorated.foobar)
+    del test_function.foobar
+    del test_wrapper.foobar
+
   def testSetsTFDecoratorArgSpec(self):
     argspec = tf_inspect.ArgSpec(
         args=['a', 'b', 'c'],
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 980c92aaca1496fd23a5eb7a92289adf67e72014..03c8d6f535f09a40cd4e0ae3fe52279eda22f607 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -84,6 +84,7 @@ cc_library(
         "stream_executor_internal.h",
     ],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -128,6 +129,7 @@ cc_library(
         "trace_listener.h",
     ],
     deps = [
+        ":allocator_stats",
         ":device_memory",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
@@ -190,6 +192,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -386,6 +389,7 @@ cc_library(
         "trace_listener.h",
     ],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -497,6 +501,7 @@ cc_library(
         "stream_executor_internal.h",
     ],
     deps = [
+        ":allocator_stats",
         ":device_description",
         ":device_memory",
         ":device_options",
@@ -510,6 +515,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -527,6 +533,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":platform",
         ":stream_executor_headers",
@@ -561,6 +568,7 @@ cc_library(
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -601,6 +609,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         "//tensorflow/core:lib",
@@ -638,6 +647,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "allocator_stats",
+    srcs = [
+        "allocator_stats.cc",
+    ],
+    hdrs = ["allocator_stats.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_cc_test(
     name = "stream_test",
     size = "small",
diff --git a/tensorflow/stream_executor/allocator_stats.cc b/tensorflow/stream_executor/allocator_stats.cc
new file mode 100644
index 0000000000000000000000000000000000000000..440d6f46a3cbd8740c55a239865ce5f89b96b4f3
--- /dev/null
+++ b/tensorflow/stream_executor/allocator_stats.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/allocator_stats.h"
+#include "absl/strings/str_format.h"
+
+namespace stream_executor {
+
+string AllocatorStats::DebugString() const {
+  return absl::StrFormat(
+      "Limit:        %20lld\n"
+      "InUse:        %20lld\n"
+      "MaxInUse:     %20lld\n"
+      "NumAllocs:    %20lld\n"
+      "MaxAllocSize: %20lld\n",
+      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
+}
+
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/allocator_stats.h b/tensorflow/stream_executor/allocator_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..786ceb0fdd6fdea829d095923dc774d63a5de625
--- /dev/null
+++ b/tensorflow/stream_executor/allocator_stats.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+// Runtime statistics collected by an allocator. Exactly the same as
+// tensorflow::AllocatorStats, but independently defined to preserve the mutual
+// independence of StreamExecutor and TensorFlow.
+struct AllocatorStats {
+  int64 num_allocs;          // Number of allocations.
+  int64 bytes_in_use;        // Number of bytes in use.
+  int64 peak_bytes_in_use;   // The peak bytes in use.
+  int64 largest_alloc_size;  // The largest single allocation seen.
+
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  absl::optional<int64> bytes_limit;
+
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0) {}
+
+  string DebugString() const;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
index 1d8a7ec62a958c3f7c6f72f451a334286b8a8ae4..b2da13e5ae708e40d8cb86ce6f1e2b8f1f0bb7a4 100644
--- a/tensorflow/stream_executor/build_defs.bzl
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -17,5 +17,5 @@ def tf_additional_cudnn_plugin_deps():
 # Returns whether any GPU backend is configuered.
 def if_gpu_is_configured(x):
     if cuda_is_configured() or rocm_is_configured():
-      return x
+        return x
     return []
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 0e23cc3eb4074b615e742d3974810a9081430a1e..03dd92e19c1b1429d6cfa742c111b22d4d802e87 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -87,13 +87,17 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
-        "//tensorflow/core:cuda",
         "//tensorflow/stream_executor:device_options",
         "//tensorflow/stream_executor/gpu:gpu_driver_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + tf_additional_cuda_driver_deps()),
+    ] + tf_additional_cuda_driver_deps()) + select({
+        # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub"],
+        "//conditions:default": ["//tensorflow/core:cuda"],
+    }),
 )
 
 cc_library(
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 080c26fdfea52452eb0ab3461e0d67bf7a507c28..5ef821a4f121c58d1c2f64b602345063165ac58e 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -837,6 +837,10 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
                                              uint64 bytes) {
+  if (bytes == 0) {
+    return nullptr;
+  }
+
   ScopedActivateContext activated{context};
   CUdeviceptr result = 0;
   CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
index 657eea81716e6cbf5f158ab29bf6bd9149d46403..0de27d5cd3c858b0990618d7ec74be31088918f5 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
@@ -44,22 +44,22 @@ namespace wrap {
 #define TO_STR_(x) #x
 #define TO_STR(x) TO_STR_(x)
 
-#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                        \
-  template <typename... Args>                                               \
-  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) {  \
-    using FuncPtrT = std::add_pointer<decltype(::cudaSymbolName)>::type;    \
-    static FuncPtrT loaded = []() -> FuncPtrT {                             \
-      static const char *kName = TO_STR(cudaSymbolName);                    \
-      void *f;                                                              \
-      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
-          stream_executor::internal::CachedDsoLoader::GetLibcudaDsoHandle() \
-              .ValueOrDie(),                                                \
-          kName, &f);                                                       \
-      CHECK(s.ok()) << "could not find " << kName                           \
-                    << " in libcuda DSO; dlerror: " << s.error_message();   \
-      return reinterpret_cast<FuncPtrT>(f);                                 \
-    }();                                                                    \
-    return loaded(args...);                                                 \
+#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                           \
+  template <typename... Args>                                                  \
+  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) {     \
+    using FuncPtrT = std::add_pointer<decltype(::cudaSymbolName)>::type;       \
+    static FuncPtrT loaded = []() -> FuncPtrT {                                \
+      static const char *kName = TO_STR(cudaSymbolName);                       \
+      void *f;                                                                 \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary(    \
+          stream_executor::internal::CachedDsoLoader::GetCudaDriverDsoHandle() \
+              .ValueOrDie(),                                                   \
+          kName, &f);                                                          \
+      CHECK(s.ok()) << "could not find " << kName                              \
+                    << " in libcuda DSO; dlerror: " << s.error_message();      \
+      return reinterpret_cast<FuncPtrT>(f);                                    \
+    }();                                                                       \
+    return loaded(args...);                                                    \
   }
 #endif
 
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index cb35719183bc076a014d5c090379c3a3756b21be..8878700c5ea9b48f0bad2038d803e61c71313dad 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -21,19 +21,19 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
 namespace {
-void *GetDsoHandle() {
-  static auto handle = [] {
-    void *result = nullptr;
-    using DsoLoader = stream_executor::internal::DsoLoader;
-    DsoLoader::GetLibcudartDsoHandle(&result).IgnoreError();
-    return result;
+void* GetDsoHandle() {
+  static auto handle = []() -> void* {
+    auto handle_or =
+        stream_executor::internal::DsoLoader::GetCudaRuntimeDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
   }();
   return handle;
 }
 
 template <typename T>
-T LoadSymbol(const char *symbol_name) {
-  void *symbol = nullptr;
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
   auto env = stream_executor::port::Env::Default();
   env->GetSymbolFromLibrary(GetDsoHandle(), symbol_name, &symbol).IgnoreError();
   return reinterpret_cast<T>(symbol);
@@ -41,28 +41,28 @@ T LoadSymbol(const char *symbol_name) {
 cudaError_t GetSymbolNotFoundError() {
   return cudaErrorSharedObjectSymbolNotFound;
 }
-const char *GetSymbolNotFoundStrError() {
+const char* GetSymbolNotFoundStrError() {
   return "cudaErrorSharedObjectSymbolNotFound";
 }
 }  // namespace
 
 // Code below is auto-generated.
 extern "C" {
-cudaError_t CUDART_CB cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t (*)(void *devPtr);
+cudaError_t CUDART_CB cudaFree(void* devPtr) {
+  using FuncPtr = cudaError_t (*)(void* devPtr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devPtr);
 }
 
-cudaError_t CUDART_CB cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t (*)(int *device);
+cudaError_t CUDART_CB cudaGetDevice(int* device) {
+  using FuncPtr = cudaError_t (*)(int* device);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(device);
 }
 
-cudaError_t CUDART_CB cudaGetDeviceProperties(cudaDeviceProp *prop,
+cudaError_t CUDART_CB cudaGetDeviceProperties(cudaDeviceProp* prop,
                                               int device) {
   using FuncPtr = cudaError_t (*)(cudaDeviceProp * prop, int device);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
@@ -70,8 +70,8 @@ cudaError_t CUDART_CB cudaGetDeviceProperties(cudaDeviceProp *prop,
   return func_ptr(prop, device);
 }
 
-const char *CUDART_CB cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(*)(cudaError_t error);
+const char* CUDART_CB cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char* (*)(cudaError_t error);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
   if (!func_ptr) return GetSymbolNotFoundStrError();
   return func_ptr(error);
@@ -86,29 +86,36 @@ cudaError_t CUDART_CB cudaSetDevice(int device) {
 
 cudaError_t CUDART_CB cudaStreamAddCallback(cudaStream_t stream,
                                             cudaStreamCallback_t callback,
-                                            void *userData,
+                                            void* userData,
                                             unsigned int flags) {
   using FuncPtr =
       cudaError_t (*)(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags);
+                      void* userData, unsigned int flags);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream, callback, userData, flags);
 }
 
-cudaError_t CUDART_CB cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t (*)(int *count);
+cudaError_t CUDART_CB cudaGetDeviceCount(int* count) {
+  using FuncPtr = cudaError_t (*)(int* count);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(count);
 }
 
 cudaError_t CUDART_CB cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
+    struct cudaPointerAttributes* attributes, const void* ptr) {
   using FuncPtr = cudaError_t (*)(struct cudaPointerAttributes * attributes,
-                                  const void *ptr);
+                                  const void* ptr);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attributes, ptr);
 }
+
+cudaError_t CUDART_CB cudaGetLastError() {
+  using FuncPtr = cudaError_t (*)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
 }  // extern "C"
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 78fe478679bb1abf20fc66120739c828caac7585..e6812389e3eafc40365861722be3edb414dd05c7 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -25,7 +25,7 @@ filegroup(
 
 cc_library(
     name = "gpu_activation_header",
-    hdrs = if_gpu_is_configured(["gpu_activation.h"]),
+    hdrs = ["gpu_activation.h"],
     deps = ["//tensorflow/stream_executor/platform"],
 )
 
@@ -33,13 +33,13 @@ cc_library(
     name = "gpu_activation",
     srcs = if_gpu_is_configured(["gpu_activation.cc"]),
     hdrs = if_gpu_is_configured(["gpu_activation.h"]),
-    deps = [
+    deps = if_gpu_is_configured([
         ":gpu_activation_header",
         ":gpu_driver_header",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:stream_executor_internal",
         "//tensorflow/stream_executor/platform",
-    ],
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/stream_executor/logging.proto b/tensorflow/stream_executor/logging.proto
index 68021d2b3157ceeaabd0d0a2065bc946913f64c4..3351752788c610cdcb2baa15b83f9fd0f7492b5c 100644
--- a/tensorflow/stream_executor/logging.proto
+++ b/tensorflow/stream_executor/logging.proto
@@ -15,6 +15,18 @@ message ComputeCapability {
   int32 minor = 2;
 }
 
+// NOTE: this proto is temporarily duplicated in other places, outside of
+// stream_executor. The plan is to move all custom logging (tensorflow::Logger
+// related) behavior out of StreamExecutor. There are two reasons:
+// * Technical: stream_executor is part of libtensorflow_framework.so. It's
+//   extremely hard to have a single definition of the protos in the .so, and
+//   let the callers call into those definitions. The complication lives in
+//   cc_proto_library where we have a header-only version and impl version.
+// * Functional: we want to log autotuning stats from the callers. The
+//   autotuning stats are not available in SE.
+//
+// TODO(timshen): remove this proto once both XLA and TF log autotuning
+// results.
 message CudaInfo {
   CudnnVersion cudnn_version = 1;
   ComputeCapability compute_capability = 2;
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 8592455860376ab1b7d764f52b61696f32eb7728..a2a9ad67534921c3349dbccaa2f6843bfadc787f 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -12,298 +12,175 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
 
-// TODO(jhen): Replace hardcoded, platform specific path strings in GetXXXPath()
-// with a function in e.g. cuda.h.
-
-#include <limits.h>
 #include <stdlib.h>
-#include <initializer_list>
-#include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "cuda/cuda_config.h"
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/path.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-#include "tensorflow/stream_executor/platform/default/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-#if !defined(PLATFORM_GOOGLE)
-#include "absl/strings/string_view.h"
-#include "cuda/cuda_config.h"
-#endif
-
 namespace stream_executor {
 namespace internal {
 
+namespace {
 string GetCudaVersion() { return TF_CUDA_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 
-/* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cublas", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
+  auto filename = port::Env::Default()->FormatLibraryFileName(name, version);
+  void* dso_handle;
+  port::Status status =
+      port::Env::Default()->LoadLibrary(filename.c_str(), &dso_handle);
+  if (status.ok()) {
+    LOG(INFO) << "Successfully opened dynamic library " << filename;
+    return dso_handle;
+  }
+
+  auto message = absl::StrCat("Could not dlopen library '", filename,
+                              "'; dlerror: ", status.error_message());
+#if !defined(PLATFORM_WINDOWS)
+  if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
+    message += absl::StrCat("; LD_LIRARY_PATH: ", ld_library_path);
+  }
+#endif
+  LOG(INFO) << message;
+  return port::Status(port::error::FAILED_PRECONDITION, message);
+}
+}  // namespace
+
+namespace DsoLoader {
+port::StatusOr<void*> GetCudaDriverDsoHandle() {
+#if defined(PLATFORM_WINDOWS)
+  return GetDsoHandle("nvcuda", "");
+#elif defined(__APPLE__)
+  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
+  // libcuda.1.dylib.
+  auto handle_or = GetDsoHandle("cuda", "");
+  if (handle_or.ok()) {
+    return handle_or;
+  }
+#endif
+  return GetDsoHandle("cuda", "1");
 }
 
-/* static */ port::Status DsoLoader::GetCudnnDsoHandle(void** dso_handle) {
-  // libcudnn is versioned differently than the other libraries and may have a
-  // different version number than other CUDA libraries.  See b/22397368 for
-  // some details about the complications surrounding this.
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cudnn", GetCudnnVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
+  return GetDsoHandle("cudart", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cufft", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetCublasDsoHandle() {
+  return GetDsoHandle("cublas", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "curand", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetCufftDsoHandle() {
+  return GetDsoHandle("cufft", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
-#if defined(PLATFORM_WINDOWS)
-  return GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("nvcuda", ""),
-                  GetCudaDriverLibraryPath()),
-      dso_handle);
-#else
-  port::Status status = GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", "1"),
-                  GetCudaDriverLibraryPath()),
-      dso_handle);
-#if defined(__APPLE__)
-  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
-  // libcuda.1.dylib.
-  return status.ok()
-             ? status
-             : GetDsoHandle(
-                   FindDsoPath(
-                       port::Env::Default()->FormatLibraryFileName("cuda", ""),
-                       GetCudaDriverLibraryPath()),
-                   dso_handle);
-#else
-  return status;
-#endif
-#endif
+port::StatusOr<void*> GetCurandDsoHandle() {
+  return GetDsoHandle("curand", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
+port::StatusOr<void*> GetCuptiDsoHandle() {
 #if defined(ANDROID_TEGRA)
   // On Android devices the CUDA version number is not added to the library
   // name.
-  return GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cupti", ""),
-                  GetCudaCuptiLibraryPath()),
-      dso_handle);
+  return GetDsoHandle("cupti", "");
 #else
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cupti", GetCudaVersion()),
-                                  GetCudaCuptiLibraryPath()),
-                      dso_handle);
+  return GetDsoHandle("cupti", GetCudaVersion());
 #endif
 }
 
-/* static */ port::Status DsoLoader::GetLibcudartDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cudart", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetCudnnDsoHandle() {
+  return GetDsoHandle("cudnn", GetCudnnVersion());
 }
 
-static mutex& GetRpathMutex() {
-  static mutex* mu = new mutex;
-  return *mu;
+port::StatusOr<void*> GetRocblasDsoHandle() {
+  return GetDsoHandle("rocblas", "");
 }
 
-/* static */ void DsoLoader::RegisterRpath(absl::string_view path) {
-  mutex_lock lock{GetRpathMutex()};
-  GetRpaths()->emplace_back(path);
+port::StatusOr<void*> GetMiopenDsoHandle() {
+  return GetDsoHandle("MIOpen", "");
 }
 
-/* static */ port::Status DsoLoader::GetDsoHandle(absl::string_view path,
-                                                  void** dso_handle,
-                                                  LoadKind load_kind) {
-  if (load_kind != LoadKind::kLocal) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "Only LoadKind::kLocal is currently supported");
-  }
-  string path_string(path);
-  port::Status s =
-      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
-  if (!s.ok()) {
-#if !defined(PLATFORM_WINDOWS)
-    char* ld_library_path = getenv("LD_LIBRARY_PATH");
-#endif
-    LOG(INFO) << "Couldn't open CUDA library " << path
-#if !defined(PLATFORM_WINDOWS)
-              << ". LD_LIBRARY_PATH: "
-              << (ld_library_path != nullptr ? ld_library_path : "")
-#endif
-        ;
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        absl::StrCat("could not dlopen DSO: ", path,
-                                     "; dlerror: ", s.error_message()));
-  }
-  LOG(INFO) << "successfully opened CUDA library " << path << " locally";
-  return port::Status::OK();
+port::StatusOr<void*> GetRocfftDsoHandle() {
+  return GetDsoHandle("rocfft", "");
 }
 
-/* static */ string DsoLoader::GetBinaryDirectory(bool strip_executable_name) {
-  string exe_path = port::Env::Default()->GetExecutablePath();
-  return strip_executable_name ? string(port::Dirname(exe_path)) : exe_path;
+port::StatusOr<void*> GetRocrandDsoHandle() {
+  return GetDsoHandle("rocrand", "");
 }
 
-// Creates a heap-allocated vector for initial rpaths.
-// Ownership is transferred to the caller.
-static std::vector<string>* CreatePrimordialRpaths() {
-  auto rpaths = new std::vector<string>;
-#if defined(__APPLE__)
-  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib");
-#else
-  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib64");
-#endif
-  return rpaths;
-}
+port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
 
-/* static */ std::vector<string>* DsoLoader::GetRpaths() {
-  static std::vector<string>* rpaths = CreatePrimordialRpaths();
-  return rpaths;
-}
+}  // namespace DsoLoader
 
-/* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) {
-#if defined(PLATFORM_WINDOWS)
-  return false;
-#else
-  char buf[PATH_MAX];
-  char* result = realpath(candidate->c_str(), buf);
-  if (result == nullptr) {
-    return false;
-  }
-  VLOG(3) << "realpath resolved candidate path \"" << *candidate << "\" to \""
-          << result << "\"";
-  *candidate = result;
-  return true;
-#endif
+namespace CachedDsoLoader {
+port::StatusOr<void*> GetCudaDriverDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudaDriverDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::FindDsoPath(absl::string_view library_name,
-                                           absl::string_view runfiles_relpath) {
-  // Keep a record of the paths we attempted so we can dump out meaningful
-  // diagnostics if no path is found.
-  std::vector<string> attempted;
-
-  using StringPieces = std::vector<absl::string_view>;
-  string candidate;
-
-  // Otherwise, try binary-plus-rpath locations.
-  string binary_directory =
-      GetBinaryDirectory(true /* = strip_executable_name */);
-  mutex_lock lock{GetRpathMutex()};
-  for (const string& rpath : *GetRpaths()) {
-    candidate =
-        port::Join(StringPieces{binary_directory, rpath, library_name}, "/");
-    if (TrySymbolicDereference(&candidate)) {
-      return candidate;
-    }
-  }
-  attempted.push_back(candidate);
-
-  return string(library_name);
+port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudaRuntimeDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::GetCudaLibraryDirPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/lib";
-#else
-  return "external/local_config_cuda/cuda/lib64";
-#endif
+port::StatusOr<void*> GetCublasDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCublasDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::GetCudaDriverLibraryPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/driver/lib";
-#elif defined(PLATFORM_WINDOWS)
-  return "";
-#else
-  return "external/local_config_cuda/cuda/driver/lib64";
-#endif
+port::StatusOr<void*> GetCurandDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCurandDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::GetCudaCuptiLibraryPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/extras/CUPTI/lib";
-#else
-  return "external/local_config_cuda/cuda/extras/CUPTI/lib64";
-#endif
+port::StatusOr<void*> GetCufftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCufftDsoHandle());
+  return *result;
 }
 
-// -- CachedDsoLoader
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCublasDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCublasDsoHandle);
-  return result;
+port::StatusOr<void*> GetCuptiDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCuptiDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCurandDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCurandDsoHandle);
-  return result;
+port::StatusOr<void*> GetCudnnDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudnnDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCudnnDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCudnnDsoHandle);
+port::StatusOr<void*> GetRocblasDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
   return result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCufftDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCufftDsoHandle);
+port::StatusOr<void*> GetMiopenDsoHandle() {
+  static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
   return result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcudaDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcudaDsoHandle);
+port::StatusOr<void*> GetRocfftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
   return result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcuptiDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcuptiDsoHandle);
+port::StatusOr<void*> GetRocrandDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
   return result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcudartDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcudartDsoHandle);
+port::StatusOr<void*> GetHipDsoHandle() {
+  static auto result = new auto(DsoLoader::GetHipDsoHandle());
   return result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::FetchHandleResult(
-    std::function<port::Status(void**)> load_dso) {
-  void* handle;
-  auto status = load_dso(&handle);
-  if (!status.ok()) {
-    return status;
-  }
-  return handle;
-}
-
+}  // namespace CachedDsoLoader
 }  // namespace internal
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 92c0db70374ceda2f68189d8f34b130775794725..89f23324dcdcfc0ca3d9d8c1382b566fcd1fd79e 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -31,87 +31,42 @@ limitations under the License.
 namespace stream_executor {
 namespace internal {
 
-// Permits StreamExecutor code to dynamically load a pre-determined set of
-// relevant DSOs via dlopen.
-//
-// Thread-safe.
-class DsoLoader {
- public:
-  // The following methods either load the DSO of interest and return a dlopen
-  // handle or error status in the canonical namespace.
-
-  static port::Status GetCublasDsoHandle(void** dso_handle);
-  static port::Status GetCudnnDsoHandle(void** dso_handle);
-  static port::Status GetCufftDsoHandle(void** dso_handle);
-  static port::Status GetCurandDsoHandle(void** dso_handle);
-  static port::Status GetLibcudaDsoHandle(void** dso_handle);
-  static port::Status GetLibcuptiDsoHandle(void** dso_handle);
-  static port::Status GetLibcudartDsoHandle(void** dso_handle);
-
-  // Registers a new binary-relative path to use as a dlopen search path.
-  static void RegisterRpath(absl::string_view path);
-
- private:
-  // Registered rpaths (singleton vector) and a mutex that guards it.
-  static std::vector<string>* GetRpaths();
-
-  // Descriptive boolean wrapper to indicate whether symbols are made available
-  // to resolve in later-loaded libraries.
-  enum class LoadKind { kLocal, kGlobal };
-
-  // Loads a DSO from the given "path" (which can technically be any dlopen-able
-  // name). If the load kind is global, the symbols in the loaded DSO are
-  // visible to subsequent DSO loading operations.
-  static port::Status GetDsoHandle(absl::string_view path, void** dso_handle,
-                                   LoadKind load_kind = LoadKind::kLocal);
-
-  // Returns the binary directory (or binary path) associated with the currently
-  // executing program. If strip_executable_name is true, the executable file is
-  // stripped off of the path.
-  static string GetBinaryDirectory(bool strip_executable_name);
-
-  // Invokes realpath on the original path; updates candidate and returns true
-  // if it succeeds (i.e. a file exists at the path); otherwise, returns false.
-  static bool TrySymbolicDereference(string* candidate);
-
-  // Attempts to find a path to the DSO of interest, otherwise returns the
-  // bare library name:
-  // Arguments:
-  //   library_name: the filename in tree; e.g. libOpenCL.so.1.0.0
-  //   runfiles_relpath: where to look for the library relative to the runfiles
-  //      root; e.g. third_party/gpus/cuda/lib64
-  static string FindDsoPath(absl::string_view library_name,
-                            absl::string_view runfiles_relpath);
-
-  // Return platform dependent paths for DSOs
-  static string GetCudaLibraryDirPath();
-  static string GetCudaDriverLibraryPath();
-  static string GetCudaCuptiLibraryPath();
-
-  SE_DISALLOW_COPY_AND_ASSIGN(DsoLoader);
-};
+namespace DsoLoader {
+// The following methods either load the DSO of interest and return a dlopen
+// handle or error status.
+port::StatusOr<void*> GetCudaDriverDsoHandle();
+port::StatusOr<void*> GetCudaRuntimeDsoHandle();
+port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCufftDsoHandle();
+port::StatusOr<void*> GetCurandDsoHandle();
+port::StatusOr<void*> GetCuptiDsoHandle();
+port::StatusOr<void*> GetCudnnDsoHandle();
+
+port::StatusOr<void*> GetRocblasDsoHandle();
+port::StatusOr<void*> GetMiopenDsoHandle();
+port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipDsoHandle();
+}  // namespace DsoLoader
 
 // Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
 // more than once.
-class CachedDsoLoader {
- public:
-  // Cached versions of the corresponding DsoLoader methods above.
-  static port::StatusOr<void*> GetCublasDsoHandle();
-  static port::StatusOr<void*> GetCudnnDsoHandle();
-  static port::StatusOr<void*> GetCufftDsoHandle();
-  static port::StatusOr<void*> GetCurandDsoHandle();
-  static port::StatusOr<void*> GetLibcudaDsoHandle();
-  static port::StatusOr<void*> GetLibcuptiDsoHandle();
-  static port::StatusOr<void*> GetLibcudartDsoHandle();
-
- private:
-  // Fetches a DSO handle via "load_dso" and returns the StatusOr form of the
-  // result.
-  static port::StatusOr<void*> FetchHandleResult(
-      std::function<port::Status(void**)> load_dso);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CachedDsoLoader);
-};
+namespace CachedDsoLoader {
+// Cached versions of the corresponding DsoLoader methods above.
+port::StatusOr<void*> GetCudaDriverDsoHandle();
+port::StatusOr<void*> GetCudaRuntimeDsoHandle();
+port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCufftDsoHandle();
+port::StatusOr<void*> GetCurandDsoHandle();
+port::StatusOr<void*> GetCuptiDsoHandle();
+port::StatusOr<void*> GetCudnnDsoHandle();
+
+port::StatusOr<void*> GetRocblasDsoHandle();
+port::StatusOr<void*> GetMiopenDsoHandle();
+port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipDsoHandle();
+}  // namespace CachedDsoLoader
 
 }  // namespace internal
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index dd08b36308c58e0f4cc941a6c33c8e2a147559b5..5190b551f801d75f809277351b72b9ca6b9cc522 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -33,7 +33,7 @@ filegroup(
 cc_library(
     name = "rocm_diagnostics",
     srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
-    hdrs = [],
+    hdrs = if_rocm_is_configured(["rocm_diagnostics.h"]),
     deps = if_rocm_is_configured([
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
@@ -47,7 +47,7 @@ cc_library(
 cc_library(
     name = "rocm_driver",
     srcs = if_rocm_is_configured(["rocm_driver.cc"]),
-    hdrs = [],
+    hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
     deps = if_rocm_is_configured([
         ":rocm_diagnostics",
         "@com_google_absl//absl/base",
@@ -57,6 +57,7 @@ cc_library(
         "//tensorflow/stream_executor/gpu:gpu_driver_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -140,55 +141,62 @@ cc_library(
     deps = ["//tensorflow/stream_executor:platform"],
 )
 
-# FIXME: enable in future PRs
-#cc_library(
-#    name = "rocblas_plugin",
-#    srcs = ["rocm_blas.cc"],
-#    hdrs = ["rocm_blas.h"],
-#    visibility = ["//visibility:public"],
-#    deps = [
-#        ":rocm_gpu_executor",
-#        ":rocm_platform_id",
-#        "//third_party/eigen3",
-#        "//tensorflow/core:lib_internal",
-#        "//tensorflow/stream_executor",
-#        "//tensorflow/stream_executor:event",
-#        "//tensorflow/stream_executor:host_or_device_scalar",
-#        "//tensorflow/stream_executor:plugin_registry",
-#        "//tensorflow/stream_executor:scratch_allocator",
-#        "//tensorflow/stream_executor:timer",
-#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
-#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
-#        "//tenosrflow/stream_executor/gpu:gpu_timer_header",
-#        "//tensorflow/stream_executor/lib",
-#        "//tensorflow/stream_executor/platform",
-#        "//tensorflow/stream_executor/platform:dso_loader",
-#        "@com_google_absl//absl/strings",
-#        "@local_config_rocm//rocm:rocm_headers",
-#    ] + if_static(["@local_config_rocm//rocm:rocblas"]),
-#    alwayslink = True,
-#)
+cc_library(
+    name = "rocblas_plugin",
+    srcs = if_rocm_is_configured(["rocm_blas.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_blas.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_timer_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/strings",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocblas",
+    ])),
+    alwayslink = True,
+)
 
-# FIXME: enable in future PRs
-#cc_library(
-#    name = "rocfft_plugin",
-#    srcs = ["rocm_fft.cc"],
-#    hdrs = [],
-#    visibility = ["//visibility:public"],
-#    deps = [
-#        ":rocm_platform_id",
-#        "//tensorflow/stream_executor:event",
-#        "//tensorflow/stream_executor:fft",
-#        "//tensorflow/stream_executor:plugin_registry",
-#        "//tensorflow/stream_executor:scratch_allocator",
-#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
-#        "//tensorflow/stream_executor/lib",
-#        "//tensorflow/stream_executor/platform",
-#        "//tensorflow/stream_executor/platform:dso_loader",
-#        "@local_config_rocm//rocm:rocm_headers",
-#    ] + if_static(["@local_config_rocm//rocm:rocfft"]),
-#    alwayslink = True,
-#)
+cc_library(
+    name = "rocfft_plugin",
+    srcs = if_rocm_is_configured(["rocm_fft.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocfft",
+    ])),
+    alwayslink = True,
+)
 
 # FIXME: enable in future PRs
 #cc_library(
@@ -229,26 +237,30 @@ cc_library(
 #    alwayslink = True,
 #)
 
-# FIXME: enable in future PRs
-#cc_library(
-#    name = "rocrand_plugin",
-#    srcs = ["rocm_rng.cc"],
-#    hdrs = [],
-#    deps = [
-#        ":rocm_gpu_executor",
-#        ":rocm_platform_id",
-#        "@local_config_rocm//rocm:rocm_headers",
-#        "//tensorflow/stream_executor:event",
-#        "//tensorflow/stream_executor:plugin_registry",
-#        "//tensorflow/stream_executor:rng",
-#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
-#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
-#        "//tensorflow/stream_executor/lib",
-#        "//tensorflow/stream_executor/platform",
-#        "//tensorflow/stream_executor/platform:dso_loader",
-#    ] + if_static(["@local_config_rocm//rocm:curand"]),
-#    alwayslink = True,
-#)
+cc_library(
+    name = "rocrand_plugin",
+    srcs = if_rocm_is_configured(["rocm_rng.cc"]),
+    hdrs = if_rocm_is_configured([]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static([
+        "@local_config_rocm//rocm:hiprand",
+    ])),
+    alwayslink = True,
+)
 
 cc_library(
     name = "all_runtime",
@@ -257,9 +269,9 @@ cc_library(
     deps = if_rocm_is_configured([
         # FIXME: enable in future PRs
         #":miopen_plugin",
-        #":rocfft_plugin",
-        #":rocblas_plugin",
-        #":rocrand_plugin",
+        ":rocfft_plugin",
+        ":rocblas_plugin",
+        ":rocrand_plugin",
         ":rocm_driver",
         ":rocm_platform",
     ]),
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82b966644c573c542c3c59320b0cfe7149ee3dbb
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -0,0 +1,2374 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/rocblas.h"
+
+#include "tensorflow/stream_executor/rocm/rocm_blas.h"
+
+#define EIGEN_USE_GPU
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include <assert.h>
+#include <complex>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocBlasPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_ROCBLAS_WRAP(__name)                       \
+  struct WrapperShim__##__name {                                   \
+    static const char *kName;                                      \
+    template <typename... Args>                                    \
+    rocblas_status operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                    \
+    }                                                              \
+  } __name;                                                        \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_ROCBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_ROCBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_ROCBLAS_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocblasDsoHandle();          \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocblas DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    rocblas_status operator()(GpuExecutor *parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_ROCBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_ROCBLAS_WRAP(__name)
+
+#endif
+
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro)                                     \
+  __macro(rocblas_snrm2) __macro(rocblas_dnrm2) /*  __macro(rocblas_scnrm2)    \
+                                                  __macro(rocblas_dznrm2) */   \
+      __macro(rocblas_sdot)                                                    \
+          __macro(rocblas_ddot) /*  __macro(rocblas_cdotu)                     \
+                                  __macro(rocblas_cdotc)                       \
+                                  __macro(rocblas_zdotu)                       \
+                                  __macro(rocblas_zdotc)                    */ \
+      __macro(rocblas_sscal)                                                   \
+          __macro(rocblas_dscal) /*  __macro(rocblas_cscal)                    \
+                                   __macro(rocblas_csscal)                     \
+                                   __macro(rocblas_zscal)                      \
+                                   __macro(rocblas_zdscal) */                  \
+      __macro(rocblas_saxpy)                                                   \
+          __macro(rocblas_daxpy) /*  __macro(rocblas_caxpy)                    \
+                                   __macro(rocblas_zaxpy) */                   \
+      __macro(rocblas_scopy)                                                   \
+          __macro(rocblas_dcopy) /*  __macro(rocblas_ccopy)                    \
+                                   __macro(rocblas_zcopy) */                   \
+      __macro(rocblas_sswap)                                                   \
+          __macro(rocblas_dswap) /*  __macro(rocblas_cswap)                    \
+                                   __macro(rocblas_zswap) */                   \
+      __macro(rocblas_isamax)                                                  \
+          __macro(rocblas_idamax) /*  __macro(rocblas_icamax)                  \
+                                    __macro(rocblas_izamax) */                 \
+      __macro(rocblas_isamin)                                                  \
+          __macro(rocblas_idamin) /*  __macro(rocblas_icamin)                  \
+                                    __macro(rocblas_izamin) */                 \
+      __macro(rocblas_sasum)                                                   \
+          __macro(rocblas_dasum) /*  __macro(rocblas_scasum)                   \
+                                   __macro(rocblas_dzasum)                     \
+                                   __macro(rocblas_srot)                       \
+                                   __macro(rocblas_drot)                       \
+                                   __macro(rocblas_crot)                       \
+                                   __macro(rocblas_csrot)                      \
+                                   __macro(rocblas_zrot)                       \
+                                   __macro(rocblas_zdrot)                      \
+                                   __macro(rocblas_srotg)                      \
+                                   __macro(rocblas_drotg)                      \
+                                   __macro(rocblas_Crotg)                      \
+                                   __macro(rocblas_crotg)                      \
+                                   __macro(rocblas_zrotm)                      \
+                                   __macro(rocblas_drotm)                      \
+                                   __macro(rocblas_srotmg)                     \
+                                   __macro(rocblas_drotmg) */                  \
+      __macro(rocblas_sgemv)                                                   \
+          __macro(rocblas_dgemv) /*  __macro(rocblas_cgemv)                    \
+                                   __macro(rocblas_zgemv)                      \
+                                   __macro(rocblas_sgbmv)                      \
+                                   __macro(rocblas_dgbmv)                      \
+                                   __macro(rocblas_cgbmv)                      \
+                                   __macro(rocblas_zgbmv)                      \
+                                   __macro(rocblas_strmv)                      \
+                                   __macro(rocblas_dtrmv)                      \
+                                   __macro(rocblas_ctrmv)                      \
+                                   __macro(rocblas_ztrmv)                      \
+                                   __macro(rocblas_stbmv)                      \
+                                   __macro(rocblas_dtbmv)                      \
+                                   __macro(rocblas_ctbmv)                      \
+                                   __macro(rocblas_ztbmv)                      \
+                                   __macro(rocblas_stpmv)                      \
+                                   __macro(rocblas_dtpmv)                      \
+                                   __macro(rocblas_ctpmv)                      \
+                                   __macro(rocblas_ztpmv)                      \
+                                   __macro(rocblas_strsv)                      \
+                                   __macro(rocblas_dtrsv)                      \
+                                   __macro(rocblas_ctrsv)                      \
+                                   __macro(rocblas_ztrsv)                      \
+                                   __macro(rocblas_stpsv)                      \
+                                   __macro(rocblas_dtpsv)                      \
+                                   __macro(rocblas_ctpsv)                      \
+                                   __macro(rocblas_ztpsv)                      \
+                                   __macro(rocblas_stbsv)                      \
+                                   __macro(rocblas_dtbsv)                      \
+                                   __macro(rocblas_ctbsv)                      \
+                                   __macro(rocblas_ztbsv)                      \
+                                   __macro(rocblas_ssymv)                      \
+                                   __macro(rocblas_dsymv)                      \
+                                   __macro(rocblas_csymv)                      \
+                                   __macro(rocblas_zsymv)                      \
+                                   __macro(rocblas_chemv)                      \
+                                   __macro(rocblas_zhemv)                      \
+                                   __macro(rocblas_ssbmv)                      \
+                                   __macro(rocblas_dsbmv)                      \
+                                   __macro(rocblas_chbmv)                      \
+                                   __macro(rocblas_zhbmv)                      \
+                                   __macro(rocblas_sspmv)                      \
+                                   __macro(rocblas_dspmv)                      \
+                                   __macro(rocblas_chpmv)                      \
+                                   __macro(rocblas_zhpmv) */                   \
+      __macro(rocblas_sger)                                                    \
+          __macro(rocblas_dger) /*  __macro(rocblas_cgeru)                     \
+                                  __macro(rocblas_cgerc)                       \
+                                  __macro(rocblas_zgeru)                       \
+                                  __macro(rocblas_zgerc)                    */ \
+      __macro(rocblas_ssyr)                                                    \
+          __macro(rocblas_dsyr) /*  __macro(rocblas_csyr)                      \
+                                  __macro(rocblas_zsyr)                        \
+                                  __macro(rocblas_cher)                        \
+                                  __macro(rocblas_zher)                        \
+                                  __macro(rocblas_sspr)                        \
+                                  __macro(rocblas_dspr)                        \
+                                  __macro(rocblas_chpr)                        \
+                                  __macro(rocblas_zhpr)                        \
+                                  __macro(rocblas_ssyr2)                       \
+                                  __macro(rocblas_dsyr2)                       \
+                                  __macro(rocblas_csyr2)                       \
+                                  __macro(rocblas_zsyr2)                       \
+                                  __macro(rocblas_cher2)                       \
+                                  __macro(rocblas_zher2)                       \
+                                  __macro(rocblas_sspr2)                       \
+                                  __macro(rocblas_dspr2)                       \
+                                  __macro(rocblas_chpr2)                       \
+                                  __macro(rocblas_zhpr2)                    */ \
+      __macro(rocblas_sgemm) __macro(rocblas_dgemm)                            \
+          __macro(rocblas_hgemm) /*  __macro(rocblas_cgemm)                    \
+                                   __macro(rocblas_zgemm)                      \
+                                   __macro(rocblas_ssyrk)                      \
+                                   __macro(rocblas_dsyrk)                      \
+                                   __macro(rocblas_csyrk)                      \
+                                   __macro(rocblas_zsyrk)                      \
+                                   __macro(rocblas_cherk)                      \
+                                   __macro(rocblas_zherk)                      \
+                                   __macro(rocblas_ssyr2k)                     \
+                                   __macro(rocblas_dsyr2k)                     \
+                                   __macro(rocblas_csyr2k)                     \
+                                   __macro(rocblas_zsyr2k)                     \
+                                   __macro(rocblas_cher2k)                     \
+                                   __macro(rocblas_zher2k)                     \
+                                   __macro(rocblas_ssyrkx)                     \
+                                   __macro(rocblas_dsyrkx)                     \
+                                   __macro(rocblas_csyrkx)                     \
+                                   __macro(rocblas_zsyrkx)                     \
+                                   __macro(rocblas_cherkx)                     \
+                                   __macro(rocblas_zherkx)                     \
+                                   __macro(rocblas_ssymm)                      \
+                                   __macro(rocblas_dsymm)                      \
+                                   __macro(rocblas_csymm)                      \
+                                   __macro(rocblas_zsymm)                      \
+                                   __macro(rocblas_chemm)                      \
+                                   __macro(rocblas_zhemm) */                   \
+      __macro(rocblas_strsm)                                                   \
+          __macro(rocblas_dtrsm) /*  __macro(rocblas_ctrsm)                    \
+                                   __macro(rocblas_ztrsm)                      \
+                                   __macro(rocblas_strmm)                      \
+                                   __macro(rocblas_dtrmm)                      \
+                                   __macro(rocblas_ctrmm)                      \
+                                   __macro(rocblas_ztrmm) */                   \
+      __macro(rocblas_sgeam)                                                   \
+          __macro(rocblas_dgeam) /*  __macro(rocblas_cgeam)                    \
+                                   __macro(rocblas_zgeam)                      \
+                                   __macro(rocblas_sdgmm)                      \
+                                   __macro(rocblas_ddgmm)                      \
+                                   __macro(rocblas_cdgmm)                      \
+                                   __macro(rocblas_zdgmm) */
+
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_create_handle)
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_destroy_handle)
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_set_stream)
+// STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_set_pointer_mode)
+// STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_get_pointer_mode)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_sgemm_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_hgemm_strided_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_sgemm_strided_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_dgemm_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_dgemm_strided_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_cgemm_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_zgemm_batched)
+ROCBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_ROCBLAS_V2_WRAP)
+
+}  // namespace wrap
+
+static string ToString(rocblas_status status) {
+  switch (status) {
+    case rocblas_status_success:
+      return "rocblas_status_success";
+    case rocblas_status_invalid_handle:
+      return "rocblas_status_invalid_handle";
+    case rocblas_status_not_implemented:
+      return "rocblas_status_not_implemented";
+    case rocblas_status_invalid_pointer:
+      return "rocblas_status_invalid_pointer";
+    case rocblas_status_invalid_size:
+      return "rocblas_status_invalid_size";
+    case rocblas_status_memory_error:
+      return "rocblas_status_memory_error";
+    case rocblas_status_internal_error:
+      return "rocblas_status_internal_error";
+    default:
+      return absl::StrCat("<invalid rocBLAS status: ", status, ">");
+  }
+}
+
+bool ROCMBlas::Init() {
+  rocblas_status ret = wrap::rocblas_create_handle(parent_, &blas_);
+  if (ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to create rocBLAS handle: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+ROCMBlas::ROCMBlas(gpu::GpuExecutor *parent)
+    : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
+
+ROCMBlas::~ROCMBlas() {
+  if (blas_ != nullptr) {
+    wrap::rocblas_destroy_handle(parent_, blas_);
+  }
+}
+
+bool ROCMBlas::SetStream(Stream *stream) {
+  CHECK(stream != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
+  CHECK(blas_ != nullptr);
+  rocblas_status ret =
+      wrap::rocblas_set_stream(parent_, blas_, AsGpuStreamValue(stream));
+  if (ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to set stream for rocBLAS calls: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+namespace {
+
+// Helper functions transforming blas arguments into rocBLAS arguments.
+
+rocblas_operation ROCMBlasTranspose(blas::Transpose trans) {
+  switch (trans) {
+    case blas::Transpose::kNoTranspose:
+      return rocblas_operation_none;
+    case blas::Transpose::kTranspose:
+      return rocblas_operation_transpose;
+    case blas::Transpose::kConjugateTranspose:
+      return rocblas_operation_conjugate_transpose;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Transpose.";
+  }
+}
+
+rocblas_fill ROCMBlasUpperLower(blas::UpperLower uplo) {
+  switch (uplo) {
+    case blas::UpperLower::kUpper:
+      return rocblas_fill_upper;
+    case blas::UpperLower::kLower:
+      return rocblas_fill_lower;
+    default:
+      LOG(FATAL) << "Invalid value of blas::UpperLower.";
+  }
+}
+
+rocblas_diagonal ROCMBlasDiagonal(blas::Diagonal diag) {
+  switch (diag) {
+    case blas::Diagonal::kUnit:
+      return rocblas_diagonal_unit;
+    case blas::Diagonal::kNonUnit:
+      return rocblas_diagonal_non_unit;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Diagonal.";
+  }
+}
+
+rocblas_side ROCMBlasSide(blas::Side side) {
+  switch (side) {
+    case blas::Side::kLeft:
+      return rocblas_side_left;
+    case blas::Side::kRight:
+      return rocblas_side_right;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Side.";
+  }
+}
+
+}  // namespace
+
+template <typename FuncT, typename... Args>
+bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                                  bool pointer_mode_host, bool err_on_failure,
+                                  Args... args) {
+  mutex_lock lock{mu_};
+
+  CHECK(blas_ != nullptr);
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  rocblas_status ret = rocblas_func(parent_, blas_, args...);
+  if (err_on_failure && ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to run ROCBLAS routine " << rocblas_func.kName << ": "
+               << ToString(ret);
+  }
+  return ret == rocblas_status_success;
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(wrap::rocblas_sasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(wrap::rocblas_dasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the ASUM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the ASUM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_saxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_daxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the AXPY operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the AXPY operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_scopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_dcopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the COPY operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the COPY operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_sdot, stream, false /* = pointer_mode_host */, elem_count,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_ddot, stream, false /* = pointer_mode_host */, elem_count,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(wrap::rocblas_snrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(wrap::rocblas_dnrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the NRM2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the NRM2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<float> *x, int incx,
+                         DeviceMemory<float> *y, int incy, float c, float s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<double> *x, int incx,
+                         DeviceMemory<double> *y, int incy, double c,
+                         double s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<float>> *x, int incx,
+                         DeviceMemory<std::complex<float>> *y, int incy,
+                         float c, float s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<double>> *x, int incx,
+                         DeviceMemory<std::complex<double>> *y, int incy,
+                         double c, double s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
+                          DeviceMemory<float> *b, DeviceMemory<float> *c,
+                          DeviceMemory<float> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
+                          DeviceMemory<double> *b, DeviceMemory<double> *c,
+                          DeviceMemory<double> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
+                          DeviceMemory<std::complex<float>> *b,
+                          DeviceMemory<float> *c,
+                          DeviceMemory<std::complex<float>> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
+                          DeviceMemory<std::complex<double>> *b,
+                          DeviceMemory<double> *c,
+                          DeviceMemory<std::complex<double>> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy,
+                          const DeviceMemory<float> &param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy,
+                          const DeviceMemory<double> &param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
+                           DeviceMemory<float> *d2, DeviceMemory<float> *x1,
+                           const DeviceMemory<float> &y1,
+                           DeviceMemory<float> *param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTMG operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
+                           DeviceMemory<double> *d2, DeviceMemory<double> *x1,
+                           const DeviceMemory<double> &y1,
+                           DeviceMemory<double> *param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTMG operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(wrap::rocblas_sscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemoryMutable(x), incx);
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(wrap::rocblas_dscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemoryMutable(x), incx);
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_sswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_dswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<float>> *x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SWAP operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<double>> *x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SWAP operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(wrap::rocblas_isamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(wrap::rocblas_idamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMAX operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMAX operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_isamin, stream, false /* = pointer_mode_host */, elem_count,
+      GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_idamin, stream, false /* = pointer_mode_host */, elem_count,
+      GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMIN operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMIN operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(
+      wrap::rocblas_sgemv, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(
+      wrap::rocblas_dgemv, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(
+      wrap::rocblas_sger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(
+      wrap::rocblas_dger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GER operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GER operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GERU operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GERU operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SBMV operation "
+             << "for the \"complex<float>\" dataype";
+
+  return false;
+}
+
+bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &ap,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &ap,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR2 operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR2 operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(wrap::rocblas_ssyr, stream,
+                        true /* = pointer_mode_host */,
+                        ROCMBlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(wrap::rocblas_dsyr, stream,
+                        true /* = pointer_mode_host */,
+                        ROCMBlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2 operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2 operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<Eigen::half> &a,
+                          int lda, const DeviceMemory<Eigen::half> &b, int ldb,
+                          float beta, DeviceMemory<Eigen::half> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  const Eigen::half alpha_half(alpha);
+  const Eigen::half beta_half(beta);
+  return DoBlasInternal(
+      wrap::rocblas_hgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+      reinterpret_cast<const rocblas_half *>(&alpha_half),
+      reinterpret_cast<const rocblas_half *>(GpuMemory(a)), lda,
+      reinterpret_cast<const rocblas_half *>(GpuMemory(b)), ldb,
+      reinterpret_cast<const rocblas_half *>(&beta_half),
+      reinterpret_cast<rocblas_half *>(GpuMemoryMutable(c)), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  return DoBlasInternal(
+      wrap::rocblas_sgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k, &alpha,
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      wrap::rocblas_dgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k, &alpha,
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
+    int incx, float beta, DeviceMemory<float> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
+    int incx, double beta, DeviceMemory<double> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
+    int lda, const DeviceMemory<std::complex<float>> &x, int incx,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
+    int lda, const DeviceMemory<std::complex<double>> &x, int incx,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
+    DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+template <typename T>
+bool ROCMBlas::DoBlasGemvWithProfilingImpl(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
+    const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
+    const T &beta, DeviceMemory<T> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+template <typename T, typename ParamType>
+bool ROCMBlas::DoBlasGemmWithProfilingImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+    int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+    DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+template <typename InT, typename OutT, typename CompT>
+bool ROCMBlas::DoBlasGemmWithAlgorithmImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a, int lda,
+    const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+    DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+bool ROCMBlas::GetBlasGemmAlgorithms(
+    std::vector<blas::AlgorithmType> *out_algorithms) {
+  // ROCM TODO: properly implement the interface
+  return true;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+    const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int32> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"int8\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+    const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"half\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+template <typename T>
+struct EigenHalfToRocBlasHalf {
+  using type = T;
+};
+
+template <>
+struct EigenHalfToRocBlasHalf<Eigen::half> {
+  using type = rocblas_half;
+};
+
+template <typename T, typename FuncT>
+port::Status ROCMBlas::DoBlasGemmBatchedInternal(
+    FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
+    const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
+    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // MAPPED_T will be same as T for all types except Eigen::Half
+  // for T = Eigen::half, MAPPED_T = rocblas_half
+  using MAPPED_T = typename EigenHalfToRocBlasHalf<T>::type;
+
+  // Alocate local vectors to hold device pointers to matrices
+  std::vector<MAPPED_T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
+  for (int i = 0; i < batch_count; ++i) {
+    // static_cast does work when converting Eigen::half* to rocblas_half*,
+    // hence the use od reinterpret_cast
+    a_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(a_ptrs_to_wrappers[i]->opaque()));
+    b_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(b_ptrs_to_wrappers[i]->opaque()));
+    c_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(c_ptrs_to_wrappers[i]->opaque()));
+  }
+
+  //  batch_count <= 1 is base case, no definable matrix stride, set it same as
+  //  ld*
+  long long bsa = lda;
+  long long bsb = ldb;
+  long long bsc = ldc;
+  bool bsa_is_constant = true;
+  bool bsb_is_constant = true;
+  bool bsc_is_constant = true;
+
+  if (batch_count > 1) {
+    // Remember first stride; if any other stride is different that this one,
+    // KABLAM
+    bsa = a_raw_ptrs[1] - a_raw_ptrs[0];
+    bsb = b_raw_ptrs[1] - b_raw_ptrs[0];
+    bsc = c_raw_ptrs[1] - c_raw_ptrs[0];
+
+    //  Loop to verify that batched strides are constant
+    //  All the test cases from batch_matmul_op_test.py seem to satisfy this
+    //  requirement of a constant stride.  If this can be proven globally, then
+    //  this loop check can be safely removed
+    for (int i = 1; i < batch_count - 1; ++i) {
+      long long iterative_bsa = a_raw_ptrs[i + 1] - a_raw_ptrs[i];
+      if (iterative_bsa != bsa) {
+        bsa_is_constant = false;
+        break;
+      }
+
+      long long iterative_bsb = b_raw_ptrs[i + 1] - b_raw_ptrs[i];
+      if (iterative_bsb != bsb) {
+        bsb_is_constant = false;
+        break;
+      }
+
+      long long iterative_bsc = c_raw_ptrs[i + 1] - c_raw_ptrs[i];
+      if (iterative_bsc != bsc) {
+        bsc_is_constant = false;
+        break;
+      }
+    }
+  }
+
+  assert(!(ldc < m || bsc < ldc * n));
+
+  if (ROCMBlasTranspose(transa) == rocblas_operation_none)
+    assert(!(lda < m || bsa < lda * k));
+  else
+    assert(!(lda < k || bsa < lda * m));
+
+  if (ROCMBlasTranspose(transb) == rocblas_operation_none)
+    assert(!(ldb < k || bsb < ldb * n));
+  else
+    assert(!(ldb < n || bsc < ldc * k));
+
+  MAPPED_T *alpha_ptr = reinterpret_cast<MAPPED_T *>(&alpha);
+  MAPPED_T *beta_ptr = reinterpret_cast<MAPPED_T *>(&beta);
+
+  if (bsa_is_constant && bsb_is_constant && bsc_is_constant) {
+    bool ok = DoBlasInternal(
+        rocblas_func, stream, true /* = pointer_mode_host */,
+        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+        GpuComplex(alpha_ptr), a_raw_ptrs[0], lda, bsa, b_raw_ptrs[0], ldb, bsb,
+        GpuComplex(beta_ptr), c_raw_ptrs[0], ldc, bsc, batch_count);
+
+    if (ok) {
+      return port::Status::OK();
+    }
+  }
+
+  return port::Status(port::error::INTERNAL,
+                      "failed BLAS call, see log for details");
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  const Eigen::half alpha_half(alpha);
+  const Eigen::half beta_half(beta);
+
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_hgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha_half, a, lda, b, ldb, beta_half, c, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_sgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha,
+    const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
+    double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_dgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
+    int ldb, std::complex<float> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMMBatched operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
+    int ldb, std::complex<double> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMMBatched operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          float beta, DeviceMemory<std::complex<float>> *c,
+                          int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HERK operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          double beta, DeviceMemory<std::complex<double>> *c,
+                          int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HERK operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           float beta, DeviceMemory<std::complex<float>> *c,
+                           int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2K operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           double beta, DeviceMemory<std::complex<double>> *c,
+                           int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2K operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          float beta, DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          double beta, DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           float alpha, const DeviceMemory<float> &a, int lda,
+                           const DeviceMemory<float> &b, int ldb, float beta,
+                           DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           double alpha, const DeviceMemory<double> &a, int lda,
+                           const DeviceMemory<double> &b, int ldb, double beta,
+                           DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           std::complex<float> beta,
+                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           std::complex<double> beta,
+                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  return DoBlasInternal(
+      wrap::rocblas_strsm, stream, true /* = pointer_mode_host */,
+      ROCMBlasSide(side), ROCMBlasUpperLower(uplo), ROCMBlasTranspose(transa),
+      ROCMBlasDiagonal(diag), m, n, &alpha, const_cast<float *>(GpuMemory(a)),
+      lda, GpuMemoryMutable(b), ldb);
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  return DoBlasInternal(
+      wrap::rocblas_dtrsm, stream, true /* = pointer_mode_host */,
+      ROCMBlasSide(side), ROCMBlasUpperLower(uplo), ROCMBlasTranspose(transa),
+      ROCMBlasDiagonal(diag), m, n, &alpha, const_cast<double *>(GpuMemory(a)),
+      lda, GpuMemoryMutable(b), ldb);
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
+    int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"Eigen::half\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
+    float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
+    int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
+    double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
+    int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a,
+    const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a,
+    const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+}  // namespace gpu
+
+void initialize_rocblas() {
+  auto rocBlasAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
+
+  if (!rocBlasAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()
+            ->RegisterFactory<PluginRegistry::BlasFactory>(
+                rocm::kROCmPlatformId, gpu::kRocBlasPlugin, "rocBLAS",
+                [](internal::StreamExecutorInterface *parent)
+                    -> blas::BlasSupport * {
+                  gpu::GpuExecutor *rocm_executor =
+                      dynamic_cast<gpu::GpuExecutor *>(parent);
+                  if (rocm_executor == nullptr) {
+                    LOG(ERROR)
+                        << "Attempting to initialize an instance of the "
+                           "rocBLAS "
+                        << "support library with a non-ROCM StreamExecutor";
+                    return nullptr;
+                  }
+
+                  gpu::ROCMBlas *blas = new gpu::ROCMBlas(rocm_executor);
+                  if (!blas->Init()) {
+                    // Note: Init() will log a more specific error.
+                    delete blas;
+                    return nullptr;
+                  }
+                  return blas;
+                });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocBLAS factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocblas,
+                            { stream_executor::initialize_rocblas(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.h b/tensorflow/stream_executor/rocm/rocm_blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..75c68481cee5f6123c80e9751c06392f1835a5ce
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_blas.h
@@ -0,0 +1,159 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for BLAS functionality -- this wraps the rocBLAS
+// library capabilities, and is only included into ROCM implementation code --
+// it will not introduce rocm headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+// Opaque and unique identifier for the rocBLAS plugin.
+extern const PluginId kRocBlasPlugin;
+
+class GpuExecutor;
+
+// BLAS plugin for ROCM platform via rocBLAS library.
+//
+// This satisfies the platform-agnostic BlasSupport interface.
+//
+// Note that the rocBLAS handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the rocBLAS handle when a
+// ROCM context is active.
+//
+// Thread-safe post-initialization.
+class ROCMBlas : public blas::BlasSupport {
+ public:
+  explicit ROCMBlas(GpuExecutor *parent);
+
+  // Allocates a rocBLAS handle.
+  bool Init();
+
+  // Releases the rocBLAS handle, if present.
+  ~ROCMBlas() override;
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES
+
+ private:
+  // Tells rocBLAS to enqueue the BLAS operation onto a particular Stream.
+  //
+  // rocBLAS is stateful, and only be associated with one stream (in order to
+  // enqueue dispatch) at a given time. As a result, this generally must be
+  // invoked before calling into rocBLAS.
+  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // A helper function that calls the real rocBLAS function together with error
+  // handling.
+  //
+  // rocblas_func:       rocBLAS function pointer.
+  // rocblas_name:       rocBLAS function name.
+  // stream:             Stream to enqueue the BLAS operation onto.
+  // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
+  //                     (true) or device (false).
+  // err_on_failure:     Whether to print an error if the rocBLAS function
+  // fails. args:               Arguments of rocBLAS function.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                          bool pointer_mode_host, bool err_on_failure,
+                          Args... args);
+
+  // Convenience functions that call DoBlasInternalImpl with different values
+  // for err_on_failure.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternal(FuncT rocblas_func, Stream *stream,
+                      bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/true, args...);
+  }
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalFailureOK(FuncT rocblas_func, Stream *stream,
+                               bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/false, args...);
+  }
+
+  // A helper function to implement DoBlasGemmBatched interfaces for generic
+  // types.
+  template <typename T, typename FuncT>
+  port::Status DoBlasGemmBatchedInternal(
+      FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Helper function for implementing DoBlasGemmWithAlgorithm.
+  //
+  // We take alpha and beta by const reference because T might be Eigen::half,
+  // and we want to avoid pulling in a dependency on Eigen.  When we pass the
+  // references to rocBLAS, we essentially reinterpret_cast to __half, which is
+  // safe because Eigen::half inherits from __half.
+  template <typename InT, typename OutT, typename CompT>
+  bool DoBlasGemmWithAlgorithmImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a,
+      int lda, const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+      DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemmWithProfiling.
+  template <typename T, typename ParamType>
+  bool DoBlasGemmWithProfilingImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+      int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+      DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemvWithProfiling.
+  template <typename T>
+  bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans,
+                                   uint64 m, uint64 n, const T &alpha,
+                                   const DeviceMemory<T> &a, int lda,
+                                   const DeviceMemory<T> &x, int incx,
+                                   const T &beta, DeviceMemory<T> *y, int incy,
+                                   blas::ProfileResult *output_profile_result);
+
+  // mutex that guards the rocBLAS handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this ROCMBlas.
+  // Immutable post-initialization.
+  GpuExecutor *parent_;
+
+  // rocBLAS library handle on the device.
+  rocblas_handle blas_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCMBlas);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
index c6da7f9e3da9143466454466d4c402772ac69870..812974a9debb88e7db924680089acc5dc2ccc5a2 100644
--- a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/numbers.h"
 #include "tensorflow/stream_executor/lib/process_state.h"
@@ -38,9 +37,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
 
 namespace stream_executor {
-namespace gpu {
+namespace rocm {
 
 string DriverVersionToString(DriverVersion version) {
   return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
@@ -95,6 +95,12 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
   return result;
 }
 
+}  // namespace rocm
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
 // -- class Diagnostician
 
 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@@ -133,11 +139,11 @@ void Diagnostician::LogDiagnosticInformation() {
   }
   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "librocm reported version is: "
-            << DriverVersionStatusToString(dso_version);
+            << rocm::DriverVersionStatusToString(dso_version);
 
   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
-            << DriverVersionStatusToString(kernel_version);
+            << rocm::DriverVersionStatusToString(kernel_version);
 
   if (kernel_version.ok() && dso_version.ok()) {
     WarnOnDsoKernelMismatch(dso_version, kernel_version);
@@ -175,7 +181,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
       auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
       auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
-      *result = StringToDriverVersion(stripped_dso_version);
+      *result = rocm::StringToDriverVersion(stripped_dso_version);
       return 1;
     }
     return 0;
@@ -205,7 +211,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   // TODO(b/22689637): Eliminate the explicit namespace if possible.
   auto stripped_kernel_version =
       port::StripSuffixString(kernel_version, ".ld64");
-  return StringToDriverVersion(stripped_kernel_version);
+  return rocm::StringToDriverVersion(stripped_kernel_version);
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
@@ -214,12 +220,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
   if (kernel_version.ok() && dso_version.ok() &&
       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
     LOG(INFO) << "kernel version seems to match DSO: "
-              << DriverVersionToString(kernel_version.ValueOrDie());
+              << rocm::DriverVersionToString(kernel_version.ValueOrDie());
   } else {
     LOG(ERROR) << "kernel version "
-               << DriverVersionStatusToString(kernel_version)
+               << rocm::DriverVersionStatusToString(kernel_version)
                << " does not match DSO version "
-               << DriverVersionStatusToString(dso_version)
+               << rocm::DriverVersionStatusToString(dso_version)
                << " -- cannot find working devices in this configuration";
   }
 }
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.h b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..233c6bdade68e19e02a30c92e92d5961d9ca260b
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = gpu::DriverVersion;
+
+// Converts a parsed driver version to string form.
+string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+using Diagnostician = gpu::Diagnostician;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 39d52d2830429fd6fcb93d251b5b86700e785fdd..1b0b91426aae0ed7cca445143186a9b5daeced6a 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
 
 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
@@ -143,7 +144,7 @@ string MemorySpaceString(MemorySpace memory_space) {
 // HIP driver (e.g., this value is not our cached view of the current device).
 static int CurrentDeviceOrDie() {
   int current = -1;
-  hipError_t result = hipGetDevice(&current);
+  hipError_t result = tensorflow::wrap::hipGetDevice(&current);
   if (result != hipSuccess) {
     LOG(FATAL) << "failed to query current device: " << ToString(result);
   }
@@ -154,7 +155,7 @@ namespace {
 
 // Call hipDeviceSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
-  auto res = hipDeviceSynchronize();
+  auto res = tensorflow::wrap::hipDeviceSynchronize();
   if (res != hipSuccess) {
     LOG(FATAL) << "Synchronize found " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -197,7 +198,8 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
           << tls->current_device_ordinal << " to " << context->device_ordinal();
 
   // Set the device and update thread local.
-  CHECK_EQ(hipSuccess, hipSetDevice(context->device_ordinal()));
+  CHECK_EQ(hipSuccess,
+           tensorflow::wrap::hipSetDevice(context->device_ordinal()));
   tls->current_device_ordinal = context->device_ordinal();
 }
 
@@ -225,7 +227,8 @@ ScopedActivateContext::~ScopedActivateContext() {
           << to_restore_->device_ordinal();
 
   // Set context and update thread local.
-  CHECK_EQ(hipSuccess, hipSetDevice(to_restore_->device_ordinal()));
+  CHECK_EQ(hipSuccess,
+           tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
   tls->current_device_ordinal = to_restore_->device_ordinal();
 }
 
@@ -261,7 +264,8 @@ string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
 // in the process of querying.
 string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
   hipPointerAttribute_t from_pointerAttributes;
-  hipError_t result = hipPointerGetAttributes(&from_pointerAttributes, from);
+  hipError_t result =
+      tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
   if (result != hipSuccess) {
     LOG(ERROR) << "could not retrieve source pointer's device: "
                << ToString(result);
@@ -269,7 +273,7 @@ string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
   }
 
   hipPointerAttribute_t to_pointerAttributes;
-  result = hipPointerGetAttributes(&to_pointerAttributes, to);
+  result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
   if (result != hipSuccess) {
     LOG(ERROR) << "could not retrieve destination pointer's device: "
                << ToString(result);
@@ -289,7 +293,7 @@ static port::Status InternalInit() {
   if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
     LOG(ERROR) << "injecting ROCM init error; initialization will fail";
   } else {
-    res = hipInit(0 /* = flags */);
+    res = tensorflow::wrap::hipInit(0 /* = flags */);
   }
 
   if (res == hipSuccess) {
@@ -322,7 +326,7 @@ static port::Status InternalInit() {
 
 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
                                                hipDevice_t* device) {
-  hipError_t res = hipDeviceGet(device, device_ordinal);
+  hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
   if (res == hipSuccess) {
     return port::Status::OK();
   }
@@ -336,7 +340,8 @@ static port::Status InternalInit() {
                                            string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
-  hipError_t res = hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  hipError_t res =
+      tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
@@ -382,7 +387,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
 
 /* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
                                                 hipFuncCache_t cache_config) {
-  hipError_t res = hipFuncSetCacheConfig(function, cache_config);
+  hipError_t res =
+      tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
@@ -396,7 +402,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   hipSharedMemConfig shared_mem_config;
   ScopedActivateContext activation{context};
-  hipError_t result = hipDeviceGetSharedMemConfig(&shared_mem_config);
+  hipError_t result =
+      tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config);
   if (result != hipSuccess) {
     LOG(ERROR) << "failed to get ROCM device shared memory config. "
                << "Context device ID: " << context->device_ordinal()
@@ -411,7 +418,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
     GpuContext* context, hipSharedMemConfig shared_mem_config) {
   ScopedActivateContext activation{context};
-  hipError_t result = hipDeviceSetSharedMemConfig(shared_mem_config);
+  hipError_t result =
+      tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config);
   if (result != hipSuccess) {
     LOG(ERROR) << "failed to set ROCM device shared memory config. "
                << "Context device ID: " << context->device_ordinal()
@@ -435,7 +443,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
-  hipError_t res = hipModuleLaunchKernel(
+  hipError_t res = tensorflow::wrap::hipModuleLaunchKernel(
       function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
       block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
   if (res != hipSuccess) {
@@ -471,7 +479,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
         ScopedActivateContext activation{context};
         void* hsaco_data = const_cast<char*>(hsaco_contents);
 
-        hipError_t res = hipModuleLoadData(module, hsaco_data);
+        hipError_t res =
+            tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
 
         if (res != hipSuccess) {
           LOG(ERROR) << "failed to load HSACO: " << ToString(res);
@@ -491,7 +500,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                     hipDeviceptr_t location,
                                                     uint8 value, size_t size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemset(location, value, size);
+  hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -512,8 +521,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     LOG(ERROR) << "failed to memset memory";
     return false;
   }
-  hipError_t res =
-      hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
+  hipError_t res = tensorflow::wrap::hipMemset(pointer, static_cast<int>(value),
+                                               uint32_count * 4);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -527,7 +536,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                      size_t uint32_count,
                                                      GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemsetAsync(location, value, uint32_count, stream);
+  hipError_t res =
+      tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -552,7 +562,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     LOG(ERROR) << "failed to memset memory";
     return false;
   }
-  hipError_t res = hipMemsetAsync(pointer, value, uint32_count * 4, stream);
+  hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value,
+                                                    uint32_count * 4, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -565,8 +576,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                GpuStreamHandle stream,
                                                StreamCallback callback,
                                                void* data) {
-  hipError_t res = hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
-                                        data, 0 /* = flags */);
+  hipError_t res = tensorflow::wrap::hipStreamAddCallback(
+      stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
   if (res != hipSuccess) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -580,7 +591,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                hipFunction_t* function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
-  hipError_t res = hipModuleGetFunction(function, module, kernel_name);
+  hipError_t res =
+      tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to get kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
@@ -598,7 +610,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  hipError_t res = hipModuleGetGlobal(dptr, bytes, module, symbol_name);
+  hipError_t res =
+      tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
   if (res != hipSuccess) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
@@ -613,7 +626,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
                                           hipModule_t module) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipModuleUnload(module);
+  hipError_t res = tensorflow::wrap::hipModuleUnload(module);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
@@ -623,7 +636,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
                                           GpuStreamHandle* stream) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipStreamCreateWithFlags(
+  hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
       stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
   if (res != hipSuccess) {
     LOG(ERROR) << "could not allocate ROCM stream for device "
@@ -643,7 +656,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  hipError_t res = hipStreamDestroy(*stream);
+  hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to destroy ROCM stream for device "
                << context->device_ordinal() << ": " << ToString(res);
@@ -658,7 +671,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                              uint64 bytes) {
   ScopedActivateContext activated{context};
   hipDeviceptr_t result = 0;
-  hipError_t res = hipMalloc(&result, bytes);
+  hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@@ -675,7 +688,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                               void* location) {
   ScopedActivateContext activation{context};
   hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
-  hipError_t res = hipFree(pointer);
+  hipError_t res = tensorflow::wrap::hipFree(pointer);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
@@ -704,7 +717,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activation{context};
   void* host_mem = nullptr;
   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
-  hipError_t res = hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
+  hipError_t res = tensorflow::wrap::hipHostMallocVanilla(
+      &host_mem, bytes, hipHostMallocPortable);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
@@ -715,7 +729,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
                                             void* location) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipHostFree(location);
+  hipError_t res = tensorflow::wrap::hipHostFree(location);
   if (res != hipSuccess) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
@@ -726,7 +740,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                           uint64 bytes) {
   ScopedActivateContext activation{context};
   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
-  hipError_t res = hipHostRegister(location, bytes, hipHostRegisterPortable);
+  hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
+                                                     hipHostRegisterPortable);
   if (res != hipSuccess) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
@@ -738,7 +753,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
                                             void* location) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipHostUnregister(location);
+  hipError_t res = tensorflow::wrap::hipHostUnregister(location);
   if (res != hipSuccess) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
@@ -755,7 +770,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventDestroy(*event);
+  hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
   *event = nullptr;
 
   switch (res) {
@@ -779,7 +794,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                  GpuEventHandle event,
                                                  GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventRecord(event, stream);
+  hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
   switch (res) {
     case hipSuccess:
       return port::Status::OK();
@@ -800,7 +815,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
     GpuContext* context, GpuEventHandle event) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventQuery(event);
+  hipError_t res = tensorflow::wrap::hipEventQuery(event);
   if (res != hipSuccess && res != hipErrorNotReady) {
     return port::Status{
         port::error::INTERNAL,
@@ -817,12 +832,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for hipEventElapsedTime to
   // work.
-  hipError_t res = hipEventSynchronize(stop);
+  hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
-  res = hipEventElapsedTime(elapsed_milliseconds, start, stop);
+  res =
+      tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
@@ -836,7 +852,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                GpuStreamHandle stream,
                                                GpuEventHandle event) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipStreamWaitEvent(stream, event, 0 /* = flags */);
+  hipError_t res =
+      tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != hipSuccess) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
@@ -847,7 +864,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipDeviceSynchronize();
+  hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
   if (res != hipSuccess) {
     LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -861,7 +878,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                        GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  hipError_t res = hipStreamSynchronize(stream);
+  hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream);
   if (res != hipSuccess) {
     port::Status status = port::InternalError(
         absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
@@ -877,7 +894,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                           GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  hipError_t res = hipStreamQuery(stream);
+  hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
   if (res == hipSuccess) {
     return true;
   }
@@ -891,7 +908,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
     GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyDtoH(host_dst, gpu_src, size);
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size);
   if (res != hipSuccess) {
     return port::InternalError(
         absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
@@ -908,7 +925,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
     uint64 size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoD(
+      gpu_dst, const_cast<void*>(host_src), size);
   if (res != hipSuccess) {
     return port::InternalError(absl::StrFormat(
         "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
@@ -924,7 +942,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
     uint64 size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyDtoD(gpu_dst, gpu_src, size);
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != hipSuccess) {
     return port::InternalError(absl::StrFormat(
         "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
@@ -942,7 +960,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                    uint64 size,
                                                    GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  hipError_t res =
+      tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
@@ -963,8 +982,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                    uint64 size,
                                                    GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t res =
-      hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
+      gpu_dst, const_cast<void*>(host_src), size, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
@@ -984,7 +1003,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                    uint64 size,
                                                    GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t result = hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  hipError_t result =
+      tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != hipSuccess) {
     LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from device to device: %s"
@@ -1021,7 +1041,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventCreateWithFlags(event, hipflags);
+  hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
 
   if (res == hipSuccess) {
     return port::Status::OK();
@@ -1037,7 +1057,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ int GpuDriver::GetDeviceCount() {
   int device_count = 0;
-  hipError_t res = hipGetDeviceCount(&device_count);
+  hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
   if (res != hipSuccess) {
     LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
     return 0;
@@ -1061,7 +1081,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ port::Status GpuDriver::GetPointerAddressRange(
     hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
-  hipError_t result = hipMemGetAddressRange(base, size, dptr);
+  hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
   if (result == hipSuccess) {
     return port::Status::OK();
   } else if (result == hipErrorNotFound) {
@@ -1106,7 +1126,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
     hipDeviceptr_t pointer) {
   hipPointerAttribute_t pointerAttributes;
-  hipError_t result = hipPointerGetAttributes(&pointerAttributes, pointer);
+  hipError_t result =
+      tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
   if (result != hipSuccess) {
     return port::Status{
         port::error::INTERNAL,
@@ -1114,7 +1135,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   hipDevice_t device;
-  result = hipDeviceGet(&device, pointerAttributes.device);
+  result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
   if (result != hipSuccess) {
     return port::Status{
         port::error::INTERNAL,
@@ -1127,7 +1148,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
                                                       hipDevice_t device) {
   hipDeviceProp_t props;
-  hipError_t result = hipGetDeviceProperties(&props, device);
+  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
   if (result == hipSuccess) {
     *version = props.gcnArch;
     return port::Status::OK();
@@ -1145,7 +1166,8 @@ template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
                                             hipDeviceAttribute_t attribute) {
   int value = -1;
-  hipError_t result = hipDeviceGetAttribute(&value, attribute, device);
+  hipError_t result =
+      tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
   if (result != hipSuccess) {
     return port::Status{
         port::error::NOT_FOUND,
@@ -1199,22 +1221,24 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
                                            hipDevice_t device) {
   int value;
-  hipError_t res =
-      hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
+  hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimX, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;
 
-  res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
+  res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimY, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;
 
-  res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
+  res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimZ, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
@@ -1224,7 +1248,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 }
 
 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
-  hipError_t res = hipDriverGetVersion(driver_version);
+  hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
@@ -1235,7 +1259,8 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 
 /* static */ bool GpuDriver::GetDeviceProperties(
     hipDeviceProp_t* device_properties, int device_ordinal) {
-  hipError_t res = hipGetDeviceProperties(device_properties, device_ordinal);
+  hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
+                                                            device_ordinal);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query device properties: " << ToString(res);
     return false;
@@ -1268,7 +1293,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   ScopedActivateContext activation{context};
   size_t free = 0;
   size_t total = 0;
-  hipError_t res = hipMemGetInfo(&free, &total);
+  hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
@@ -1282,7 +1307,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
                                                   uint64* result) {
   size_t value = -1;
-  hipError_t res = hipDeviceTotalMem(&value, device);
+  hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
@@ -1297,7 +1322,8 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   static const int kBufferSize = 64;
   absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
-  hipError_t res = hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
+      chars.begin(), kBufferSize - 1, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
@@ -1313,7 +1339,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   }
 
   int can_access_peer = -1;
-  hipError_t res = hipDeviceCanAccessPeer(
+  hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
       &can_access_peer, from->device_ordinal(), to->device_ordinal());
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1330,8 +1356,8 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   }
 
   ScopedActivateContext activated{from};
-  hipError_t result =
-      hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
+  hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
+      to->device_ordinal(), 0 /* = flags */);
   if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
     return port::Status{
         port::error::INTERNAL,
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..27495c2cbc04e7b6171814e142414a9817c26226
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
@@ -0,0 +1,146 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocm driver calls with dso loader so that we don't need to
+// have explicit linking to librocm. All TF rocm driver usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+
+#include "rocm/include/hip/hip_runtime.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#if defined(TENSORFLOW_USE_ROCM)
+
+#endif
+
+namespace tensorflow {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                          \
+  template <typename... Args>                                            \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
+    return ::hipSymbolName(args...);                                     \
+  }
+
+// This macro wraps a global identifier, given by hipSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+// hipMalloc and hipHostMalloc are defined as funtion templates in the
+// HIP header files, and hence their names get mangled and the attempt
+// to resolve their name when trying to dynamically load them will fail
+// Updating the HIP header files to make them C functions is underway.
+// Until that change flows through, we will workaround the issue by
+// creating dummy wrappers for them here
+
+hipError_t hipMallocVanilla(void** ptr, size_t size) {
+  return hipErrorNotInitialized;
+}
+
+hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
+  return hipErrorNotInitialized;
+}
+
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
+  template <typename... Args>                                               \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) {    \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
+    static FuncPtrT loaded = []() -> FuncPtrT {                             \
+      static const char *kName = TO_STR(hipSymbolName);                     \
+      void *f;                                                              \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
+          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()     \
+              .ValueOrDie(),                                                \
+          kName, &f);                                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in HIP DSO; dlerror: " << s.error_message();       \
+      return reinterpret_cast<FuncPtrT>(f);                                 \
+    }();                                                                    \
+    return loaded(args...);                                                 \
+  }
+#endif
+
+// clang-format off
+#define HIP_ROUTINE_EACH(__macro)                   \
+  __macro(hipDeviceCanAccessPeer)                   \
+  __macro(hipDeviceEnablePeerAccess)                \
+  __macro(hipDeviceGet)                             \
+  __macro(hipDeviceGetAttribute)                    \
+  __macro(hipDeviceGetName)                         \
+  __macro(hipDeviceGetPCIBusId)                     \
+  __macro(hipDeviceGetSharedMemConfig)              \
+  __macro(hipDeviceSetSharedMemConfig)              \
+  __macro(hipDeviceSynchronize)                     \
+  __macro(hipDeviceTotalMem)                        \
+  __macro(hipDriverGetVersion)                      \
+  __macro(hipEventCreateWithFlags)                  \
+  __macro(hipEventElapsedTime)                      \
+  __macro(hipEventDestroy)                          \
+  __macro(hipEventQuery)                            \
+  __macro(hipEventRecord)                           \
+  __macro(hipEventSynchronize)                      \
+  __macro(hipFree)                                  \
+  __macro(hipFuncSetCacheConfig)                    \
+  __macro(hipGetDevice)                             \
+  __macro(hipGetDeviceCount)                        \
+  __macro(hipGetDeviceProperties)                   \
+  __macro(hipHostFree)                              \
+  __macro(hipHostRegister)                          \
+  __macro(hipHostUnregister)                        \
+  __macro(hipInit)                                  \
+  __macro(hipMemGetAddressRange)                    \
+  __macro(hipMemGetInfo)                            \
+  __macro(hipMemcpyDtoD)                            \
+  __macro(hipMemcpyDtoDAsync)                       \
+  __macro(hipMemcpyDtoH)                            \
+  __macro(hipMemcpyDtoHAsync)                       \
+  __macro(hipMemcpyHtoD)                            \
+  __macro(hipMemcpyHtoDAsync)                       \
+  __macro(hipMemset)                                \
+  __macro(hipMemsetAsync)                           \
+  __macro(hipModuleGetFunction)                     \
+  __macro(hipModuleGetGlobal)                       \
+  __macro(hipModuleLaunchKernel)                    \
+  __macro(hipModuleLoadData)                        \
+  __macro(hipModuleUnload)                          \
+  __macro(hipPointerGetAttributes)                  \
+  __macro(hipSetDevice)                             \
+  __macro(hipStreamAddCallback)                     \
+  __macro(hipStreamCreateWithFlags)                 \
+  __macro(hipStreamDestroy)                         \
+  __macro(hipStreamQuery)                           \
+  __macro(hipStreamSynchronize)                     \
+  __macro(hipStreamWaitEvent)                       \
+// clang-format on
+
+HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
+#undef HIP_ROUTINE_EACH
+#undef STREAM_EXECUTOR_HIP_WRAP
+#undef TO_STR
+#undef TO_STR_
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.cc b/tensorflow/stream_executor/rocm/rocm_fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2af973309c01ec67de0d7022e5a8cefd18c5063a
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_fft.cc
@@ -0,0 +1,618 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_fft.h"
+
+#include <complex>
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocFftPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                      \
+  struct WrapperShim__##__name {                                 \
+    template <typename... Args>                                  \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};            \
+      return ::__name(args...);                                  \
+    }                                                            \
+  } __name;
+
+#else
+
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocfftDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocfft DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) {          \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+#define ROCFFT_ROUTINE_EACH(__macro)                                           \
+  __macro(hipfftDestroy) __macro(hipfftSetStream) __macro(hipfftPlan1d)        \
+      __macro(hipfftPlan2d) __macro(hipfftPlan3d) __macro(hipfftPlanMany)      \
+          __macro(hipfftCreate) __macro(hipfftSetAutoAllocation)               \
+              __macro(hipfftSetWorkArea) __macro(hipfftGetSize1d)              \
+                  __macro(hipfftMakePlan1d) __macro(hipfftGetSize2d)           \
+                      __macro(hipfftMakePlan2d) __macro(hipfftGetSize3d)       \
+                          __macro(hipfftMakePlan3d) __macro(hipfftGetSizeMany) \
+                              __macro(hipfftMakePlanMany)                      \
+                                  __macro(hipfftExecD2Z)                       \
+                                      __macro(hipfftExecZ2D)                   \
+                                          __macro(hipfftExecC2C)               \
+                                              __macro(hipfftExecC2R)           \
+                                                  __macro(hipfftExecZ2Z)       \
+                                                      __macro(hipfftExecR2C)
+
+ROCFFT_ROUTINE_EACH(STREAM_EXECUTOR_ROCFFT_WRAP)
+
+}  // namespace wrap
+
+namespace {
+
+// A helper function transforming gpu_fft arguments into rocFFT arguments.
+hipfftType ROCMFftType(fft::Type type) {
+  switch (type) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kC2CInverse:
+      return HIPFFT_C2C;
+    case fft::Type::kC2R:
+      return HIPFFT_C2R;
+    case fft::Type::kR2C:
+      return HIPFFT_R2C;
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kZ2ZInverse:
+      return HIPFFT_Z2Z;
+    case fft::Type::kZ2D:
+      return HIPFFT_Z2D;
+    case fft::Type::kD2Z:
+      return HIPFFT_D2Z;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+// Associates the given stream with the given rocFFT plan.
+bool SetStream(GpuExecutor *parent, hipfftHandle plan, Stream *stream) {
+  auto ret = wrap::hipfftSetStream(parent, plan, AsGpuStreamValue(stream));
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine hipfftSetStream: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+port::Status ROCMFftPlan::Initialize(
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    uint64 *input_embed, uint64 input_stride, uint64 input_distance,
+    uint64 *output_embed, uint64 output_stride, uint64 output_distance,
+    fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
+  if (IsInitialized()) {
+    LOG(FATAL) << "Try to repeatedly initialize.";
+  }
+  is_initialized_ = true;
+  int elem_count_[3], input_embed_[3], output_embed_[3];
+  for (int i = 0; i < rank; ++i) {
+    elem_count_[i] = elem_count[i];
+    if (input_embed) {
+      input_embed_[i] = input_embed[i];
+    }
+    if (output_embed) {
+      output_embed_[i] = output_embed[i];
+    }
+  }
+  parent_ = parent;
+  fft_type_ = type;
+  if (batch_count == 1 && input_embed == nullptr && output_embed == nullptr) {
+    hipfftResult_t ret;
+    if (scratch_allocator == nullptr) {
+      switch (rank) {
+        case 1:
+          // hipfftPlan1d
+          ret = wrap::hipfftPlan1d(parent, &plan_, elem_count_[0],
+                                   ROCMFftType(type), 1 /* = batch */);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 1d plan."};
+          }
+          return port::Status::OK();
+        case 2:
+          // hipfftPlan2d
+          ret = wrap::hipfftPlan2d(parent, &plan_, elem_count_[0],
+                                   elem_count_[1], ROCMFftType(type));
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 2d plan."};
+          }
+          return port::Status::OK();
+        case 3:
+          // hipfftPlan3d
+          ret =
+              wrap::hipfftPlan3d(parent, &plan_, elem_count_[0], elem_count_[1],
+                                 elem_count_[2], ROCMFftType(type));
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 3d plan."};
+          }
+          return port::Status::OK();
+        default:
+          LOG(ERROR) << "Invalid rank value for hipfftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "hipfftPlan only takes rank 1, 2, or 3."};
+      }
+    } else {
+      ret = wrap::hipfftCreate(parent, &plan_);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT plan."};
+      }
+      ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set auto allocation for rocFFT plan."};
+      }
+      size_t size_in_bytes;
+      switch (rank) {
+        case 1:
+          ret = wrap::hipfftMakePlan1d(parent, plan_, elem_count_[0],
+                                       ROCMFftType(type), /*batch=*/1,
+                                       &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 1d plan."};
+          }
+          break;
+        case 2:
+          ret = wrap::hipfftMakePlan2d(parent, plan_, elem_count_[0],
+                                       elem_count_[1], ROCMFftType(type),
+                                       &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 2d plan."};
+          }
+          break;
+        case 3:
+          ret = wrap::hipfftMakePlan3d(parent, plan_, elem_count_[0],
+                                       elem_count_[1], elem_count_[2],
+                                       ROCMFftType(type), &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 3d plan."};
+          }
+          break;
+        default:
+          LOG(ERROR) << "Invalid rank value for hipfftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "hipfftPlan only takes rank 1, 2, or 3."};
+      }
+      // TODO(yangzihao): refactor this code and the one with the same function
+      // in the batch mode.
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::hipfftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for rocFFT plan."};
+      }
+      return port::Status::OK();
+    }
+  } else {
+    // For either multiple batches or rank higher than 3, use hipfftPlanMany().
+    if (scratch_allocator == nullptr) {
+      auto ret = wrap::hipfftPlanMany(
+          parent, &plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, ROCMFftType(type), batch_count);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT bacthed plan."};
+      }
+    } else {
+      auto ret = wrap::hipfftCreate(parent, &plan_);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT bacthed plan."};
+      }
+      ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for rocFFT batched plan:"
+                   << ret;
+        return port::Status{
+            port::error::INTERNAL,
+            "Failed to set auto allocation for rocFFT bacthed plan."};
+      }
+      size_t size_in_bytes;
+      ret = wrap::hipfftMakePlanMany(
+          parent, plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, ROCMFftType(type), batch_count, &size_in_bytes);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to make rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to make rocFFT bacthed plan."};
+      }
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::hipfftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for rocFFT bacthed plan."};
+      }
+    }
+  }
+  return port::Status::OK();
+}
+
+port::Status ROCMFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
+                                     int rank, uint64 *elem_count,
+                                     fft::Type type,
+                                     ScratchAllocator *scratch_allocator) {
+  return Initialize(parent_, stream, rank, elem_count,
+                    /*input_embed=*/nullptr, /*input_stride=*/0,
+                    /*input_distance=*/0,
+                    /*output_embed=*/nullptr, /*output_stride=*/0,
+                    /*output_distance=*/0, type, 1, scratch_allocator);
+}
+
+ROCMFftPlan::~ROCMFftPlan() { wrap::hipfftDestroy(parent_, plan_); }
+
+int ROCMFftPlan::GetFftDirection() const {
+  if (!IsInitialized()) {
+    LOG(FATAL) << "Try to get fft direction before initialization.";
+  } else {
+    switch (fft_type_) {
+      case fft::Type::kC2CForward:
+      case fft::Type::kZ2ZForward:
+      case fft::Type::kR2C:
+      case fft::Type::kD2Z:
+        return HIPFFT_FORWARD;
+      case fft::Type::kC2CInverse:
+      case fft::Type::kZ2ZInverse:
+      case fft::Type::kC2R:
+      case fft::Type::kZ2D:
+        return HIPFFT_BACKWARD;
+      default:
+        LOG(FATAL) << "Invalid value of fft::Type.";
+    }
+  }
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create1dPlan(Stream *stream, uint64 num_x,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  // TODO(yangzihao): In the future, send error msg back to TensorFlow
+  // so it can fail gracefully,
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 1d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create1dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, fft::Type type, bool in_place_fft,
+    ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 1d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create2dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 2d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create2dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 2d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create3dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, uint64 num_z,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 3d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create3dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 3d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlan(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched hipfft plan: "
+               << status.error_message();
+  }
+
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlanWithScratchAllocator(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched hipfft plan with customized "
+                  "allocator: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+void ROCMFft::UpdatePlanWithScratchAllocator(
+    Stream *stream, fft::Plan *plan, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "update plan with scratch allocator not implemented";
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool ROCMFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfftExec,
+                            const DeviceMemory<InputT> &input,
+                            DeviceMemory<OutputT> *output) {
+  ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
+  if (rocm_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a ROCMFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, rocm_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
+                        GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                        GpuComplex(GpuMemoryMutable(output)));
+
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool ROCMFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                         FuncT hipfftExec,
+                                         const DeviceMemory<InputT> &input,
+                                         DeviceMemory<OutputT> *output) {
+  ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
+  if (rocm_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a ROCMFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, rocm_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
+                        GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                        GpuComplex(GpuMemoryMutable(output)),
+                        rocm_fft_plan->GetFftDirection());
+
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+#define STREAM_EXECUTOR_ROCM_DEFINE_FFT(__type, __fft_type1, __fft_type2,    \
+                                        __fft_type3)                         \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<std::complex<__type>> &input,       \
+                      DeviceMemory<std::complex<__type>> *output) {          \
+    return DoFftWithDirectionInternal(                                       \
+        stream, plan, wrap::hipfftExec##__fft_type1, input, output);         \
+  }                                                                          \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<__type> &input,                     \
+                      DeviceMemory<std::complex<__type>> *output) {          \
+    return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type2, input, \
+                         output);                                            \
+  }                                                                          \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<std::complex<__type>> &input,       \
+                      DeviceMemory<__type> *output) {                        \
+    return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type3, input, \
+                         output);                                            \
+  }
+
+STREAM_EXECUTOR_ROCM_DEFINE_FFT(float, C2C, R2C, C2R)
+STREAM_EXECUTOR_ROCM_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+
+#undef STREAM_EXECUTOR_ROCM_DEFINE_FFT
+
+}  // namespace gpu
+
+void initialize_rocfft() {
+  auto rocFftAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
+
+  if (!rocFftAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
+            rocm::kROCmPlatformId, gpu::kRocFftPlugin, "rocFFT",
+            [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+              gpu::GpuExecutor *rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor *>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the rocFFT "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              return new gpu::ROCMFft(rocm_executor);
+            });
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocFFT factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocfft,
+                            { stream_executor::initialize_rocfft(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.h b/tensorflow/stream_executor/rocm/rocm_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..7086d8a4b129a5807fdbde6d9ace6ee437edc3ce
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_fft.h
@@ -0,0 +1,132 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for FFT functionality -- this wraps the rocFFT library
+// capabilities, and is only included into ROCM implementation code -- it will
+// not introduce rocm headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+
+#include "rocm/include/rocfft/hipfft.h"
+#include "tensorflow/stream_executor/fft.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+class GpuExecutor;
+
+// Opaque and unique indentifier for the rocFFT plugin.
+extern const PluginId kRocFftPlugin;
+
+// ROCMFftPlan uses deferred initialization. Only a single call of
+// Initialize() is allowed to properly create hipfft plan and set member
+// variable is_initialized_ to true. Newly added interface that uses member
+// variables should first check is_initialized_ to make sure that the values of
+// member variables are valid.
+class ROCMFftPlan : public fft::Plan {
+ public:
+  ROCMFftPlan()
+      : parent_(nullptr),
+        plan_(),
+        fft_type_(fft::Type::kInvalid),
+        scratch_(nullptr),
+        is_initialized_(false) {}
+  ~ROCMFftPlan() override;
+
+  // Get FFT direction in hipFFT based on FFT type.
+  int GetFftDirection() const;
+  hipfftHandle GetPlan() const {
+    if (IsInitialized()) {
+      return plan_;
+    } else {
+      LOG(FATAL) << "Try to get hipfftHandle value before initialization.";
+    }
+  }
+
+  // Initialize function for batched plan
+  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, uint64 *input_embed,
+                          uint64 input_stride, uint64 input_distance,
+                          uint64 *output_embed, uint64 output_stride,
+                          uint64 output_distance, fft::Type type,
+                          int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Initialize function for 1d,2d, and 3d plan
+  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, fft::Type type,
+                          ScratchAllocator *scratch_allocator);
+
+ protected:
+  bool IsInitialized() const { return is_initialized_; }
+
+ private:
+  GpuExecutor *parent_;
+  hipfftHandle plan_;
+  fft::Type fft_type_;
+  DeviceMemory<uint8> scratch_;
+  bool is_initialized_;
+};
+
+// FFT support for ROCM platform via rocFFT library.
+//
+// This satisfies the platform-agnostic FftSupport interface.
+//
+// Note that the hipFFT handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the hipFFT handle when a
+// ROCM context is active.
+//
+// Thread-safe. The ROCM context associated with all operations is the ROCM
+// context of parent_, so all context is explicit.
+class ROCMFft : public fft::FftSupport {
+ public:
+  explicit ROCMFft(GpuExecutor *parent) : parent_(parent) {}
+  ~ROCMFft() override {}
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
+
+ private:
+  GpuExecutor *parent_;
+
+  // Two helper functions that execute dynload::hipfftExec?2?.
+
+  // This is for complex to complex FFT, when the direction is required.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                  FuncT hipfft_exec,
+                                  const DeviceMemory<InputT> &input,
+                                  DeviceMemory<OutputT> *output);
+
+  // This is for complex to real or real to complex FFT, when the direction
+  // is implied.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfft_exec,
+                     const DeviceMemory<InputT> &input,
+                     DeviceMemory<OutputT> *output);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCMFft);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index 0e38556ee7d71e4fdcc788a6683b9d1a01e8b6dd..0f6af695dfdf7ee8623478fbc2250c2fb7d39e64 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/gpu/gpu_event.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
@@ -41,6 +40,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -655,7 +655,7 @@ port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
 blas::BlasSupport* GpuExecutor::CreateBlas() {
   PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::BlasFactory> status =
-      registry->GetFactory<PluginRegistry::BlasFactory>(kROCmPlatformId,
+      registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
@@ -669,7 +669,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
 dnn::DnnSupport* GpuExecutor::CreateDnn() {
   PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::DnnFactory> status =
-      registry->GetFactory<PluginRegistry::DnnFactory>(kROCmPlatformId,
+      registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
                                                        plugin_config_.dnn());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve DNN factory: "
@@ -683,7 +683,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
 fft::FftSupport* GpuExecutor::CreateFft() {
   PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::FftFactory> status =
-      registry->GetFactory<PluginRegistry::FftFactory>(kROCmPlatformId,
+      registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
@@ -697,7 +697,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
 rng::RngSupport* GpuExecutor::CreateRng() {
   PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::RngFactory> status =
-      registry->GetFactory<PluginRegistry::RngFactory>(kROCmPlatformId,
+      registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
@@ -878,12 +878,10 @@ DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
   {
     int driver_version = 0;
     (void)GpuDriver::GetDriverVersion(&driver_version);
-    string augmented_driver_version =
-        absl::StrFormat("%d (%s)", driver_version, "__FIXME__");
-    // FIXME:
-    // uncomment the line below once the "DriverVersionStatusToString"
-    // routine is moved from the "cuda" namespace to the "gpu" naemspace
-    // DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+    string augmented_driver_version = absl::StrFormat(
+        "%d (%s)", driver_version,
+        rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
     builder.set_driver_version(augmented_driver_version);
   }
 
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.cc b/tensorflow/stream_executor/rocm/rocm_platform.cc
index 113371dd5531a0b99351134907040bb8541ed94d..ce091658da4db4087f2b1078ad46b67afce5695e 100644
--- a/tensorflow/stream_executor/rocm/rocm_platform.cc
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@@ -94,7 +94,7 @@ port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
       absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
 }
 
-Platform::Id ROCmPlatform::id() const { return kROCmPlatformId; }
+Platform::Id ROCmPlatform::id() const { return rocm::kROCmPlatformId; }
 
 int ROCmPlatform::VisibleDeviceCount() const {
   // Throw away the result - it logs internally, and this [containing] function
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.cc b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
index daa42ab022af805d40c8d61973f87540167b9c1a..bb07858a96babaed11c991c59ff9644e0933ac6b 100644
--- a/tensorflow/stream_executor/rocm/rocm_platform_id.cc
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 
 namespace stream_executor {
-namespace gpu {
+namespace rocm {
 
 PLATFORM_DEFINE_ID(kROCmPlatformId);
 
-}  // namespace gpu
+}  // namespace rocm
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.h b/tensorflow/stream_executor/rocm/rocm_platform_id.h
index 71c760b827717081a1618d3629015a5766fb1504..a17d4f97bbcb91e883f89d107da40aebcb6fba95 100644
--- a/tensorflow/stream_executor/rocm/rocm_platform_id.h
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.h
@@ -19,16 +19,16 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 
 namespace stream_executor {
-namespace gpu {
+namespace rocm {
 
 // Opaque and unique identifier for the ROCm platform.
 // This is needed so that plugins can refer to/identify this platform without
 // instantiating a ROCmPlatform object.
 // This is broken out here to avoid a circular dependency between ROCmPlatform
-// and GpuExecutor.
+// and ROCmExecutor.
 extern const Platform::Id kROCmPlatformId;
 
-}  // namespace gpu
+}  // namespace rocm
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_rng.cc b/tensorflow/stream_executor/rocm/rocm_rng.cc
index 2048c8ff644379ff19ffd65944f44e9d7f31a8da..99bfc49d10fb0dd71aaf4ead6ece0f9336920545 100644
--- a/tensorflow/stream_executor/rocm/rocm_rng.cc
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@@ -14,15 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include "rocm/include/hiprand/hiprand.h"
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
+
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/gpu/gpu_activation.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/gpu/gpu_helpers.h"
-#include "tensorflow/stream_executor/gpu/gpu_rng.h"
 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
@@ -60,7 +62,9 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(__name)                     \
+#ifdef PLATFORM_GOOGLE
+
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                        \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
     hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
@@ -69,15 +73,47 @@ namespace wrap {
     }                                                               \
   } __name;
 
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandCreateGenerator);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandDestroyGenerator);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetStream);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniform);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniformDouble);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetGeneratorOffset);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormal);
-PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormalDouble);
+#else
+
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocrandDsoHandle();          \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocrand DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    hiprandStatus operator()(GpuExecutor* parent, Args... args) {         \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandCreateGenerator);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandDestroyGenerator);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetStream);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateUniform);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateUniformDouble);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetGeneratorOffset);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateNormal);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateNormalDouble);
 
 }  // namespace wrap
 
@@ -245,40 +281,45 @@ bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
 }
 
 }  // namespace gpu
-}  // namespace stream_executor
 
-namespace se = ::stream_executor;
-
-REGISTER_MODULE_INITIALIZER(register_hiprand, {
-  se::port::Status status =
-      se::PluginRegistry::Instance()
-          ->RegisterFactory<se::PluginRegistry::RngFactory>(
-              se::gpu::kROCmPlatformId, se::gpu::kGpuRandPlugin, "hipRAND",
-              [](se::internal::StreamExecutorInterface* parent)
-                  -> se::rng::RngSupport* {
-                se::gpu::GpuExecutor* rocm_executor =
-                    dynamic_cast<se::gpu::GpuExecutor*>(parent);
-                if (rocm_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the hipRAND "
-                      << "support library with a non-ROCM StreamExecutor";
-                  return nullptr;
-                }
-
-                se::gpu::GpuRng* rng = new se::gpu::GpuRng(rocm_executor);
-                if (!rng->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete rng;
-                  return nullptr;
-                }
-                return rng;
-              });
-
-  if (!status.ok()) {
-    LOG(ERROR) << "Unable to register hipRAND factory: "
-               << status.error_message();
+void initialize_rocrand() {
+  auto rocRandAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
+
+  if (!rocRandAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
+            rocm::kROCmPlatformId, gpu::kGpuRandPlugin, "rocRAND",
+            [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+              gpu::GpuExecutor* rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor*>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the hipRAND "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              gpu::GpuRng* rng = new gpu::GpuRng(rocm_executor);
+              if (!rng->Init()) {
+                // Note: Init() will log a more specific error.
+                delete rng;
+                return nullptr;
+              }
+              return rng;
+            });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocRAND factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
   }
+}
+
+}  // namespace stream_executor
 
-  se::PluginRegistry::Instance()->SetDefaultFactory(
-      se::gpu::kROCmPlatformId, se::PluginKind::kRng, se::gpu::kGpuRandPlugin);
-});
+REGISTER_MODULE_INITIALIZER(register_rocrand,
+                            { stream_executor::initialize_rocrand(); });
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index e234e5d7e1dc8183e9f3c4217be23ed82b797f55..36eabda459cdec634de8542961d24942e943d4d1 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -27,6 +27,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
+#include "tensorflow/stream_executor/allocator_stats.h"
 #include "tensorflow/stream_executor/device_description.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_options.h"
@@ -367,6 +369,11 @@ class StreamExecutorInterface {
   // as a platform.
   virtual void *GpuContextHack() { return nullptr; }
 
+  // Return allocator statistics.
+  virtual absl::optional<AllocatorStats> GetAllocatorStats() {
+    return absl::nullopt;
+  }
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
 };
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index c680a02e9c956092d643d688c8620220c7c2a9c8..aae1efc7c4fa730da620b30c7536a8774a94bce2 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -871,6 +871,10 @@ bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
   return true;
 }
 
+absl::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
+  return implementation_->GetAllocatorStats();
+}
+
 template <typename TraceCallT, typename... ArgsT>
 void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
   if (tracing_enabled_) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 508273e112a2242eb030d34410782cc7b9caf02d..09fe0a5c5df30a03c27e075902c847e2517c57e2 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/macros.h"
+#include "absl/types/optional.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
@@ -485,6 +486,9 @@ class StreamExecutor {
   // previously registered.
   bool UnregisterTraceListener(TraceListener* listener);
 
+  // Return allocator statistics.
+  absl::optional<AllocatorStats> GetAllocatorStats();
+
  private:
   template <typename BeginCallT, typename CompleteCallT,
             typename ReturnT, typename... BeginArgsT>
@@ -856,7 +860,7 @@ DeviceMemory<T> StreamExecutor::AllocateSubBuffer(DeviceMemory<T> *parent,
   }
   CreateAllocRecord(opaque, sizeof(T) * element_count);
   return DeviceMemory<T>(DeviceMemoryBase(opaque, sizeof(T) * element_count,
-                                    true /* = is_sub_buffer */));
+                                          true /* = is_sub_buffer */));
 }
 
 template <typename... Params, typename... Args>
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index fab1584cbb764392e86bd97367ff364344d1b87c..6163236791df5b6c38f4ed5e6d9b9e8df7302155 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -45,6 +45,7 @@ load(
     "//third_party/ngraph:build_defs.bzl",
     "if_ngraph",
 )
+
 def register_extension_info(**kwargs):
     pass
 
@@ -97,6 +98,11 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
         for p in core_proto_sources_relative
     ])
 
+# Wrapper for portable protos which currently just creates an empty rule.
+def tf_portable_proto_library(name, proto_deps, **kwargs):
+    _ignore = [kwargs]
+    native.cc_library(name = name, deps = proto_deps)
+
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
@@ -146,6 +152,12 @@ def if_android(a):
         "//conditions:default": [],
     })
 
+def if_emscripten(a):
+    return select({
+        clean_dep("//tensorflow:emscripten"): a,
+        "//conditions:default": [],
+    })
+
 def if_ios(a):
     return select({
         clean_dep("//tensorflow:ios"): a,
@@ -306,9 +318,19 @@ def tf_opts_nortti_if_android():
 
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
+def tf_opts_nortti_if_emscripten():
+    return if_emscripten([
+        "-fno-rtti",
+        "-DGOOGLE_PROTOBUF_NO_RTTI",
+        "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ])
+
 def tf_features_nomodules_if_android():
     return if_android(["-use_header_modules"])
 
+def tf_features_nomodules_if_emscripten():
+    return if_emscripten(["-use_header_modules"])
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
@@ -364,16 +386,13 @@ def tf_binary_additional_srcs():
         ],
     )
 
-def _linux_kernel_dso_name(kernel_build_target):
-    """Given a build target, construct the dso name for linux."""
-    parts = kernel_build_target.split(":")
-    return "%s:libtfkernel_%s.so" % (parts[0], parts[1])
-
 # Helper functions to add kernel dependencies to tf binaries when using dynamic
 # kernel linking.
-def tf_binary_dynamic_kernel_dsos(kernels):
+def tf_binary_dynamic_kernel_dsos():
     return if_dynamic_kernels(
-        extra_deps = [_linux_kernel_dso_name(k) for k in kernels],
+        extra_deps = [
+            "//tensorflow/core/kernels:libtfkernel_all_kernels.so",
+        ],
         otherwise = [],
     )
 
@@ -397,9 +416,9 @@ def tf_cc_shared_object(
     native.cc_binary(
         name = name,
         srcs = srcs + framework_so,
-        deps = deps + tf_binary_dynamic_kernel_deps(kernels),
+        deps = deps,
         linkshared = 1,
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
         linkopts = linkopts + _rpath_linkopts(name) + select({
             clean_dep("//tensorflow:darwin"): [
                 "-Wl,-install_name,@rpath/" + name.split("/")[-1],
@@ -430,6 +449,11 @@ def tf_cc_binary(
         copts = tf_copts(),
         kernels = [],
         **kwargs):
+    if kernels:
+        added_data_deps = tf_binary_dynamic_kernel_dsos()
+    else:
+        added_data_deps = []
+
     native.cc_binary(
         name = name,
         copts = copts,
@@ -439,7 +463,7 @@ def tf_cc_binary(
                 clean_dep("//third_party/mkl:intel_binary_blob"),
             ],
         ),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = depset(data + added_data_deps),
         linkopts = linkopts + _rpath_linkopts(name),
         **kwargs
     )
@@ -591,6 +615,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
             clean_dep("//tensorflow/core:android_tensorflow_lib"),
@@ -607,6 +632,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
             clean_dep("//tensorflow/core:android_tensorflow_lib"),
@@ -785,7 +811,7 @@ def tf_cc_test(
                 clean_dep("//third_party/mkl:intel_binary_blob"),
             ],
         ),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
         exec_compatible_with = tf_exec_compatible_with(kwargs),
         # Nested select() statements seem not to be supported when passed to
         # linkstatic, and we already have a cuda select() passed in to this
@@ -907,7 +933,7 @@ def tf_cuda_only_cc_test(
         args = args,
         copts = _cuda_copts() + rocm_copts() + tf_copts(),
         features = if_cuda(["-use_header_modules"]),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
         deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda_is_configured([
             clean_dep("//tensorflow/core:cuda"),
             clean_dep("//tensorflow/core:gpu_lib"),
@@ -986,7 +1012,7 @@ def tf_cc_test_mkl(
                 ],
             }) + _rpath_linkopts(src_to_test_name(src)),
             deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_deps(),
-            data = data + tf_binary_dynamic_kernel_dsos(kernels),
+            data = data + tf_binary_dynamic_kernel_dsos(),
             exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
             linkstatic = linkstatic,
             tags = tags,
@@ -1040,7 +1066,7 @@ def tf_java_test(
     native.java_test(
         name = name,
         srcs = srcs,
-        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
+        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
         *args,
         **kwargs
     )
@@ -1440,7 +1466,7 @@ def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kw
 
 def tf_custom_op_library_additional_deps():
     return [
-      "@protobuf_archive//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
         clean_dep("//third_party/eigen3"),
         clean_dep("//tensorflow/core:framework_headers_lib"),
     ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
@@ -1450,8 +1476,8 @@ def tf_custom_op_library_additional_deps():
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
     return [
-      "@protobuf_archive//:protobuf",
-      "@nsync//:nsync_cpp",
+        "@protobuf_archive//:protobuf",
+        "@nsync//:nsync_cpp",
         # for //third_party/eigen3
         clean_dep("//third_party/eigen3"),
         # for //tensorflow/core:framework_headers_lib
@@ -1760,7 +1786,7 @@ def py_test(deps = [], data = [], kernels = [], **kwargs):
         data = data + select({
             "//conditions:default": [],
             clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
-        }) + tf_binary_dynamic_kernel_dsos(kernels),
+        }) + tf_binary_dynamic_kernel_dsos(),
         exec_compatible_with = tf_exec_compatible_with(kwargs),
         **kwargs
     )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index 078f1028fd4e98c11481c6bb9e08303dd3c54c19..2e8ece122c572f3430afd073179c4d61f532303a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -32,6 +32,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "collective_nccl"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index d2ee0c4db668d0a1aa6190573f56f210a06e2653..9c7de2c5719350d6c30ac27c08712f326b014e3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -155,6 +155,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "collective_nccl"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024a20834635e2fc75ad7e6a393fc8f092d9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.CriticalSection"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.critical_section_ops.CriticalSection\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'shared_name\', \'critical_section_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "execute"
+    argspec: "args=[\'self\', \'fn\', \'exclusive_resource_access\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index a2cc07483a4e10918891f555ca9459fb7503bb32..6c528dd16200e870ca860312defcb43155247979 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -84,6 +84,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_STRING
       }
+      field {
+        name: "timestamped_allocator"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "pending_cap"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
       nested_type {
         name: "VirtualDevices"
         field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
index d71c2358c93e9597726665fdf8f92e648b2ea772..b453f7e9903bf66d19b06974da016c8c2971372d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
@@ -44,6 +44,13 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.AssetFileDef"
     }
+    field {
+      name: "object_graph_def"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SavedObjectGraph"
+    }
     nested_type {
       name: "MetaInfoDef"
       field {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
similarity index 61%
rename from tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
index c364b0217a7ed10282dc8fc28797f3be1b92f867..8d599d73b84fc51b8bf3001f6773011e42f09456 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
@@ -1,8 +1,8 @@
-path: "tensorflow.experimental.Module"
+path: "tensorflow.Module"
 tf_class {
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
@@ -12,18 +12,6 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "owned_submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2e2f583d21a30fd1e97fc20dadd58d3a62a5141
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.RunMetadata.FunctionGraphs"
+tf_proto {
+  descriptor {
+    name: "FunctionGraphs"
+    field {
+      name: "partition_graphs"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "pre_optimization_graph"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "post_optimization_graph"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
index 1287940326c0196e76fff2cf6363622226092504..777b889745fefd69e628ea26d3ca0a0fefc743b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
@@ -23,5 +23,36 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.GraphDef"
     }
+    field {
+      name: "function_graphs"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RunMetadata.FunctionGraphs"
+    }
+    nested_type {
+      name: "FunctionGraphs"
+      field {
+        name: "partition_graphs"
+        number: 1
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+      field {
+        name: "pre_optimization_graph"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+      field {
+        name: "post_optimization_graph"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
index d11e927bd55cea52d0dbdfd4b28b2c1bc24fdaa5..60518ffadc833b0ab07e25c9b738aed7f7e08f20 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 341ace07663032a836da9c4c6b5f9fccccfb7add..9a4363829c20d10aeaed28abfa2146355ba12f46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
index d7e4529594df24666844bbce8cff729c5fa8fa67..09076a8302610d769e811d8e34333d946d51bc8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
@@ -4,4 +4,12 @@ tf_module {
     name: "experimental_connect_to_host"
     argspec: "args=[\'remote_host\', \'job_name\'], varargs=None, keywords=None, defaults=[\'None\', \'worker\'], "
   }
+  member_method {
+    name: "experimental_list_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index f7d388d33d050eac2c9f14682bc7068c745a46bc..1c55f81bed19929276cc17d31d333ff53712bc8f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -79,6 +83,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index d73168b070e374a749a00f74b24b77a715d2f37e..5488449044a021ea58c92a028b109f701f6f1b79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
index 682a2b91b6187783eef74a4cd3672ae2ae2d47fb..87af1123d77a22a362abced5605beddf12395723 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.Iterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "initializer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 51224cd6b45f0a1efdfbb3ba6a3ca377d37fd00b..0927dd01a3f4629f6caf39f097085547638f0cf1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index a10add1b7e38f9875e699903b3e3c103d73e647e..bab1e399210ea20ffcaaa417a4f709d74c5e8a00 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 71b597c19c512879b8f18b34843b160efecc6bec..68cf02333f05aa7f068686773ff3ce5d2f0d3ca5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 20646e87b5fbe23d89ad31ca632a64bf958339f6..6d3f88eded4a055b7047886c3245f28866ec59ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 86c5ff5b0bd7b42d61a92a44c8888852a48677be..bc4943e73788c8c59669e8f2fe1145e3d1c1fb01 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index abc98a74b64ab274ed8b2fc43876b7102f1c7201..0e3999c0f4618f2b09a23ea0a32b5ae89990258d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
@@ -152,6 +152,10 @@ tf_module {
     name: "map_and_batch"
     argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "map_and_batch_with_legacy_function"
+    argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "parallel_interleave"
     argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea23432416ee5f012e6f0c725d5f57841400f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.CrossDeviceOps"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a38c4b21d563f0ef67eb2b2614fae678f0d97dce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.HierarchicalCopyAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.HierarchicalCopyAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09bcd84b96ee8475d3ef87ec5be686fc449ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.NcclAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.NcclAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d7943af2dc82e59b4ccd34816f9ecb6fd77dae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.OneDeviceStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ade9f86ba24779061bce3aa3e6f019d26741aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.ReductionToOneDevice"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.ReductionToOneDevice\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduce_to_device\', \'accumulation_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..619c3744d6cae858eb501392e962fbb94751355e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4cfa2f4ed2e8e346e847318e409dcf16b5f34f8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.ParameterServerStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8859fbd38c43577c5f6040d717d11fb23941e4e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.distribute.experimental"
+tf_module {
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterServerStrategy"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
index 5b5c9e2df134dc2fd73d3aabcabe8ced18d0c61f..430e81166067cb9599b03fba43e066e099d928fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.distribute"
 tf_module {
+  member {
+    name: "CrossDeviceOps"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HierarchicalCopyAllReduce"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputContext"
     mtype: "<type \'type\'>"
@@ -12,10 +20,22 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NcclAllReduce"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OneDeviceStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "ReductionToOneDevice"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
@@ -36,6 +56,10 @@ tf_module {
     name: "cluster_resolver"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "get_loss_reduction"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
index bf7c1abcd89b29c29f3487cab58cfdf28103119c..d53752780090c2d621dcabfc8c31e4f1192bd7c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.mode_keys.EstimatorModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
index 52874dd9b9316d9815c5aef51e272e6ffddb5224..ce486807a47031999ec4c9082bbf4b2e4af910ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
index 964c315e9730effac38d60f7242527e71cbf9846..9775b2cca812d3fd47a3e821f032ed67c62d0078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
index bb82bc9e58627318b897f0610c7d852db7f98c07..a4389fb998e86cc291bb02d2ae04d220f1e152cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
index 8522834433f214e5d646ef6265b1047fb7f2cc4f..fc7f8447893c99f4f68bf12f0790e8a549232dc3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index a7ee6d3e07d4387c4fc0bd9a5f6d9614473ca73c..0c3f04e468c4c817cd474deb42149aee3021aa43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.experimental"
 tf_module {
-  member {
-    name: "Module"
-    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
-  }
   member_method {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
index f06e7989537eef2b0e6fa4b720e90614366b41ee..79ed45cfba325e0749c823765a181f413c0e2617 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
@@ -48,6 +48,26 @@ tf_module {
     name: "numeric_column"
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "sequence_categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
+  }
+  member_method {
+    name: "sequence_numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'0.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "shared_embedding_columns"
     argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 283cc6a735695b0b2d16af28f7688a7a077f19be..5e1371815469974b91b1a4de16fa20d19404eee8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 95e405aebaf61e3ccae268b474a006a3bca51343..4bbe98b789935318a901d84502cb763a60ddc92b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 712e3ad6fcefc0f9f6a8ecc892e1dec81b673b45..c64a90890f13e4b14678005cb71460e427272416 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -198,7 +198,7 @@ tf_module {
   }
   member_method {
     name: "get_session"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'op_input_list\'], varargs=None, keywords=None, defaults=[\'()\'], "
   }
   member_method {
     name: "get_uid"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 2e0f77eda85780cec26b103ba11276ccdfd90189..66bacd809408029e40ba8009dd9ea6889bb24e0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.callbacks.TensorBoard"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.callbacks.TensorBoard\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks_v1.TensorBoard\'>"
   is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1bfd51cdcc3e783fcd24a80e189d3d73bf3a928d..2f3cb0b7c51e119da6a122dd6672109789c1e73c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75f44fcc3a1ecc65b27cc52d61256b6e69e0af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.SequenceFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index 721c18890acc28de66b529bc8f9e1430bc925bda..65b82a3f3222c51c4a419918ad1e74dd52152aed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -20,12 +20,16 @@ tf_module {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SequenceFeatures"
+    mtype: "<type \'type\'>"
+  }
   member_method {
-    name: "export"
-    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "load_from_saved_model"
-    argspec: "args=[\'saved_model_path\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 8a0b8eb46f006497472c1e9ce539e91db19bd260..0c6c0a34b9f606398831c2a82e9b049fed96957a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index abb3c236948a7f46d64cad92ae922324446f9a99..15bf03977dbc03660971fc7343cb0388d8696326 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index b27db4e7f23499fd27430059f1cb556f341547b3..b265384d5980f4c4982ced19af0208427da56817 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 50998ac9d63c9492523720d7dcc8041fd9efcab5..3a0882daf1eefaf17f893c3b565784bdd60ac689 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index be17aeafb5ae383cba58b854808f6c9bc0e9696d..d2ee310d68e5da9f07f9cb6656165c97fccaa469 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7f21b444bc8832189b11cd8ff206e034bc89170c..1da079f39e25b020406dabca46966d57dfb4451d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 2ac86f152fad454fc0b09e2cb8814f23ad997c20..d96751ccf916aa9a9522f341a3befbc987f25125 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index f6b1dd2f7e4244218b7c64868b773142c79695d6..3819e525a99b67337f1d3a36b30f6ee0dfefa03b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 3da1f43a92a3fb5a146bcf8fd16f26783487f129..47f6b397a70109e9cdb833eae0cbec8a753831f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a7be5ac81814b28c93407cd5d1ca7c3f60822f0b..4b8cadca5344acd7fe279d0132184d666a94f448 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index c5c29bead383da6b9c0c7436fb089e27413e72f3..5c66da42e64b3cf57c2e03002bb95bc33eeb3ac2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index 3af3c2a501d6b46821d2ef1b4e6a06e2820d4764..203fea1d9ff251a45826b13cbbb557c67a549622 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 880d18e1aae53512b2f587b5c8914babcd68566f..95eb6f69ecc4fa313da9eca9b6bee185b647be03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 1eb0cf1a188b88d55b82297da715624c9e5a58f2..09410135435100779208ff1c3db9198fdc88b178 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index d9394e60f532465c1852b2cac46ca4cbd9125583..941b6aa3dd3316d15edede430ef25da913bf0ce6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -196,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index a0f6dc8097adfb896a8f3aa3b642c2997e257cf3..4bf8336fb32013258ec17edb810a68e7c0d3cfb3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 037b92f861b14720a1a638884752a4d3e1dbd9f9..221addf20aec476c7533284f9d7437cb0e1a6bad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 6a0d027d47d999f5770e59299fc1206249bf9b43..1c95fcc8b543c3da318119f7c9ed64952ba5b819 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 66b5bd75fc16c37aecaa65ef12fb2311925c252a..994a507bfb20a675ef2b4f8f5a9b2284ee0c21ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index e73133ff0731821407084cc1cd6160b2e9bf3d9c..ae251b529bd6243733263acad98367a1a64e6530 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 7af6b2b3c398473398a9d2e227a42ec96451b301..1d73eecc37c30b0ce6ef723f56691cc502a5698e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index baff492dfbd3c9ab6f2c269cb89632768e6b6c92..d37ec0f5603237cb5c397dd8193f51d5e50db0ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 63d30a61851cdae8daa8a5dc70fa733fd6b2ff11..bb3c37d573fc2477a7327ea4dfec45a7a66e7c22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 7a29cbbec35b885792828828354ac8f9a29579b6..fc29f1c913adbfde33827d8095181763a43126d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 87c75c02243cd646502e12e2947555ad7c6913e9..2658fb4e9984969c60257c17361799e2f56b1bb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index f69104ddfef17c8b5df36f4bc3e9b0ea3a986295..58567eafa4134b747b4ca7ed9b71aa77d3a4c2e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index aa05471933cc97a872480e0ac45213b49a882189..42be76f4189d6fe299acb6fda744e268881de6f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index d61f1ddc1d506ae2db992aaacbdc634964d53292..11092f225d1c350ef81653555fb3cec8f6bda85a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index e2d05f8298ef4779f3c2678b24ea9f938a3889cd..e618a111b6f60cd73fa64ab47d4d4422580a73ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index f650f48423b4496fe21abd215ac494018921df00..cf8a67b54983f17773e59de2e0661fd9896d2420 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6f7f3033ecf8e226b961dabfe59e751639e5b98
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.layers.DenseFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 06e8b6b314183b884e635f3b78e5bd5368e0962f..339c9f52b7ff45ebebce128980b3ed9fc47bfc5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 9fdf6f66d1160a49da302ffa8eeeade3009de048..c2992de5a8f9757266fe681943e7a6b84039a893 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index cbe102065071a00596fb4b8f764b410737c638a6..b37f4c845a45418ba8fc47c89718debb97de8f88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 0efba09b272c8ffb2220ccfaad830c7fff98568c..d21f577721c3052507969c208870b9d69dd3313d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index b34c499eb2e603aa8e2a6c9c84ec752a41efd0de..f1e086b5dc50a3e47bbf2e9d3d130cade60b8902 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 51dd853127f549c8ff370391f11cf7b8021af469..eb3496aa48fa43066f4f394ba2039416b4157f33 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index dcd18a9cedd53565fdf38d9787335e0afea9ad3d..bbe324c77255fe5ccba650566d4a455196ce49ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f029907ee86943fb8c04eada819e9cbfd6d01009..dd93e32ddcecb70203798c65483638a328019d3a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 278ae06cba9bd60b16716fdd0a38e87df2ee303e..9f25b3c6574e08b96e4947210c3c7d174e7293f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -179,7 +179,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 15cbcfe8edffa92ef0514248e9dbc523dc6a49bd..e24862632e73c9030dbce521df445da2521f322d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 865b898c4cc54253d85442f2db2f3f624ecfb817..b1b5759881792810b4b11e279bd699652c256334 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 3e17aca17cc4e636ea3f6235f04cd4b7f468ae28..9c4087a06f1776c3372bcc8524f1ea48af1b7447 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index b160687a2a610714f5dd6c0cc7c7c92408d386df..d56429442c626340b0f38becf452668ff1c4535f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 70e8d51a5a782d5f473e1350d16d942998e58fbb..089145ead9f0aa35f4a6e2d41b5dfb8425c68557 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 809dc8554b38af9486035f1f3b03aa58392812de..2bc02b6f69f2468449c8d551d42e3b12e219964a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 3fbce8cb714355c0898dcbdc6797394410e90253..c2510d129460b8819da832b3354321db49868a29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 70e4103ea1abd5bda90811d127230105ae7bb941..845d6b17015a4168426b430add12d303fe14cf5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 000bf54c4523307d791db76d73e1cbe71cb46e4d..f6fa8659dc4cf316968dd9f48572532fc0b4e5aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 8ffbf07f9bc32cc9a3a83e2f57f5fe4d78fffd3c..1285e21f714e3cac3288e4d5c8b883c5ec909d7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 3803d2b0a8765f4832df34fc4876256d4dd2ea86..a1417e46ceb713440bfcb16d7f29d415883a3633 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 28668224e01e04bbc4c14259d84a11ec72c826a8..ff4da8ba542732f48388428f3dcf9452d41a3320 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index b83ed67723afd5544ec19c599437a57909d780c5..7140d5718ba61f508a1d00729a4777745994bee4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index e689d69140e36a94c731f4c3b4578919d31343f5..4edeb9788d88000634ee1aadab9fea69fb2c83c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index bb6eddae7168b576bdaca91b6f7951fe7b65ee1f..48609567d5bed7246d0162ddfb8da92090c5640f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5fb3f9dd3aa9a3f3761c2bfadbcc19f46fe2303c..7398613812d0b5dc7d3f9bb62fdda0bd08d11b60 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 8eb6dd9f4a1799c9a6d90fff42490e29417cc24d..a8f60e83b9bc92c2e0759d120c84b5a61b7431aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -179,7 +179,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 376bec0814880e3fa0091a41cd9a4ba0dcc4ab60..88f1f8b06d1e887385ce44903ada3dfb3c934e21 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index c5f91a6338cf5c0b8f017f6a6a87d0016ffbe999..c95f9159cdf2549cce9d645d9cb9b4802ea87018 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -105,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bde888735916a018647f681968241a583e0271ef..80d1c32dede3d72a4bb5f150d9c9d51143b7f5ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 16945f2c12a7be4eba8a67a9a58587d756888d12..b050302861485258c0043d97bb325860f770fb06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index f05741ffceb6a855f56731086619dcc621c8d71e..3bb780cdb0ae3b2f2c8b95f1e41c524f12d49162 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 7885db4ed291afb8ea627cebe1dbae45723d4b2f..690208be83a58bf107f02fa342812c9d29319183 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 9380d26cf4c7b1c93a4c0ce2681e792381c42deb..02f3186dc60056844a43821411e4438eb123fb88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 8eb8218df3f3532c6c108e9f43f6ff5a708c7fbc..f2e9a3b13665024e8102f1db84293b73fa094d42 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 0c96f86ed36d7cf99c396f863de6d9ef8f90adc5..868faa03919a0d656394a691200e622d987f4be1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 0c6b230eb79aac1e719949b3f8331423b621d47b..8e1662630c18d8662145226c6a54a05f49cab24b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index eb7ca52fba97174a1c6869ae003beec8ffc328df..ab96640936137bbcd3e73b1dbb56060dfe4943d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index e724e9088f82f7ff7152cc4393af4a8f582136c6..4f492f50fbd7b571b5fc853e6523c9cb56d73e82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index dafbd09ee28ef6a5ec933cddf3c246cf2f4f4480..702f2e88d0b32776ab2e45bf968843e8df1e5578 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 3122fbec1c7cad161d71fdf9970995adcedfdad7..fef939090d8d1e4ffc17b290754b16b87a07f38a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 0527cda1f026e1ff9075e827c2902c45fd22db9d..1e2db3fb135ded60a33f8f4f3ffa92322d38dbd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 814e5a5d545f0d4b1276ef1639eddb72004b4d1b..1450047d7dd716b97a824c6718bef26373214114 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index aa1731afb82698cb44375407fc717bf32ef634d6..d5d3e0333bfc5ef8bf56a153d04ccc9c9df85dc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 9d7dd85fe0eef3733a86b9e918396e882f5812d2..f1151f43c9df50850e61b5ab6e9ea7b7bc0184d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -110,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index e9bba298bb028851e6e9b9a17ff40a671d9132f2..0874240fa6f60bd8c345fc1c2219f9f7bf843831 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 3c783eb5129028b3eb5160c75dde2859541cfd32..de9f8fb5939fd11ccdde97e383e1f4a7f1d99e73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index b8e0882541c51209cca112c54197bbce305bd1b2..a125754f43a31806b94b4775b4339e9c610a8ee0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 310f369ed6c7a9931af56016ae09db5d4bca15d5..01fac3a2595838bdd17b8ae5b0dd93a7ed70d495 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index df19d781c21e403b51d451d772cfba66a7383be7..80628d76d8224af71ebc12a1d735481ea6c0c654 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index bf909509bd4b25507291839fff1ee0eaccee630f..41e96fde4b2a06c67fd2007f0b597104b4cf5b93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 5d66bc6fb6334d99242009f59ccbe0e7aa2d4e89..f48b0b3517f04dcba9ace267b4ffcfc5abb70c23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 88e9300de912f3b12712bbc311ac156803ef35c4..5e799329c032a1e72a61378623e2844fa7e89401 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 9d81c6d4bc3139952e9f41113d05547b215cf571..60893bbf1aa5a508cb4fedabc8cd90927b488f71 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -167,7 +167,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 712eb0c6ec3b706a9e396a532f58916140a2c606..c96405a434c9ba3cb770b3dcaf6cf551ac4f200c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index dfc4ca27052f919ea3866a489e525ae1202795f1..153b7bc876738ef180901080bef3f8fdf2dd80f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 5e4f727f71d8be2496bee1abdb87c7050f1ca02e..44e08117c3130df83572dd8409e5af273ac5b290 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9d893cb30a066c4732ccab9d1520f5047a4d3a01..6e9f624999444a54b5d7dd2e372f9d5d470a6011 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a2ed954e4c0ce1d474b8c71b41ce1d585d42d665..57da4c0ba9d62aad702a6204b3ce41b48bf6d161 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 8a0818e78ac766a624bdcce85591fe13e1d4ceac..27eb794485a8c1c46cf918782b97724e9774eef4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index b5591b48265d3d09459bf9bb114a4a3149984eb9..733070e50bc283729ecbd91b1b79af9521eba678 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index 210e4fd4e6f0b2e8ba75d22e83134e2267fbece5..009ecca9a7f490f4792c3c0649e3a9003d0e247d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -111,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index da2213a84fe2f6bc683630e5f8760acdf3239b19..f465aa64b91f644d5d2f292f0a67a3300ed3f488 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index e2c303d506e0f8f99d8ca89f29979a5999382378..049da3deddc0fa56d4ae2bfd6552452473d0b7d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 396e774c8a4a10c4996c56c208fc4f4d432e3135..1d50c891154dd86e4ebe5b481a137e4c7ce248a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 8b6418d514e61536f314da88f1586cea4f29cfc5..6604ac05d910abd89e3c04f6a8194adac582963e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -110,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e8fda4c71ada65aecef59eb8012120488b0f17c7..2c8d52765d55357fb2f02f5268b089b27fa20429 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 50c52d270b684bcea5105e4c9813cc62103403f9..bf9f43c1dbfbddb0ddd19b26d554df86ba493d25 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 84c6b78a2b4405fe0e2a1fbb3bba0635db06a21d..a78cfa826bfdf5f49947cd9ae2f9879bf36328e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
index 476c597fe60ced2338a38aca4c5ddd5f21b0ccf2..9b5598eed07c9f04feb0d90820381abc12dbb456 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.experimental.LayerNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index ad74ab3f120f4f7ed56850d7852ca09ddaaa7d90..cc0fdabee1db04ec48f1c4e23b765a2cf89d4ad8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
similarity index 73%
rename from tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
index 4952a76291c00bfdd73eed5412e7421887d1bab2..aa14c44fa3628236033e952b69f3a160c49a36fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -1,11 +1,12 @@
-path: "tensorflow.keras.losses.CosineProximity"
+path: "tensorflow.keras.losses.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index 4240ff69a5ef27e247e2ab04e98caeac89670dfb..7e90b36ead9b0a8b58a621afaf379eebc2cfdc8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -13,7 +13,7 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
@@ -108,6 +108,10 @@ tf_module {
     name: "cosine_proximity"
     argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index beaa6f0596f5c41ccd278cef1e59c876d4728983..6e00a3a355269a0ccc5d69b3fcea106c4908e115 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 5f0079762fce727000e4fdeeda3dfb012840f3ed..18cde2f32aa10100c63d81470cb6e0fd2e61d4f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index a0a3ae890036161d5ec8da0af6b81495c99f9146..c5d2fc9c5397ae20dbd0c7f8f7ce7801c63c3997 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index b6ce9e042027eb5e1186de4ff7c01230fb0e038f..a8662459c9ce52da3a42e9c5e47c52b6deb6ab06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 587ffddac96220dad543be2e63a0fc202d60cb10..998c4cbb1fad2352cfb9a510ba6e9b153860fcf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 8cd173c7eb9f77d2731b790b068ce3a768f1c586..04f1794aba61aae085a7580806e524eea8b2a791 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 99414dd54ec1fd383f2f1a672f311a3b99213a00..19442b5028dda68548c19c74e0828abf4fd54534 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index b059e039deaaf699414c779584eff50b5b760537..678c7b0681fe4281893fba70b4652233a91e2a0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -1,12 +1,12 @@
-path: "tensorflow.keras.metrics.CosineProximity"
+path: "tensorflow.keras.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -90,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 5432f7f4006b165fefb9aa028bf7d36d8cbc38f1..5a94569660fdc31f1889b5ca64f1483970cb5235 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 75541bf285d8989f867aabc7c7025e56cce1d05d..9033d9e655b2f2b80836153c23d9927315360de6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index f45a57df41e56a52f1a4b784a8c8d140f6670b25..dedc64f1375b66b90f655f280c1a56ba165cfa17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 25bcf8591d45bf2237845a914122f5f9ca9c0ad7..af8366b60876cb31f840c5f5007e67980be8dc3a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index fb2df7f459753b08426449f026129d0960d21eaf..a7e072e21cc94492ed27186f44b92863cd791d62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 11538c6c09dc4f0ed7de48fbf95dcc253adb1e24..75173ad17a9c1fa02451287adad10870a60d653b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index adec3feffa1f07d4d65b63281f1a87c7469eb4cb..7be81b63bbe01b8534bd64d163e735d735ff88f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 47b550a9e73bdd33c5d592ff86f237d821866e09..21e44ed988494119662e5e1a5101edbe4d7a35fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index a1ec7cb51e545077737051ff3f06509882e4d32a..8ef17fc34566e8ab6c5cc73781b40cb0f7396067 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index af960fa54329d7d80d23f24df798509b2d12bb85..363f532ba410f1ebae5f105769a0e69c2e2d0166 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 417f92f2734329e2382d6d6bc1ff7c399c990704..712f10cd3051fe3de82472cb0eef2ec5fb53b6dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 2a5218fed4cadf155f29781ff7341675b64f22fb..fff91d2b44c6e1e7c1fa0339c737c4a44b9566b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index b089109baa5ea49b588e6e9a54b56256d4e4e3a2..cffb444835c58c28953f85c61a8f2d98f2e74716 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index 2216043cec13086660c370d835209ff39ddaa324..ce746ab350bfa0534bf7f9ac7d6e8255c7749894 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.metrics.Metric"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index 8e91098f95bffa13fdc97406590d1ae71edd6887..570b77408cbaa2b7a0089f9de8a528e604799abe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index 9aeaa5627a9805579d6a6c4e09336a4d7994d1c5..83535d56cfc37932be785684825bed0e29a4fa5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 748cec08668c461fcf80df6a50fd5192f99073b9..9ec2bbc4a3c8709f162dc0407408b2fe29b695a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index 4dd60bb772aa93b637fca287baf3f17104a2e16d..e2bdbd54e22756b823716c149cf0f24661acc812 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 97aeb680be1c5c412a16e2a73e3f1bc2ca8ce6c0..172c40eb2777d5504968de225718c270a0ce4e99 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 5a7bef4714d5aa37dcd1c11048ffd370e8841f0b..8a24088257a423c18f347eb256915bda10459e1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index c9687681680c13480d92105a5eb10ef0fe5bf89e..0cadc9dcd99c03d81907ad5b1c03fd3cba25f833 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index ea0f2a760ab7e34eb5f5b8c2ba63eea96ce63777..c135b8f680061a1e79fedd9d705d0fb54344823b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 85f80b062efe3d2d91104b211c8d9d75127c8c0e..4f6818797e1e55362a35d37e70a05bbb3b8d65b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 15895daf42bbfbbeda419d767844fd840ab4178e..90bbb087fafcdcde5dee048c45adbc45e3be2e55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index 5ca1c6c8396e3d79a8f6250f34e7137870a23ac6..30ef19e02cfc99d117e6a396beeaf6422a105013 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 9cf1bf21cb5acd9d81c581eecfc2e64b78bd9e70..e59476a2410f859dff7171162a2cab123d5e853d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 4bc9383f6ffc90972416fa031d5515a149e70425..6627a460c7522358a6f44d415a2ad8ce59b97427 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 2eae4df0ae344656bb637bc27e806876304a86f1..8c3c2cb03a8b28db6212e29e0cb9b7b61fca7174 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index 92dddacf2cc55ad0aec7ce012c7f6f5ef91578e2..3f7fe4a2f4033faf1e4f79705fe78475866e80f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -29,7 +29,7 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index eb1ab1d9dd61b36ed8662e25700f12f82aadb502..63100a2176869e0a4ff30355c3df7ceaefaef65e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index c69cf281742360d9ed4d1f7cbd35219cf04b1149..5c9ba04296ee39e1fdf014396d166d91ef5f714c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index 0a56293e804f583a949ecb413da0ba613e0bc876..8471803624634eb2d3bacd79e236e51d4488a764 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index 14d0894e5622021c4961228d431d01516b752055..0466ea65fa3ccaab1459841def55d6f907c7b14c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index fdb1ea838c04f296c3d0ee7596d73052b6f4b6d9..9762fad5d0fb7690e041b853eba65bee1583ad14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index ece63ec168dac58f58286dbd9fd8a8151d0dc2dc..f477a60d237f5801b8ac8713c150cd83f2b3d768 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad42c6b75b32947635ec1098a3d639e011ec3765
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.keras.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index f952f88b6d203488ea0ec4f1794d7de79a25853a..3ffb4bb8b4dea5840013e830efc7eec6699f71ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 27bae902b0cb7f1f4e09737a83fadd95a83cc163..9639c71ce415f5a942485fdc0d40f32c24f16b7d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index e523443a0099b57942c73cafcd8a919503e8db38..2a7603d69b4f55d23e03e6e3d4fa5e60aeaac4c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index d2721f8e92088c216ab748cae45e415553b9d4c1..41635553347f5f1c04c221574ce7e5c6ac05275d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
index 97f7c4b8c9c35926484b9dc9d7ec818d8360b4ab..7a333834c267e59f7a09c4936b8ed59776be7ee5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "Adamax"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index 6d826a8f8e47e53bffd5f759c4af02c5f9d5b15c..d012bd97efe8748463fb7e465ad3ce7c015d841e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 9505c90aac52c3329861bf01ce3c40c50e557b10..90a27e5d66af196074c8c8ca44830ad6ce073da3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 5b1b8f78dc5d98c059756122df24340bcbc2790b..d653a0cec41e6c6e459b54c9126db20d13750112 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index ef4c57b6942f17bdd1524d3eb773fbfabd5a82a6..32f9345ea40915c27682e4e7274e4a5852a72078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index b5ee2e7302d034c4b9d9ced7a1159c87297b1a06..0e7adfe26b66ec5e877fc13090311ffb35cfd6d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 57f6d7c7c0114d3ab5cd9cf4066979a6837d8e9d..5296597dc55bbba51d5c11ad3256f65580e37374 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 88c616bd17987acc2e766a26c4b14c62d6d4a3b1..5ae9568e6421919c97ea7ec2300d33ccddc53e02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index b70a907907e5cb7e7a509e1712675bc9c9bd4cfc..aa0da6d68ca1eb32dc47fc3a378c953fde165c79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 33e8765ce6edfe3a14b7d6ff88be9a2ec2f07b32..516f0faea98550eb3d85fcbf0185aeddbea74ca4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 1ac13b57912cd815c1b8de9b461d6cae2364ed9d..d92af8f326484f99dbf08f9695e257a67e1697f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 77faa3c2b9da7eb1c7f9cb086948997f6b2af02c..614643fc9945a4f1c1cc23b6dadafe3a47d4a82c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 0b2631491b0d727b262df4ac05eef7bc64eedba3..31022d3049e91d5026625dbf89c79ba424d5949f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 0a3414d20cfd554d1c5eb82d613fbc938f8f3600..03bbf39022d366afa67f62e5299a0ad26ba7d4db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index ffc5cf1c8b76e84d30f6a7af22773b45feb1a02b..63a301e3e6eb5370a832b35a5399752d3fb68fa9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index ff2cf2ba90732bcb042c7f5ebc8a50483d37c8e7..d81a3368ced00d6090ebeab1ffec85e19953846b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 09c8a31a7be1162866e4457fda84a921de283377..48d93d503e8939dd6bc2896014146a1151ae1c46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index 549e13a7ac6b595dfa665096aa15d7bb20df65e7..2f1f1c1e3fd5c1528c7848cf93f8ff123a0cb743 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 169ecdece5ddc92908ed027c7f470d08cbd5a5e0..bd7549af4c420f69ea2992a9c9dfa39471d083d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37344f70311bd225856ce72c52dfd0ac1fb09075
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorAdjoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_adjoint.LinearOperatorAdjoint\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index c7a50969b54e5efc4d338caa79dea76d86bffe8a..ddef774a75157401354d29b75c7a00fbedfd9ec5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 3900c752c8527f68af2496f99083d80fc9d18106..97a6b1a475b9555f6f5ded273050b5751625b78b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index 7b876099af6a28d9fca2e5c55aeae5e4610f82a6..e2bfe7e7d852e825f826f9f785c40b9550f706be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 5bddba8e798618f5b1d0cdc61ddff9725a495fe0..8885526669065e5a5506bfe1bf93076f4584f9d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 62ba8bb59e8af14447fe570ba28c5d0eba7f6af8..2a017fcb8987d46d8e24d2d21b43ae9962ad2075 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_composition.LinearOperatorComposition\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 0803feeabd12acb7988459fe6da2748e19b70a5f..31dcf7b0a6b4699e7009746fe62ec5551ee3e11b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_diag.LinearOperatorDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 6def32864b9cc660b94d628ccd53dc48a566ea81..0ad39b4ba6006a1efa6b16e650ef3140516775f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_full_matrix.LinearOperatorFullMatrix\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index dbf1ac82d33b81c63e5c356ac736f63262797ff0..f66a5a833a42c06f32696abc0a3114aa89f73a7e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -60,6 +64,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
index 6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337..a7eb144d83aaeb2997d44b703b46de9a01c3a478 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 85d902b977ceddd405abb1154a086d7bd29e7848..c983f8c6e6aa53716d1c2d07f219baccda99bd04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 638d82a599248e547bcae86ebd6d8d8dc3f6aa4b..813aec2a137ccaaea9718b7d0254ed0d60500247 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_low_rank_update.LinearOperatorLowRankUpdate\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "base_operator"
     mtype: "<type \'property\'>"
@@ -83,6 +87,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index ab1b04bd3cb1b215b848019b6c578ce091f8f828..0bb7a15e1342aeb4be94e9a40e1e6b1828e397b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_lower_triangular.LinearOperatorLowerTriangular\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 961969aac58b78e4edd53b47f2932f71f2d21fd5..7747c985404e54f93d012aba86a39503a855c76d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -64,6 +68,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index e76738a9648123414159fdc9666a99b0577aa46e..590782bbc1d57ed4efb1cfb68b145b49d64c4545 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index b35cd69da474a9665652f04f12b34a8d9f33fa8a..ed6bfdff288220fc0bcdf9fb6c4c78abfe5e43b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.linalg.LinearOperator"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -58,6 +62,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 5e49b75c3131b989c765ab03659fb225cc23e26e..53564e33596c1789493f7eaa00ca74d491b041db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorAdjoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorBlockDiag"
     mtype: "<type \'type\'>"
@@ -200,4 +204,8 @@ tf_module {
     name: "triangular_solve"
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "tridiagonal_solve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fedb5ee9fa4a31f25133bef55b980c18ed74fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.Optimize"
+tf_class {
+  is_instance: "<enum \'Optimize\'>"
+  member {
+    name: "OPTIMIZE_FOR_LATENCY"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
+    name: "OPTIMIZE_FOR_SIZE"
+    mtype: "<enum \'Optimize\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d14b69531d183faa35d19f379d6b20c29b02e6e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.RepresentativeDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.RepresentativeDataset\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'input_gen\', \'output_gen\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b4133d0ca6edb929c780babd0652187d41c76b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TFLiteLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fff2b8606c98ddffabbd3d27e7a7848d1fce86a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TfLiteRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce858c46ba304b7cc3ce6b257518cdcc9aac646
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.lite.experimental.nn"
+tf_module {
+  member {
+    name: "TFLiteLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TfLiteRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d585ea6643b731fa4767301f13654fc699d6e23
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.lite.experimental"
+tf_module {
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
index 154dd00821794ef4a5118e98d67e32beca38bebf..b508c3255e3817c14d4cfb6adf0e67606c744e2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "OpsSet"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "Optimize"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "RepresentativeDataset"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
@@ -24,6 +32,10 @@ tf_module {
     name: "constants"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "toco_convert"
     argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 4e211be968d30b98a4ae0a52617a85b2ec829961..6fea38d03acaf1c6c3ec60109b6e16f0a2d3f11f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -120,6 +120,10 @@ tf_module {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -260,6 +264,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -300,6 +308,10 @@ tf_module {
     name: "reduce_any"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_euclidean_norm"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_logsumexp"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
index ad5360e33589ff2d9229c2ad6da83f6bea101669..70bb6d760bc7a6c55bbdfd5c05cde4c08769786d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "flatten"
     argspec: "args=[\'structure\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "is_nested"
+    argspec: "args=[\'seq\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "map_structure"
     argspec: "args=[\'func\'], varargs=structure, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 249b7ed2c6411bb19ab59fb89f067e00148c2393..c36d997d2e97619babd8eb4d161db41326907359 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -22,12 +22,24 @@ tf_module {
   }
   member_method {
     name: "avg_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool2d"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "avg_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
   }
+  member_method {
+    name: "avg_pool_v2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "batch_norm_with_global_normalization"
     argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\', \'input\', \'mean\', \'variance\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
@@ -54,7 +66,11 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\', \'input\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv1d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
@@ -70,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'SAME\', \'NHWC\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'SAME\', \'NHWC\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d"
@@ -86,7 +102,11 @@ tf_module {
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'SAME\', \'NDHWC\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'SAME\', \'NDHWC\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convolution"
@@ -234,16 +254,24 @@ tf_module {
   }
   member_method {
     name: "max_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "max_pool1d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
   }
+  member_method {
+    name: "max_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
   member_method {
     name: "max_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
   }
+  member_method {
+    name: "max_pool_v2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "max_pool_with_argmax"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'Targmax\', \'name\', \'output_dtype\', \'include_batch_in_index\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\', \'None\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 4251206cda782be1a3a4c7f78fc0df705df88596..95136152775dafedc6e276e15d426de595a4d983 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 20af24633a45759c5b6e7b7758d09d87a024cacd..912f78fac15eaeaeb7c260ab3e4c57e059befac3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3205c6a4dcaaa00591cd957021a463b77835343e..58d004b3d5d10332065216b4a816febb673a4853 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index f273e11a0835e4e37b8469327988f0a637dcfea6..a7b63a7c2b43f3364431081e0c71e366e8bc0c8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index e43547b15428634f0f84ff0e01abdf4585e9d5db..3f17805af25df993332b594c05be14d4bcaa5b7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 99381cd7e167223cea0fe4eaebbcff736be66054..055485f3e90c0e48bb4f8d0ffc88b8dbca11b635 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 1fbde9df17cb83bffd46c82f11c99d2926859f77..23272f442279cf25b3833d19e94f646fe2144830 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 8ba92fcc8dc89958b8395aa986c358a03fedd66d..a9f7e85b1488dc49a52c3ac4d5a7ed55bf605ab5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 4003e87b8994af5ca38f754279e5906260456566..ecf43616741383e3565186c6f9a29ac17b9a2c4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 56d06053f15f51e88cc879ec1d67b03d25513946..103fdd0c1a5a1e4a2a601a17a5577de53a911d49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "ConfigProto"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "CriticalSection"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
@@ -136,6 +140,10 @@ tf_module {
     name: "MetaGraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "Module"
+    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
+  }
   member {
     name: "NameAttrList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -1460,6 +1468,10 @@ tf_module {
     name: "is_strictly_increasing"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_variable_initialized"
     argspec: "args=[\'variable\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 0b2a15f52160191b05d69cc6f1262ddb4dab21b0..080afef7645fb4cf7a700b2da062311fd14871f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -532,6 +532,10 @@ tf_module {
     name: "CacheDataset"
     argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "Case"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Cast"
     argspec: "args=[\'x\', \'DstT\', \'Truncate\'], varargs=None, keywords=None, defaults=None"
@@ -568,6 +572,10 @@ tf_module {
     name: "CollectiveBcastSend"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "CollectiveGather"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "CollectiveReduce"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\'], varargs=None, keywords=None, defaults=None"
@@ -972,6 +980,10 @@ tf_module {
     name: "Erfc"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "EuclideanNorm"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Exit"
     argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
@@ -1912,6 +1924,10 @@ tf_module {
     name: "Mul"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "MulNoNan"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "MultiDeviceIterator"
     argspec: "args=[\'devices\', \'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
@@ -2644,10 +2660,18 @@ tf_module {
     name: "RequantizationRange"
     argspec: "args=[\'input\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "RequantizationRangePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'clip_value_max\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Requantize"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "RequantizePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Reshape"
     argspec: "args=[\'tensor\', \'shape\'], varargs=None, keywords=None, defaults=None"
@@ -3056,6 +3080,10 @@ tf_module {
     name: "ShapeN"
     argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "ShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "ShardedFilename"
     argspec: "args=[\'basename\', \'shard\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
@@ -3768,6 +3796,10 @@ tf_module {
     name: "TensorListScatter"
     argspec: "args=[\'tensor\', \'indices\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "TensorListScatterIntoExistingList"
+    argspec: "args=[\'input_handle\', \'tensor\', \'indices\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "TensorListScatterV2"
     argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
@@ -3848,6 +3880,10 @@ tf_module {
     name: "Transpose"
     argspec: "args=[\'x\', \'perm\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "TridiagonalSolve"
+    argspec: "args=[\'diagonals\', \'rhs\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "TruncateDiv"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 3929003fa1ff0902b55adcdca1274b1c1b1de2e8..5216f4e2ed063c72999654de022db37bbedb6a63 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -164,6 +164,10 @@ tf_module {
     name: "load"
     argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
   }
+  member_method {
+    name: "load_v2"
+    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "main_op_with_restore"
     argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +184,10 @@ tf_module {
     name: "regression_signature_def"
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "simple_save"
     argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
index 7ed9cd77a01c2eadb5ea43a02306d60d505127a0..3879645d60249b18664b77125917d2066a063662 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "image"
     argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'graph\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "merge"
     argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
index 65a2b605d532c4a14d3d444a44a723c543af5026..6ed8f934f307b88c26993176b1838d202d187b17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index 179272d8a8a298ac374c19641068aca739bb9626..c57b3d8ed5a78b0417fd0e927e447cb3c0d9dd96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
index 15c2ef46c127543cb94690aade3c79b6e75981c2..897df3ed231d5e91c417e78c48e062a591308cf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
index 9c902e582f35ca44a6825727637fa3d76011e33c..cb8b5d366c4c5e853e99ca79737886b62f9503ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
index 42dcdac9e77a8efac875e4985f6a8f744e838ddb..629bc13612ab567006436bd95fee49c4e3acdefe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
index f41d9f12d9fe65b128d216551870ec8c95834a6c..1d1aceb0138d264501758a26eba75791d5b9f735 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
index 7399750385f960133aa5cf071c57dc9fc716a18d..b998e848c2ba6a585d3820549d1d873bf04538cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
index 9bbaa14a6fd54dfcad37560142bebc7b3118601c..2de61d67f717786152515d414bed6ccd574aa58f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
index 448e17a44891781b1d6b0fe8e627cb91d098f1e1..8baa56902581d8ac405f95992daa29ae4a9fd1e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index eb1782e9cad73708de24f6565237830a29cfaf8b..626b75335461fc13a0bdc73b220d7e562a5a6c46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index eb9a86183e10775379efb84c693f7aa7ba573f2d..9c0dca030d2fa4c75315fb60df4cc2019271b41e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index 2cf4c2e7ea4879c48c1b3a43302f7fa4e9f689cb..61ae458c01750493d87bc53f3be5c660ab912f5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
index ecce08220d6bd9815fecd26a95f8ac6f745d9e33..b812d6f1ef4ae8f4173c2b23a010935dcc6cabcf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024a20834635e2fc75ad7e6a393fc8f092d9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.CriticalSection"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.critical_section_ops.CriticalSection\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'shared_name\', \'critical_section_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "execute"
+    argspec: "args=[\'self\', \'fn\', \'exclusive_resource_access\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
similarity index 61%
rename from tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
index c364b0217a7ed10282dc8fc28797f3be1b92f867..8d599d73b84fc51b8bf3001f6773011e42f09456 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
@@ -1,8 +1,8 @@
-path: "tensorflow.experimental.Module"
+path: "tensorflow.Module"
 tf_class {
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
@@ -12,18 +12,6 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "owned_submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
index bee19520b7736967533c6d30a1862e3c48d03fc2..60518ffadc833b0ab07e25c9b738aed7f7e08f20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV2\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index a80726d3bbc400b1ce8e640819ad370c3589be6c..03fd32fdebf80745cb264afd81d08bf7054aebaf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
index d7e4529594df24666844bbce8cff729c5fa8fa67..09076a8302610d769e811d8e34333d946d51bc8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
@@ -4,4 +4,12 @@ tf_module {
     name: "experimental_connect_to_host"
     argspec: "args=[\'remote_host\', \'job_name\'], varargs=None, keywords=None, defaults=[\'None\', \'worker\'], "
   }
+  member_method {
+    name: "experimental_list_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 951b2df05aca88cef88e256a30dc76f70f18a355..195c10445d4991616c11f66dcda2ad4b744940f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index f1573512438b3f40db7653bf94fd4ad282a40acd..043584c23305975ac90ceb25572ed07b65552406 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 690da98b1ac2097c4241ba3218caa3b476dbf397..76d15f46344812927b76e5edaf5c59d547eaa62d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index fe0bc1a4db5d4a5e78ec7479e414545b522ec2df..a6c7a2d7c58fafa08b5d9993eb83a0c68b01b691 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 261129b132189ef504678058f11651dd22bdce8c..ae177ebc6b8d0c18edccbad21b897fda6e6722e5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 0b34bbc94269280d6cca77bca789fb74f76629be..c15c73f3f78e5abe27395b9598005588a33b569d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 0e61890eee42a8b5b0df7bda0f99d189c4911eb9..567f48d1aa5d82529bd9d044405b951e447c5567 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index abc98a74b64ab274ed8b2fc43876b7102f1c7201..48d90028646da07d95478abc9a3ddb5e918599e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea23432416ee5f012e6f0c725d5f57841400f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.CrossDeviceOps"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a38c4b21d563f0ef67eb2b2614fae678f0d97dce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.HierarchicalCopyAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.HierarchicalCopyAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09bcd84b96ee8475d3ef87ec5be686fc449ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.NcclAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.NcclAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d7943af2dc82e59b4ccd34816f9ecb6fd77dae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.OneDeviceStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ade9f86ba24779061bce3aa3e6f019d26741aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.ReductionToOneDevice"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.ReductionToOneDevice\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduce_to_device\', \'accumulation_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..619c3744d6cae858eb501392e962fbb94751355e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4cfa2f4ed2e8e346e847318e409dcf16b5f34f8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.ParameterServerStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8859fbd38c43577c5f6040d717d11fb23941e4e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.distribute.experimental"
+tf_module {
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterServerStrategy"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 5b5c9e2df134dc2fd73d3aabcabe8ced18d0c61f..7339bee6cd85ba9d474e55b3952468113d28a27c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.distribute"
 tf_module {
+  member {
+    name: "CrossDeviceOps"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HierarchicalCopyAllReduce"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputContext"
     mtype: "<type \'type\'>"
@@ -12,10 +20,22 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NcclAllReduce"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OneDeviceStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "ReductionToOneDevice"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
@@ -36,9 +56,9 @@ tf_module {
     name: "cluster_resolver"
     mtype: "<type \'module\'>"
   }
-  member_method {
-    name: "get_loss_reduction"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
   member_method {
     name: "get_replica_context"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
index f3dfe7296f77fa295e7f02718a2a8bb4cb3ea199..d53752780090c2d621dcabfc8c31e4f1192bd7c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.mode_keys.ModeKeysV2\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.mode_keys.EstimatorModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
index 52874dd9b9316d9815c5aef51e272e6ffddb5224..ce486807a47031999ec4c9082bbf4b2e4af910ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
index 964c315e9730effac38d60f7242527e71cbf9846..9775b2cca812d3fd47a3e821f032ed67c62d0078 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
index bb82bc9e58627318b897f0610c7d852db7f98c07..a4389fb998e86cc291bb02d2ae04d220f1e152cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
index 8522834433f214e5d646ef6265b1047fb7f2cc4f..fc7f8447893c99f4f68bf12f0790e8a549232dc3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index a7ee6d3e07d4387c4fc0bd9a5f6d9614473ca73c..0c3f04e468c4c817cd474deb42149aee3021aa43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.experimental"
 tf_module {
-  member {
-    name: "Module"
-    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
-  }
   member_method {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index d6d5fc903f6e4483d87bcc648250bb300d6e0f61..4e4fd78b598c91b98a121e3751f1e61d67f14419 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -40,6 +40,26 @@ tf_module {
     name: "numeric_column"
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "sequence_categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
+  }
+  member_method {
+    name: "sequence_numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'0.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "shared_embeddings"
     argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 283cc6a735695b0b2d16af28f7688a7a077f19be..5e1371815469974b91b1a4de16fa20d19404eee8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 95e405aebaf61e3ccae268b474a006a3bca51343..4bbe98b789935318a901d84502cb763a60ddc92b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 2e0f77eda85780cec26b103ba11276ccdfd90189..abf4286b691b4f8a231df030c292a0f14a3d6f65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\'], varargs=None, keywords=kwargs, defaults=[\'./logs\', \'0\', \'True\', \'False\', \'epoch\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1bfd51cdcc3e783fcd24a80e189d3d73bf3a928d..2f3cb0b7c51e119da6a122dd6672109789c1e73c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75f44fcc3a1ecc65b27cc52d61256b6e69e0af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.SequenceFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index 721c18890acc28de66b529bc8f9e1430bc925bda..65b82a3f3222c51c4a419918ad1e74dd52152aed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -20,12 +20,16 @@ tf_module {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SequenceFeatures"
+    mtype: "<type \'type\'>"
+  }
   member_method {
-    name: "export"
-    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "load_from_saved_model"
-    argspec: "args=[\'saved_model_path\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b03cbb8eb804ad80ce5c2d6e43fe07d4ac9db9cd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.constant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02f8c252bdafc6ae5e0db1162ba2185c04981b63
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.glorot_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d18a3b6e7e063b5b3e172228a93b934a41736bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.glorot_uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcdb6ddf5f0c6256c7f1160996a08565330aa6c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc2dd171dfceba916fed1a02bbfb19f26497adca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.keras.initializers.ones"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..855065c1634abe2c794fec705a5dbc004fd3e597
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.orthogonal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
index 7412cd130588a6a95538607b17b93be26492111a..15a56fbb973ccc729d7b377ef6a20b426687690c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
@@ -48,6 +48,34 @@ tf_module {
     name: "Zeros"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "constant"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "glorot_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "glorot_uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "identity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ones"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "zeros"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9b3359d7a9b7deda1c80ffce17b061d87f31235
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.keras.initializers.zeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 8a0b8eb46f006497472c1e9ce539e91db19bd260..0c6c0a34b9f606398831c2a82e9b049fed96957a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index abb3c236948a7f46d64cad92ae922324446f9a99..15bf03977dbc03660971fc7343cb0388d8696326 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index b27db4e7f23499fd27430059f1cb556f341547b3..b265384d5980f4c4982ced19af0208427da56817 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 50998ac9d63c9492523720d7dcc8041fd9efcab5..3a0882daf1eefaf17f893c3b565784bdd60ac689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index be17aeafb5ae383cba58b854808f6c9bc0e9696d..d2ee310d68e5da9f07f9cb6656165c97fccaa469 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7f21b444bc8832189b11cd8ff206e034bc89170c..1da079f39e25b020406dabca46966d57dfb4451d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 2ac86f152fad454fc0b09e2cb8814f23ad997c20..d96751ccf916aa9a9522f341a3befbc987f25125 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index f6b1dd2f7e4244218b7c64868b773142c79695d6..3819e525a99b67337f1d3a36b30f6ee0dfefa03b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 3da1f43a92a3fb5a146bcf8fd16f26783487f129..47f6b397a70109e9cdb833eae0cbec8a753831f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a7be5ac81814b28c93407cd5d1ca7c3f60822f0b..4b8cadca5344acd7fe279d0132184d666a94f448 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index c5c29bead383da6b9c0c7436fb089e27413e72f3..5c66da42e64b3cf57c2e03002bb95bc33eeb3ac2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index b13f963a6fcaa8b4c2da541654564d620c710c20..9ae69aa163c33359850e5a290613929fb65ad689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 880d18e1aae53512b2f587b5c8914babcd68566f..95eb6f69ecc4fa313da9eca9b6bee185b647be03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 1eb0cf1a188b88d55b82297da715624c9e5a58f2..09410135435100779208ff1c3db9198fdc88b178 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index d9394e60f532465c1852b2cac46ca4cbd9125583..941b6aa3dd3316d15edede430ef25da913bf0ce6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -196,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index a0f6dc8097adfb896a8f3aa3b642c2997e257cf3..4bf8336fb32013258ec17edb810a68e7c0d3cfb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 037b92f861b14720a1a638884752a4d3e1dbd9f9..221addf20aec476c7533284f9d7437cb0e1a6bad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 6a0d027d47d999f5770e59299fc1206249bf9b43..1c95fcc8b543c3da318119f7c9ed64952ba5b819 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 66b5bd75fc16c37aecaa65ef12fb2311925c252a..994a507bfb20a675ef2b4f8f5a9b2284ee0c21ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index e73133ff0731821407084cc1cd6160b2e9bf3d9c..ae251b529bd6243733263acad98367a1a64e6530 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 7af6b2b3c398473398a9d2e227a42ec96451b301..1d73eecc37c30b0ce6ef723f56691cc502a5698e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index baff492dfbd3c9ab6f2c269cb89632768e6b6c92..d37ec0f5603237cb5c397dd8193f51d5e50db0ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 63d30a61851cdae8daa8a5dc70fa733fd6b2ff11..bb3c37d573fc2477a7327ea4dfec45a7a66e7c22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 7a29cbbec35b885792828828354ac8f9a29579b6..fc29f1c913adbfde33827d8095181763a43126d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 87c75c02243cd646502e12e2947555ad7c6913e9..2658fb4e9984969c60257c17361799e2f56b1bb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index f69104ddfef17c8b5df36f4bc3e9b0ea3a986295..58567eafa4134b747b4ca7ed9b71aa77d3a4c2e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index aa05471933cc97a872480e0ac45213b49a882189..42be76f4189d6fe299acb6fda744e268881de6f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index d61f1ddc1d506ae2db992aaacbdc634964d53292..11092f225d1c350ef81653555fb3cec8f6bda85a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 3caa3ff4edf352fa1baaf29092dd22340bdd12a4..d6f7f3033ecf8e226b961dabfe59e751639e5b98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 06e8b6b314183b884e635f3b78e5bd5368e0962f..339c9f52b7ff45ebebce128980b3ed9fc47bfc5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 9fdf6f66d1160a49da302ffa8eeeade3009de048..c2992de5a8f9757266fe681943e7a6b84039a893 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index cbe102065071a00596fb4b8f764b410737c638a6..b37f4c845a45418ba8fc47c89718debb97de8f88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 0efba09b272c8ffb2220ccfaad830c7fff98568c..d21f577721c3052507969c208870b9d69dd3313d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index b34c499eb2e603aa8e2a6c9c84ec752a41efd0de..f1e086b5dc50a3e47bbf2e9d3d130cade60b8902 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 51dd853127f549c8ff370391f11cf7b8021af469..eb3496aa48fa43066f4f394ba2039416b4157f33 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index dcd18a9cedd53565fdf38d9787335e0afea9ad3d..bbe324c77255fe5ccba650566d4a455196ce49ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f029907ee86943fb8c04eada819e9cbfd6d01009..dd93e32ddcecb70203798c65483638a328019d3a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index ac2d8c9aa3b74e5754c2a8014b4c093a610c5198..32e69856b9e569718e96ea1e0054b4229d47707f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 15cbcfe8edffa92ef0514248e9dbc523dc6a49bd..e24862632e73c9030dbce521df445da2521f322d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 865b898c4cc54253d85442f2db2f3f624ecfb817..b1b5759881792810b4b11e279bd699652c256334 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 3e17aca17cc4e636ea3f6235f04cd4b7f468ae28..9c4087a06f1776c3372bcc8524f1ea48af1b7447 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index b160687a2a610714f5dd6c0cc7c7c92408d386df..d56429442c626340b0f38becf452668ff1c4535f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 70e8d51a5a782d5f473e1350d16d942998e58fbb..089145ead9f0aa35f4a6e2d41b5dfb8425c68557 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 809dc8554b38af9486035f1f3b03aa58392812de..2bc02b6f69f2468449c8d551d42e3b12e219964a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 3fbce8cb714355c0898dcbdc6797394410e90253..c2510d129460b8819da832b3354321db49868a29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 70e4103ea1abd5bda90811d127230105ae7bb941..845d6b17015a4168426b430add12d303fe14cf5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 000bf54c4523307d791db76d73e1cbe71cb46e4d..f6fa8659dc4cf316968dd9f48572532fc0b4e5aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 8ffbf07f9bc32cc9a3a83e2f57f5fe4d78fffd3c..1285e21f714e3cac3288e4d5c8b883c5ec909d7c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 3803d2b0a8765f4832df34fc4876256d4dd2ea86..a1417e46ceb713440bfcb16d7f29d415883a3633 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 28668224e01e04bbc4c14259d84a11ec72c826a8..ff4da8ba542732f48388428f3dcf9452d41a3320 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index b83ed67723afd5544ec19c599437a57909d780c5..7140d5718ba61f508a1d00729a4777745994bee4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index e689d69140e36a94c731f4c3b4578919d31343f5..4edeb9788d88000634ee1aadab9fea69fb2c83c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index bb6eddae7168b576bdaca91b6f7951fe7b65ee1f..48609567d5bed7246d0162ddfb8da92090c5640f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5fb3f9dd3aa9a3f3761c2bfadbcc19f46fe2303c..7398613812d0b5dc7d3f9bb62fdda0bd08d11b60 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 89dfc2a256da20cc65b7d18601dce240f5580a21..c9b759d792752a372e1ba35b77fc964c9da4eea1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 376bec0814880e3fa0091a41cd9a4ba0dcc4ab60..88f1f8b06d1e887385ce44903ada3dfb3c934e21 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index c5f91a6338cf5c0b8f017f6a6a87d0016ffbe999..c95f9159cdf2549cce9d645d9cb9b4802ea87018 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -105,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bde888735916a018647f681968241a583e0271ef..80d1c32dede3d72a4bb5f150d9c9d51143b7f5ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
index c4726cf82430c800267f24032f2e02fc65e9499c..9bafe013efed1d59e1a6c43600cbe35593b04f97 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 16945f2c12a7be4eba8a67a9a58587d756888d12..b050302861485258c0043d97bb325860f770fb06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index f05741ffceb6a855f56731086619dcc621c8d71e..3bb780cdb0ae3b2f2c8b95f1e41c524f12d49162 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 7885db4ed291afb8ea627cebe1dbae45723d4b2f..690208be83a58bf107f02fa342812c9d29319183 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 9380d26cf4c7b1c93a4c0ce2681e792381c42deb..02f3186dc60056844a43821411e4438eb123fb88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 8eb8218df3f3532c6c108e9f43f6ff5a708c7fbc..f2e9a3b13665024e8102f1db84293b73fa094d42 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 0c96f86ed36d7cf99c396f863de6d9ef8f90adc5..868faa03919a0d656394a691200e622d987f4be1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 0c6b230eb79aac1e719949b3f8331423b621d47b..8e1662630c18d8662145226c6a54a05f49cab24b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index eb7ca52fba97174a1c6869ae003beec8ffc328df..ab96640936137bbcd3e73b1dbb56060dfe4943d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index e724e9088f82f7ff7152cc4393af4a8f582136c6..4f492f50fbd7b571b5fc853e6523c9cb56d73e82 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index dafbd09ee28ef6a5ec933cddf3c246cf2f4f4480..702f2e88d0b32776ab2e45bf968843e8df1e5578 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 3122fbec1c7cad161d71fdf9970995adcedfdad7..fef939090d8d1e4ffc17b290754b16b87a07f38a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 0527cda1f026e1ff9075e827c2902c45fd22db9d..1e2db3fb135ded60a33f8f4f3ffa92322d38dbd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 814e5a5d545f0d4b1276ef1639eddb72004b4d1b..1450047d7dd716b97a824c6718bef26373214114 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index aa1731afb82698cb44375407fc717bf32ef634d6..d5d3e0333bfc5ef8bf56a153d04ccc9c9df85dc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 9d7dd85fe0eef3733a86b9e918396e882f5812d2..f1151f43c9df50850e61b5ab6e9ea7b7bc0184d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -110,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index e9bba298bb028851e6e9b9a17ff40a671d9132f2..0874240fa6f60bd8c345fc1c2219f9f7bf843831 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 3c783eb5129028b3eb5160c75dde2859541cfd32..de9f8fb5939fd11ccdde97e383e1f4a7f1d99e73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index b8e0882541c51209cca112c54197bbce305bd1b2..a125754f43a31806b94b4775b4339e9c610a8ee0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 310f369ed6c7a9931af56016ae09db5d4bca15d5..01fac3a2595838bdd17b8ae5b0dd93a7ed70d495 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index df19d781c21e403b51d451d772cfba66a7383be7..80628d76d8224af71ebc12a1d735481ea6c0c654 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index bf909509bd4b25507291839fff1ee0eaccee630f..41e96fde4b2a06c67fd2007f0b597104b4cf5b93 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 5d66bc6fb6334d99242009f59ccbe0e7aa2d4e89..f48b0b3517f04dcba9ace267b4ffcfc5abb70c23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 88e9300de912f3b12712bbc311ac156803ef35c4..5e799329c032a1e72a61378623e2844fa7e89401 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 9d81c6d4bc3139952e9f41113d05547b215cf571..60893bbf1aa5a508cb4fedabc8cd90927b488f71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -167,7 +167,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 712eb0c6ec3b706a9e396a532f58916140a2c606..c96405a434c9ba3cb770b3dcaf6cf551ac4f200c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index dfc4ca27052f919ea3866a489e525ae1202795f1..153b7bc876738ef180901080bef3f8fdf2dd80f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 5e4f727f71d8be2496bee1abdb87c7050f1ca02e..44e08117c3130df83572dd8409e5af273ac5b290 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9d893cb30a066c4732ccab9d1520f5047a4d3a01..6e9f624999444a54b5d7dd2e372f9d5d470a6011 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a2ed954e4c0ce1d474b8c71b41ce1d585d42d665..57da4c0ba9d62aad702a6204b3ce41b48bf6d161 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 8a0818e78ac766a624bdcce85591fe13e1d4ceac..27eb794485a8c1c46cf918782b97724e9774eef4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index b5591b48265d3d09459bf9bb114a4a3149984eb9..733070e50bc283729ecbd91b1b79af9521eba678 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index 210e4fd4e6f0b2e8ba75d22e83134e2267fbece5..009ecca9a7f490f4792c3c0649e3a9003d0e247d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -111,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index da2213a84fe2f6bc683630e5f8760acdf3239b19..f465aa64b91f644d5d2f292f0a67a3300ed3f488 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index e2c303d506e0f8f99d8ca89f29979a5999382378..049da3deddc0fa56d4ae2bfd6552452473d0b7d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 396e774c8a4a10c4996c56c208fc4f4d432e3135..1d50c891154dd86e4ebe5b481a137e4c7ce248a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 8b6418d514e61536f314da88f1586cea4f29cfc5..6604ac05d910abd89e3c04f6a8194adac582963e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -110,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e8fda4c71ada65aecef59eb8012120488b0f17c7..2c8d52765d55357fb2f02f5268b089b27fa20429 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 50c52d270b684bcea5105e4c9813cc62103403f9..bf9f43c1dbfbddb0ddd19b26d554df86ba493d25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 84c6b78a2b4405fe0e2a1fbb3bba0635db06a21d..a78cfa826bfdf5f49947cd9ae2f9879bf36328e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
index 476c597fe60ced2338a38aca4c5ddd5f21b0ccf2..9b5598eed07c9f04feb0d90820381abc12dbb456 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.experimental.LayerNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa14c44fa3628236033e952b69f3a160c49a36fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
index f20ed26e2ea2819554159a9bcecb4141601e4a19..e93be80f1f702eacda20e4eefbec12dad724edaa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.losses_utils.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "NONE"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index af9d3c12e306907ea543eaad2b982c41cab39eac..4ec0e887fc07ce66c554e2b4e23a06fc68a3e794 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -13,7 +13,7 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
@@ -105,11 +105,7 @@ tf_module {
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "cosine_proximity"
+    name: "cosine_similarity"
     argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index beaa6f0596f5c41ccd278cef1e59c876d4728983..6e00a3a355269a0ccc5d69b3fcea106c4908e115 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 5f0079762fce727000e4fdeeda3dfb012840f3ed..18cde2f32aa10100c63d81470cb6e0fd2e61d4f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index a0a3ae890036161d5ec8da0af6b81495c99f9146..c5d2fc9c5397ae20dbd0c7f8f7ce7801c63c3997 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index b6ce9e042027eb5e1186de4ff7c01230fb0e038f..a8662459c9ce52da3a42e9c5e47c52b6deb6ab06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 587ffddac96220dad543be2e63a0fc202d60cb10..998c4cbb1fad2352cfb9a510ba6e9b153860fcf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 8cd173c7eb9f77d2731b790b068ce3a768f1c586..04f1794aba61aae085a7580806e524eea8b2a791 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 99414dd54ec1fd383f2f1a672f311a3b99213a00..19442b5028dda68548c19c74e0828abf4fd54534 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index b059e039deaaf699414c779584eff50b5b760537..678c7b0681fe4281893fba70b4652233a91e2a0c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -1,12 +1,12 @@
-path: "tensorflow.keras.metrics.CosineProximity"
+path: "tensorflow.keras.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -90,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 5432f7f4006b165fefb9aa028bf7d36d8cbc38f1..5a94569660fdc31f1889b5ca64f1483970cb5235 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 75541bf285d8989f867aabc7c7025e56cce1d05d..9033d9e655b2f2b80836153c23d9927315360de6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index f45a57df41e56a52f1a4b784a8c8d140f6670b25..dedc64f1375b66b90f655f280c1a56ba165cfa17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 25bcf8591d45bf2237845a914122f5f9ca9c0ad7..af8366b60876cb31f840c5f5007e67980be8dc3a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index fb2df7f459753b08426449f026129d0960d21eaf..a7e072e21cc94492ed27186f44b92863cd791d62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 11538c6c09dc4f0ed7de48fbf95dcc253adb1e24..75173ad17a9c1fa02451287adad10870a60d653b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index adec3feffa1f07d4d65b63281f1a87c7469eb4cb..7be81b63bbe01b8534bd64d163e735d735ff88f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 47b550a9e73bdd33c5d592ff86f237d821866e09..21e44ed988494119662e5e1a5101edbe4d7a35fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index a1ec7cb51e545077737051ff3f06509882e4d32a..8ef17fc34566e8ab6c5cc73781b40cb0f7396067 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index af960fa54329d7d80d23f24df798509b2d12bb85..363f532ba410f1ebae5f105769a0e69c2e2d0166 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 417f92f2734329e2382d6d6bc1ff7c399c990704..712f10cd3051fe3de82472cb0eef2ec5fb53b6dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 2a5218fed4cadf155f29781ff7341675b64f22fb..fff91d2b44c6e1e7c1fa0339c737c4a44b9566b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index b089109baa5ea49b588e6e9a54b56256d4e4e3a2..cffb444835c58c28953f85c61a8f2d98f2e74716 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index 2216043cec13086660c370d835209ff39ddaa324..ce746ab350bfa0534bf7f9ac7d6e8255c7749894 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.metrics.Metric"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index 8e91098f95bffa13fdc97406590d1ae71edd6887..570b77408cbaa2b7a0089f9de8a528e604799abe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index 9aeaa5627a9805579d6a6c4e09336a4d7994d1c5..83535d56cfc37932be785684825bed0e29a4fa5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 748cec08668c461fcf80df6a50fd5192f99073b9..9ec2bbc4a3c8709f162dc0407408b2fe29b695a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index 4dd60bb772aa93b637fca287baf3f17104a2e16d..e2bdbd54e22756b823716c149cf0f24661acc812 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 97aeb680be1c5c412a16e2a73e3f1bc2ca8ce6c0..172c40eb2777d5504968de225718c270a0ce4e99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 5a7bef4714d5aa37dcd1c11048ffd370e8841f0b..8a24088257a423c18f347eb256915bda10459e1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index c9687681680c13480d92105a5eb10ef0fe5bf89e..0cadc9dcd99c03d81907ad5b1c03fd3cba25f833 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index ea0f2a760ab7e34eb5f5b8c2ba63eea96ce63777..c135b8f680061a1e79fedd9d705d0fb54344823b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 85f80b062efe3d2d91104b211c8d9d75127c8c0e..4f6818797e1e55362a35d37e70a05bbb3b8d65b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 15895daf42bbfbbeda419d767844fd840ab4178e..90bbb087fafcdcde5dee048c45adbc45e3be2e55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index 5ca1c6c8396e3d79a8f6250f34e7137870a23ac6..30ef19e02cfc99d117e6a396beeaf6422a105013 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 9cf1bf21cb5acd9d81c581eecfc2e64b78bd9e70..e59476a2410f859dff7171162a2cab123d5e853d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 4bc9383f6ffc90972416fa031d5515a149e70425..6627a460c7522358a6f44d415a2ad8ce59b97427 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 2eae4df0ae344656bb637bc27e806876304a86f1..8c3c2cb03a8b28db6212e29e0cb9b7b61fca7174 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index 92dddacf2cc55ad0aec7ce012c7f6f5ef91578e2..71e89765cb8a660e4843362f912d7d011ca4ec14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -29,7 +29,7 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
@@ -180,14 +180,6 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
-  member_method {
-    name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index eb1ab1d9dd61b36ed8662e25700f12f82aadb502..63100a2176869e0a4ff30355c3df7ceaefaef65e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index c69cf281742360d9ed4d1f7cbd35219cf04b1149..5c9ba04296ee39e1fdf014396d166d91ef5f714c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index 0a56293e804f583a949ecb413da0ba613e0bc876..8471803624634eb2d3bacd79e236e51d4488a764 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index 14d0894e5622021c4961228d431d01516b752055..0466ea65fa3ccaab1459841def55d6f907c7b14c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index fdb1ea838c04f296c3d0ee7596d73052b6f4b6d9..9762fad5d0fb7690e041b853eba65bee1583ad14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index ece63ec168dac58f58286dbd9fd8a8151d0dc2dc..f477a60d237f5801b8ac8713c150cd83f2b3d768 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad42c6b75b32947635ec1098a3d639e011ec3765
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.keras.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index f952f88b6d203488ea0ec4f1794d7de79a25853a..3ffb4bb8b4dea5840013e830efc7eec6699f71ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 27bae902b0cb7f1f4e09737a83fadd95a83cc163..9639c71ce415f5a942485fdc0d40f32c24f16b7d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index e523443a0099b57942c73cafcd8a919503e8db38..2a7603d69b4f55d23e03e6e3d4fa5e60aeaac4c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index d2721f8e92088c216ab748cae45e415553b9d4c1..41635553347f5f1c04c221574ce7e5c6ac05275d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
index 97f7c4b8c9c35926484b9dc9d7ec818d8360b4ab..7a333834c267e59f7a09c4936b8ed59776be7ee5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "Adamax"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37344f70311bd225856ce72c52dfd0ac1fb09075
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorAdjoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_adjoint.LinearOperatorAdjoint\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index c7a50969b54e5efc4d338caa79dea76d86bffe8a..ddef774a75157401354d29b75c7a00fbedfd9ec5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 3900c752c8527f68af2496f99083d80fc9d18106..97a6b1a475b9555f6f5ded273050b5751625b78b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index 7b876099af6a28d9fca2e5c55aeae5e4610f82a6..e2bfe7e7d852e825f826f9f785c40b9550f706be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 5bddba8e798618f5b1d0cdc61ddff9725a495fe0..8885526669065e5a5506bfe1bf93076f4584f9d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 62ba8bb59e8af14447fe570ba28c5d0eba7f6af8..2a017fcb8987d46d8e24d2d21b43ae9962ad2075 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_composition.LinearOperatorComposition\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 0803feeabd12acb7988459fe6da2748e19b70a5f..31dcf7b0a6b4699e7009746fe62ec5551ee3e11b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_diag.LinearOperatorDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 6def32864b9cc660b94d628ccd53dc48a566ea81..0ad39b4ba6006a1efa6b16e650ef3140516775f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_full_matrix.LinearOperatorFullMatrix\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index dbf1ac82d33b81c63e5c356ac736f63262797ff0..f66a5a833a42c06f32696abc0a3114aa89f73a7e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -60,6 +64,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
index 6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337..a7eb144d83aaeb2997d44b703b46de9a01c3a478 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 85d902b977ceddd405abb1154a086d7bd29e7848..c983f8c6e6aa53716d1c2d07f219baccda99bd04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 638d82a599248e547bcae86ebd6d8d8dc3f6aa4b..813aec2a137ccaaea9718b7d0254ed0d60500247 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_low_rank_update.LinearOperatorLowRankUpdate\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "base_operator"
     mtype: "<type \'property\'>"
@@ -83,6 +87,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index ab1b04bd3cb1b215b848019b6c578ce091f8f828..0bb7a15e1342aeb4be94e9a40e1e6b1828e397b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_lower_triangular.LinearOperatorLowerTriangular\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 961969aac58b78e4edd53b47f2932f71f2d21fd5..7747c985404e54f93d012aba86a39503a855c76d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -64,6 +68,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index e76738a9648123414159fdc9666a99b0577aa46e..590782bbc1d57ed4efb1cfb68b145b49d64c4545 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index b35cd69da474a9665652f04f12b34a8d9f33fa8a..ed6bfdff288220fc0bcdf9fb6c4c78abfe5e43b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.linalg.LinearOperator"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -58,6 +62,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index f9119cdd5f728f3b35d83248daff17547a497aa2..e46cb44ba56c1df7b4c004b35c21f80326f963a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorAdjoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorBlockDiag"
     mtype: "<type \'type\'>"
@@ -200,4 +204,8 @@ tf_module {
     name: "triangular_solve"
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "tridiagonal_solve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fedb5ee9fa4a31f25133bef55b980c18ed74fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.Optimize"
+tf_class {
+  is_instance: "<enum \'Optimize\'>"
+  member {
+    name: "OPTIMIZE_FOR_LATENCY"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
+    name: "OPTIMIZE_FOR_SIZE"
+    mtype: "<enum \'Optimize\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d14b69531d183faa35d19f379d6b20c29b02e6e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.RepresentativeDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.RepresentativeDataset\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'input_gen\', \'output_gen\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
deleted file mode 100644
index 3ef90b8bc4646a2adfcbeca2258ff5aa7cbf8894..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.lite.TocoConverter"
-tf_class {
-  is_instance: "<class \'tensorflow.lite.python.lite.TocoConverter\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "from_frozen_graph"
-    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_keras_model_file"
-    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_saved_model"
-    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_session"
-    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b4133d0ca6edb929c780babd0652187d41c76b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TFLiteLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fff2b8606c98ddffabbd3d27e7a7848d1fce86a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TfLiteRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce858c46ba304b7cc3ce6b257518cdcc9aac646
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.lite.experimental.nn"
+tf_module {
+  member {
+    name: "TFLiteLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TfLiteRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d585ea6643b731fa4767301f13654fc699d6e23
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.lite.experimental"
+tf_module {
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
index 154dd00821794ef4a5118e98d67e32beca38bebf..5fccb608abd6dc03b91707fcfcb403a9f0ebceca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
@@ -13,19 +13,23 @@ tf_module {
     mtype: "<class \'enum.EnumMeta\'>"
   }
   member {
-    name: "TFLiteConverter"
+    name: "Optimize"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "RepresentativeDataset"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TocoConverter"
+    name: "TFLiteConverter"
     mtype: "<type \'type\'>"
   }
   member {
     name: "constants"
     mtype: "<type \'module\'>"
   }
-  member_method {
-    name: "toco_convert"
-    argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d180a9fbad492089f37bc98de50904c8bfa4d38
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3937dfa153a08843c12f1098c05e49e1b1a01c79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a064dd649535336ddfeda4f24b2594771bbbd5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
similarity index 74%
rename from tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
index 4952a76291c00bfdd73eed5412e7421887d1bab2..7829f0f327be676b1fe81775bf3a8a368e88c7db 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
@@ -1,11 +1,12 @@
-path: "tensorflow.keras.losses.CosineProximity"
+path: "tensorflow.losses.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..155154c312a8b9bf034d5600bd2d859036934ce1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5052c19a049c0defa2f5b64f3d16626dc95374cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b739c057b238b6a3dfa14e325deac3c4f4b46fc1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..557cc210450c2553668f914c84218bb762202668
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b1feaefddc93bc5b6da966525ace33fb047e07
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-loss.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.LogLoss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogLoss\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logloss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bcc6f8854368f919cf9b4254caa6fc2e071dc6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a33db29d421ccaab3bb829e3924cc3307039ad1b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c79a5e8b8a9fc3779f93fa8e8fd29f28df9ec52
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b99e19413ed7a87509b1ff2296046e718c5dcc82
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e01827391a796614bd0b69f36f0ae268ffb31a7b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6603cb407265b59ad17c2e428306e6b025aa9a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 6a44e4ce66c9dfcb9912c96d0106e4f4fd9fdcff..e4ae87ea29365b13b0c49e3ea2329550b0e420ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.losses_utils.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "NONE"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e3ce6fc728d594a3bc0c7a0ffe83f078e81fac8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5e3757143871b04a63c258d49f8ef30f52304fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index 36007d3ca6f63ccf06ef613aad584c8d1c63d627..f688a32db7ae7212f4356de3fbc0e54729edd71d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -1,7 +1,183 @@
 path: "tensorflow.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogLoss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "binary_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kullback_leibler_divergence"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logcosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_percentage_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_logarithmic_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'loss\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+  }
+  member_method {
+    name: "squared_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index a0b8e9e4013d4f69cc933f6f495bdcbbe478641e..c2c5bb50b59815a154c52407c82648cc28d4b31b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -120,6 +120,10 @@ tf_module {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -260,6 +264,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -300,6 +308,10 @@ tf_module {
     name: "reduce_any"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "reduce_euclidean_norm"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_logsumexp"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9aa59370a3050d67e35324831d688f23ab444303
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4cfade42ece20e113bfd41744f05a451bbba34c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84198f3cf3d841627aa88d690673b5e8fb1838a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d51c6a798bd20f25b523dc142bc6cb4734b5b6a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67e14faf3f950ed4d52c45111fa8c4a7023f7019
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33cd4c574931e8de692bc69a7dc85d98ca432fcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c90fab3fdbd901235bd7b5b10259fe2a67e071
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..326df8fcc2aa4beeabe11c4566d5d77b6ed13981
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9e32ad53c790a6753a1764c959575ff3eee7631
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45a2c48acba5be6c53ba5666cf9e308f374f6372
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2132fda36f44150e7154323c5d5f1e317173777
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbd0db90fa517982b85f4fe071c16c7cef4f2f70
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b66eda8523d026935d7edc380d4c9a00e41e4a6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90a6b0664165e37a61bccd7468dab19c7105d3d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8710a45cccd402e6e298044a53c957bd6797342f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb48837860114f5073206abe447151fe130f5d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..207f262851adfcd6d751d3f33c3d528f44e8daae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ce4c959abc303ff77bd7b50738bc8dd30750cd7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ad4089c2cb7d1eeeb796903e75731b675abdd5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31dc0cc9d7a88b317faa0ba5df1d67a70e0955d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..794665197bfa453fd73405d1ca8b1986c1758257
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1826983476253122caf753c821a3ce331801856
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..202739712fcf5471911158b8944203c36b80546b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ebe442e8626a18729a1930ccad7a531c12553be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74489f6446e533f89bc705a7657b0016abb4ab35
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f408836b66fc3c0ccff64a625a58879b44b7498f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6233b63a7161e52f4f40b306dd114e2aff5a2a7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7dfbf3f62e32b0a1ca6d8d698d0f563484927d53
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cb0007b1bf7ce959c25413aa6097f5026b0267c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7e4344e43907e876020cf6fe58c0a997180a76a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9dbf70f8f66debf98967b29690e087a473e57c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76ecc8c41c68ddf84c185d12c446f7e4f83529c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1642e1e62f2cbef1ffe2f3a01962e94c16030ca2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d47520fe113d2858d58bc1325a0fbbf74cb087b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8b55f8a2eec45220435ad1fa0298e4684d00ad1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e7274d7e6171d3766555d8c6910de8599e568b1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae997c6bd4ec0c98631c439a01b085f7cfae8e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.metrics"
+tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "binary_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.5\'], "
+  }
+  member_method {
+    name: "binary_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kullback_leibler_divergence"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_percentage_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_logarithmic_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'metric\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+  }
+  member_method {
+    name: "sparse_top_k_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'k\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "squared_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "top_k_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'k\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
index ad5360e33589ff2d9229c2ad6da83f6bea101669..70bb6d760bc7a6c55bbdfd5c05cde4c08769786d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "flatten"
     argspec: "args=[\'structure\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "is_nested"
+    argspec: "args=[\'seq\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "map_structure"
     argspec: "args=[\'func\'], varargs=structure, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 865ce461c2bdbc8da482395b38d0efbadea68027..bd52203c1f0b0a13453ba99618efdb76dd00a06c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -22,7 +22,15 @@ tf_module {
   }
   member_method {
     name: "avg_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "avg_pool3d"
@@ -50,35 +58,31 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv2d"
-    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    name: "conv1d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    name: "conv2d"
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d"
     argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    name: "conv3d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_transpose"
-    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    name: "conv_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convolution"
@@ -194,12 +198,16 @@ tf_module {
   }
   member_method {
     name: "max_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "max_pool1d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
   }
+  member_method {
+    name: "max_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
   member_method {
     name: "max_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3205c6a4dcaaa00591cd957021a463b77835343e..58d004b3d5d10332065216b4a816febb673a4853 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 8ba92fcc8dc89958b8395aa986c358a03fedd66d..a9f7e85b1488dc49a52c3ac4d5a7ed55bf605ab5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b476fafa9a6e26c29d91e28ac2ee66b6e74f637
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adadelta"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be2fedfe81f5e01bf86cdcfaccf19dbd1f367543
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..919c433648ff5950d4ab0c0f2ff2295d33d1085c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67fce4f5c63c0dcd364a124929c9232de1887ae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adamax"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee4c9ad25fc678652e33fd5423f228dd86b89816
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06363234ea68f192105295a1eec2b8487c4eb121
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Nadam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..041922bdfd121b5f161f2d1dea443fc32c592743
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.optimizers.Optimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5deef618248d608bf571a9548ea72f6837984a09
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.RMSprop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..381f72767b807bb495aeaa98c012e7fd97608c22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e03b42ac7f8b8400c6c09061e6d9d09f0ac3d9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.optimizers"
+tf_module {
+  member {
+    name: "Adadelta"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adamax"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Nadam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Optimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RMSprop"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f174d6cfc3ce20d1bf6d36e1cfff4d63d799a3d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..851d21c5cc063c97a49e8dbe2611b2f903d216a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36db36e4f432e8c0b87e306d3a35d3a0e5bdde0b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.optimizers.schedules.LearningRateSchedule"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6613bedef5f638e4b7f3211827031d04b35cc5ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbd5bcef8f75e78c3bcaa87d82d45b2094bfed80
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e61b76628a5ef1638e5134a4687402d0e47c8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index bb7ea2e13e828281de768661bbacaaa5bb99eeef..66489e68914341303eb1988a3ae9083f258cf3f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "AggregationMethod"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CriticalSection"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
@@ -20,6 +24,10 @@ tf_module {
     name: "IndexedSlices"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Module"
+    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
+  }
   member {
     name: "Operation"
     mtype: "<type \'type\'>"
@@ -208,6 +216,10 @@ tf_module {
     name: "math"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
@@ -228,6 +240,10 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "optimizers"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -524,10 +540,6 @@ tf_module {
     name: "device"
     argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "div_no_nan"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -668,6 +680,10 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -760,6 +776,10 @@ tf_module {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "numpy_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "one_hot"
     argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index ada33181df7222f7e15c31c9df5f41bcb55026b7..2433c8846719bf5049c7c93e005012f9061ba648 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -532,6 +532,10 @@ tf_module {
     name: "CacheDataset"
     argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "Case"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Cast"
     argspec: "args=[\'x\', \'DstT\', \'Truncate\'], varargs=None, keywords=None, defaults=None"
@@ -568,6 +572,10 @@ tf_module {
     name: "CollectiveBcastSend"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "CollectiveGather"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "CollectiveReduce"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\'], varargs=None, keywords=None, defaults=None"
@@ -972,6 +980,10 @@ tf_module {
     name: "Erfc"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "EuclideanNorm"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Exit"
     argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
@@ -1912,6 +1924,10 @@ tf_module {
     name: "Mul"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "MulNoNan"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "MultiDeviceIterator"
     argspec: "args=[\'devices\', \'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
@@ -2644,10 +2660,18 @@ tf_module {
     name: "RequantizationRange"
     argspec: "args=[\'input\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "RequantizationRangePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'clip_value_max\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Requantize"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "RequantizePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "Reshape"
     argspec: "args=[\'tensor\', \'shape\'], varargs=None, keywords=None, defaults=None"
@@ -3056,6 +3080,10 @@ tf_module {
     name: "ShapeN"
     argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "ShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "ShardedFilename"
     argspec: "args=[\'basename\', \'shard\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
@@ -3768,6 +3796,10 @@ tf_module {
     name: "TensorListScatter"
     argspec: "args=[\'tensor\', \'indices\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "TensorListScatterIntoExistingList"
+    argspec: "args=[\'input_handle\', \'tensor\', \'indices\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "TensorListScatterV2"
     argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
@@ -3848,6 +3880,10 @@ tf_module {
     name: "Transpose"
     argspec: "args=[\'x\', \'perm\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "TridiagonalSolve"
+    argspec: "args=[\'diagonals\', \'rhs\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "TruncateDiv"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
index 9f6ce042250617fa74c4c1c6941f3e7ea033019c..53b61f03f96a1d4b4ef81032d716f3e913f0fd44 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
@@ -8,7 +8,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt
index 51dc8c1517e98b9ce16c38007fcb07a8c5088e77..9a303dfd6fde05e724d4012d1f9c09d69cc658a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt
@@ -8,7 +8,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index f6ab7ac0adcc594ef2a1c31b72882f5c72d3c06e..17275d4d8d7f4cebdca43ed0f2fed6a9841a7ea2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -88,29 +88,13 @@ tf_module {
     name: "VARIABLES_FILENAME"
     mtype: "<type \'str\'>"
   }
-  member_method {
-    name: "build_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "classification_signature_def"
-    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "contains_saved_model"
     argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "regression_signature_def"
-    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+    name: "load"
+    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index c59f1b8474302b5529895b8aa9216a2e197d958f..85edef9d7e4c62a0a4f0c77f19e17d8c45535fc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "SummaryWriter"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "audio"
+    argspec: "args=[\'name\', \'data\', \'sample_rate\', \'step\', \'max_outputs\', \'encoding\', \'description\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
+  }
   member_method {
     name: "create_file_writer"
     argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -12,14 +16,34 @@ tf_module {
     name: "flush"
     argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "histogram"
+    argspec: "args=[\'name\', \'data\', \'step\', \'buckets\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "image"
+    argspec: "args=[\'name\', \'data\', \'step\', \'max_outputs\', \'description\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
   member_method {
     name: "import_event"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "record_if"
+    argspec: "args=[\'condition\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scalar"
+    argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "summary_scope"
     argspec: "args=[\'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'summary\', \'None\'], "
   }
+  member_method {
+    name: "text"
+    argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write"
     argspec: "args=[\'tag\', \'tensor\', \'step\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index 3c77788cb03dea3c93ce3bfbc32f35613c927c37..ac9dd8f7189799cbf9b061677cd88058cb9d799e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "benchmark_config"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_gradient"
+    argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+  }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 42dcdac9e77a8efac875e4985f6a8f744e838ddb..629bc13612ab567006436bd95fee49c4e3acdefe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 3cbea41dcab4f8453ef2598fbfd4f8bee65c9b65..b81ce692bbdb28f7c039c32af7f803423d7e86c1 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -15,11 +15,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 py_test(
     name = "api_compatibility_test",
-    srcs = [
-        "api_compatibility_test.py",
-        "//tensorflow:tf_python_api_gen_v2",
-    ],
-    args = ["--only_test_core_api=true"],
+    srcs = ["api_compatibility_test.py"],
     data = [
         "//tensorflow/tools/api/golden:api_golden_v1",
         "//tensorflow/tools/api/golden:api_golden_v2",
@@ -27,6 +23,7 @@ py_test(
         "//tensorflow/tools/api/tests:README.txt",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_rocm"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -35,6 +32,7 @@ py_test(
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 9350ebba5789109c055c014105f98a14c268856d..fe52a2bde7df16c4ae2ad911fa89bf82890c42a6 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -32,8 +32,8 @@ import os
 import re
 import sys
 
+import six
 import tensorflow as tf
-from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
@@ -78,6 +78,13 @@ _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 _NON_CORE_PACKAGES = ['estimator']
 
 
+# TODO(annarev): remove this once we test with newer version of
+# estimator that actually has compat v1 version.
+if not hasattr(tf.compat.v1, 'estimator'):
+  tf.compat.v1.estimator = tf.estimator
+  tf.compat.v2.estimator = tf.estimator
+
+
 def _KeyToFilePath(key, api_version):
   """From a given key, construct a filepath.
 
@@ -135,6 +142,29 @@ def _FilterNonCoreGoldenFiles(golden_file_list):
   return filtered_file_list
 
 
+def _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map):
+  """Filter out golden proto dict symbols that should be omitted."""
+  if not omit_golden_symbols_map:
+    return golden_proto_dict
+  filtered_proto_dict = dict(golden_proto_dict)
+  for key, symbol_list in six.iteritems(omit_golden_symbols_map):
+    api_object = api_objects_pb2.TFAPIObject()
+    api_object.CopyFrom(filtered_proto_dict[key])
+    filtered_proto_dict[key] = api_object
+    module_or_class = None
+    if api_object.HasField('tf_module'):
+      module_or_class = api_object.tf_module
+    elif api_object.HasField('tf_class'):
+      module_or_class = api_object.tf_class
+    if module_or_class is not None:
+      for members in (module_or_class.member, module_or_class.member_method):
+        filtered_members = [m for m in members if m.name not in symbol_list]
+        # Two steps because protobuf repeated fields disallow slice assignment.
+        del members[:]
+        members.extend(filtered_members)
+  return filtered_proto_dict
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -256,7 +286,7 @@ class ApiCompatibilityTest(test.TestCase):
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
       visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
-    traverse.traverse(tf_v2.compat.v1, visitor)
+    traverse.traverse(tf.compat.v1, visitor)
 
   def testNoSubclassOfMessageV2(self):
     if not hasattr(tf.compat, 'v2'):
@@ -271,7 +301,8 @@ class ApiCompatibilityTest(test.TestCase):
                                    root,
                                    golden_file_pattern,
                                    api_version,
-                                   additional_private_map=None):
+                                   additional_private_map=None,
+                                   omit_golden_symbols_map=None):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
@@ -304,6 +335,8 @@ class ApiCompatibilityTest(test.TestCase):
         _FileNameToKey(filename): _ReadFileToProto(filename)
         for filename in golden_file_list
     }
+    golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict,
+                                               omit_golden_symbols_map)
 
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
@@ -316,7 +349,7 @@ class ApiCompatibilityTest(test.TestCase):
 
   @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
-    api_version = 1
+    api_version = 2 if '_api.v2' in tf.__name__ else 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
@@ -339,7 +372,7 @@ class ApiCompatibilityTest(test.TestCase):
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
-    self._checkBackwardsCompatibility(tf_v2.compat.v1, golden_file_pattern,
+    self._checkBackwardsCompatibility(tf.compat.v1, golden_file_pattern,
                                       api_version)
 
   def testAPIBackwardsCompatibilityV2(self):
@@ -347,11 +380,17 @@ class ApiCompatibilityTest(test.TestCase):
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
+    omit_golden_symbols_map = {}
+    if FLAGS.only_test_core_api:
+      # In TF 2.0 these summary symbols are imported from TensorBoard.
+      omit_golden_symbols_map['tensorflow.summary'] = [
+          'audio', 'histogram', 'image', 'scalar', 'text']
     self._checkBackwardsCompatibility(
-        tf_v2,
+        tf.compat.v2,
         golden_file_pattern,
         api_version,
-        additional_private_map={'tf.compat': ['v1', 'v2']})
+        additional_private_map={'tf.compat': ['v1', 'v2']},
+        omit_golden_symbols_map=omit_golden_symbols_map)
 
 
 if __name__ == '__main__':
@@ -363,7 +402,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--only_test_core_api',
       type=bool,
-      default=False,
+      default=True,  # only_test_core_api default value
       help=_ONLY_TEST_CORE_API_HELP)
   parser.add_argument(
       '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index de93b12b97081feea5be96edf3b6e6dfbe5599b4..e5187ab8727b2af1853972417c2fb1b890b59a4a 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -254,6 +254,7 @@ Status InitializeSession(int num_threads, const string& graph,
   tensorflow::ConfigProto& config = options.config;
   if (num_threads > 0) {
     config.set_intra_op_parallelism_threads(num_threads);
+    config.set_inter_op_parallelism_threads(num_threads);
   }
   LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index aadaa8bac11cc80d1af11905d88116c8df677c2f..f190199643a14d42bb16113568d02ef2e6c85127 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -44,9 +44,12 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
   libboost-filesystem-dev \
   rpm \
   libnuma-dev \
+  pciutils \
   virtualenv \
   python-pip \
   python3-pip \
+  libxml2 \
+  libxml2-dev \
   wget && \
   apt-get clean && \
   rm -rf /var/lib/apt/lists/*
@@ -54,15 +57,12 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
 # Install rocm pkgs
 RUN apt-get update --allow-insecure-repositories && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    rocm-dev rocm-libs rocm-utils \
+    rocm-dev rocm-libs rocm-utils rocm-cmake \
     rocfft miopen-hip miopengemm rocblas hipblas rocrand \
     rocm-profiler cxlactivitylogger && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN cd ~ && git clone https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP.git
-RUN cd ~/HIP && mkdir -p build && cd build && cmake .. && make package -j && dpkg -i *.deb
-
 ENV HCC_HOME=$ROCM_PATH/hcc
 ENV HIP_PATH=$ROCM_PATH/hip
 ENV OPENCL_ROOT=$ROCM_PATH/opencl
@@ -71,12 +71,7 @@ ENV PATH="$ROCM_PATH/bin:${PATH}"
 ENV PATH="$OPENCL_ROOT/bin:${PATH}"
 
 # Add target file to help determine which device(s) to build for
-RUN echo -e "gfx803\ngfx900" >> /opt/rocm/bin/target.lst
-
-# Setup environment variables, and add those environment variables at the end of ~/.bashrc 
-ARG HCC_HOME=/opt/rocm/hcc
-ARG HIP_PATH=/opt/rocm/hip
-ARG PATH=$HCC_HOME/bin:$HIP_PATH/bin:$PATH
+RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst'
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cff1fe66579dae380bc7023d2010b3a53d50ada3
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -0,0 +1,673 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Build the Python PIP installation package for TensorFlow and install
+# the package.
+#
+# Usage:
+#   pip_new.sh
+#
+# Required step(s):
+#   Run configure.py prior to running this script.
+#
+# Required environment variable(s):
+#   CONTAINER_TYPE:      (CPU | GPU)
+#   OS_TYPE:             (UBUNTU | MACOS)
+#   TF_PYTHON_VERSION:   (python2 | python2.7 | python3.5 | python3.7)
+#
+# Optional environment variables. If provided, overwrites any default values.
+#   TF_BUILD_FLAGS:      Bazel build flags.
+#                          e.g. TF_BUILD_FLAGS="--config=opt"
+#   TF_TEST_FLAGS:       Bazel test flags.
+#                          e.g. TF_TEST_FLAGS="--verbose_failures=true \
+#                               --build_tests_only --test_output=errors"
+#   TF_TEST_FILTER_TAGS: Filtering tags for bazel tests. More specifically,
+#                        input tags for `--test_filter_tags` flag.
+#                          e.g. TF_TEST_FILTER_TAGS="no_pip,-nomac,no_oss"
+#   TF_TEST_TARGETS:     Bazel test targets.
+#                          e.g. TF_TEST_TARGETS="//tensorflow/contrib/... \
+#                               //tensorflow/... \
+#                               //tensorflow/python/..."
+#   TF_PIP_TESTS:        PIP tests to run. If NOT specified, skips all tests.
+#                          e.g. TF_PIP_TESTS="test_pip_virtualenv_clean \
+#                               test_pip_virtualenv_clean \
+#                               test_pip_virtualenv_oss_serial"
+#   IS_NIGHTLY:          Nightly run flag.
+#                          e.g. IS_NIGHTLY=1  # nightly runs
+#                          e.g. IS_NIGHTLY=0  # non-nightly runs
+#   TF_PROJECT_NAME:     Name of the project. This string will be pass onto
+#                        the wheel file name. For nightly builds, it will be
+#                        overwritten to 'tf_nightly'. For gpu builds, '_gpu'
+#                        will be appended.
+#                          e.g. TF_PROJECT_NAME="tensorflow"
+#                          e.g. TF_PROJECT_NAME="tf_nightly_gpu"
+#   TF_PIP_TEST_ROOT:    Root directory for building and testing pip pkgs.
+#                          e.g. TF_PIP_TEST_ROOT="pip_test"
+#
+# To-be-deprecated variable(s).
+#   GIT_TAG_OVERRIDE:    Values for `--git_tag_override`. This flag gets passed
+#                        in as `--action_env` for bazel build and tests.
+#   TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES
+#                        Additonal pip packages to be installed.
+#                        Caveat: pip version needs to be checked prior.
+
+# set bash options
+set -e
+set -x
+
+###########################################################################
+# General helper function(s)
+###########################################################################
+
+# Strip leading and trailing whitespaces
+str_strip () {
+  echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
+}
+
+# Convert string to all lower case
+lowercase() {
+  if [[ -z "${1}" ]]; then
+    die "Nothing to convert to lowercase. No argument given."
+  fi
+  echo "${1}" | tr '[:upper:]' '[:lower:]'
+}
+
+check_global_vars() {
+  # Check container type
+  if ! [[ ${CONTAINER_TYPE} == "cpu" ]] && \
+     ! [[ ${CONTAINER_TYPE} == "rocm" ]] && \
+     ! [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+    die "Error: Provided CONTAINER_TYPE \"${CONTAINER_TYPE}\" "\
+        "is not supported."
+  fi
+  # Check OS type
+  if ! [[ ${OS_TYPE} == "ubuntu" ]] && \
+     ! [[ ${OS_TYPE} == "macos" ]]; then
+    die"Error: Provided OS_TYPE \"${OS_TYPE}\" is not supported."
+  fi
+}
+
+add_test_filter_tag() {
+  EMPTY=""
+  while true; do
+    FILTER="${1:$EMPTY}"
+    if ! [[ $BAZEL_TEST_FILTER_TAGS == *"${FILTER}"* ]]; then
+      BAZEL_TEST_FILTER_TAGS="${FILTER},${BAZEL_TEST_FILTER_TAGS}"
+    fi
+    shift
+    if [[ -z "${1}" ]]; then
+      break
+    fi
+  done
+}
+
+remove_test_filter_tag() {
+  EMPTY=""
+  while true; do
+    FILTER="${1:$EMPTY}"
+    BAZEL_TEST_FILTER_TAGS="$(echo ${BAZEL_TEST_FILTER_TAGS} | sed -e 's/^'${FILTER}',//g' -e 's/,'${FILTER}'//g')"
+    shift
+    if [[ -z "${1}" ]]; then
+      break
+    fi
+  done
+}
+
+# Clean up bazel build & test flags with proper configuration.
+update_bazel_flags() {
+  # Add git tag override flag if necessary.
+  GIT_TAG_STR=" --action_env=GIT_TAG_OVERRIDE"
+  if [[ -z "${GIT_TAG_OVERRIDE}" ]] && \
+    ! [[ ${BAZEL_BUILD_FLAGS} = *${GIT_TAG_STR}* ]]; then
+    BAZEL_BUILD_FLAGS+="${GIT_TAG_STR}"
+  fi
+  # Clean up whitespaces
+  BAZEL_BUILD_FLAGS=$(str_strip "${BAZEL_BUILD_FLAGS}")
+  BAZEL_TEST_FLAGS=$(str_strp "${BAZEL_TEST_FLAGS}")
+  # Cleaned bazel flags
+  echo "Bazel build flags (cleaned):\n" "${BAZEL_BUILD_FLAGS}"
+  echo "Bazel test flags (cleaned):\n" "${BAZEL_TEST_FLAGS}"
+}
+
+update_test_filter_tags() {
+  # Add test filter tags
+  # This script is for validating built PIP packages. Add pip tags.
+  add_test_filter_tag -no_pip -nopip
+  # MacOS filter tags
+  if [[ ${OS_TYPE} == "macos" ]]; then
+    remove_test_filter_tag nomac no_mac
+    add_test_filter_tag -nomac -no_mac
+  fi
+  echo "Final test filter tags: ${BAZEL_TEST_FILTER_TAGS}"
+}
+
+# Check currently running python and pip version
+check_python_pip_version() {
+  # Check if only the major version of python is provided by the user.
+  MAJOR_VER_ONLY=0
+  if [[ ${#PYTHON_VER} -lt 9 ]]; then
+    # User only provided major version (e.g. 'python2' instead of 'python2.7')
+    MAJOR_VER_ONLY=1
+  fi
+
+  # Retrieve only the version number of the user requested python.
+  PYTHON_VER_REQUESTED=${PYTHON_VER:6:3}
+  echo "PYTHON_VER_REQUESTED: ${PYTHON_VER_REQUESTED}"
+
+  # Retrieve only the version numbers of the python & pip in use currently.
+  PYTHON_VER_IN_USE=$(python --version 2>&1)
+  PYTHON_VER_IN_USE=${PYTHON_VER_IN_USE:7:3}
+  PIP_VER_IN_USE=$(pip --version)
+  PIP_VER_IN_USE=${PIP_VER_IN_USE:${#PIP_VER_IN_USE}-4:3}
+
+  # If only major versions are applied, drop minor versions.
+  if [[ $MAJOR_VER_ONLY == 1 ]]; then
+    PYTHON_VER_IN_USE=${PYTHON_VER_IN_USE:0:1}
+    PIP_VER_IN_USE=${PIP_VER_IN_USE:0:1}
+  fi
+
+  # Check if all versions match.
+  echo -e "User requested python version: '${PYTHON_VER_REQUESTED}'\n" \
+    "Detected python version in use: '${PYTHON_VER_IN_USE}'\n"\
+    "Detected pip version in use: '${PIP_VER_IN_USE}'"
+  if ! [[ $PYTHON_VER_REQUESTED == $PYTHON_VER_IN_USE ]]; then
+    die "Error: Mismatch in python versions detected."
+  else:
+    echo "Python and PIP versions in use match the requested."
+  fi
+}
+
+###########################################################################
+# Setup: directories, local/global variables
+###########################################################################
+
+# Script directory and source necessary files.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/builds_common.sh"
+
+# Required global variables
+# Checks on values for these vars are done in "Build TF PIP Package" section.
+CONTAINER_TYPE=$(lowercase "${CONTAINER_TYPE}")
+OS_TYPE=$(lowercase "${OS_TYPE}")
+PYTHON_VER=$(lowercase "${TF_PYTHON_VERSION}")
+
+# Python bin path
+if [[ -z "$PYTHON_BIN_PATH" ]]; then
+  die "Error: PYTHON_BIN_PATH was not provided. Did you run configure?"
+fi
+# Get python version for configuring pip later in installation.
+PYTHON_VER_CFG=$(${PYTHON_BIN_PATH} -V 2>&1 | awk '{print $NF}' | cut -d. -f-2)
+echo "PYTHON_BIN_PATH: ${PYTHON_BIN_PATH} (version: ${PYTHON_VER_CFG})"
+
+# Default values for optional global variables in case they are not user
+# defined.
+DEFAULT_BAZEL_BUILD_FLAGS='--config=opt'
+DEFAULT_BAZEL_TEST_FLAGS='--test_output=errors --verbose_failures=true'
+DEFAULT_BAZEL_TEST_FILTERS='-no_oss,-oss_serial'
+DEFAULT_BAZEL_TEST_TARGETS='//tensorflow/python/... -//tensorflow/core/... -//tensorflow/compiler/... '
+DEFAULT_PIP_TESTS="" # Do not run any tests by default
+DEFAULT_IS_NIGHTLY=0 # Not nightly by default
+DEFAULT_PROJECT_NAME="tensorflow"
+DEFAULT_PIP_TEST_ROOT="pip_test"
+
+# Take in optional global variables
+BAZEL_BUILD_FLAGS=${TF_BUILD_FLAGS:-$DEFAULT_BAZEL_BUILD_FLAGS}
+BAZEL_TEST_FLAGS=${TF_TEST_FLAGS:-$DEFAULT_BAZEL_TEST_FLAGS}
+BAZEL_TEST_TARGETS=${TF_TEST_TARGETS:-$DEFAULT_BAZEL_TEST_TARGETS}
+BAZEL_TEST_FILTER_TAGS=${TF_TEST_FILTER_TAGS:-$DEFAULT_BAZEL_TEST_FILTERS}
+PIP_TESTS=${TF_PIP_TESTS:-$DEFAULT_PIP_TESTS}
+IS_NIGHTLY=${IS_NIGHTLY:-$DEFAULT_IS_NIGHTLY}
+PROJECT_NAME=${TF_PROJECT_NAME:-$DEFAULT_PROJECT_NAME}
+PIP_TEST_ROOT=${TF_PIP_TEST_ROOT:-$DEFAULT_PIP_TEST_ROOT}
+
+# Local variables
+PIP_WHL_DIR="${KOKORO_ARTIFACTS_DIR}/tensorflow/${PIP_TEST_ROOT}/whl"
+mkdir -p "${PIP_WHL_DIR}"
+PIP_WHL_DIR=$(realpath "${PIP_WHL_DIR}") # Get absolute path
+WHL_PATH=""
+# Determine the major.minor versions of python being used (e.g., 2.7).
+# Useful for determining the directory of the local pip installation.
+PY_MAJOR_MINOR_VER=$(${PYTHON_BIN_PATH} -V 2>&1 | awk '{print $NF}' | cut -d. -f-2)
+if [[ -z "${PY_MAJOR_MINOR_VER}" ]]; then
+  die "ERROR: Unable to determine the major.minor version of Python."
+fi
+echo "Python binary path to be used in PIP install: ${PYTHON_BIN_PATH} "\
+"(Major.Minor version: ${PY_MAJOR_MINOR_VER})"
+PYTHON_BIN_PATH_INIT=${PYTHON_BIN_PATH}
+PIP_BIN_PATH="$(which pip${PYTHON_VER_CFG})"
+PIP_BIN_PATH_INIT=${PIP_BIN_PATH}
+
+# PIP packages
+INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
+
+###########################################################################
+# Build TF PIP Package
+###########################################################################
+
+# First remove any already existing binaries for a clean start and test.
+if [[ -d ${PIP_TEST_ROOT} ]]; then
+  echo "Test root directory ${PIP_TEST_ROOT} already exists. Deleting it."
+  sudo rm -rf ${PIP_TEST_ROOT}
+fi
+
+# Check that global variables are properly set.
+check_global_vars
+
+# Check if in a virtualenv and exit if yes.
+IN_VENV=$(python -c 'import sys; print("1" if hasattr(sys, "real_prefix") else "0")')
+if [[ "$IN_VENV" == "1" ]]; then
+  echo "It appears that we are already in a virtualenv. Deactivating..."
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+fi
+
+# Configure python. Obtain the path to python binary.
+source tools/python_bin_path.sh
+# Assume PYTHON_BIN_PATH is exported by the script above.
+if [[ -z "$PYTHON_BIN_PATH" ]]; then
+  die "PYTHON_BIN_PATH was not provided. Did you run configure?"
+fi
+
+# Bazel build the file.
+PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
+# Clean bazel cache.
+bazel clean
+# Clean up and update bazel flags
+update_bazel_flags
+# Build. This outputs the file `build_pip_package`.
+bazel build ${BAZEL_BUILD_FLAGS} ${PIP_BUILD_TARGET} || \
+  die "Error: Bazel build failed for target: '${PIP_BUILD_TARGET}'"
+
+###########################################################################
+# Test function(s)
+###########################################################################
+
+test_pip_virtualenv_clean() {
+  # Create a clean directory.
+  CLEAN_VENV_DIR="${PIP_TEST_ROOT}/venv_clean"
+
+  # activate virtual environment and install tensorflow with PIP.
+  create_activate_virtualenv --clean "${CLEAN_VENV_DIR}"
+  # Install TF with pip
+  install_tensorflow_pip "${WHL_PATH}"
+
+  # cd to a temporary directory to avoid picking up Python files in the source
+  # tree.
+  TMP_DIR=$(mktemp -d)
+  pushd "${TMP_DIR}"
+
+  # Run a quick check on tensorflow installation.
+  RET_VAL=$(python -c "import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2))")
+
+  # Deactivate virtualenv.
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+
+  # Return to original directory. Remove temp dirs.
+  popd
+  sudo rm -rf "${TMP_DIR}" "${CLEAN_VENV_DIR}"
+
+  # Check result to see if tensorflow is properly installed.
+  if [[ ${RET_VAL} == *'Tensor("Add:0", shape=(4,), dtype=int32)'* ]]; then
+    echo "PIP test on clean virtualenv PASSED."
+    return 0
+  else
+    echo "PIP test on clean virtualenv FAILED."
+    return 1
+  fi
+}
+
+test_pip_virtualenv_non_clean() {
+  # Create virtualenv directory for install test
+  VENV_DIR="${PIP_TEST_ROOT}/venv"
+
+  # Activate virtualenv
+  create_activate_virtualenv "${VENV_DIR}"
+  # Install TF with pip
+  install_tensorflow_pip "${WHL_PATH}"
+
+  # cd to a temporary directory to avoid picking up Python files in the source
+  # tree.
+  TMP_DIR=$(mktemp -d)
+  pushd "${TMP_DIR}"
+
+  # Run a quick check on tensorflow installation.
+  RET_VAL=$(python -c "import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2))")
+
+  # Return to original directory. Remove temp dirs.
+  popd
+  sudo rm -rf "${TMP_DIR}"
+
+  # Check result to see if tensorflow is properly installed.
+  if ! [[ ${RET_VAL} == *'Tensor("Add:0", shape=(4,), dtype=int32)'* ]]; then
+    echo "PIP test on virtualenv (non-clean) FAILED"
+    return 1
+  fi
+
+  # Install extra pip packages, if specified.
+  for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
+    echo "Installing extra pip package required by test-on-install: ${PACKAGE}"
+
+    ${PIP_BIN_PATH} install ${PACKAGE}
+    if [[ $? != 0 ]]; then
+      echo "${PIP_BIN_PATH} install ${PACKAGE} FAILED."
+      deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+      return 1
+    fi
+  done
+
+  # Run bazel test.
+  run_test_with_bazel
+  RESULT=$?
+
+  # Deactivate from virtualenv.
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+  sudo rm -rf "${VENV_DIR}"
+
+  if [[ $RESULT -ne 0 ]]; then
+    echo "PIP test on virtualenv (non-clean) FAILED."
+    return 1
+  else
+    echo "PIP test on virtualenv (non-clean) PASSED."
+    return 0
+  fi
+}
+
+test_pip_virtualenv_oss_serial() {
+  # Create virtualenv directory
+  VENV_DIR="${PIP_TEST_ROOT}/venv"
+
+  create_activate_virtualenv "${VENV_DIR}"
+  run_test_with_bazel --oss_serial
+  RESULT=$?
+
+  # deactivate virtualenv
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+
+  if [[ ${RESULT} -ne 0 ]]; then
+    echo "PIP test on virtualenv (oss-serial) FAILED."
+    return 1
+  else
+    echo "PIP test on virtualenv (oss-serial) PASSED."
+    return 0
+  fi
+}
+
+###########################################################################
+# Test helper function(s)
+###########################################################################
+
+create_activate_virtualenv() {
+  VIRTUALENV_FLAGS="--system-site-packages"
+  if [[ "${1}" == "--clean" ]]; then
+    shift
+  fi
+
+  VIRTUALENV_DIR="${1}"
+  if [[ -d "${VIRTUALENV_DIR}" ]]; then
+    if sudo rm -rf "${VIRTUALENV_DIR}"
+    then
+      echo "Removed existing virtualenv directory: ${VIRTUALENV_DIR}"
+    else
+      die "Failed to remove existing virtualenv directory: ${VIRTUALENV_DIR}"
+    fi
+  fi
+
+  if mkdir -p "${VIRTUALENV_DIR}"
+  then
+    echo "Created virtualenv directory: ${VIRTUALENV_DIR}"
+  else
+    die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
+  fi
+
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH_INIT} -m virtualenv -p ${PYTHON_BIN_PATH_INIT} ${VIRTUALENV_FLAGS} ${VIRTUALENV_DIR} || \
+    die "FAILED: Unable to create virtualenv"
+
+  source "${VIRTUALENV_DIR}/bin/activate" || \
+    die "FAILED: Unable to activate virtualenv in ${VIRTUALENV_DIR}"
+
+  # Update .tf_configure.bazelrc with venv python path for bazel test.
+  PYTHON_BIN_PATH="$(which python)"
+  yes "" | ./configure
+}
+
+install_tensorflow_pip() {
+  if [[ -z "${1}" ]]; then
+    die "Please provide a proper wheel file path."
+  fi
+
+  # Set path to pip.
+  PIP_BIN_PATH="$(which pip${PYTHON_VER_CFG})"
+
+  # Print python and pip bin paths
+  echo "PYTHON_BIN_PATH to be used to install the .whl: ${PYTHON_BIN_PATH}"
+  echo "PIP_BIN_PATH to be used to install the .whl: ${PIP_BIN_PATH}"
+
+  # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
+  echo "Upgrade pip in virtualenv"
+
+  # NOTE: pip install --upgrade pip leads to a documented TLS issue for
+  # some versions in python
+  curl https://bootstrap.pypa.io/get-pip.py | ${PYTHON_BIN_PATH} || \
+    die "Error: pip install (get-pip.py) FAILED"
+
+  # Check that requested python version matches configured one.
+  check_python_pip_version
+
+  # Force upgrade of setuptools. This must happen before the pip install of the
+  # WHL_PATH, which pulls in absl-py, which uses install_requires notation
+  # introduced in setuptools >=20.5. The default version of setuptools is 5.5.1,
+  # which is too old for absl-py.
+  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+    die "Error: setuptools install, upgrade FAILED"
+
+  # Force tensorflow reinstallation. Otherwise it may not get installed from
+  # last build if it had the same version number as previous build.
+  PIP_FLAGS="--upgrade --force-reinstall"
+  ${PIP_BIN_PATH} install -v ${PIP_FLAGS} ${WHL_PATH} || \
+    die "pip install (forcing to reinstall tensorflow) FAILED"
+  echo "Successfully installed pip package ${WHL_PATH}"
+
+  # Force downgrade of setuptools. This must happen after the pip install of the
+  # WHL_PATH, which ends up upgrading to the latest version of setuptools.
+  # Versions of setuptools >= 39.1.0 will cause tests to fail like this:
+  #   ImportError: cannot import name py31compat
+  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+    die "Error: setuptools install, upgrade FAILED"
+}
+
+run_test_with_bazel() {
+  IS_OSS_SERIAL=0
+  if [[ "${1}" == "--oss_serial" ]]; then
+    IS_OSS_SERIAL=1
+  fi
+  TF_GPU_COUNT=${TF_GPU_COUNT:-4}
+
+  # PIP tests should have a "different" path. Different than the one we place
+  # virtualenv, because we are deleting and recreating it here.
+  PIP_TEST_PREFIX=bazel_pip
+  TEST_ROOT=$(pwd)/${PIP_TEST_PREFIX}
+  sudo rm -rf $TEST_ROOT
+  mkdir -p $TEST_ROOT
+  ln -s $(pwd)/tensorflow $TEST_ROOT/tensorflow
+
+  if [[ "${IS_OSS_SERIAL}" == "1" ]]; then
+    remove_test_filter_tag -no_oss
+    add_test_filter_tag oss_serial
+  else
+    add_test_filter_tag -oss_serial
+  fi
+
+  # Clean the bazel cache
+  bazel clean
+  # Clean up flags before running bazel commands
+  update_bazel_flags
+  # Clean up and update test filter tags
+  update_test_filter_tags
+
+  # Figure out how many concurrent tests we can run and do run the tests.
+  BAZEL_PARALLEL_TEST_FLAGS=""
+  if [[ $CONTAINER_TYPE == "gpu" ]]; then
+    # Number of test threads is the number of GPU cards available.
+    if [[ $OS_TYPE == "macos" ]]; then
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+    else
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=${TF_GPU_COUNT} \
+        --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute"
+    fi
+  else
+    # Number of test threads is the number of physical CPUs.
+    if [[ $OS_TYPE == "macos" ]]; then
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(sysctl -n hw.ncpu)"
+    else
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(grep -c ^processor /proc/cpuinfo)"
+    fi
+  fi
+
+  if [[ ${IS_OSS_SERIAL} == 1 ]]; then
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+  fi
+
+  # TODO(hyey): Update test target after validation.
+  # Run the test.
+  bazel test --build_tests_only ${BAZEL_TEST_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} --test_tag_filters=${BAZEL_TEST_FILTER_TAGS} -k -- //$PIP_TEST_PREFIX/tensorflow/python/...
+
+  unlink ${TEST_ROOT}/tensorflow
+}
+
+run_all_tests() {
+  if [[ -z "${PIP_TESTS}" ]]; then
+    echo "No test was specified to run. Skipping all tests."
+    return 0
+  fi
+  FAIL_COUNTER=0
+  PASS_COUNTER=0
+  for TEST in ${PIP_TESTS[@]}; do
+
+    # Run tests.
+    case "${TEST}" in
+    "test_pip_virtualenv_clean")
+      test_pip_virtualenv_clean
+      ;;
+    "test_pip_virtualenv_non_clean")
+      test_pip_virtualenv_non_clean
+      ;;
+    "test_pip_virtualenv_oss_serial")
+      test_pip_virtualenv_oss_serial
+      ;;
+    *)
+      die "No matching test ${TEST} was found. Stopping test."
+      ;;
+    esac
+
+    # Check and update the results.
+    RETVAL=$?
+
+    # Update results counter
+    if [ ${RETVAL} -eq 0 ]; then
+      echo "Test (${TEST}) PASSED. (PASS COUNTER: ${PASS_COUNTER})"
+      PASS_COUNTER=$(($PASS_COUNTER+1))
+    else
+      echo "Test (${TEST}) FAILED. (FAIL COUNTER: ${FAIL_COUNTER})"
+      FAIL_COUNTER=$(($FAIL_COUNTER+1))
+    fi
+  done
+  printf "${PASS_COUNTER} PASSED | ${FAIL_COUNTER} FAILED"
+  if [[ "${FAIL_COUNTER}" == "0" ]]; then
+    printf "PIP tests ${COLOR_GREEN}PASSED${COLOR_NC}\n"
+    return 0
+  else:
+    printf "PIP tests ${COLOR_RED}FAILED${COLOR_NC}\n"
+    return 1
+  fi
+}
+
+###########################################################################
+# Build TF PIP Wheel file
+###########################################################################
+
+# Update the build flags for building whl.
+# Flags: GPU, OS, tf_nightly, project name
+GPU_FLAG=""
+NIGHTLY_FLAG=""
+
+# TF Nightly flag
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  # If 'nightly' is not specified in the project name already, then add.
+  if ! [[ $PROJECT_NAME == *"nightly"* ]]; then
+    echo "WARNING: IS_NIGHTLY=${IS_NIGHTLY} but requested project name \
+    (PROJECT_NAME=${PROJECT_NAME}) does not include 'nightly' string. \
+    Renaming it to 'tf_nightly'."
+    PROJECT_NAME="tf_nightly"
+  fi
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
+# CPU / GPU flag
+if [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+  GPU_FLAG="--gpu"
+  if ! [[ $PROJECT_NAME == *"gpu"* ]]; then
+    echo "WARNING: GPU is specified but requested project name (PROJECT_NAME=${PROJECT_NAME}) \
+    does not include 'gpu'. Appending '_gpu' to the project name."
+    PROJECT_NAME="${PROJECT_NAME}_gpu"
+  fi
+fi
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package ${PIP_WHL_DIR} ${GPU_FLAG} ${NIGHTLY_FLAG} "--project_name" ${PROJECT_NAME} || die "build_pip_package FAILED"
+
+PY_MAJOR_MINOR_VER=$(echo $PY_MAJOR_MINOR_VER | tr -d '.')
+if [[ $PY_MAJOR_MINOR_VER == "2" ]]; then
+  PY_MAJOR_MINOR_VER="27"
+fi
+
+# Set wheel path and verify that there is only one .whl file in the path.
+WHL_PATH=$(ls "${PIP_WHL_DIR}"/"${PROJECT_NAME}"-*"${PY_MAJOR_MINOR_VER}"*"${PY_MAJOR_MINOR_VER}"*.whl)
+if [[ $(echo "${WHL_PATH}" | wc -w) -ne 1 ]]; then
+  echo "ERROR: Failed to find exactly one built TensorFlow .whl file in "\
+  "directory: ${PIP_WHL_DIR}"
+fi
+
+WHL_DIR=$(dirname "${WHL_PATH}")
+WHL_BASE_NAME=$(basename "${WHL_PATH}")
+AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux1}")
+
+# Print the size of the wheel file.
+echo "Size of the PIP wheel file built: $(ls -l ${WHL_PATH} | awk '{print $5}')"
+
+# Run tests (if any is specified).
+run_all_tests
+
+for WHL_PATH in $(ls ${PIP_WHL_DIR}/${PROJECT_NAME}*.whl); do
+  if [[ "${TF_NEED_CUDA}" -eq "1" ]]; then
+    # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
+    WHL_PATH=${AUDITED_WHL_NAME}
+    cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
+    echo "Copied manylinux1 wheel file at ${WHL_PATH}"
+  else
+    # Repair the wheels for cpu manylinux1
+    echo "auditwheel repairing ${WHL_PATH}"
+    auditwheel repair -w "${WHL_DIR}" "${WHL_PATH}"
+
+    if [[ -f ${AUDITED_WHL_NAME} ]]; then
+      WHL_PATH=${AUDITED_WHL_NAME}
+      echo "Repaired manylinux1 wheel file at: ${WHL_PATH}"
+    else
+      die "WARNING: Cannot find repaired wheel."
+    fi
+  fi
+done
+
+echo "EOF: Successfully ran pip_new.sh"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 2c348a0e3390af05cffff5d9a73d0bd57caa92b4..afb282715bc9197eb2250cb823af2a0f36a0d233 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -540,9 +540,12 @@ do_check_file_name_test() {
   python file_name_test.py
 }
 
+do_libtensorflow_framework_not_depend_on_cuda_check() {
+  bazel build --action_env=TF_NEED_CUDA=1 --define framework_shared_object=true --config=cuda --nobuild_tests_only tensorflow/core/platform/default/build_config:libtensorflow_cuda_check_deps
+}
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_libtensorflow_framework_not_depend_on_cuda_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu libtensorflow_framework.so does not depend on cuda shared libraries.")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 40a744374564d3ad3e663de8453d4085202c4e0c..856d64eb82f48525d4851b1f2167e44c849eaa27 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,7 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+TF_NIGHTLY_REGEX = (r"(.+)(tf_nightly.*)-(\d\.[\d]{1,2}"
                     r"\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
@@ -43,7 +43,7 @@ def check_existence(filename):
     raise RuntimeError("%s not found." % filename)
 
 
-def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
+def copy_binary(directory, origin_tag, new_tag, version, package):
   """Rename and copy binaries for different python versions.
 
   Arguments:
@@ -51,14 +51,10 @@ def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
     origin_tag: str of the old python version tag
     new_tag: str of the new tag
     version: the version of the package
-    gpu: bool if its a gpu build or not
+    package: str, name of the package
 
   """
   print("Rename and copy binaries with %s to %s." % (origin_tag, new_tag))
-  if gpu:
-    package = "tf_nightly_gpu"
-  else:
-    package = "tf_nightly"
   origin_binary = BINARY_STRING_TEMPLATE % (package, version, origin_tag)
   new_binary = BINARY_STRING_TEMPLATE % (package, version, new_tag)
   zip_ref = zipfile.ZipFile(os.path.join(directory, origin_binary), "r")
@@ -120,7 +116,7 @@ def main():
   check_existence(args.filename)
   regex_groups = re.search(TF_NIGHTLY_REGEX, args.filename)
   directory = regex_groups.group(1)
-  gpu = regex_groups.group(2)
+  package = regex_groups.group(2)
   version = regex_groups.group(3)
   origin_tag = regex_groups.group(4)
   old_py_ver = re.search(r"(cp\d\d)", origin_tag).group(1)
@@ -129,7 +125,7 @@ def main():
   new_tag = origin_tag.replace(old_py_ver, "cp" + args.new_py_ver)
 
   # Copy the binary with the info we have
-  copy_binary(directory, origin_tag, new_tag, version, gpu)
+  copy_binary(directory, origin_tag, new_tag, version, package)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 3878452b695069c7fc6881c1825c06a74fe038e3..131950dc0d3e2499ffe77ea424db83c68e713130 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -40,8 +40,8 @@ pip2 install virtualenv
 pip3 install virtualenv
 
 # Install six.
-pip2 install --upgrade six==1.10.0
-pip3 install --upgrade six==1.10.0
+pip2 install --upgrade six==1.12.0
+pip3 install --upgrade six==1.12.0
 
 # Install absl-py.
 pip2 install --upgrade absl-py
@@ -97,9 +97,9 @@ pip3 install py-cpuinfo
 pip2 install pylint==1.6.4
 pip3 install pylint==1.6.4
 
-# pep8 tests require the following:
-pip2 install pep8
-pip3 install pep8
+# pycodestyle tests require the following:
+pip2 install pycodestyle
+pip3 install pycodestyle
 
 # tf.mock require the following for python2:
 pip2 install mock
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 1d0b838c1b5c101e202fcb3cb64c3ce4a9bd34d6..1944183c0e8124c2ed4e572ac4a63f1f82f5c380 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -19,9 +19,10 @@ set -e
 set -x
 
 N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+N_GPUS=$(lspci|grep 'VGA'|grep 'AMD/ATI'|wc -l)
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
 echo ""
 
 # Run configure.
@@ -29,11 +30,14 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
-    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=1 --config=opt \
+bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test -k \
+    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 600,900,2400,7200 \
+    --build_tests_only --test_output=errors --local_test_jobs=${TF_GPU_COUNT} --config=opt \
+    --test_sharding_strategy=disabled \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 9741fba1c8016b89fa17384faee0fc8738c72307..a938928baab9e010a5f2d7b8c209146fdc424932 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -60,6 +60,7 @@ RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 PROJECT_NAME=""
 EXTRA_BUILD_FLAGS=""
+EXTRA_TEST_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -89,6 +90,13 @@ while [[ $# -gt 0 ]]; do
       fi
       PROJECT_NAME="$1"
       ;;
+    --extra_test_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_TEST_FLAGS="$1"
+      ;;
     *)
   esac
   shift
@@ -156,6 +164,7 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 bazel test --announce_rc --config=opt -k --test_output=errors \
+  ${EXTRA_TEST_FLAGS} \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
   --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index ed388b83eb6e6ae4979e480e6a5bfe4a2f4fcf28..3d421f6704ffd26e1c5c34787aa0c29213e53cac 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -286,7 +286,7 @@ class _PastaEditVisitor(ast.NodeVisitor):
     arg_warnings = self._get_applicable_dict("function_arg_warnings",
                                              full_name, name)
 
-    for (kwarg, arg), (level, warning) in arg_warnings.items():
+    for (kwarg, arg), (level, warning) in sorted(arg_warnings.items()):
       present, _ = get_arg_value(node, kwarg, arg)
       if present:
         warned = True
@@ -525,7 +525,7 @@ class ASTCodeUpgrader(object):
     text += "Processing file %r\n outputting to %r\n" % (in_filename,
                                                          out_filename)
     text += "-" * 80 + "\n\n"
-    text += "\n".join(log)
+    text += "\n".join(log) + "\n"
     text += "-" * 80 + "\n\n"
     return text
 
@@ -568,7 +568,7 @@ class ASTCodeUpgrader(object):
       in_place: Allow the conversion of an entire directory in place.
 
     Returns:
-      A tuple of files processed, the report string ofr all files, and a dict
+      A tuple of files processed, the report string for all files, and a dict
         mapping filenames to errors encountered in that file.
     """
 
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 3e147d547850861135da7059a7905fca7c1d5832..67425bba6c463338527ac99907ffd12927b8ac70 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -150,6 +150,7 @@ renames = {
     'tf.disable_v2_batch_normalization': 'tf.compat.v1.disable_v2_batch_normalization',
     'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distribute.get_loss_reduction': 'tf.compat.v1.distribute.get_loss_reduction',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
     'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
     'tf.distributions.Categorical': 'tf.compat.v1.distributions.Categorical',
@@ -169,6 +170,7 @@ renames = {
     'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
     'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
     'tf.div': 'tf.compat.v1.div',
+    'tf.div_no_nan': 'tf.math.divide_no_nan',
     'tf.dtypes.as_string': 'tf.strings.as_string',
     'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
@@ -204,8 +206,6 @@ renames = {
     'tf.get_variable': 'tf.compat.v1.get_variable',
     'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
     'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
-    'tf.gfile.GFile': 'tf.io.gfile.GFile',
-    'tf.gfile.Open': 'tf.io.gfile.GFile',
     'tf.global_norm': 'tf.linalg.global_norm',
     'tf.global_variables': 'tf.compat.v1.global_variables',
     'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
@@ -268,7 +268,6 @@ renames = {
     'tf.keras.initializers.Orthogonal': 'tf.compat.v1.keras.initializers.Orthogonal',
     'tf.keras.initializers.TruncatedNormal': 'tf.compat.v1.keras.initializers.TruncatedNormal',
     'tf.keras.initializers.VarianceScaling': 'tf.compat.v1.keras.initializers.VarianceScaling',
-    'tf.keras.initializers.constant': 'tf.compat.v1.keras.initializers.constant',
     'tf.keras.initializers.glorot_normal': 'tf.compat.v1.keras.initializers.glorot_normal',
     'tf.keras.initializers.glorot_uniform': 'tf.compat.v1.keras.initializers.glorot_uniform',
     'tf.keras.initializers.he_normal': 'tf.compat.v1.keras.initializers.he_normal',
@@ -277,13 +276,11 @@ renames = {
     'tf.keras.initializers.lecun_normal': 'tf.compat.v1.keras.initializers.lecun_normal',
     'tf.keras.initializers.lecun_uniform': 'tf.compat.v1.keras.initializers.lecun_uniform',
     'tf.keras.initializers.normal': 'tf.compat.v1.keras.initializers.normal',
-    'tf.keras.initializers.ones': 'tf.compat.v1.keras.initializers.ones',
     'tf.keras.initializers.orthogonal': 'tf.compat.v1.keras.initializers.orthogonal',
     'tf.keras.initializers.random_normal': 'tf.compat.v1.keras.initializers.random_normal',
     'tf.keras.initializers.random_uniform': 'tf.compat.v1.keras.initializers.random_uniform',
     'tf.keras.initializers.truncated_normal': 'tf.compat.v1.keras.initializers.truncated_normal',
     'tf.keras.initializers.uniform': 'tf.compat.v1.keras.initializers.uniform',
-    'tf.keras.initializers.zeros': 'tf.compat.v1.keras.initializers.zeros',
     'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
     'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
     'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
@@ -325,6 +322,8 @@ renames = {
     'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
     'tf.lin_space': 'tf.linspace',
+    'tf.lite.TocoConverter': 'tf.compat.v1.lite.TocoConverter',
+    'tf.lite.toco_convert': 'tf.compat.v1.lite.toco_convert',
     'tf.local_variables': 'tf.compat.v1.local_variables',
     'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
     'tf.log': 'tf.math.log',
@@ -351,6 +350,7 @@ renames = {
     'tf.logging.warn': 'tf.compat.v1.logging.warn',
     'tf.logging.warning': 'tf.compat.v1.logging.warning',
     'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.losses.Reduction': 'tf.compat.v1.losses.Reduction',
     'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
     'tf.losses.add_loss': 'tf.compat.v1.losses.add_loss',
     'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
@@ -423,8 +423,11 @@ renames = {
     'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
     'tf.model_variables': 'tf.compat.v1.model_variables',
     'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
+    'tf.nn.avg_pool_v2': 'tf.nn.avg_pool',
     'tf.nn.bidirectional_dynamic_rnn': 'tf.compat.v1.nn.bidirectional_dynamic_rnn',
-    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
+    'tf.nn.conv2d_backprop_filter': 'tf.compat.v1.nn.conv2d_backprop_filter',
+    'tf.nn.conv3d_backprop_filter': 'tf.compat.v1.nn.conv3d_backprop_filter',
+    'tf.nn.conv3d_backprop_filter_v2': 'tf.compat.v1.nn.conv3d_backprop_filter_v2',
     'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
     'tf.nn.ctc_loss_v2': 'tf.nn.ctc_loss',
     'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
@@ -432,6 +435,7 @@ renames = {
     'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
     'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
     'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.max_pool_v2': 'tf.nn.max_pool',
     'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
     'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
     'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
@@ -499,8 +503,10 @@ renames = {
     'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
     'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
     'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.build_signature_def': 'tf.compat.v1.saved_model.build_signature_def',
     'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
     'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
+    'tf.saved_model.classification_signature_def': 'tf.compat.v1.saved_model.classification_signature_def',
     'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
     'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
     'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
@@ -512,6 +518,7 @@ renames = {
     'tf.saved_model.constants.VARIABLES_FILENAME': 'tf.saved_model.VARIABLES_FILENAME',
     'tf.saved_model.experimental.save': 'tf.saved_model.save',
     'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
+    'tf.saved_model.is_valid_signature': 'tf.compat.v1.saved_model.is_valid_signature',
     'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
     'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
     'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
@@ -519,6 +526,8 @@ renames = {
     'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
     'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
     'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.predict_signature_def': 'tf.compat.v1.saved_model.predict_signature_def',
+    'tf.saved_model.regression_signature_def': 'tf.compat.v1.saved_model.regression_signature_def',
     'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
     'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
     'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
@@ -530,11 +539,11 @@ renames = {
     'tf.saved_model.signature_constants.REGRESS_INPUTS': 'tf.saved_model.REGRESS_INPUTS',
     'tf.saved_model.signature_constants.REGRESS_METHOD_NAME': 'tf.saved_model.REGRESS_METHOD_NAME',
     'tf.saved_model.signature_constants.REGRESS_OUTPUTS': 'tf.saved_model.REGRESS_OUTPUTS',
-    'tf.saved_model.signature_def_utils.build_signature_def': 'tf.saved_model.build_signature_def',
-    'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.saved_model.classification_signature_def',
-    'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.saved_model.is_valid_signature',
-    'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.saved_model.predict_signature_def',
-    'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.saved_model.regression_signature_def',
+    'tf.saved_model.signature_def_utils.build_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.build_signature_def',
+    'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.classification_signature_def',
+    'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.compat.v1.saved_model.signature_def_utils.is_valid_signature',
+    'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.predict_signature_def',
+    'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.regression_signature_def',
     'tf.saved_model.simple_save': 'tf.compat.v1.saved_model.simple_save',
     'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
     'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
@@ -622,6 +631,7 @@ renames = {
     'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
     'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
     'tf.summary.image': 'tf.compat.v1.summary.image',
+    'tf.summary.initialize': 'tf.compat.v1.summary.initialize',
     'tf.summary.merge': 'tf.compat.v1.summary.merge',
     'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
     'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
index ac890e247418d13adc3e3147670682e5ec488147..14c2d0c09f953c73000c1595012423176fa48999 100644
--- a/tensorflow/tools/compatibility/reorders_v2.py
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -58,9 +58,9 @@ reorders = {
     'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
-    'tf.nn.conv1d': ['value', 'filters', 'stride', 'padding', 'use_cudnn_on_gpu', 'data_format', 'name', 'input'],
+    'tf.nn.avg_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
+    'tf.nn.conv1d': ['value', 'filters', 'stride', 'padding', 'use_cudnn_on_gpu', 'data_format', 'name', 'input', 'dilations'],
     'tf.nn.conv2d': ['input', 'filter', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
-    'tf.nn.conv2d_backprop_filter': ['input', 'filter_sizes', 'out_backprop', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name'],
     'tf.nn.conv2d_backprop_input': ['input_sizes', 'filter', 'out_backprop', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
     'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format', 'filters', 'dilations'],
     'tf.nn.crelu': ['features', 'name', 'axis'],
@@ -72,6 +72,7 @@ reorders = {
     'tf.nn.fractional_avg_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
     'tf.nn.fractional_max_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
     'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.nn.max_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
     'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims', 'keepdims'],
     'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format', 'dilations'],
     'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index b774acd6d2d3786a8f1d830bbca608f2dd3b4611..c3ca68be6043530ab7034c148986335e1a52fb92 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -23,7 +23,6 @@ import functools
 import sys
 
 import pasta
-import six
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
@@ -223,6 +222,17 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.max_pool_with_argmax": {
             "Targmax": "output_dtype",
         },
+        "tf.nn.max_pool": {
+            "value": "input"
+        },
+
+        "tf.nn.avg_pool": {
+            "value": "input"
+        },
+
+        "tf.nn.avg_pool2d": {
+            "value": "input"
+        },
         "tf.multinomial": {
             "output_dtype": "dtype",
         },
@@ -402,12 +412,29 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "filter": "filters",
             "use_cudnn_on_gpu": None,
         },
-        "tf.nn.conv2d_backprop_filter": {
-            "use_cudnn_on_gpu": None,
-        },
         "tf.nn.conv2d_backprop_input": {
-            "filter": "filters",
             "use_cudnn_on_gpu": None,
+            "input_sizes": "output_shape",
+            "out_backprop": "input",
+            "filter": "filters",
+        },
+        "tf.contrib.summary.audio": {
+            "tensor": "data",
+            "family": None,
+        },
+        "tf.contrib.summary.histogram": {
+            "tensor": "data",
+            "family": None,
+        },
+        "tf.contrib.summary.image": {
+            "tensor": "data",
+            "bad_color": None,
+            "max_images": "max_outputs",
+            "family": None,
+        },
+        "tf.contrib.summary.scalar": {
+            "tensor": "data",
+            "family": None,
         },
     }
 
@@ -420,7 +447,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.batch_to_space_nd":
             "tf.batch_to_space",
         "tf.batch_gather":
-            "tf.gather",
+            "tf.compat.v1.batch_gather",
         "tf.space_to_batch_nd":
             "tf.space_to_batch",
         "tf.nn.space_to_batch":
@@ -437,6 +464,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.gfile.exists",
         "tf.gfile.Glob":
             "tf.io.gfile.glob",
+        "tf.gfile.GFile":
+            "tf.io.gfile.GFile",
         "tf.gfile.IsDirectory":
             "tf.io.gfile.isdir",
         "tf.gfile.ListDirectory":
@@ -445,6 +474,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.gfile.makedirs",
         "tf.gfile.MkDir":
             "tf.io.gfile.mkdir",
+        "tf.gfile.Open":
+            "tf.io.gfile.GFile",
         "tf.gfile.Remove":
             "tf.io.gfile.remove",
         "tf.gfile.Rename":
@@ -537,28 +568,64 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.data.experimental.unbatch",
         "tf.contrib.data.unique":
             "tf.data.experimental.unique",
+        "tf.contrib.estimator.make_early_stopping_hook":
+            "tf.estimator.experimental.make_early_stopping_hook",
+        "tf.contrib.estimator.stop_if_higher_hook":
+            "tf.estimator.experimental.stop_if_higher_hook",
+        "tf.contrib.estimator.stop_if_lower_hook":
+            "tf.estimator.experimental.stop_if_lower_hook",
+        "tf.contrib.estimator.stop_if_no_decrease_hook":
+            "tf.estimator.experimental.stop_if_no_decrease_hook",
+        "tf.contrib.estimator.stop_if_no_increase_hook":
+            "tf.estimator.experimental.stop_if_no_increase_hook",
+        "tf.contrib.framework.CriticalSection":
+            "tf.CriticalSection",
+        "tf.contrib.framework.is_tensor":
+            "tf.is_tensor",
         "tf.contrib.framework.nest.assert_same_structure":
             "tf.nest.assert_same_structure",
         "tf.contrib.framework.nest.flatten":
             "tf.nest.flatten",
+        "tf.contrib.framework.nest.is_sequence":
+            "tf.nest.is_nested",
         "tf.contrib.framework.nest.map_structure":
             "tf.nest.map_structure",
         "tf.contrib.framework.nest.pack_sequence_as":
             "tf.nest.pack_sequence_as",
-        "tf.contrib.framework.constant_value":
+        "tf.contrib.util.constant_value":
             "tf.get_static_value",
         "tf.contrib.saved_model.load_keras_model":
             "tf.keras.experimental.load_from_saved_model",
         "tf.contrib.saved_model.save_keras_model":
-            "tf.keras.experimental.export",
+            "tf.keras.experimental.export_saved_model",
         "tf.contrib.rnn.RNNCell":
             "tf.nn.rnn_cell.RNNCell",
         "tf.contrib.rnn.LSTMStateTuple":
             "tf.nn.rnn_cell.LSTMStateTuple",
+        "tf.contrib.rnn.BasicLSTMCell":
+            "tf.compat.v1.nn.rnn_cell.BasicLSTMCell",
+        "tf.contrib.rnn.BasicRNNCell":
+            "tf.compat.v1.nn.rnn_cell.BasicRNNCell",
+        "tf.contrib.rnn.GRUCell":
+            "tf.compat.v1.nn.rnn_cell.GRUCell",
+        "tf.contrib.rnn.LSTMCell":
+            "tf.compat.v1.nn.rnn_cell.LSTMCell",
+        "tf.contrib.rnn.MultiRNNCell":
+            "tf.compat.v1.nn.rnn_cell.MultiRNNCell",
         "tf.contrib.framework.sort":
             "tf.sort",
         "tf.contrib.framework.argsort":
             "tf.argsort",
+        "tf.contrib.summary.audio":
+            "tf.compat.v2.summary.audio",
+        "tf.contrib.summary.histogram":
+            "tf.compat.v2.summary.histogram",
+        "tf.contrib.summary.image":
+            "tf.compat.v2.summary.image",
+        "tf.contrib.summary.initialize":
+            "tf.compat.v1.summary.initialize",
+        "tf.contrib.summary.scalar":
+            "tf.compat.v2.summary.scalar",
         "tf.count_nonzero":
             "tf.math.count_nonzero",
         "tf.manip.batch_to_space_nd":
@@ -637,6 +704,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         # changed significantly.
         "tf.nn.ctc_loss":
             "tf.compat.v1.nn.ctc_loss",
+        # tf.saved_model.load in 1.x has no equivalent in 2.x, but there is a
+        # symbol with the same name.
+        "tf.saved_model.load":
+            "tf.compat.v1.saved_model.load",
+        "tf.saved_model.load_v2":
+            "tf.compat.v2.saved_model.load",
         "tf.zeros_initializer":
             "tf.compat.v1.initializers.zeros",
         "tf.ones_initializer":
@@ -695,8 +768,20 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.compat.v1.debugging.assert_rank_in",
         "tf.assert_rank":
             "tf.compat.v1.assert_rank",
-        "tf.contrib.framework.argsort":
-            "tf.argsort",
+        "tf.nn.max_pool":
+            "tf.nn.max_pool2d",
+        "tf.nn.avg_pool":
+            "tf.nn.avg_pool2d",
+        "tf.keras.initializers.zeros":
+            "tf.compat.v1.keras.initializers.zeros",
+        "tf.keras.initializers.ones":
+            "tf.compat.v1.keras.initializers.ones",
+        "tf.keras.initializers.constant":
+            "tf.compat.v1.keras.initializers.constant",
+        "tf.data.experimental.map_and_batch_with_legacy_function":
+            "tf.compat.v1.data.experimental.map_and_batch_with_legacy_function",
+        "tf.nn.conv2d_backprop_input":
+            "tf.nn.conv2d_transpose"
     }
     # pylint: enable=line-too-long
 
@@ -730,7 +815,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.convert_to_tensor",
         "tf.nn.conv1d",
         "tf.nn.conv2d",
-        "tf.nn.conv2d_backprop_filter",
         "tf.nn.conv2d_backprop_input",
         "tf.nn.ctc_beam_search_decoder",
         "tf.nn.moments",
@@ -809,12 +893,28 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.image.sample_distorted_bounding_box",
         "tf.gradients",
         "tf.hessians",
+        "tf.nn.max_pool",
+        "tf.nn.avg_pool",
     }
 
+    # Manual mapping of function names to be reordered to their list of argument
+    # names, in order. Only use this if argument names cannot be autodetected,
+    # e.g. if the functions are in contrib.
+    self.manual_function_reorders = {
+        "tf.contrib.summary.audio": [
+            "name", "tensor", "sample_rate", "max_outputs", "family", "step"],
+        "tf.contrib.summary.histogram": [
+            "name", "tensor", "family", "step"],
+        "tf.contrib.summary.image": [
+            "name", "tensor", "bad_color", "max_images", "family", "step"],
+        "tf.contrib.summary.scalar": [
+            "name", "tensor", "family", "step"],
+    }
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = reorders_v2.reorders
+    self.function_reorders = dict(reorders_v2.reorders)
+    self.function_reorders.update(self.manual_function_reorders)
 
     contrib_warning = (
         ast_edits.ERROR,
@@ -903,6 +1003,14 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "only effects core estimator. If you are using "
         "tf.contrib.learn.Estimator, please switch to using core estimator.")
 
+    # TODO(b/124529441): if possible eliminate need for manual checking.
+    contrib_summary_comment = (
+        ast_edits.WARNING,
+        "(Manual check required) tf.contrib.summary.* functions have been "
+        "migrated best-effort to tf.compat.v2.summary.* equivalents where "
+        "possible, but the resulting code may not always work. Please check "
+        "manually; you can report migration failures on b/124529441.")
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     # You can use *. to add items which do not check the FQN, and apply to e.g.,
@@ -944,6 +1052,14 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_rank_comment,
         "tf.assert_rank_in":
             assert_rank_comment,
+        "tf.contrib.summary.audio":
+            contrib_summary_comment,
+        "tf.contrib.summary.histogram":
+            contrib_summary_comment,
+        "tf.contrib.summary.image":
+            contrib_summary_comment,
+        "tf.contrib.summary.scalar":
+            contrib_summary_comment,
         "tf.debugging.assert_equal":
             assert_return_type_comment,
         "tf.debugging.assert_greater":
@@ -1243,11 +1359,45 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
                 "tf.cond no longer takes 'strict' argument, it behaves as "
                 "if was set to True.")
         },
+        "tf.contrib.summary.audio": {
+            ("family", 4): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.histogram": {
+            ("family", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.image": {
+            ("bad_color", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.image no longer takes the 'bad_color' "
+                "argument; caller must now preprocess if needed. This call "
+                "site specifies a bad_color argument so it cannot be converted "
+                "safely."),
+            ("family", 4): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.scalar": {
+            ("family", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
     }
 
     # Specially handled functions
     # Each transformer is a callable which will be called with the arguments
-    #   transformer(parent, node, full_name, name, logs, errors)
+    #   transformer(parent, node, full_name, name, logs)
     # Where logs is a list to which (level, line, col, msg) tuples can be
     # appended, full_name is the FQN of the function called (or None if that is
     # unknown), name is the name of the function called (or None is that is
@@ -1318,6 +1468,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             _add_argument_transformer,
             arg_name="data_format",
             arg_value_ast=ast.Str("NHWC")),
+        "tf.contrib.summary.audio": _add_summary_step_transformer,
+        "tf.contrib.summary.histogram": _add_summary_step_transformer,
+        "tf.contrib.summary.image": _add_summary_step_transformer,
+        "tf.contrib.summary.scalar": _add_summary_step_transformer,
     }
 
     self.module_deprecations = {
@@ -1429,7 +1583,7 @@ def _add_argument_transformer(parent, node, full_name, name, logs,
                               arg_name, arg_value_ast):
   """Adds an argument (as a final kwarg arg_name=arg_value_ast)."""
   node.keywords.append(ast.keyword(arg=arg_name, value=arg_value_ast))
-  logs.add((
+  logs.append((
       ast_edits.INFO, node.lineno, node.col_offset,
       "Adding argument '%s' to call to %s." % (pasta.dump(node.keywords[-1],
                                                           full_name or name))
@@ -1548,11 +1702,18 @@ def _softmax_cross_entropy_with_logits_transformer(
   """Wrap labels argument with stop_gradients."""
   def _wrap_label(parent, old_value):
     """Wrap labels with tf.stop_gradient."""
-    if six.PY3:
+    already_stop_grad = (isinstance(old_value, ast.Call) and
+                         isinstance(old_value.func, ast.Attribute) and
+                         old_value.func.attr == "stop_gradient" and
+                         isinstance(old_value.func.value, ast.Name) and
+                         old_value.func.value.id == "tf")
+    if already_stop_grad:
+      return False
+    try:
       new_value = ast.Call(
           ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
           [old_value], [])
-    else:
+    except TypeError:
       new_value = ast.Call(
           ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
           [old_value], [], None, None)
@@ -1560,16 +1721,17 @@ def _softmax_cross_entropy_with_logits_transformer(
     # This copies the prefix and suffix on old_value to new_value.
     pasta.ast_utils.replace_child(parent, old_value, new_value)
     ast.copy_location(new_value, old_value)
+    return True
 
   # Check if we have a labels keyword arg
   for karg in node.keywords:
     if karg.arg == "labels":
-      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
-                   "Changing labels arg of "
-                   "tf.nn.softmax_cross_entropy_with_logits to "
-                   "tf.stop_gradient(labels). Please check this "
-                   "transformation.\n"))
-      _wrap_label(karg, karg.value)
+      if _wrap_label(karg, karg.value):
+        logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                     "Changing labels arg of "
+                     "tf.nn.softmax_cross_entropy_with_logits to "
+                     "tf.stop_gradient(labels). Please check this "
+                     "transformation.\n"))
       return node
   return node
 
@@ -1703,3 +1865,22 @@ def _extract_glimpse_transformer(parent, node, full_name, name, logs):
                  "Changing uniform_noise arg of tf.image.extract_glimpse to "
                  "noise, and recomputing value.\n"))
     return node
+
+
+def _add_summary_step_transformer(parent, node, full_name, name, logs):
+  """Adds a step argument to the summary API call if not specified.
+
+  The inserted argument value is tf.compat.v1.train.get_or_create_global_step().
+  """
+  for keyword_arg in node.keywords:
+    if keyword_arg.arg == "step":
+      return node
+  default_value = "tf.compat.v1.train.get_or_create_global_step()"
+  # Parse with pasta instead of ast to avoid emitting a spurious trailing \n.
+  ast_value = pasta.parse(default_value)
+  node.keywords.append(ast.keyword(arg="step", value=ast_value))
+  logs.append((
+      ast_edits.WARNING, node.lineno, node.col_offset,
+      "Summary API writing function %s now requires a 'step' argument; "
+      "inserting default of %s." % (full_name or name, default_value)))
+  return node
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index aecd74e71c805074fb88fe715917b8d91c6f707d..c34d659eebf0c90361dbd3ae3da0d6b1b83ead09 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -27,7 +27,7 @@ from tensorflow.tools.compatibility import tf_upgrade_v2
 def main():
   parser = argparse.ArgumentParser(
       formatter_class=argparse.RawDescriptionHelpFormatter,
-      description="""Convert a TensorFlow Python file to 2.0
+      description="""Convert a TensorFlow Python file from 1.* to 2.0
 
 Simple usage:
   tf_upgrade_v2.py --infile foo.py --outfile bar.py
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 7668ba2260e123de500d9505e4f7681da92d0607..57d938f749b34cb14dd2b9d299a0be7646805eea 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -157,6 +157,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           _, _, _, text = self._upgrade("tf." + name)
           if (text and
               not text.startswith("tf.compat.v1") and
+              not text.startswith("tf.compat.v2") and
               text not in self.v2_symbols and
               # Builds currently install old version of estimator that doesn't
               # have some 2.0 symbols.
@@ -174,11 +175,6 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     collect = True
     v1_symbols = set([])
 
-    # Symbols which may be generated by the conversion script which do not exist
-    # in TF 1.x. This should be a very short list of symbols which are
-    # experimental in 1.x but stable for 2.x.
-    whitelisted_v2_only_symbols = set(["tf.saved_model.save"])
-
     # Converts all symbols in the v1 namespace to the v2 namespace, raising
     # an error if the target of the conversion is not in the v1 namespace.
     def conversion_visitor(unused_path, unused_parent, children):
@@ -192,9 +188,9 @@ class TestUpgrade(test_util.TensorFlowTestCase):
             _, _, _, text = self._upgrade("tf." + name)
             if (text and
                 not text.startswith("tf.compat.v1") and
+                not text.startswith("tf.compat.v2") and
                 not text.startswith("tf.estimator") and
-                text not in v1_symbols and
-                text not in whitelisted_v2_only_symbols):
+                text not in v1_symbols):
               self.assertFalse(
                   True, "Symbol %s generated from %s not in v1 API" % (
                       text, name))
@@ -293,6 +289,10 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   "Function '%s' is not in 2.0 when converting\n%s\nto\n%s" %
                   (new_function_name, text_input, text))
             continue
+          if new_function_name.startswith("tf.compat.v2"):
+            self.assertIn(new_function_name.replace("tf.compat.v2.", "tf."),
+                          self.v2_symbols)
+            continue
           # 3. Verify V2 function and arguments.
           args_v2 = get_args(self.v2_symbols[new_function_name])
           args_v2.extend(v2_arg_exceptions)
@@ -343,6 +343,8 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
     function_reorders = (
         tf_upgrade_v2.TFAPIChangeSpec().function_reorders)
+    manual_function_reorders = (
+        tf_upgrade_v2.TFAPIChangeSpec().manual_function_reorders)
 
     added_names_message = """Some function names in
 self.reordered_function_names are not in reorders_v2.py.
@@ -362,6 +364,8 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     # function_reorders should contain reordered_function_names
     # and their TensorFlow V1 aliases.
     for name in function_reorders:
+      if name in manual_function_reorders:
+        continue
       # get other names for this function
       attr = get_symbol_for_name(tf.compat.v1, name)
       _, attr = tf_decorator.unwrap(attr)
@@ -775,7 +779,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         "tf.contrib.saved_model.save_keras_model(model, './saved_models')\n"
         "tf.contrib.saved_model.load_keras_model(saved_model_path)\n")
     expected_text = (
-        "tf.keras.experimental.export(model, './saved_models')\n"
+        "tf.keras.experimental.export_saved_model(model, './saved_models')\n"
         "tf.keras.experimental.load_from_saved_model(saved_model_path)\n")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
@@ -818,6 +822,36 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
+  def testSoftMaxCrossEntropyWithLogitsDoesntNest(self):
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=tf.stop_gradient(labels), logits=logits, dim=2)")
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=tf.stop_gradient(labels), logits=logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=tf.stop_gradient(foo(bar)))")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo(bar)))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo())")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo()))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo().zz())")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo().zz()))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
   def testSparseMatmul(self):
     text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
     expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
@@ -879,9 +913,8 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         "tf.nn.conv2d_backprop_filter(input, filter_sizes, out_backprop, "
         "strides, padding, use_cudnn_on_gpu, data_format)")
     expected_text = (
-        "tf.nn.conv2d_backprop_filter(input=input, filter_sizes=filter_sizes, "
-        "out_backprop=out_backprop, strides=strides, padding=padding, "
-        "data_format=data_format)")
+        "tf.compat.v1.nn.conv2d_backprop_filter(input, filter_sizes, "
+        "out_backprop, strides, padding, use_cudnn_on_gpu, data_format)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -890,8 +923,8 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         "tf.nn.conv2d_backprop_input(input_sizes, filter, out_backprop, "
         "strides, padding, use_cudnn_on_gpu, data_format)")
     expected_text = (
-        "tf.nn.conv2d_backprop_input(input_sizes=input_sizes, filters=filter, "
-        "out_backprop=out_backprop, strides=strides, padding=padding, "
+        "tf.nn.conv2d_transpose(output_shape=input_sizes, filters=filter, "
+        "input=out_backprop, strides=strides, padding=padding, "
         "data_format=data_format)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
@@ -1037,6 +1070,13 @@ tf.print('abc')
       _, unused_report, unused_errors, actual = self._upgrade(text)
       self.assertEqual(actual, expected)
 
+  def testMapAndBatch(self):
+    suffix = ".data.experimental.map_and_batch_with_legacy_function(args)"
+    text = "tf" + suffix
+    expected = "tf.compat.v1" + suffix
+    _, unused_report, unused_errors, actual = self._upgrade(text)
+    self.assertEqual(actual, expected)
+
   def testCast(self):
     for (name, dtype) in [("int32", "int32"),
                           ("int64", "int64"),
@@ -1144,6 +1184,18 @@ def _log_prob(self, x):
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
 
+  def test_is_tensor_upgrade(self):
+    text = "tf.contrib.framework.is_tensor(x)"
+    expected = "tf.is_tensor(x)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_CriticalSection_upgrade(self):
+    text = "tf.contrib.framework.CriticalSection(shared_name='blah')"
+    expected = "tf.CriticalSection(shared_name='blah')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
   def test_sample_distorted_bounding_box(self):
     # pylint: disable=line-too-long
     text = "tf.image.sample_distorted_bounding_box(a, b, c, d, e, f, g, h, i, j)"
@@ -1152,6 +1204,12 @@ def _log_prob(self, x):
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
 
+  def test_contrib_initialize(self):
+    text = "tf.contrib.summary.initialize"
+    expected = "tf.compat.v1.summary.initialize"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
   def test_contrib_framework_argsort(self):
     text = "tf.contrib.framework.argsort"
     expected = "tf.argsort"
@@ -1167,6 +1225,124 @@ def _log_prob(self, x):
     _, _, errors, _ = self._upgrade("tf.flags.FLAGS")
     self.assertIn("tf.flags has been removed", errors[0])
 
+  def test_max_pool_2d(self):
+    text = "tf.nn.max_pool(value=4)"
+    expected_text = "tf.nn.max_pool2d(input=4)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_contrib_estimator_early_stopping(self):
+    api_symbols = [
+        "make_early_stopping_hook", "stop_if_higher_hook", "stop_if_lower_hook",
+        "stop_if_no_decrease_hook", "stop_if_no_increase_hook"
+    ]
+    for symbol in api_symbols:
+      text = "tf.contrib.estimator." + symbol
+      expected_text = "tf.estimator.experimental." + symbol
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def test_contrib_rnn(self):
+    api_symbols = ["BasicLSTMCell", "BasicRNNCell", "GRUCell", "LSTMCell",
+                   "MultiRNNCell"]
+    for symbol in api_symbols:
+      text = "tf.contrib.rnn." + symbol
+      expected_text = "tf.compat.v1.nn.rnn_cell." + symbol
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def test_contrib_summary_audio(self):
+    text = "tf.contrib.summary.audio('foo', myval, 44100, 3, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.audio(name='foo', data=myval, "
+                "sample_rate=44100, max_outputs=3, step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_histogram(self):
+    text = "tf.contrib.summary.histogram('foo', myval, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.histogram(name='foo', data=myval, "
+                "step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_image(self):
+    text = "tf.contrib.summary.image('foo', myval, red, 3, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.image(name='foo', data=myval, "
+                "max_outputs=3, step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'bad_color' argument", errors[0])
+    self.assertIn("'family' argument", errors[1])
+    self.assertIn("Manual check required", errors[2])
+
+  def test_contrib_summary_scalar(self):
+    text = "tf.contrib.summary.scalar('foo', myval, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.scalar(name='foo', data=myval, "
+                "step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_audio_nostep(self):
+    text = "tf.contrib.summary.audio('foo', myval, 44100)"
+    expected = ("tf.compat.v2.summary.audio(name='foo', data=myval, "
+                "sample_rate=44100, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_histogram_nostep(self):
+    text = "tf.contrib.summary.histogram('foo', myval)"
+    expected = ("tf.compat.v2.summary.histogram(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_image_nostep(self):
+    text = "tf.contrib.summary.image('foo', myval)"
+    expected = ("tf.compat.v2.summary.image(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_scalar_nostep(self):
+    text = "tf.contrib.summary.scalar('foo', myval)"
+    expected = ("tf.compat.v2.summary.scalar(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_avg_pool_2d(self):
+    text = "tf.nn.avg_pool(value=4)"
+    expected_text = "tf.nn.avg_pool2d(input=4)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_saved_model_load(self):
+    text = "tf.saved_model.load(sess, ['foo_graph'])"
+    expected = "tf.compat.v1.saved_model.load(sess, ['foo_graph'])"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_saved_model_load_v2(self):
+    text = "tf.saved_model.load_v2('/tmp/blah')"
+    expected = "tf.compat.v2.saved_model.load('/tmp/blah')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 09537b7314491819d06d3bfda2f2446c5af93067..83b72cb5bb8d9686efe37f11357fc610902ebcb9 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -34,6 +34,7 @@ import errno
 import itertools
 import multiprocessing
 import os
+import platform
 import re
 import shutil
 import sys
@@ -552,6 +553,13 @@ def main(argv):
       if not FLAGS.build_images:
         continue
 
+      # Only build images for host architecture
+      proc_arch = platform.processor()
+      is_x86 = proc_arch.startswith('x86')
+      if (is_x86 and any([arch in tag for arch in ['ppc64le']]) or
+          not is_x86 and proc_arch not in tag):
+        continue
+
       # Generate a temporary Dockerfile to use to build, since docker-py
       # needs a filepath relative to the build context (i.e. the current
       # directory)
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index c09a95858bae7026d5206f92281e65b5b4ee8160..c806fa4eacd38d6676333153e45631ed5f96ff42 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -47,8 +47,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 857b5e20471a82bd162e55b146854d0a5c165db8..a82577b53be4045171b685bbe4076ffdad4d3824 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -47,8 +47,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index 099c2dc62eab3947f009786a54270a28b2b2c000..dc5b5d49b90e57e8a6acec1e8cd883d62a40dfba 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -30,7 +30,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -43,12 +42,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index a4017634fe1ab5674fef1dde00c1e6281cbebdaa..da813970a83e289ec7f9237c9b050fd37d7c1e55 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -30,7 +30,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -43,12 +42,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 44481eb16a797fdef2e6e52c0c81c73dbc794fc7..24309e3ba06a417c7dfc1bf6ce4802576fb83115 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -21,23 +21,33 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-dev-10-0 \
-        cuda-cudart-dev-10-0 \
-        cuda-cufft-dev-10-0 \
-        cuda-curand-dev-10-0 \
-        cuda-cusolver-dev-10-0 \
-        cuda-cusparse-dev-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
-        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -48,27 +58,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         wget \
         git \
         && \
-    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
-        && rm -rf /var/lib/apt/lists/*
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
+ENV TF_NEED_TENSORRT 0
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=10.0
-ENV TF_CUDNN_VERSION=7
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 160abc876395cf048aa850301de701c950cba149..6bc4e32efb1681c4287758b5e2c70849321971b1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -21,23 +21,33 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-dev-10-0 \
-        cuda-cudart-dev-10-0 \
-        cuda-cufft-dev-10-0 \
-        cuda-curand-dev-10-0 \
-        cuda-cusolver-dev-10-0 \
-        cuda-cusparse-dev-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
-        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -48,27 +58,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         wget \
         git \
         && \
-    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
-        && rm -rf /var/lib/apt/lists/*
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
+ENV TF_NEED_TENSORRT 0
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=10.0
-ENV TF_CUDNN_VERSION=7
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index 088310755ea292e61ea3cbdc10d83a673e950cb2..85a32fae1b10038956699db276e0c41973a77996 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -21,32 +21,41 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-10-0 \
-        cuda-cufft-10-0 \
-        cuda-curand-10-0 \
-        cuda-cusolver-10-0 \
-        cuda-cusparse-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
         && apt-get clean \
-        && rm -rf /var/lib/apt/lists/*
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
@@ -75,8 +84,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index 80e427f824a186b64031b5325042ba374c9b0021..c661341eaeeb5ec331055943397fa5245aea76fb 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -21,32 +21,41 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-10-0 \
-        cuda-cufft-10-0 \
-        cuda-curand-10-0 \
-        cuda-cusolver-10-0 \
-        cuda-cusparse-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
         && apt-get clean \
-        && rm -rf /var/lib/apt/lists/*
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
@@ -75,8 +84,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..63bf205a7ae0392a3ab30e142d9e6b100608fa86
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..083d61bf9a2adc69ac821841f628096c91af3524
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c8384f7af2020e142ec0a2c28418252e6028b083
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..08f880e7b530cf6dd0e38dd0ff1e46eecbda17b1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c508a0f73f2dabb21c937b00c9653b33fbc108fe
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,158 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 0
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f910cb2e91b7f20d9b399d4c032b63d3b991fd72
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -0,0 +1,139 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 0
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1e82ca282208373f57923fda2619b51a43a0d52c
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,131 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e9572756e53d6c400948b2f82e792826c19c7543
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -0,0 +1,112 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1e79574a34de7e15bccc68136269962d375459a0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
@@ -0,0 +1,28 @@
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 76758bd147ef9d52b3db072bd0091190e132667c..2ae840687df4fa2419f92b73adc11dca5b3a9f7b 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -3,5 +3,8 @@
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0397ab5fa8569dc0274f9550cf4ecae65489c248
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -0,0 +1,33 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
index 0652ac4151d907b660557c991b082541de587a42..a1fd901b343bd80bde2061e29ee7f3abbf7e762d 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -7,7 +7,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -20,9 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index 2b4494ac5955f828b519bb2a61db72f91dace6ef..cf3e38b8c8b1072c5c1633003d4f1669192ce191 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,20 +1,30 @@
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-dev-10-0 \
-        cuda-cudart-dev-10-0 \
-        cuda-cufft-dev-10-0 \
-        cuda-curand-dev-10-0 \
-        cuda-cusolver-dev-10-0 \
-        cuda-cusparse-dev-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
-        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -25,24 +35,26 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         wget \
         git \
         && \
-    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
-        && rm -rf /var/lib/apt/lists/*
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
+ENV TF_NEED_TENSORRT 0
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=10.0
-ENV TF_CUDNN_VERSION=7
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index a6393a3280c6eb7cf2d356b02734865be8eb5a04..041ee87839938b80489f750530c47c8519ab6171 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,29 +1,38 @@
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-10-0 \
-        cuda-cufft-10-0 \
-        cuda-curand-10-0 \
-        cuda-cusolver-10-0 \
-        cuda-cusparse-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
         && apt-get clean \
-        && rm -rf /var/lib/apt/lists/*
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 19d96e7a3df4468ff82f2029a1945a02b1e58932..6fddfe000c60dadd05ff172d4cb036e648377deb 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -55,6 +55,8 @@ releases:
         tag_specs:
             - "{ubuntu}{jupyter}"
             - "{ubuntu-devel}{jupyter}"
+            - "{ubuntu-ppc64le}{jupyter}"
+            - "{ubuntu-devel-ppc64le}{jupyter}"
 
 slice_sets:
 
@@ -122,6 +124,70 @@ slice_sets:
           args:
               - CHECKOUT_TF_SRC=1
 
+    ubuntu-ppc64le:
+        - add_to_name: "-ppc64le"
+          dockerfile_exclusive_name: "cpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow-ppc64le
+              - shell
+        - add_to_name: "-gpu-ppc64le"
+          dockerfile_exclusive_name: "gpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+              - ARCH=ppc64le
+              - CUDA=10.0
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow-ppc64le
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
+
+    ubuntu-devel-ppc64le:
+        - add_to_name: "devel-ppc64le"
+          dockerfile_exclusive_name: "devel-cpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+        - add_to_name: "devel-gpu-ppc64le"
+          dockerfile_exclusive_name: "devel-gpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+              - ARCH=ppc64le
+              - CUDA=10.0
+              - LIB_DIR_PREFIX=powerpc64le
+              - CHECKOUT_TF_SRC=1
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-nvidia
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
+
     nightly:
         - add_to_name: "nightly"
           partials:
diff --git a/tensorflow/tools/dockerfiles/tools.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
index e8929295a5ee397acbe46ebf96894174ca01fca2..a96b2578cba7579c605d25ee6068d2cde278e1f4 100644
--- a/tensorflow/tools/dockerfiles/tools.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -17,7 +17,7 @@
 #
 # You can use this image to quickly develop changes to the Dockerfile assembler
 # or set of TF Docker partials. See README.md for usage instructions.
-FROM debian:stretch
+FROM ubuntu:16.04
 LABEL maintainer="Austin Anderson <angerson@google.com>"
 
 RUN apt-get update && apt-get install -y python3 python3-pip bash curl
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 83b4bf812881f423195f65cc98dc8f3189af3931..ff32d089514c0b514377e064b5af74555eb273af 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -1681,7 +1681,7 @@ def _get_defined_in(py_object, parser_config):
     path = path[:-1]
 
   # Never include links outside this code base.
-  if path.startswith('..'):
+  if path.startswith('..') or re.search(r'\b_api\b', path):
     return None
 
   if re.match(r'.*/gen_[^/]*\.py$', path):
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 16a0f7d58df66be06224d58de623ee7e2dc41880..f59a7abbea93d7b9c838938689009d4d90c68095 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -37,7 +37,7 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
       input_graph_def,  // clang-format off
       {"Mul",                // mul_node
         {
-          {"Conv2D|MatMul",  // conv_node
+          {"Conv2D|MatMul|DepthwiseConv2dNative",  // conv_node
             {
               {"*"},         // input_node
               {"Const"},     // weights_node
@@ -72,8 +72,15 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
 
         // Make sure all the inputs really are vectors, with as many entries as
         // there are columns in the weights.
-        const int weights_cols_index = conv_node.op() == "Conv2D" ? 3 : 1;
-        const int64 weights_cols = weights.shape().dim_size(weights_cols_index);
+        int64 weights_cols;
+        if (conv_node.op() == "Conv2D") {
+          weights_cols = weights.shape().dim_size(3);
+        } else if (conv_node.op() == "DepthwiseConv2dNative") {
+          weights_cols =
+              weights.shape().dim_size(2) * weights.shape().dim_size(3);
+        } else {
+          weights_cols = weights.shape().dim_size(1);
+        }
         if ((mul_values.shape().dims() != 1) ||
             (mul_values.shape().dim_size(0) != weights_cols)) {
           return errors::InvalidArgument(
@@ -82,14 +89,13 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
         }
 
         // Multiply the original weights by the scale vector.
-        auto weights_matrix = weights.flat_inner_dims<float>();
+        auto weights_vector = weights.flat<float>();
         Tensor scaled_weights(DT_FLOAT, weights.shape());
-        auto scaled_weights_matrix = scaled_weights.flat_inner_dims<float>();
-        for (int64 row = 0; row < weights_matrix.dimension(0); ++row) {
-          for (int64 col = 0; col < weights_cols; ++col) {
-            scaled_weights_matrix(row, col) =
-                weights_matrix(row, col) * mul_values.flat<float>()(col);
-          }
+        auto scaled_weights_vector = scaled_weights.flat<float>();
+        for (int64 row = 0; row < weights_vector.dimension(0); ++row) {
+          scaled_weights_vector(row) =
+              weights_vector(row) *
+              mul_values.flat<float>()(row % weights_cols);
         }
 
         // Construct the new nodes.
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
index a5d541feb6f4dbfd5a0f61b171fd05160a6d67c8..885fbd59b7797c35639d0a33dbb895d8589b6b4d 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
@@ -87,6 +87,57 @@ class FoldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldBatchNormsDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mul_values_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mul_values_data, {2.0f, 3.0f, 4.0f, 5.0f});
+    Output mul_values_op = Const(root.WithOpName("mul_values"),
+                                 Input::Initializer(mul_values_data));
+
+    Output mul_op = Mul(root.WithOpName("output"), conv_op, mul_values_op);
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(
+        FoldBatchNorms(original_graph_def, {{}, {"output"}}, &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("Mul", node.op());
+    }
+  }
+
   void TestFoldBatchNormsConv2DShared() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -202,6 +253,9 @@ TEST_F(FoldBatchNormsTest, TestFoldBatchNormsConv2D) {
 TEST_F(FoldBatchNormsTest, TestFoldBatchNormsMatMul) {
   TestFoldBatchNormsMatMul();
 }
+TEST_F(FoldBatchNormsTest, TestFoldBatchNormsDepthwiseConv2dNative) {
+  TestFoldBatchNormsDepthwiseConv2dNative();
+}
 
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index fd546f812c0dafc5d2e71c94710c3c3f5b75250e..532b4600973cbc2ef2826be1bf551984a1f1f8d6 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -109,24 +109,29 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
                                     const string& conv_output_name,
                                     std::vector<NodeDef>* new_nodes) {
   const NodeDef& conv_node = conv_node_match.node;
-  CHECK_EQ("Conv2D", conv_node.op());
+  // CHECK_EQ("Conv2D", conv_node.op());
   const NodeDef& input_node = conv_node_match.inputs[0].node;
   const NodeDef& weights_node = conv_node_match.inputs[1].node;
   CHECK_EQ("Const", weights_node.op());
 
   Tensor weights = GetNodeTensorAttr(weights_node, "value");
-  const int64 weights_cols = weights.shape().dim_size(3);
+  int64 weights_cols;
+  if (conv_node.op() == "Conv2D") {
+    weights_cols = weights.shape().dim_size(3);
+  } else if (conv_node.op() == "DepthwiseConv2dNative") {
+    weights_cols = weights.shape().dim_size(2) * weights.shape().dim_size(3);
+  } else {
+    weights_cols = weights.shape().dim_size(1);
+  }
   CHECK_EQ(weights_cols, scale_values.size());
 
   // Multiply the original weights by the scale vector.
-  auto weights_matrix = weights.flat_inner_dims<float>();
+  auto weights_vector = weights.flat<float>();
   Tensor scaled_weights(DT_FLOAT, weights.shape());
-  auto scaled_weights_matrix = scaled_weights.flat_inner_dims<float>();
-  for (int64 row = 0; row < weights_matrix.dimension(0); ++row) {
-    for (int64 col = 0; col < weights_cols; ++col) {
-      scaled_weights_matrix(row, col) =
-          weights_matrix(row, col) * scale_values[col];
-    }
+  auto scaled_weights_vector = scaled_weights.flat<float>();
+  for (int64 row = 0; row < weights_vector.dimension(0); ++row) {
+    scaled_weights_vector(row) =
+        weights_vector(row) * scale_values[row % weights_cols];
   }
   // Figure out the remaining bias to add on.
   Tensor bias_offset(DT_FLOAT, {weights_cols});
@@ -158,7 +163,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  if (conv_node.attr().count("data_format") > 0) {
+  if (!conv_node.attr().count("data_format")) {
     CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
@@ -185,7 +190,7 @@ Status FuseBatchNormWithConv(const NodeMatch& match,
 }
 
 Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
-                             std::vector<NodeDef>* new_nodes) {
+                                     std::vector<NodeDef>* new_nodes) {
   // Calculate the scale and offset values to apply.
   std::vector<float> scale_values;
   std::vector<float> offset_values;
@@ -200,9 +205,8 @@ Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
   const NodeDef& conv_node = conv_node_match.node;
 
   string biasadd_name = conv_node.name() + "/biasadd";
-  TF_RETURN_IF_ERROR(
-      FuseScaleOffsetToConvWeights(scale_values, offset_values, conv_node_match,
-                                   biasadd_name , new_nodes));
+  TF_RETURN_IF_ERROR(FuseScaleOffsetToConvWeights(
+      scale_values, offset_values, conv_node_match, biasadd_name, new_nodes));
 
   NodeDef new_batch_to_space_node = batch_to_space_node;
   // reuse batch_norm node name
@@ -292,7 +296,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
         current_graph_def,  // clang-format off
       {"BatchNormWithGlobalNormalization|FusedBatchNorm",    // batch_norm_node
         {
-          {"Conv2D",                          // conv_node
+          {"Conv2D|DepthwiseConv2dNative",                          // conv_node
             {
               {"*"},                          // input_node
               {"Const"},                      // weights_node
@@ -325,7 +329,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
          {
              {"BatchToSpaceND",                  // batch_to_space_node
               {
-                  {"Conv2D",                     // conv_node
+                  {"Conv2D|DepthwiseConv2dNative",                     // conv_node
                    {
                        {"*"},                    // input_node
                        {"Const"},                // weights_node
@@ -363,13 +367,13 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
         {
           {"ConcatV2|Concat",                     // concat two conv2d.
             {
-              {"Conv2D",                          // conv_node
+              {"Conv2D|DepthwiseConv2dNative",                          // conv_node
                 {
                   {"*"},                          // input_node
                   {"Const"},                      // weights_node
                 }
               },
-              {"Conv2D",                          // conv_node
+              {"Conv2D|DepthwiseConv2dNative",                          // conv_node
                 {
                   {"*"},                          // input_node
                   {"Const"},                      // weights_node
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 6c7174926d06460556ce673a5fe738901134543d..c5fa9b16b0c91e6c069462f0663908737bb6f835 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -121,6 +121,84 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldOldBatchNormsAfterDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f, 30.0f, 40.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f, 0.75f, 1.0f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f, 1.1f, 1.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f, 3.0f, 4.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("BatchNormWithGlobalNormalization");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("variance_epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("scale_after_normalization", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+    original_graph_def.mutable_versions()->set_producer(8);
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("BatchNormWithGlobalNormalization", node.op());
+    }
+  }
+
   void TestFoldFusedBatchNorms() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -198,6 +276,83 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldFusedBatchNormsAfterDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f, 30.0f, 40.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f, 0.75f, 1.0f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f, 1.1f, 1.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f, 3.0f, 4.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("FusedBatchNorm");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("is_training", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 2e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("FusedBatchNorm", node.op());
+    }
+  }
+
   void TestFoldFusedBatchNormsWithConcat(const bool split) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -321,16 +476,17 @@ void TestFoldFusedBatchNormsWithBatchToSpace() {
 
   Tensor block_shape_data(DT_INT32, TensorShape({2}));
   test::FillValues<int32>(&block_shape_data, {1, 2});
-  Output block_shape_op =
-      Const(root.WithOpName("block_shape_op"), Input::Initializer(block_shape_data));
+  Output block_shape_op = Const(root.WithOpName("block_shape_op"),
+                                Input::Initializer(block_shape_data));
 
   Tensor crops_data(DT_INT32, TensorShape({2, 2}));
   test::FillValues<int32>(&crops_data, {0, 0, 0, 1});
   Output crops_op =
       Const(root.WithOpName("crops_op"), Input::Initializer(crops_data));
 
-  Output batch_to_space_op = BatchToSpaceND(root.WithOpName("batch_to_space_op"),
-                                            conv_op, block_shape_op, crops_data);
+  Output batch_to_space_op =
+      BatchToSpaceND(root.WithOpName("batch_to_space_op"), conv_op,
+                     block_shape_op, crops_data);
 
   Tensor mean_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&mean_data, {10.0f, 20.0f});
@@ -339,8 +495,8 @@ void TestFoldFusedBatchNormsWithBatchToSpace() {
 
   Tensor variance_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&variance_data, {0.25f, 0.5f});
-  Output variance_op = Const(root.WithOpName("variance_op"),
-                             Input::Initializer(variance_data));
+  Output variance_op =
+      Const(root.WithOpName("variance_op"), Input::Initializer(variance_data));
 
   Tensor beta_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&beta_data, {0.1f, 0.6f});
@@ -410,5 +566,14 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithBatchToSpace) {
   TestFoldFusedBatchNormsWithBatchToSpace();
 }
 
+TEST_F(FoldOldBatchNormsTest, TestFoldOldBatchNormsAfterDepthwiseConv2dNative) {
+  TestFoldOldBatchNormsAfterDepthwiseConv2dNative();
+}
+
+TEST_F(FoldOldBatchNormsTest,
+       TestFoldFusedBatchNormsAfterDepthwiseConv2dNative) {
+  TestFoldFusedBatchNormsAfterDepthwiseConv2dNative();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 30a31a35c57257f8aa75c6b32ea4a9a139af21b5..90dfca2b444fd48a340716a43176c1308810ce26 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -77,8 +77,10 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:spectral_ops_test_util",
     "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/data/benchmarks:benchmark_base",
     "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
+    "//tensorflow/python/data/kernel_tests:filter_test_base",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 952c71c61580fba72dbf1a4b2e1bd836816b1420..3bcc4fc81bdbefcc3c1e5481d5a1c18ee8f15768 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -34,6 +34,7 @@ PIP_PACKAGE_QUERY_EXPRESSION = (
 # pip smoke test.
 BUILD_BLACKLIST = [
     "tensorflow/lite/examples/android",
+    "tensorflow/lite/experimental/objc",
     "tensorflow/lite/experimental/swift",
 ]
 
@@ -88,8 +89,8 @@ DEPENDENCY_BLACKLIST = [
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
     # lite
-    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm",
-    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm.py",
+    "//tensorflow/lite/experimental/examples/lstm:rnn_cell",
+    "//tensorflow/lite/experimental/examples/lstm:rnn_cell.py",
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test",  # pylint:disable=line-too-long
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test.py",  # pylint:disable=line-too-long
     "//tensorflow/lite/python:interpreter",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index bbe3050513d50c48fbf93676743aeff082cbce30..83081a1ff3868795a8e2be4b33e654056b025f0e 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -51,7 +51,7 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'google_pasta >= 0.1.1',
+    'google_pasta >= 0.1.2',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
     'numpy >= 1.14.5, < 2.0',
@@ -284,6 +284,7 @@ setup(
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 4b2026b9472b651f8e0571155dab8952d20aa8b2..ef12226ec001cc2ddcb09980fcf38a0aeb794742 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -57,6 +57,14 @@ py_binary(
     srcs = ["run_and_gather_logs.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":run_and_gather_logs_main_lib"],
+)
+
+py_library(
+    name = "run_and_gather_logs_main_lib",
+    srcs = ["run_and_gather_logs.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":run_and_gather_logs_lib",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 3486871080c78dc7a1cc201ea2a4d45ebc342758..97861110346b62659ac97da95727250abaf3b928 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -4,60 +4,66 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # Create a benchmark test target of a TensorFlow C++ test (tf_cc_*_test)
 def tf_cc_logged_benchmark(
-    name=None,
-    target=None,
-    benchmarks="..",
-    tags=[],
-    test_log_output_prefix="",
-    benchmark_type="cpp_microbenchmark"):
-  if not name:
-    fail("Must provide a name")
-  if not target:
-    fail("Must provide a target")
-  if (not ":" in target
-      or not target.startswith("//")
-      or target.endswith(":all")
-      or target.endswith(".")):
-    fail(" ".join(("Target must be a single well-defined test, e.g.,",
-                   "//path/to:test. Received: %s" % target)))
+        name = None,
+        target = None,
+        benchmarks = "..",
+        tags = [],
+        test_log_output_prefix = "",
+        benchmark_type = "cpp_microbenchmark"):
+    if not name:
+        fail("Must provide a name")
+    if not target:
+        fail("Must provide a target")
+    if (not ":" in target or
+        not target.startswith("//") or
+        target.endswith(":all") or
+        target.endswith(".")):
+        fail(" ".join((
+            "Target must be a single well-defined test, e.g.,",
+            "//path/to:test. Received: %s" % target,
+        )))
 
-  all_tags = (
-    depset(tags) + depset(
-      ["benchmark-test", "local", "manual", "regression-test"])).to_list()
+    all_tags = (
+        depset(tags) + depset(
+            ["benchmark-test", "local", "manual", "regression-test"],
+        )
+    ).to_list()
 
-  tf_py_test(
-      name = name,
-      tags = all_tags,
-      size = "large",
-      srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
-      args = [
-          "--name=//%s:%s" % (native.package_name(), name),
-          "--test_name=" + target,
-          "--test_args=--benchmarks=%s" % benchmarks,
-          "--benchmark_type=%s" % benchmark_type,
-      ],
-      data = [
-        target,
-      ],
-      main = "run_and_gather_logs.py",
-      additional_deps = [
-          "//tensorflow/tools/test:run_and_gather_logs"
-      ])
+    tf_py_test(
+        name = name,
+        tags = all_tags,
+        size = "large",
+        srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
+        args = [
+            "--name=//%s:%s" % (native.package_name(), name),
+            "--test_name=" + target,
+            "--test_args=--benchmarks=%s" % benchmarks,
+            "--benchmark_type=%s" % benchmark_type,
+        ],
+        data = [
+            target,
+        ],
+        main = "run_and_gather_logs.py",
+        additional_deps = [
+            "//tensorflow/tools/test:run_and_gather_logs",
+        ],
+    )
 
 # Create a benchmark test target of a TensorFlow python test (*py_tests)
 def tf_py_logged_benchmark(
-    name=None,
-    target=None,
-    benchmarks="..",
-    tags=[],
-    test_log_output_prefix=""):
-  # For now generating a py benchmark is the same as generating a C++
-  # benchmark target. In the future this may change, so we have
-  # two macros just in case
-  tf_cc_logged_benchmark(
-    name=name,
-    target=target,
-    benchmarks=benchmarks,
-    tags=tags,
-    test_log_output_prefix=test_log_output_prefix,
-    benchmark_type="python_benchmark")
+        name = None,
+        target = None,
+        benchmarks = "..",
+        tags = [],
+        test_log_output_prefix = ""):
+    # For now generating a py benchmark is the same as generating a C++
+    # benchmark target. In the future this may change, so we have
+    # two macros just in case
+    tf_cc_logged_benchmark(
+        name = name,
+        target = target,
+        benchmarks = benchmarks,
+        tags = tags,
+        test_log_output_prefix = test_log_output_prefix,
+        benchmark_type = "python_benchmark",
+    )
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f6689ba4a3d1091799c691445c6b9aecedd91c1d..937962fa22d781de5b69310c99f91ff78a5d00c1 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,7 +7,6 @@ load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
@@ -129,23 +128,23 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "4c2e4194bbddcb5162933e45fe574d2c4e77a2ef00818b8dac0392459707bfff",
-        strip_prefix = "abseil-cpp-d78310fe5a82f2e0e6e16509ef8079c8d7e4674e",
+        sha256 = "583e5801372a0bb12eb561858532e3bb9a3528f15f65cfc87b2c0f4c1ab1a0ca",
+        strip_prefix = "abseil-cpp-111ca7060a6ff50115ca85b59f6b5d8c8c5e9105",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/d78310fe5a82f2e0e6e16509ef8079c8d7e4674e.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/d78310fe5a82f2e0e6e16509ef8079c8d7e4674e.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/111ca7060a6ff50115ca85b59f6b5d8c8c5e9105.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/111ca7060a6ff50115ca85b59f6b5d8c8c5e9105.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        patch_file = clean_dep("//third_party/eigen3:gebp_neon.patch"),
-        sha256 = "48678550a32665331d729be87076e576f2502fff325f5b6c2c78ebf7b1b22c7b",
-        strip_prefix = "eigen-eigen-bcc817c0ba98",
+        patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
+        sha256 = "61f0017318a24cf940db14e57febecc524b24a9faa8ff4fa7f9f91630c4cd09d",
+        strip_prefix = "eigen-eigen-5a4931dafc1c",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/bcc817c0ba98.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/bcc817c0ba98.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/5a4931dafc1c.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/5a4931dafc1c.tar.gz",
         ],
     )
 
@@ -186,15 +185,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "886bcba3616d5f362838a2d86ae0198dd3670a84a84c82291cda6c30e14779fc",
-        strip_prefix = "google-cloud-cpp-0.5.0",
+        sha256 = "8e3a302d37f232dec041bf3f3916ca3fa5689216d42112898a4e36581f2f4ce5",
+        strip_prefix = "google-cloud-cpp-0.6.1",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.5.0.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.5.0.tar.gz",
+            "https://mirror.bazel.build/github.com/googleapis/google-cloud-cpp/archive/v0.6.1.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v0.6.1.tar.gz",
         ],
     )
 
@@ -478,12 +477,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
         name = "grpc",
-        sha256 = "1aa84387232dda273ea8fdfe722622084f72c16f7b84bfc519ac7759b71cdc91",
-        strip_prefix = "grpc-69b6c047bc767b4d80e7af4d00ccb7c45b683dae",
+        sha256 = "e1e3a9edbfbe4230bee174d4aa45a15c1ec2b203cedb02d20df3e6345d8fa63e",
+        strip_prefix = "grpc-62688b6a05cc85b47fb77dd408611734253e47e2",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
-            "https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+            "https://mirror.bazel.build/github.com/grpc/grpc/archive/62688b6a05cc85b47fb77dd408611734253e47e2.tar.gz",
+            "https://github.com/grpc/grpc/archive/62688b6a05cc85b47fb77dd408611734253e47e2.tar.gz",
         ],
     )
 
@@ -514,11 +513,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "ec524f6c8e4d7514b1753131e844c73df4cef1a22d8773b0e43ccd9f813bbd48",
-        strip_prefix = "llvm-ba3936b0ea89935ab8ea943c1b372d69923691d9",
+        sha256 = "8399a5e0111f52c5d8d3df3638cb997b3f01e1bc7288ee95d77e97b62a47ba18",
+        strip_prefix = "llvm-f2cd363b4b915b54ba0f25066f876c16d59fede6",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/ba3936b0ea89935ab8ea943c1b372d69923691d9.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/ba3936b0ea89935ab8ea943c1b372d69923691d9.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/f2cd363b4b915b54ba0f25066f876c16d59fede6.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/f2cd363b4b915b54ba0f25066f876c16d59fede6.tar.gz",
         ],
     )
 
@@ -718,16 +717,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "bazel_toolchains",
-        sha256 = "07dfbe80638eb1fe681f7c07e61b34b579c6710c691e49ee90ccdc6e9e75ebbb",
-        strip_prefix = "bazel-toolchains-9a111bd82161c1fbe8ed17a593ca1023fd941c70",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
-            "https://github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
diff --git a/third_party/android/android.bzl.tpl b/third_party/android/android.bzl.tpl
index e6ed4994f3ba6d721d717a04b0bd22f54dbb1d79..c498f585abfa5212464cdb91e839165643d9b005 100644
--- a/third_party/android/android.bzl.tpl
+++ b/third_party/android/android.bzl.tpl
@@ -1,9 +1,9 @@
 """Set up configurable Android SDK and NDK dependencies."""
 
 def android_workspace():
-  # String for replacement in Bazel template.
-  # These will either be replaced by android_sdk_repository if various ENV
-  # variables are set when `local_config_android` repo_rule is run, or they
-  # will be replaced by noops otherwise.
-  MAYBE_ANDROID_SDK_REPOSITORY
-  MAYBE_ANDROID_NDK_REPOSITORY
+    # String for replacement in Bazel template.
+    # These will either be replaced by android_sdk_repository if various ENV
+    # variables are set when `local_config_android` repo_rule is run, or they
+    # will be replaced by noops otherwise.
+    MAYBE_ANDROID_SDK_REPOSITORY
+    MAYBE_ANDROID_NDK_REPOSITORY
diff --git a/third_party/android/android_configure.bzl b/third_party/android/android_configure.bzl
index da09bdf39eed90b648ca8f47c79d16e3ec3804bb..646ed732a1c7c765a67e1a2d3daa1183cd6aede0 100644
--- a/third_party/android/android_configure.bzl
+++ b/third_party/android/android_configure.bzl
@@ -36,33 +36,39 @@ _ANDROID_NDK_REPO_TEMPLATE = """
 """
 
 def _android_autoconf_impl(repository_ctx):
-  """Implementation of the android_autoconf repository rule."""
-  sdk_home = repository_ctx.os.environ.get(_ANDROID_SDK_HOME)
-  sdk_api_level = repository_ctx.os.environ.get(_ANDROID_SDK_API_VERSION)
-  build_tools_version = repository_ctx.os.environ.get(
-      _ANDROID_BUILD_TOOLS_VERSION)
-  ndk_home = repository_ctx.os.environ.get(_ANDROID_NDK_HOME)
-  ndk_api_level = repository_ctx.os.environ.get(_ANDROID_NDK_API_VERSION)
+    """Implementation of the android_autoconf repository rule."""
+    sdk_home = repository_ctx.os.environ.get(_ANDROID_SDK_HOME)
+    sdk_api_level = repository_ctx.os.environ.get(_ANDROID_SDK_API_VERSION)
+    build_tools_version = repository_ctx.os.environ.get(
+        _ANDROID_BUILD_TOOLS_VERSION,
+    )
+    ndk_home = repository_ctx.os.environ.get(_ANDROID_NDK_HOME)
+    ndk_api_level = repository_ctx.os.environ.get(_ANDROID_NDK_API_VERSION)
 
-  sdk_rule = "pass"
-  if all([sdk_home, sdk_api_level, build_tools_version]):
-    sdk_rule = _ANDROID_SDK_REPO_TEMPLATE % (
-        sdk_home, sdk_api_level, build_tools_version)
+    sdk_rule = "pass"
+    if all([sdk_home, sdk_api_level, build_tools_version]):
+        sdk_rule = _ANDROID_SDK_REPO_TEMPLATE % (
+            sdk_home,
+            sdk_api_level,
+            build_tools_version,
+        )
 
-  ndk_rule = "pass"
-  if all([ndk_home, ndk_api_level]):
-    ndk_rule = _ANDROID_NDK_REPO_TEMPLATE % (ndk_home, ndk_api_level)
+    ndk_rule = "pass"
+    if all([ndk_home, ndk_api_level]):
+        ndk_rule = _ANDROID_NDK_REPO_TEMPLATE % (ndk_home, ndk_api_level)
 
-  repository_ctx.template(
-      "BUILD",
-      Label("//third_party/android:android_configure.BUILD.tpl"))
-  repository_ctx.template(
-      "android.bzl",
-      Label("//third_party/android:android.bzl.tpl"),
-      substitutions={
-          "MAYBE_ANDROID_SDK_REPOSITORY": sdk_rule,
-          "MAYBE_ANDROID_NDK_REPOSITORY": ndk_rule,
-      })
+    repository_ctx.template(
+        "BUILD",
+        Label("//third_party/android:android_configure.BUILD.tpl"),
+    )
+    repository_ctx.template(
+        "android.bzl",
+        Label("//third_party/android:android.bzl.tpl"),
+        substitutions = {
+            "MAYBE_ANDROID_SDK_REPOSITORY": sdk_rule,
+            "MAYBE_ANDROID_NDK_REPOSITORY": ndk_rule,
+        },
+    )
 
 android_configure = repository_rule(
     implementation = _android_autoconf_impl,
diff --git a/third_party/common.bzl b/third_party/common.bzl
index db981a5e314c08f8612578161fcc1f63b4662a69..8134bf3d2562f5405ece9f50537b88d7fb27865f 100644
--- a/third_party/common.bzl
+++ b/third_party/common.bzl
@@ -21,11 +21,11 @@
 #   substitutions: A dictionary mapping strings to their substitutions
 
 def template_rule_impl(ctx):
-  ctx.template_action(
-      template = ctx.file.src,
-      output = ctx.outputs.out,
-      substitutions = ctx.attr.substitutions,
-  )
+    ctx.template_action(
+        template = ctx.file.src,
+        output = ctx.outputs.out,
+        substitutions = ctx.attr.substitutions,
+    )
 
 template_rule = rule(
     attrs = {
diff --git a/third_party/eigen3/gebp_neon.patch b/third_party/eigen3/gebp_neon.patch
deleted file mode 100644
index d0022e94ccd40c8efe1423926b418531b0fb3eba..0000000000000000000000000000000000000000
--- a/third_party/eigen3/gebp_neon.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h	2019-01-22 20:46:51.000000000 -0800
-+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h	2019-01-25 13:48:49.000000000 -0800
-@@ -1031,7 +1031,7 @@
- 
-   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-   {
--    c += a * b;
-+    c = vfmaq_n_f32(c, a, b);
-   }
- 
-   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
diff --git a/third_party/eigen3/gpu_packet_math.patch b/third_party/eigen3/gpu_packet_math.patch
new file mode 100644
index 0000000000000000000000000000000000000000..06347538e179383bbe991a49506b1656a5d21ab3
--- /dev/null
+++ b/third_party/eigen3/gpu_packet_math.patch
@@ -0,0 +1,18 @@
+--- a/Eigen/src/Core/arch/GPU/PacketMath.h
++++ b/Eigen/src/Core/arch/GPU/PacketMath.h
+@@ -100,6 +100,7 @@
+   return make_double2(from, from);
+ }
+ 
++#if defined(EIGEN_CUDA_ARCH)
+ namespace {
+ 
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+@@ -211,6 +212,7 @@
+ pcmp_eq<double2>(const double2& a, const double2& b) {
+   return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+ }
++#endif  // EIGEN_CUDA_ARCH
+ 
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+   return make_float4(a, a+1, a+2, a+3);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 223ea4d58bf4c40b2790e2f5d73e2a4fc1a79eec..8df6782551779acded8b74fab30dc78f74c51948 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -178,25 +178,37 @@ template <>
 struct unpacket_traits<Packet32q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 32, alignment = Aligned32 };
+  enum { size = 32, alignment = Aligned32, vectorizable = true };
+};
+template <>
+struct unpacket_traits<Packet16q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum { size = 16, alignment = Aligned32, vectorizable = true };
 };
 template <>
 struct unpacket_traits<Packet16q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 16, alignment = Aligned32 };
+  enum { size = 16, alignment = Aligned32, vectorizable = true };
+};
+template <>
+struct unpacket_traits<Packet8q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 8, alignment = Aligned32, vectorizable = true };
 };
 template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
-  enum { size = 32, alignment = Aligned32 };
+  enum { size = 32, alignment = Aligned32, vectorizable = true };
 };
 template <>
 struct unpacket_traits<Packet8q32i> {
   typedef QInt32 type;
   typedef Packet4q32i half;
-  enum { size = 8, alignment = Aligned32 };
+  enum { size = 8, alignment = Aligned32, vectorizable = true };
 };
 
 // Unaligned load
@@ -206,6 +218,11 @@ EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -215,6 +232,11 @@ EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
 }
+template<>
+EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
 template <>
 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
@@ -228,6 +250,11 @@ EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -238,6 +265,11 @@ EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -250,6 +282,11 @@ EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
+      reinterpret_cast<__m128i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -260,6 +297,11 @@ EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
+      reinterpret_cast<__m128i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -277,6 +319,11 @@ EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
                                                from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
@@ -286,6 +333,11 @@ EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
 }
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.val);
+}
 
 // Extract first element.
 template <>
diff --git a/third_party/git/git_configure.bzl b/third_party/git/git_configure.bzl
index 8e2839bdc254acb42cf551cf561c40f8402c311a..fc18fdb98837e7ebf9ca812412a95dac8a2a7984 100644
--- a/third_party/git/git_configure.bzl
+++ b/third_party/git/git_configure.bzl
@@ -8,49 +8,57 @@
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
 def _fail(msg):
-  """Output failure message when auto configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
 
 def _get_python_bin(repository_ctx):
-  """Gets the python bin path."""
-  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-  if python_bin != None:
-    return python_bin
-  python_bin_path = repository_ctx.which("python")
-  if python_bin_path != None:
-    return str(python_bin_path)
-  _fail("Cannot find python in PATH, please make sure " +
-        "python is installed and add its directory in PATH, or --define " +
-        "%s='/something/else'.\nPATH=%s" % (
-            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
-
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_path = repository_ctx.which("python")
+    if python_bin_path != None:
+        return str(python_bin_path)
+    _fail("Cannot find python in PATH, please make sure " +
+          "python is installed and add its directory in PATH, or --define " +
+          "%s='/something/else'.\nPATH=%s" % (
+              _PYTHON_BIN_PATH,
+              repository_ctx.os.environ.get("PATH", ""),
+          ))
 
 def _git_conf_impl(repository_ctx):
-  repository_ctx.template(
-      "BUILD",
-      Label("//third_party/git:BUILD.tpl"))
-
-  tensorflow_root_path = str(repository_ctx.path(
-      Label("@org_tensorflow//:BUILD")))[:-len("BUILD")]
-  python_script_path = repository_ctx.path(
-      Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"))
-  generated_files_path = repository_ctx.path("gen")
+    repository_ctx.template(
+        "BUILD",
+        Label("//third_party/git:BUILD.tpl"),
+    )
 
-  r = repository_ctx.execute(
-      ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path])
-  if r.return_code == 0:
-    unused_var = repository_ctx.path(Label("//:.git/HEAD")) # pylint: disable=unused-variable
+    tensorflow_root_path = str(repository_ctx.path(
+        Label("@org_tensorflow//:BUILD"),
+    ))[:-len("BUILD")]
+    python_script_path = repository_ctx.path(
+        Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"),
+    )
+    generated_files_path = repository_ctx.path("gen")
 
-  result = repository_ctx.execute([
-      _get_python_bin(repository_ctx),
-      python_script_path, "--configure", tensorflow_root_path,
-      "--gen_root_path", generated_files_path], quiet=False)
+    r = repository_ctx.execute(
+        ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path],
+    )
+    if r.return_code == 0:
+        unused_var = repository_ctx.path(Label("//:.git/HEAD"))  # pylint: disable=unused-variable
 
-  if not result.return_code == 0:
-    _fail(result.stderr)
+    result = repository_ctx.execute([
+        _get_python_bin(repository_ctx),
+        python_script_path,
+        "--configure",
+        tensorflow_root_path,
+        "--gen_root_path",
+        generated_files_path,
+    ], quiet = False)
 
+    if not result.return_code == 0:
+        _fail(result.stderr)
 
 git_configure = repository_rule(
     implementation = _git_conf_impl,
diff --git a/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
index 0e175b3ef66b95c963cdface9417b64ede728bd2..221ccb2fe404f2b254d4f2c6c35b1b8d6f81e9b3 100644
--- a/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
@@ -152,7 +152,11 @@ toolchain {
     # Removal of unused code and data at link time (can this increase binary size in some cases?).
     compiler_flag: "-ffunction-sections"
     compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
+
+    # With hipcc -fno-gpu-rdc, objects and libraries would contain a .kernel
+    # section which would be removed by ld. Therefore --gc-sections shall be
+    # abolished here.
+    #linker_flag: "-Wl,--gc-sections"
   }
   linking_mode_flags { mode: DYNAMIC }
 }
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 57c7b635c42df033bd9493b13eb076900cf57a72..c159531263270d21d78a9faac579b00596889fc6 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -30,6 +30,12 @@ GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 
 HIPCC_PATH = '%{hipcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+HIPCC_ENV = '%{hipcc_env}'
+HIP_RUNTIME_PATH = '%{hip_runtime_path}'
+HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
+HCC_RUNTIME_PATH = '%{hcc_runtime_path}'
+HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}'
+VERBOSE = '%{crosstool_verbose}'=='1'
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
@@ -165,6 +171,11 @@ def InvokeHipcc(argv, log=False):
 
   hipccopts = ' '
   hipccopts += ' ' + hipcc_compiler_options
+  # Use -fno-gpu-rdc by default for early GPU kernel finalization
+  # This flag would trigger GPU kernels be generated at compile time, instead
+  # of link time. This allows the default host compiler (gcc) be used as the
+  # linker for TensorFlow on ROCm platform.
+  hipccopts += ' -fno-gpu-rdc '
   hipccopts += undefines
   hipccopts += defines
   hipccopts += std_options
@@ -205,37 +216,46 @@ def main():
   args, leftover = parser.parse_known_args(sys.argv[1:])
 
   if args.x and args.x[0] == 'rocm':
-    # XXX use hipcc to link
-    if args.pass_exit_codes:
-      gpu_compiler_flags = [flag for flag in sys.argv[1:]
-                                 if not flag.startswith(('-pass-exit-codes'))]
-
-      # special handling for $ORIGIN
-      # - guard every argument with ''
-      modified_gpu_compiler_flags = []
-      for flag in gpu_compiler_flags:
-        modified_gpu_compiler_flags.append("'" + flag + "'")
-
-      if args.rocm_log: Log('Link with hipcc: %s' % (' '.join([HIPCC_PATH] + modified_gpu_compiler_flags)))
-      return subprocess.call([HIPCC_PATH] + modified_gpu_compiler_flags)
-
+    # compilation for GPU objects
     if args.rocm_log: Log('-x rocm')
     leftover = [pipes.quote(s) for s in leftover]
     if args.rocm_log: Log('using hipcc')
     return InvokeHipcc(leftover, log=args.rocm_log)
 
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--rocm_log'))]
+  elif args.pass_exit_codes:
+    # link
+    # with hipcc compiler invoked with -fno-gpu-rdc by default now, it's ok to 
+    # use host compiler as linker, but we have to link with HCC/HIP runtime.
+    # Such restriction would be revised further as the bazel script get
+    # improved to fine tune dependencies to ROCm libraries.
+    gpu_linker_flags = [flag for flag in sys.argv[1:]
+                               if not flag.startswith(('--rocm_log'))]
+
+    gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH)
+    gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH)
+    gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY)
+    gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
+    gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
+    gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+
+    if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
+    return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
 
-  # XXX: SE codes need to be built with gcc, but need this macro defined
-  cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+  else:
+    # compilation for host objects
+
+    # Strip our flags before passing through to the CPU compiler for files which
+    # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
+    # We not only want to pass -x to the CPU compiler, but also keep it in its
+    # relative location in the argv list (the compiler is actually sensitive to
+    # this).
+    cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                               if not flag.startswith(('--rocm_log'))]
+
+    # XXX: SE codes need to be built with gcc, but need this macro defined
+    cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
+    if VERBOSE: print(' '.join([CPU_COMPILER] + cpu_compiler_flags))
+    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
 
 if __name__ == '__main__':
   sys.exit(main())
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 40c5e373ede149494918643509f424d4f352ebc5..f13aacb93d59a4a61c8a1734385962b94e8a59a1 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -126,141 +126,142 @@ load(
 )
 
 def _get_python_bin(repository_ctx):
-  """Gets the python bin path."""
-  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-  if python_bin != None:
-    return python_bin
-  python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
-  python_bin_path = repository_ctx.which(python_bin_name)
-  if python_bin_path != None:
-    return str(python_bin_path)
-  auto_configure_fail(
-      "Cannot find python in PATH, please make sure " +
-      "python is installed and add its directory in PATH, or --define " +
-      "%s='/something/else'.\nPATH=%s" % (
-          _PYTHON_BIN_PATH,
-          repository_ctx.os.environ.get("PATH", ""),
-      ))
-
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
+    python_bin_path = repository_ctx.which(python_bin_name)
+    if python_bin_path != None:
+        return str(python_bin_path)
+    auto_configure_fail(
+        "Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            _PYTHON_BIN_PATH,
+            repository_ctx.os.environ.get("PATH", ""),
+        ),
+    )
 
 def _get_nvcc_tmp_dir_for_windows(repository_ctx):
-  """Return the tmp directory for nvcc to generate intermediate source files."""
-  escaped_tmp_dir = escape_string(
-      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-          "\\", "\\\\"),)
-  return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
-
+    """Return the tmp directory for nvcc to generate intermediate source files."""
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
 
 def _get_msvc_compiler(repository_ctx):
-  vc_path = find_vc_path(repository_ctx)
-  return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
-
+    vc_path = find_vc_path(repository_ctx)
+    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
 
 def _get_win_cuda_defines(repository_ctx):
-  """Return CROSSTOOL defines for Windows"""
+    """Return CROSSTOOL defines for Windows"""
+
+    # If we are not on Windows, return empty vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    if not _is_windows(repository_ctx):
+        return {
+            "%{msvc_env_tmp}": "",
+            "%{msvc_env_path}": "",
+            "%{msvc_env_include}": "",
+            "%{msvc_env_lib}": "",
+            "%{msvc_cl_path}": "",
+            "%{msvc_ml_path}": "",
+            "%{msvc_link_path}": "",
+            "%{msvc_lib_path}": "",
+            "%{cxx_builtin_include_directory}": "",
+        }
+
+    vc_path = find_vc_path(repository_ctx)
+    if not vc_path:
+        auto_configure_fail(
+            "Visual C++ build tools not found on your machine." +
+            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
+        )
+        return {}
+
+    env = setup_vc_env_vars(repository_ctx, vc_path)
+    escaped_paths = escape_string(env["PATH"])
+    escaped_include_paths = escape_string(env["INCLUDE"])
+    escaped_lib_paths = escape_string(env["LIB"])
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+
+    msvc_cl_path = _get_python_bin(repository_ctx)
+    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+        "\\",
+        "/",
+    )
+
+    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+    # The generated files are guranteed to have unique name, so they can share the same tmp directory
+    escaped_cxx_include_directories = [
+        "cxx_builtin_include_directory: \"%s\"" %
+        _get_nvcc_tmp_dir_for_windows(repository_ctx),
+    ]
+    for path in escaped_include_paths.split(";"):
+        if path:
+            escaped_cxx_include_directories.append(
+                "cxx_builtin_include_directory: \"%s\"" % path,
+            )
 
-  # If we are not on Windows, return empty vaules for Windows specific fields.
-  # This ensures the CROSSTOOL file parser is happy.
-  if not _is_windows(repository_ctx):
     return {
-        "%{msvc_env_tmp}": "",
-        "%{msvc_env_path}": "",
-        "%{msvc_env_include}": "",
-        "%{msvc_env_lib}": "",
-        "%{msvc_cl_path}": "",
-        "%{msvc_ml_path}": "",
-        "%{msvc_link_path}": "",
-        "%{msvc_lib_path}": "",
-        "%{cxx_builtin_include_directory}": "",
+        "%{msvc_env_tmp}": escaped_tmp_dir,
+        "%{msvc_env_path}": escaped_paths,
+        "%{msvc_env_include}": escaped_include_paths,
+        "%{msvc_env_lib}": escaped_lib_paths,
+        "%{msvc_cl_path}": msvc_cl_path,
+        "%{msvc_ml_path}": msvc_ml_path,
+        "%{msvc_link_path}": msvc_link_path,
+        "%{msvc_lib_path}": msvc_lib_path,
+        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
     }
 
-  vc_path = find_vc_path(repository_ctx)
-  if not vc_path:
-    auto_configure_fail(
-        "Visual C++ build tools not found on your machine." +
-        "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using"
-    )
-    return {}
-
-  env = setup_vc_env_vars(repository_ctx, vc_path)
-  escaped_paths = escape_string(env["PATH"])
-  escaped_include_paths = escape_string(env["INCLUDE"])
-  escaped_lib_paths = escape_string(env["LIB"])
-  escaped_tmp_dir = escape_string(
-      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-          "\\", "\\\\"),)
-
-  msvc_cl_path = _get_python_bin(repository_ctx)
-  msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
-      "\\", "/")
-  msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
-      "\\", "/")
-  msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
-      "\\", "/")
-
-  # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-  # The generated files are guranteed to have unique name, so they can share the same tmp directory
-  escaped_cxx_include_directories = [
-      "cxx_builtin_include_directory: \"%s\"" %
-      _get_nvcc_tmp_dir_for_windows(repository_ctx)
-  ]
-  for path in escaped_include_paths.split(";"):
-    if path:
-      escaped_cxx_include_directories.append(
-          "cxx_builtin_include_directory: \"%s\"" % path)
-
-  return {
-      "%{msvc_env_tmp}":
-          escaped_tmp_dir,
-      "%{msvc_env_path}":
-          escaped_paths,
-      "%{msvc_env_include}":
-          escaped_include_paths,
-      "%{msvc_env_lib}":
-          escaped_lib_paths,
-      "%{msvc_cl_path}":
-          msvc_cl_path,
-      "%{msvc_ml_path}":
-          msvc_ml_path,
-      "%{msvc_link_path}":
-          msvc_link_path,
-      "%{msvc_lib_path}":
-          msvc_lib_path,
-      "%{cxx_builtin_include_directory}":
-          "\n".join(escaped_cxx_include_directories),
-  }
-
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
-  """Find the C++ compiler."""
-  if _is_windows(repository_ctx):
-    return _get_msvc_compiler(repository_ctx)
-
-  if _use_cuda_clang(repository_ctx):
-    target_cc_name = "clang"
-    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-    if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-      return "extra_tools/bin/clang"
-  else:
-    target_cc_name = "gcc"
-    cc_path_envvar = _GCC_HOST_COMPILER_PATH
-  cc_name = target_cc_name
-
-  if cc_path_envvar in repository_ctx.os.environ:
-    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-    if cc_name_from_env:
-      cc_name = cc_name_from_env
-  if cc_name.startswith("/"):
-    # Absolute path, maybe we should make this supported by our which function.
-    return cc_name
-  cc = repository_ctx.which(cc_name)
-  if cc == None:
-    fail(("Cannot find {}, either correct your path or set the {}" +
-          " environment variable").format(target_cc_name, cc_path_envvar))
-  return cc
-
+    """Find the C++ compiler."""
+    if _is_windows(repository_ctx):
+        return _get_msvc_compiler(repository_ctx)
+
+    if _use_cuda_clang(repository_ctx):
+        target_cc_name = "clang"
+        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
+            return "extra_tools/bin/clang"
+    else:
+        target_cc_name = "gcc"
+        cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    if cc_path_envvar in repository_ctx.os.environ:
+        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+        if cc_name_from_env:
+            cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = repository_ctx.which(cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
 
 _INC_DIR_MARKER_BEGIN = "#include <...>"
 
@@ -269,480 +270,484 @@ _OSX_FRAMEWORK_SUFFIX = " (framework directory)"
 _OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
 
 def _cxx_inc_convert(path):
-  """Convert path returned by cc -E xc++ in a complete path."""
-  path = path.strip()
-  if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-    path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-  return path
-
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
 
 def _normalize_include_path(repository_ctx, path):
-  """Normalizes include paths before writing them to the crosstool.
-
-    If path points inside the 'crosstool' folder of the repository, a relative
-    path is returned.
-    If path points outside the 'crosstool' folder, an absolute path is returned.
-    """
-  path = str(repository_ctx.path(path))
-  crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-  if path.startswith(crosstool_folder):
-    # We drop the path to "$REPO/crosstool" and a trailing path separator.
-    return path[len(crosstool_folder) + 1:]
-  return path
-
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
 
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-  """Compute the list of default C or C++ include directories."""
-  if lang_is_cpp:
-    lang = "c++"
-  else:
-    lang = "c"
-  result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
-  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
-  if index1 == -1:
-    return []
-  index1 = result.stderr.find("\n", index1)
-  if index1 == -1:
-    return []
-  index2 = result.stderr.rfind("\n ")
-  if index2 == -1 or index2 < index1:
-    return []
-  index2 = result.stderr.find("\n", index2 + 1)
-  if index2 == -1:
-    inc_dirs = result.stderr[index1 + 1:]
-  else:
-    inc_dirs = result.stderr[index1 + 1:index2].strip()
-
-  return [
-      _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-      for p in inc_dirs.split("\n")
-  ]
-
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
+    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = result.stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = result.stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = result.stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = result.stderr[index1 + 1:]
+    else:
+        inc_dirs = result.stderr[index1 + 1:index2].strip()
+
+    return [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
 
 def get_cxx_inc_directories(repository_ctx, cc):
-  """Compute the list of default C and C++ include directories."""
-
-  # For some reason `clang -xc` sometimes returns include paths that are
-  # different from the ones from `clang -xc++`. (Symlink and a dir)
-  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-  includes_cpp_set = depset(includes_cpp)
-  return includes_cpp + [
-      inc for inc in includes_c if inc not in includes_cpp_set
-  ]
-
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+    includes_cpp_set = depset(includes_cpp)
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp_set
+    ]
 
 def auto_configure_fail(msg):
-  """Output failure message when cuda configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
 
 # END cc_configure common functions (see TODO above).
 
 def _host_compiler_includes(repository_ctx, cc):
-  """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
-
-    Args:
-      repository_ctx: The repository context.
-      cc: The path to the gcc host compiler.
-
-    Returns:
-      A string containing the cxx_builtin_include_directory for each of the gcc
-      host compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-  inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-  inc_entries = []
-  for inc_dir in inc_dirs:
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
-  return "\n".join(inc_entries)
-
+    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+
+      Returns:
+        A string containing the cxx_builtin_include_directory for each of the gcc
+        host compiler include directories, which can be added to the CROSSTOOL
+        file.
+      """
+    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+    inc_entries = []
+    for inc_dir in inc_dirs:
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+    return "\n".join(inc_entries)
 
 def _cuda_include_path(repository_ctx, cuda_config):
-  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
-
-    Args:
-      repository_ctx: The repository context.
-      cc: The path to the gcc host compiler.
-
-    Returns:
-      A string containing the cxx_builtin_include_directory for each of the gcc
-      host compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
-      cuda_config.cuda_toolkit_path,
-      ".exe" if cuda_config.cpu_value == "Windows" else "",
-  ))
-  result = repository_ctx.execute([
-      nvcc_path,
-      "-v",
-      "/dev/null",
-      "-o",
-      "/dev/null",
-  ])
-  target_dir = ""
-  for one_line in result.stderr.splitlines():
-    if one_line.startswith("#$ _TARGET_DIR_="):
-      target_dir = (
-          cuda_config.cuda_toolkit_path + "/" + one_line.replace(
-              "#$ _TARGET_DIR_=", "") + "/include")
-  inc_entries = []
-  if target_dir != "":
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
-  default_include = cuda_config.cuda_toolkit_path + "/include"
-  inc_entries.append(
-      "  cxx_builtin_include_directory: \"%s\"" % default_include)
-  return "\n".join(inc_entries)
-
+    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+
+      Returns:
+        A string containing the cxx_builtin_include_directory for each of the gcc
+        host compiler include directories, which can be added to the CROSSTOOL
+        file.
+      """
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "",
+    ))
+    result = repository_ctx.execute([
+        nvcc_path,
+        "-v",
+        "/dev/null",
+        "-o",
+        "/dev/null",
+    ])
+    target_dir = ""
+    for one_line in result.stderr.splitlines():
+        if one_line.startswith("#$ _TARGET_DIR_="):
+            target_dir = (
+                cuda_config.cuda_toolkit_path + "/" + one_line.replace(
+                    "#$ _TARGET_DIR_=",
+                    "",
+                ) + "/include"
+            )
+    inc_entries = []
+    if target_dir != "":
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+    default_include = cuda_config.cuda_toolkit_path + "/include"
+    inc_entries.append(
+        "  cxx_builtin_include_directory: \"%s\"" % default_include,
+    )
+    return "\n".join(inc_entries)
 
 def enable_cuda(repository_ctx):
-  if "TF_NEED_CUDA" in repository_ctx.os.environ:
-    enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-    return enable_cuda == "1"
-  return False
-
+    if "TF_NEED_CUDA" in repository_ctx.os.environ:
+        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
+        return enable_cuda == "1"
+    return False
 
 def cuda_toolkit_path(repository_ctx):
-  """Finds the cuda toolkit directory.
-
-    Args:
-      repository_ctx: The repository context.
+    """Finds the cuda toolkit directory.
 
-    Returns:
-      A speculative real path of the cuda toolkit install directory.
-    """
-  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
-  if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
-    cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
-  if not repository_ctx.path(cuda_toolkit_path).exists:
-    auto_configure_fail("Cannot find cuda toolkit path.")
-  return str(repository_ctx.path(cuda_toolkit_path).realpath)
+      Args:
+        repository_ctx: The repository context.
 
+      Returns:
+        A speculative real path of the cuda toolkit install directory.
+      """
+    cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
+    if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
+        cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
+    if not repository_ctx.path(cuda_toolkit_path).exists:
+        auto_configure_fail("Cannot find cuda toolkit path.")
+    return str(repository_ctx.path(cuda_toolkit_path).realpath)
 
 def _cudnn_install_basedir(repository_ctx):
-  """Finds the cudnn install directory."""
-  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
-  if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
-    cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
-  if not repository_ctx.path(cudnn_install_path).exists:
-    auto_configure_fail("Cannot find cudnn install path.")
-  return cudnn_install_path
-
+    """Finds the cudnn install directory."""
+    cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
+    if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
+        cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
+    if not repository_ctx.path(cudnn_install_path).exists:
+        auto_configure_fail("Cannot find cudnn install path.")
+    return cudnn_install_path
 
 def matches_version(environ_version, detected_version):
-  """Checks whether the user-specified version matches the detected version.
-
-    This function performs a weak matching so that if the user specifies only
-    the
-    major or major and minor versions, the versions are still considered
-    matching
-    if the version parts match. To illustrate:
-
-        environ_version  detected_version  result
-        -----------------------------------------
-        5.1.3            5.1.3             True
-        5.1              5.1.3             True
-        5                5.1               True
-        5.1.3            5.1               False
-        5.2.3            5.1.3             False
-
-    Args:
-      environ_version: The version specified by the user via environment
-        variables.
-      detected_version: The version autodetected from the CUDA installation on
-        the system.
-    Returns: True if user-specified version matches detected version and False
-      otherwise.
-  """
-  environ_version_parts = environ_version.split(".")
-  detected_version_parts = detected_version.split(".")
-  if len(detected_version_parts) < len(environ_version_parts):
-    return False
-  for i, part in enumerate(detected_version_parts):
-    if i >= len(environ_version_parts):
-      break
-    if part != environ_version_parts[i]:
-      return False
-  return True
-
+    """Checks whether the user-specified version matches the detected version.
+
+      This function performs a weak matching so that if the user specifies only
+      the
+      major or major and minor versions, the versions are still considered
+      matching
+      if the version parts match. To illustrate:
+
+          environ_version  detected_version  result
+          -----------------------------------------
+          5.1.3            5.1.3             True
+          5.1              5.1.3             True
+          5                5.1               True
+          5.1.3            5.1               False
+          5.2.3            5.1.3             False
+
+      Args:
+        environ_version: The version specified by the user via environment
+          variables.
+        detected_version: The version autodetected from the CUDA installation on
+          the system.
+      Returns: True if user-specified version matches detected version and False
+        otherwise.
+    """
+    environ_version_parts = environ_version.split(".")
+    detected_version_parts = detected_version.split(".")
+    if len(detected_version_parts) < len(environ_version_parts):
+        return False
+    for i, part in enumerate(detected_version_parts):
+        if i >= len(environ_version_parts):
+            break
+        if part != environ_version_parts[i]:
+            return False
+    return True
 
 _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 
 def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
-  """Detects the version of CUDA installed on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_toolkit_path: The CUDA install directory.
-
-    Returns:
-      String containing the version of CUDA.
-    """
+    """Detects the version of CUDA installed on the system.
 
-  # Run nvcc --version and find the line containing the CUDA version.
-  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
-      cuda_toolkit_path,
-      ".exe" if cpu_value == "Windows" else "",
-  ))
-  if not nvcc_path.exists:
-    auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
-  result = repository_ctx.execute([str(nvcc_path), "--version"])
-  if result.stderr:
-    auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
-  lines = result.stdout.splitlines()
-  version_line = lines[len(lines) - 1]
-  if version_line.find(_NVCC_VERSION_PREFIX) == -1:
-    auto_configure_fail(
-        "Could not parse CUDA version from nvcc --version. Got: %s" %
-        result.stdout,)
-
-  # Parse the CUDA version from the line containing the CUDA version.
-  prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
-  parts = prefix_removed.split(",")
-  if len(parts) != 2 or len(parts[0]) < 2:
-    auto_configure_fail(
-        "Could not parse CUDA version from nvcc --version. Got: %s" %
-        result.stdout,)
-  full_version = parts[1].strip()
-  if full_version.startswith("V"):
-    full_version = full_version[1:]
-
-  # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
-  # match the detected version.
-  environ_version = ""
-  if _TF_CUDA_VERSION in repository_ctx.os.environ:
-    environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  if environ_version and not matches_version(environ_version, full_version):
-    auto_configure_fail(
-        ("CUDA version detected from nvcc (%s) does not match " +
-         "TF_CUDA_VERSION (%s)") % (full_version, environ_version),)
+      Args:
+        repository_ctx: The repository context.
+        cuda_toolkit_path: The CUDA install directory.
 
-  # We only use the version consisting of the major and minor version numbers.
-  version_parts = full_version.split(".")
-  if len(version_parts) < 2:
-    auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
-  if cpu_value == "Windows":
-    version = "64_%s%s" % (version_parts[0], version_parts[1])
-  else:
-    version = "%s.%s" % (version_parts[0], version_parts[1])
-  return version
+      Returns:
+        String containing the version of CUDA.
+      """
 
+    # Run nvcc --version and find the line containing the CUDA version.
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_toolkit_path,
+        ".exe" if cpu_value == "Windows" else "",
+    ))
+    if not nvcc_path.exists:
+        auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
+    result = repository_ctx.execute([str(nvcc_path), "--version"])
+    if result.stderr:
+        auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
+    lines = result.stdout.splitlines()
+    version_line = lines[len(lines) - 1]
+    if version_line.find(_NVCC_VERSION_PREFIX) == -1:
+        auto_configure_fail(
+            "Could not parse CUDA version from nvcc --version. Got: %s" %
+            result.stdout,
+        )
+
+    # Parse the CUDA version from the line containing the CUDA version.
+    prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
+    parts = prefix_removed.split(",")
+    if len(parts) != 2 or len(parts[0]) < 2:
+        auto_configure_fail(
+            "Could not parse CUDA version from nvcc --version. Got: %s" %
+            result.stdout,
+        )
+    full_version = parts[1].strip()
+    if full_version.startswith("V"):
+        full_version = full_version[1:]
+
+    # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
+    # match the detected version.
+    environ_version = ""
+    if _TF_CUDA_VERSION in repository_ctx.os.environ:
+        environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
+    if environ_version and not matches_version(environ_version, full_version):
+        auto_configure_fail(
+            ("CUDA version detected from nvcc (%s) does not match " +
+             "TF_CUDA_VERSION (%s)") % (full_version, environ_version),
+        )
+
+    # We only use the version consisting of the major and minor version numbers.
+    version_parts = full_version.split(".")
+    if len(version_parts) < 2:
+        auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
+    if cpu_value == "Windows":
+        version = "64_%s%s" % (version_parts[0], version_parts[1])
+    else:
+        version = "%s.%s" % (version_parts[0], version_parts[1])
+    return version
 
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 def find_cuda_define(repository_ctx, header_dir, header_file, define):
-  """Returns the value of a #define in a header file.
-
-    Greps through a header file and returns the value of the specified #define.
-    If the #define is not found, then raise an error.
-
-    Args:
-      repository_ctx: The repository context.
-      header_dir: The directory containing the header file.
-      header_file: The header file name.
-      define: The #define to search for.
-
-    Returns:
-      The value of the #define found in the header.
-    """
-
-  # Confirm location of the header and grep for the line defining the macro.
-  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
-  if not h_path.exists:
-    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
-  result = repository_ctx.execute(
-      # Grep one more lines as some #defines are splitted into two lines.
-      ["grep", "--color=never", "-A1", "-E", define,
-       str(h_path)],)
-  if result.stderr:
-    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
-
-  # Parse the version from the line defining the macro.
-  if result.stdout.find(define) == -1:
-    auto_configure_fail(
-        "Cannot find line containing '%s' in %s" % (define, h_path))
-
-  # Split results to lines
-  lines = result.stdout.split("\n")
-  num_lines = len(lines)
-  for l in range(num_lines):
-    line = lines[l]
-    if define in line:  # Find the line with define
-      version = line
-      if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
-        version = version[:-1] + lines[l + 1]
-      break
-
-  # Remove any comments
-  version = version.split("//")[0]
-
-  # Remove define name
-  version = version.replace(define, "").strip()
-
-  # Remove the code after the version number.
-  version_end = version.find(" ")
-  if version_end != -1:
-    if version_end == 0:
-      auto_configure_fail(
-          "Cannot extract the version from line containing '%s' in %s" %
-          (define, str(h_path)),)
-    version = version[:version_end].strip()
-  return version
-
+    """Returns the value of a #define in a header file.
+
+      Greps through a header file and returns the value of the specified #define.
+      If the #define is not found, then raise an error.
+
+      Args:
+        repository_ctx: The repository context.
+        header_dir: The directory containing the header file.
+        header_file: The header file name.
+        define: The #define to search for.
+
+      Returns:
+        The value of the #define found in the header.
+      """
+
+    # Confirm location of the header and grep for the line defining the macro.
+    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+    if not h_path.exists:
+        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+    result = repository_ctx.execute(
+        # Grep one more lines as some #defines are splitted into two lines.
+        [
+            "grep",
+            "--color=never",
+            "-A1",
+            "-E",
+            define,
+            str(h_path),
+        ],
+    )
+    if result.stderr:
+        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
+
+    # Parse the version from the line defining the macro.
+    if result.stdout.find(define) == -1:
+        auto_configure_fail(
+            "Cannot find line containing '%s' in %s" % (define, h_path),
+        )
+
+    # Split results to lines
+    lines = result.stdout.split("\n")
+    num_lines = len(lines)
+    for l in range(num_lines):
+        line = lines[l]
+        if define in line:  # Find the line with define
+            version = line
+            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
+                version = version[:-1] + lines[l + 1]
+            break
+
+    # Remove any comments
+    version = version.split("//")[0]
+
+    # Remove define name
+    version = version.replace(define, "").strip()
+
+    # Remove the code after the version number.
+    version_end = version.find(" ")
+    if version_end != -1:
+        if version_end == 0:
+            auto_configure_fail(
+                "Cannot extract the version from line containing '%s' in %s" %
+                (define, str(h_path)),
+            )
+        version = version[:version_end].strip()
+    return version
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
-  """Detects the version of cuDNN installed on the system.
+    """Detects the version of cuDNN installed on the system.
 
-    Args:
-      repository_ctx: The repository context.
-      cpu_value: The name of the host operating system.
-      cudnn_install_basedir: The cuDNN install directory.
-
-    Returns:
-      A string containing the version of cuDNN.
-    """
-  cudnn_header_dir = _find_cudnn_header_dir(
-      repository_ctx,
-      cudnn_install_basedir,
-  )
-  major_version = find_cuda_define(
-      repository_ctx,
-      cudnn_header_dir,
-      "cudnn.h",
-      _DEFINE_CUDNN_MAJOR,
-  )
-  minor_version = find_cuda_define(
-      repository_ctx,
-      cudnn_header_dir,
-      "cudnn.h",
-      _DEFINE_CUDNN_MINOR,
-  )
-  patch_version = find_cuda_define(
-      repository_ctx,
-      cudnn_header_dir,
-      "cudnn.h",
-      _DEFINE_CUDNN_PATCHLEVEL,
-  )
-  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-
-  # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
-  # match the detected version.
-  environ_version = ""
-  if _TF_CUDNN_VERSION in repository_ctx.os.environ:
-    environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  if environ_version and not matches_version(environ_version, full_version):
-    cudnn_h_path = repository_ctx.path(
-        "%s/include/cudnn.h" % cudnn_install_basedir)
-    auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
-                         "TF_CUDNN_VERSION (%s)") %
-                        (str(cudnn_h_path), full_version, environ_version),)
-  # Only use the major version to match the SONAME of the library.
-  version = major_version
-  if cpu_value == "Windows":
-    version = "64_" + version
-  return version
+      Args:
+        repository_ctx: The repository context.
+        cpu_value: The name of the host operating system.
+        cudnn_install_basedir: The cuDNN install directory.
 
+      Returns:
+        A string containing the version of cuDNN.
+      """
+    cudnn_header_dir = _find_cudnn_header_dir(
+        repository_ctx,
+        cudnn_install_basedir,
+    )
+    major_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_PATCHLEVEL,
+    )
+    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+
+    # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
+    # match the detected version.
+    environ_version = ""
+    if _TF_CUDNN_VERSION in repository_ctx.os.environ:
+        environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
+    if environ_version and not matches_version(environ_version, full_version):
+        cudnn_h_path = repository_ctx.path(
+            "%s/include/cudnn.h" % cudnn_install_basedir,
+        )
+        auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
+                             "TF_CUDNN_VERSION (%s)") %
+                            (str(cudnn_h_path), full_version, environ_version))
+
+    # Only use the major version to match the SONAME of the library.
+    version = major_version
+    if cpu_value == "Windows":
+        version = "64_" + version
+    return version
 
 def compute_capabilities(repository_ctx):
-  """Returns a list of strings representing cuda compute capabilities."""
-  if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
-    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-  capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
-  capabilities = capabilities_str.split(",")
-  for capability in capabilities:
-    # Workaround for Skylark's lack of support for regex. This check should
-    # be equivalent to checking:
-    #     if re.match("[0-9]+.[0-9]+", capability) == None:
-    parts = capability.split(".")
-    if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
-      auto_configure_fail("Invalid compute capability: %s" % capability)
-  return capabilities
-
+    """Returns a list of strings representing cuda compute capabilities."""
+    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
+    capabilities = capabilities_str.split(",")
+    for capability in capabilities:
+        # Workaround for Skylark's lack of support for regex. This check should
+        # be equivalent to checking:
+        #     if re.match("[0-9]+.[0-9]+", capability) == None:
+        parts = capability.split(".")
+        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+    return capabilities
 
 def get_cpu_value(repository_ctx):
-  """Returns the name of the host operating system.
-
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A string containing the name of the host operating system.
-    """
-  os_name = repository_ctx.os.name.lower()
-  if os_name.startswith("mac os"):
-    return "Darwin"
-  if os_name.find("windows") != -1:
-    return "Windows"
-  result = repository_ctx.execute(["uname", "-s"])
-  return result.stdout.strip()
-
+    """Returns the name of the host operating system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A string containing the name of the host operating system.
+      """
+    os_name = repository_ctx.os.name.lower()
+    if os_name.startswith("mac os"):
+        return "Darwin"
+    if os_name.find("windows") != -1:
+        return "Windows"
+    result = repository_ctx.execute(["uname", "-s"])
+    return result.stdout.strip()
 
 def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  return get_cpu_value(repository_ctx) == "Windows"
-
+    """Returns true if the host operating system is windows."""
+    return get_cpu_value(repository_ctx) == "Windows"
 
 def lib_name(base_name, cpu_value, version = None, static = False):
-  """Constructs the platform-specific name of a library.
+    """Constructs the platform-specific name of a library.
 
-    Args:
-      base_name: The name of the library, such as "cudart"
-      cpu_value: The name of the host operating system.
-      version: The version of the library.
-      static: True the library is static or False if it is a shared object.
+      Args:
+        base_name: The name of the library, such as "cudart"
+        cpu_value: The name of the host operating system.
+        version: The version of the library.
+        static: True the library is static or False if it is a shared object.
+
+      Returns:
+        The platform-specific name of the library.
+      """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
-    Returns:
-      The platform-specific name of the library.
+def find_lib(repository_ctx, paths, check_soname = True):
     """
-  version = "" if not version else "." + version
-  if cpu_value in ("Linux", "FreeBSD"):
-    if static:
-      return "lib%s.a" % base_name
-    return "lib%s.so%s" % (base_name, version)
-  elif cpu_value == "Windows":
-    return "%s.lib" % base_name
-  elif cpu_value == "Darwin":
-    if static:
-      return "lib%s.a" % base_name
-    return "lib%s%s.dylib" % (base_name, version)
-  else:
-    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+      Finds a library among a list of potential paths.
 
-def find_lib(repository_ctx, paths, check_soname = True):
-  """
-    Finds a library among a list of potential paths.
-
-    Args:
-      paths: List of paths to inspect.
-
-    Returns:
-      Returns the first path in paths that exist.
-  """
-  objdump = repository_ctx.which("objdump")
-  mismatches = []
-  for path in [repository_ctx.path(path) for path in paths]:
-    if not path.exists:
-      continue
-    if check_soname and objdump != None:
-      output = repository_ctx.execute([objdump, "-p", str(path)]).stdout
-      output = [line for line in output.splitlines() if "SONAME" in line]
-      sonames = [line.strip().split(" ")[-1] for line in output]
-      if not any([soname == path.basename for soname in sonames]):
-        mismatches.append(str(path))
-        continue
-    return path
-  if mismatches:
-    auto_configure_fail(
-        "None of the libraries match their SONAME: " + ", ".join(mismatches))
-  auto_configure_fail("No library found under: " + ", ".join(paths))
+      Args:
+        paths: List of paths to inspect.
 
+      Returns:
+        Returns the first path in paths that exist.
+    """
+    objdump = repository_ctx.which("objdump")
+    mismatches = []
+    for path in [repository_ctx.path(path) for path in paths]:
+        if not path.exists:
+            continue
+        if check_soname and objdump != None:
+            output = repository_ctx.execute([objdump, "-p", str(path)]).stdout
+            output = [line for line in output.splitlines() if "SONAME" in line]
+            sonames = [line.strip().split(" ")[-1] for line in output]
+            if not any([soname == path.basename for soname in sonames]):
+                mismatches.append(str(path))
+                continue
+        return path
+    if mismatches:
+        auto_configure_fail(
+            "None of the libraries match their SONAME: " + ", ".join(mismatches),
+        )
+    auto_configure_fail("No library found under: " + ", ".join(paths))
 
 def _find_cuda_lib(
         lib,
@@ -751,263 +756,257 @@ def _find_cuda_lib(
         basedir,
         version,
         static = False):
-  """Finds the given CUDA or cuDNN library on the system.
-
-    Args:
-      lib: The name of the library, such as "cudart"
-      repository_ctx: The repository context.
-      cpu_value: The name of the host operating system.
-      basedir: The install directory of CUDA or cuDNN.
-      version: The version of the library.
-      static: True if static library, False if shared object.
-
-    Returns:
-      Returns the path to the library.
-    """
-  file_name = lib_name(lib, cpu_value, version, static)
-  return find_lib(repository_ctx, [
-      "%s/%s%s" % (basedir, path, file_name) for path in CUDA_LIB_PATHS
-  ], check_soname = version and not static)
+    """Finds the given CUDA or cuDNN library on the system.
 
+      Args:
+        lib: The name of the library, such as "cudart"
+        repository_ctx: The repository context.
+        cpu_value: The name of the host operating system.
+        basedir: The install directory of CUDA or cuDNN.
+        version: The version of the library.
+        static: True if static library, False if shared object.
+
+      Returns:
+        Returns the path to the library.
+      """
+    file_name = lib_name(lib, cpu_value, version, static)
+    return find_lib(repository_ctx, [
+        "%s/%s%s" % (basedir, path, file_name)
+        for path in CUDA_LIB_PATHS
+    ], check_soname = version and not static)
 
 def _find_cupti_header_dir(repository_ctx, cuda_config):
-  """Returns the path to the directory containing cupti.h
-
-    On most systems, the cupti library is not installed in the same directory as
-    the other CUDA libraries but rather in a special extras/CUPTI directory.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      The path of the directory containing the cupti header.
-    """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUPTI_HEADER_PATHS:
-    if repository_ctx.path(
-        "%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
-      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join(
-      [cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
-
+    """Returns the path to the directory containing cupti.h
+
+      On most systems, the cupti library is not installed in the same directory as
+      the other CUDA libraries but rather in a special extras/CUPTI directory.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        The path of the directory containing the cupti header.
+      """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUPTI_HEADER_PATHS:
+        if repository_ctx.path(
+            "%s/%scupti.h" % (cuda_toolkit_path, relative_path),
+        ).exists:
+            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find cupti.h under %s" % ", ".join(
+        [cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS],
+    ))
 
 def _find_cupti_lib(repository_ctx, cuda_config):
-  """Finds the cupti library on the system.
-
-    On most systems, the cupti library is not installed in the same directory as
-    the other CUDA libraries but rather in a special extras/CUPTI directory.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The cuda configuration as returned by _get_cuda_config.
-
-    Returns:
-      Returns the path to the library.
-    """
-  file_name = lib_name(
-      "cupti",
-      cuda_config.cpu_value,
-      cuda_config.cuda_version,
-  )
-  basedir = cuda_config.cuda_toolkit_path
-  return find_lib(repository_ctx, [
-      "%s/%s%s" % (basedir, path, file_name) for path in CUPTI_LIB_PATHS
-  ])
-
+    """Finds the cupti library on the system.
+
+      On most systems, the cupti library is not installed in the same directory as
+      the other CUDA libraries but rather in a special extras/CUPTI directory.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The cuda configuration as returned by _get_cuda_config.
+
+      Returns:
+        Returns the path to the library.
+      """
+    file_name = lib_name(
+        "cupti",
+        cuda_config.cpu_value,
+        cuda_config.cuda_version,
+    )
+    basedir = cuda_config.cuda_toolkit_path
+    return find_lib(repository_ctx, [
+        "%s/%s%s" % (basedir, path, file_name)
+        for path in CUPTI_LIB_PATHS
+    ])
 
 def _find_libs(repository_ctx, cuda_config):
-  """Returns the CUDA and cuDNN libraries on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
+    """Returns the CUDA and cuDNN libraries on the system.
 
-    Returns:
-      Map of library names to structs of filename and path.
-    """
-  cpu_value = cuda_config.cpu_value
-  return {
-      "cuda":
-          _find_cuda_lib(
-              "cuda",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              None),
-      "cudart":
-          _find_cuda_lib(
-              "cudart",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cudart_static":
-          _find_cuda_lib(
-              "cudart_static",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-              static=True,
-          ),
-      "cublas":
-          _find_cuda_lib(
-              "cublas",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cusolver":
-          _find_cuda_lib(
-              "cusolver",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "curand":
-          _find_cuda_lib(
-              "curand",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cufft":
-          _find_cuda_lib(
-              "cufft",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cudnn":
-          _find_cuda_lib(
-              "cudnn",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cudnn_install_basedir,
-              cuda_config.cudnn_version,
-          ),
-      "cupti":
-          _find_cupti_lib(repository_ctx, cuda_config),
-  }
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
 
+      Returns:
+        Map of library names to structs of filename and path.
+      """
+    cpu_value = cuda_config.cpu_value
+    return {
+        "cuda": _find_cuda_lib(
+            "cuda",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            None,
+        ),
+        "cudart": _find_cuda_lib(
+            "cudart",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cudart_static": _find_cuda_lib(
+            "cudart_static",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+            static = True,
+        ),
+        "cublas": _find_cuda_lib(
+            "cublas",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cusolver": _find_cuda_lib(
+            "cusolver",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "curand": _find_cuda_lib(
+            "curand",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cufft": _find_cuda_lib(
+            "cufft",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cudnn": _find_cuda_lib(
+            "cudnn",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cudnn_install_basedir,
+            cuda_config.cudnn_version,
+        ),
+        "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+    }
 
 def _find_cuda_include_path(repository_ctx, cuda_config):
-  """Returns the path to the directory containing cuda.h
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      The path of the directory containing the CUDA headers.
-    """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUDA_INCLUDE_PATHS:
-    if repository_ctx.path(
-        "%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
-      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
-
+    """Returns the path to the directory containing cuda.h
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        The path of the directory containing the CUDA headers.
+      """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUDA_INCLUDE_PATHS:
+        if repository_ctx.path(
+            "%s/%scuda.h" % (cuda_toolkit_path, relative_path),
+        ).exists:
+            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
 
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
-  """Returns the path to the directory containing cudnn.h
-
-    Args:
-      repository_ctx: The repository context.
-      cudnn_install_basedir: The cudnn install directory as returned by
-        _cudnn_install_basedir.
-
-    Returns:
-      The path of the directory containing the cudnn header.
-    """
-  for relative_path in CUDA_INCLUDE_PATHS:
-    if repository_ctx.path(
-        "%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
-      return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
-  if repository_ctx.path("/usr/include/cudnn.h").exists:
-    return "/usr/include"
-  auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
-
+    """Returns the path to the directory containing cudnn.h
+
+      Args:
+        repository_ctx: The repository context.
+        cudnn_install_basedir: The cudnn install directory as returned by
+          _cudnn_install_basedir.
+
+      Returns:
+        The path of the directory containing the cudnn header.
+      """
+    for relative_path in CUDA_INCLUDE_PATHS:
+        if repository_ctx.path(
+            "%s/%scudnn.h" % (cudnn_install_basedir, relative_path),
+        ).exists:
+            return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
+    if repository_ctx.path("/usr/include/cudnn.h").exists:
+        return "/usr/include"
+    auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
 
 def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
-  """Returns the path to the directory containing libdevice in bitcode format.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      The path of the directory containing the CUDA headers.
-    """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for libdevice_file in NVVM_LIBDEVICE_FILES:
-    for relative_path in NVVM_LIBDEVICE_PATHS:
-      if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path,
-                                          libdevice_file)).exists:
-        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail(
-      "Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
-
+    """Returns the path to the directory containing libdevice in bitcode format.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        The path of the directory containing the CUDA headers.
+      """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for libdevice_file in NVVM_LIBDEVICE_FILES:
+        for relative_path in NVVM_LIBDEVICE_PATHS:
+            if repository_ctx.path("%s/%s%s" % (
+                cuda_toolkit_path,
+                relative_path,
+                libdevice_file,
+            )).exists:
+                return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail(
+        "Cannot find libdevice*.bc files under %s" % cuda_toolkit_path,
+    )
 
 def _cudart_static_linkopt(cpu_value):
-  """Returns additional platform-specific linkopts for cudart."""
-  return "" if cpu_value == "Darwin" else "\"-lrt\","
-
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
 
 def _get_cuda_config(repository_ctx):
-  """Detects and returns information about the CUDA installation on the system.
-
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A struct containing the following fields:
-        cuda_toolkit_path: The CUDA toolkit installation directory.
-        cudnn_install_basedir: The cuDNN installation directory.
-        cuda_version: The version of CUDA on the system.
-        cudnn_version: The version of cuDNN on the system.
-        compute_capabilities: A list of the system's CUDA compute capabilities.
-        cpu_value: The name of the host operating system.
-    """
-  cpu_value = get_cpu_value(repository_ctx)
-  toolkit_path = cuda_toolkit_path(repository_ctx)
-  cuda_version = _cuda_version(repository_ctx, toolkit_path, cpu_value)
-  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir,
-                                 cpu_value)
-  return struct(
-      cuda_toolkit_path=toolkit_path,
-      cudnn_install_basedir=cudnn_install_basedir,
-      cuda_version=cuda_version,
-      cudnn_version=cudnn_version,
-      compute_capabilities=compute_capabilities(repository_ctx),
-      cpu_value=cpu_value,
-  )
-
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_toolkit_path: The CUDA toolkit installation directory.
+          cudnn_install_basedir: The cuDNN installation directory.
+          cuda_version: The version of CUDA on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+    cpu_value = get_cpu_value(repository_ctx)
+    toolkit_path = cuda_toolkit_path(repository_ctx)
+    cuda_version = _cuda_version(repository_ctx, toolkit_path, cpu_value)
+    cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
+    cudnn_version = _cudnn_version(
+        repository_ctx,
+        cudnn_install_basedir,
+        cpu_value,
+    )
+    return struct(
+        cuda_toolkit_path = toolkit_path,
+        cudnn_install_basedir = cudnn_install_basedir,
+        cuda_version = cuda_version,
+        cudnn_version = cudnn_version,
+        compute_capabilities = compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
+    )
 
 def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-  if not out:
-    out = tpl.replace(":", "/")
-  repository_ctx.template(
-      out,
-      Label("//third_party/gpus/%s.tpl" % tpl),
-      substitutions,
-  )
-
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//third_party/gpus/%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _file(repository_ctx, label):
-  repository_ctx.template(
-      label.replace(":", "/"),
-      Label("//third_party/gpus/%s.tpl" % label),
-      {},
-  )
-
+    repository_ctx.template(
+        label.replace(":", "/"),
+        Label("//third_party/gpus/%s.tpl" % label),
+        {},
+    )
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_gpu_disabled():
@@ -1035,99 +1034,83 @@ error_gpu_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
-  cpu_value = get_cpu_value(repository_ctx)
-
-  # Set up BUILD file for cuda/.
-  _tpl(
-      repository_ctx,
-      "cuda:build_defs.bzl",
-      {
-          "%{cuda_is_configured}": "False",
-          "%{cuda_extra_copts}": "[]",
-      },
-  )
-  _tpl(
-      repository_ctx,
-      "cuda:BUILD",
-      {
-          "%{cuda_driver_lib}":
-              lib_name("cuda", cpu_value),
-          "%{cudart_static_lib}":
-              lib_name(
-                  "cudart_static",
-                  cpu_value,
-                  static=True,
-              ),
-          "%{cudart_static_linkopt}":
-              _cudart_static_linkopt(cpu_value),
-          "%{cudart_lib}":
-              lib_name("cudart", cpu_value),
-          "%{cublas_lib}":
-              lib_name("cublas", cpu_value),
-          "%{cusolver_lib}":
-              lib_name("cusolver", cpu_value),
-          "%{cudnn_lib}":
-              lib_name("cudnn", cpu_value),
-          "%{cufft_lib}":
-              lib_name("cufft", cpu_value),
-          "%{curand_lib}":
-              lib_name("curand", cpu_value),
-          "%{cupti_lib}":
-              lib_name("cupti", cpu_value),
-          "%{copy_rules}":
-              "",
-          "%{cuda_headers}":
-              "",
-      },
-  )
+    cpu_value = get_cpu_value(repository_ctx)
 
-  # Create dummy files for the CUDA toolkit since they are still required by
-  # tensorflow/core/platform/default/build_config:cuda.
-  repository_ctx.file("cuda/cuda/include/cuda.h")
-  repository_ctx.file("cuda/cuda/include/cublas.h")
-  repository_ctx.file("cuda/cuda/include/cudnn.h")
-  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
-  repository_ctx.file(
-      "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
-
-  # Set up cuda_config.h, which is used by
-  # tensorflow/stream_executor/dso_loader.cc.
-  _tpl(
-      repository_ctx,
-      "cuda:cuda_config.h",
-      {
-          "%{cuda_version}":
-              _DEFAULT_CUDA_VERSION,
-          "%{cudnn_version}":
-              _DEFAULT_CUDNN_VERSION,
-          "%{cuda_compute_capabilities}":
-              ",".join([
-                  "CudaVersion(\"%s\")" % c
-                  for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-              ]),
-          "%{cuda_toolkit_path}":
-              _DEFAULT_CUDA_TOOLKIT_PATH,
-      },
-      "cuda/cuda/cuda_config.h",
-  )
+    # Set up BUILD file for cuda/.
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+        },
+    )
+    _tpl(
+        repository_ctx,
+        "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": lib_name("cudart", cpu_value),
+            "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": lib_name("cufft", cpu_value),
+            "%{curand_lib}": lib_name("curand", cpu_value),
+            "%{cupti_lib}": lib_name("cupti", cpu_value),
+            "%{copy_rules}": "",
+            "%{cuda_headers}": "",
+        },
+    )
 
-  # If cuda_configure is not configured to build with GPU support, and the user
-  # attempts to build with --config=cuda, add a dummy build rule to intercept
-  # this and fail with an actionable error message.
-  repository_ctx.file(
-      "crosstool/error_gpu_disabled.bzl",
-      _DUMMY_CROSSTOOL_BZL_FILE,
-  )
-  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/core/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
+    _tpl(
+        repository_ctx,
+        "cuda:cuda_config.h",
+        {
+            "%{cuda_version}": _DEFAULT_CUDA_VERSION,
+            "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
+            "%{cuda_compute_capabilities}": ",".join([
+                "CudaVersion(\"%s\")" % c
+                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+            ]),
+            "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
+        },
+        "cuda/cuda/cuda_config.h",
+    )
 
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
 def _execute(
         repository_ctx,
@@ -1135,43 +1118,44 @@ def _execute(
         error_msg = None,
         error_details = None,
         empty_stdout_fine = False):
-  """Executes an arbitrary shell command.
-
-    Args:
-      repository_ctx: the repository_ctx object
-      cmdline: list of strings, the command to execute
-      error_msg: string, a summary of the error if the command fails
-      error_details: string, details about the error or steps to fix it
-      empty_stdout_fine: bool, if True, an empty stdout result is fine,
-        otherwise it's an error
-    Return: the result of repository_ctx.execute(cmdline)
-  """
-  result = repository_ctx.execute(cmdline)
-  if result.stderr or not (empty_stdout_fine or result.stdout):
-    auto_configure_fail(
-        "\n".join([
-            error_msg.strip() if error_msg else "Repository command failed",
-            result.stderr.strip(),
-            error_details if error_details else "",
-        ]),)
-  return result
-
+    """Executes an arbitrary shell command.
+
+      Args:
+        repository_ctx: the repository_ctx object
+        cmdline: list of strings, the command to execute
+        error_msg: string, a summary of the error if the command fails
+        error_details: string, details about the error or steps to fix it
+        empty_stdout_fine: bool, if True, an empty stdout result is fine,
+          otherwise it's an error
+      Return: the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        auto_configure_fail(
+            "\n".join([
+                error_msg.strip() if error_msg else "Repository command failed",
+                result.stderr.strip(),
+                error_details if error_details else "",
+            ]),
+        )
+    return result
 
 def _norm_path(path):
-  """Returns a path with '/' and remove the trailing slash."""
-  path = path.replace("\\", "/")
-  if path[-1] == "/":
-    path = path[:-1]
-  return path
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
 
 def make_copy_files_rule(repository_ctx, name, srcs, outs):
-  """Returns a rule to copy a set of files."""
-  cmds = []
-  # Copy files.
-  for src, out in zip(srcs, outs):
-    cmds.append('cp -f "%s" $(location %s)' % (src, out))
-  outs = [('        "%s",' % out) for out in outs]
-  return """genrule(
+    """Returns a rule to copy a set of files."""
+    cmds = []
+
+    # Copy files.
+    for src, out in zip(srcs, outs):
+        cmds.append('cp -f "%s" $(location %s)' % (src, out))
+    outs = [('        "%s",' % out) for out in outs]
+    return """genrule(
     name = "%s",
     outs = [
 %s
@@ -1180,15 +1164,16 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
 )""" % (name, "\n".join(outs), " && ".join(cmds))
 
 def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
-  """Returns a rule to recursively copy a directory."""
-  src_dir = _norm_path(src_dir)
-  out_dir = _norm_path(out_dir)
-  outs = _read_dir(repository_ctx, src_dir)
-  outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
-  # '@D' already contains the relative path for a single file, see
-  # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
-  out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
-  return """genrule(
+    """Returns a rule to recursively copy a directory."""
+    src_dir = _norm_path(src_dir)
+    out_dir = _norm_path(out_dir)
+    outs = _read_dir(repository_ctx, src_dir)
+    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
+
+    # '@D' already contains the relative path for a single file, see
+    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
+    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    return """genrule(
     name = "%s",
     outs = [
 %s
@@ -1197,346 +1182,323 @@ def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
 )""" % (name, "\n".join(outs), src_dir, out_dir)
 
 def _read_dir(repository_ctx, src_dir):
-  """Returns a string with all files in a directory.
-
-    Finds all files inside a directory, traversing subfolders and following
-    symlinks. The returned string contains the full path of all files
-    separated by line breaks.
-    """
-  if _is_windows(repository_ctx):
-    src_dir = src_dir.replace("/", "\\")
-    find_result = _execute(
-        repository_ctx,
-        ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-        empty_stdout_fine=True,
-    )
-
-    # src_files will be used in genrule.outs where the paths must
-    # use forward slashes.
-    result = find_result.stdout.replace("\\", "/")
-  else:
-    find_result = _execute(
-        repository_ctx,
-        ["find", src_dir, "-follow", "-type", "f"],
-        empty_stdout_fine=True,
-    )
-    result = find_result.stdout
-  return sorted(result.splitlines())
-
+    """Returns a string with all files in a directory.
+
+      Finds all files inside a directory, traversing subfolders and following
+      symlinks. The returned string contains the full path of all files
+      separated by line breaks.
+      """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return sorted(result.splitlines())
 
 def _flag_enabled(repository_ctx, flag_name):
-  if flag_name in repository_ctx.os.environ:
-    value = repository_ctx.os.environ[flag_name].strip()
-    return value == "1"
-  return False
-
+    if flag_name in repository_ctx.os.environ:
+        value = repository_ctx.os.environ[flag_name].strip()
+        return value == "1"
+    return False
 
 def _use_cuda_clang(repository_ctx):
-  return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
-
+    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-  if _use_cuda_clang(repository_ctx):
-    capability_flags = [
-        "--cuda-gpu-arch=sm_" + cap.replace(".", "")
-        for cap in compute_capabilities
-    ]
-  else:
-    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-    # TODO(csigg): Make this consistent with cuda clang and pass to crosstool.
-    capability_flags = []
-  return str(capability_flags)
-
+    if _use_cuda_clang(repository_ctx):
+        capability_flags = [
+            "--cuda-gpu-arch=sm_" + cap.replace(".", "")
+            for cap in compute_capabilities
+        ]
+    else:
+        # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+        # TODO(csigg): Make this consistent with cuda clang and pass to crosstool.
+        capability_flags = []
+    return str(capability_flags)
 
 def _create_local_cuda_repository(repository_ctx):
-  """Creates the repository containing files set up to build with CUDA."""
-  cuda_config = _get_cuda_config(repository_ctx)
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
 
-  cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
-  cudnn_header_dir = _find_cudnn_header_dir(
-      repository_ctx,
-      cuda_config.cudnn_install_basedir,
-  )
-  cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
-  nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
-
-  # Create genrule to copy files from the installed CUDA toolkit into execroot.
-  copy_rules = [
-      make_copy_dir_rule(
-          repository_ctx,
-          name = "cuda-include",
-          src_dir = cuda_include_path,
-          out_dir = "cuda/include",
-      ),
-      make_copy_dir_rule(
-          repository_ctx,
-          name = "cuda-nvvm",
-          src_dir = nvvm_libdevice_dir,
-          out_dir = "cuda/nvvm/libdevice",
-      ),
-      make_copy_dir_rule(
-          repository_ctx,
-          name = "cuda-extras",
-          src_dir = cupti_header_dir,
-          out_dir = "cuda/extras/CUPTI/include",
-      ),
-  ]
-
-  cuda_libs = _find_libs(repository_ctx, cuda_config)
-  cuda_lib_srcs = []
-  cuda_lib_outs = []
-  for path in cuda_libs.values():
-    cuda_lib_srcs.append(str(path))
-    cuda_lib_outs.append("cuda/lib/" + path.basename)
-  copy_rules.append(make_copy_files_rule(
-      repository_ctx,
-      name = "cuda-lib",
-      srcs = cuda_lib_srcs,
-      outs = cuda_lib_outs,
-  ))
-
-  copy_rules.append(make_copy_dir_rule(
-      repository_ctx,
-      name = "cuda-bin",
-      src_dir = cuda_config.cuda_toolkit_path + "/bin",
-      out_dir = "cuda/bin"
-  ))
-
-  # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
-  included_files = _read_dir(repository_ctx, cuda_include_path)
-  if not any([file.endswith("cudnn.h") for file in included_files]):
+    cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
+    cudnn_header_dir = _find_cudnn_header_dir(
+        repository_ctx,
+        cuda_config.cudnn_install_basedir,
+    )
+    cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
+    nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
+
+    # Create genrule to copy files from the installed CUDA toolkit into execroot.
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-include",
+            src_dir = cuda_include_path,
+            out_dir = "cuda/include",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-nvvm",
+            src_dir = nvvm_libdevice_dir,
+            out_dir = "cuda/nvvm/libdevice",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-extras",
+            src_dir = cupti_header_dir,
+            out_dir = "cuda/extras/CUPTI/include",
+        ),
+    ]
+
+    cuda_libs = _find_libs(repository_ctx, cuda_config)
+    cuda_lib_srcs = []
+    cuda_lib_outs = []
+    for path in cuda_libs.values():
+        cuda_lib_srcs.append(str(path))
+        cuda_lib_outs.append("cuda/lib/" + path.basename)
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
-        name = "cudnn-include",
-        srcs = [cudnn_header_dir + "/cudnn.h"],
-        outs = ["cuda/include/cudnn.h"],
+        name = "cuda-lib",
+        srcs = cuda_lib_srcs,
+        outs = cuda_lib_outs,
     ))
-  else:
-    copy_rules.append("filegroup(name = 'cudnn-include')\n")
-
-  # Set up BUILD file for cuda/
-  _tpl(
-      repository_ctx,
-      "cuda:build_defs.bzl",
-      {
-          "%{cuda_is_configured}":
-              "True",
-          "%{cuda_extra_copts}":
-              _compute_cuda_extra_copts(
-                  repository_ctx,
-                  cuda_config.compute_capabilities,
-              ),
-      },
-  )
-  _tpl(
-      repository_ctx,
-      "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
-      {
-          "%{cuda_driver_lib}":
-              cuda_libs["cuda"].basename,
-          "%{cudart_static_lib}":
-              cuda_libs["cudart_static"].basename,
-          "%{cudart_static_linkopt}":
-              _cudart_static_linkopt(cuda_config.cpu_value,),
-          "%{cudart_lib}":
-              cuda_libs["cudart"].basename,
-          "%{cublas_lib}":
-              cuda_libs["cublas"].basename,
-          "%{cusolver_lib}":
-              cuda_libs["cusolver"].basename,
-          "%{cudnn_lib}":
-              cuda_libs["cudnn"].basename,
-          "%{cufft_lib}":
-              cuda_libs["cufft"].basename,
-          "%{curand_lib}":
-              cuda_libs["curand"].basename,
-          "%{cupti_lib}":
-              cuda_libs["cupti"].basename,
-          "%{copy_rules}":
-              "\n".join(copy_rules),
-          "%{cuda_headers}": ('":cuda-include",\n' + '        ":cudnn-include",'
-                             ),
-      },
-      "cuda/BUILD",
-  )
 
-  is_cuda_clang = _use_cuda_clang(repository_ctx)
+    copy_rules.append(make_copy_dir_rule(
+        repository_ctx,
+        name = "cuda-bin",
+        src_dir = cuda_config.cuda_toolkit_path + "/bin",
+        out_dir = "cuda/bin",
+    ))
 
-  should_download_clang = is_cuda_clang and _flag_enabled(
-      repository_ctx,
-      _TF_DOWNLOAD_CLANG,
-  )
-  if should_download_clang:
-    download_clang(repository_ctx, "crosstool/extra_tools")
-
-  # Set up crosstool/
-  cc = find_cc(repository_ctx)
-  cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-  host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-  cuda_defines = {}
-  # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-  # https://github.com/bazelbuild/bazel/issues/760).
-  # However, this stops our custom clang toolchain from picking the provided
-  # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-  # toolchain.
-  # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-  #       flag from the CROSSTOOL completely (see
-  #       https://github.com/bazelbuild/bazel/issues/5634)
-  if should_download_clang:
-    cuda_defines["%{linker_bin_path_flag}"] = ""
-  else:
-    cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
-
-  if is_cuda_clang:
-    cuda_defines["%{host_compiler_path}"] = str(cc)
-    cuda_defines["%{host_compiler_warnings}"] = """
-        # Some parts of the codebase set -Werror and hit this warning, so
-        # switch it off for now.
-        flag: "-Wno-invalid-partial-specialization"
-    """
-    cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-    _tpl(repository_ctx, "crosstool:BUILD", {
-        "%{linker_files}": ":empty",
-        "%{win_linker_files}": ":empty"
-    })
-    repository_ctx.file(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
-    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
-  else:
-    cuda_defines[
-        "%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-    cuda_defines["%{host_compiler_warnings}"] = ""
-
-    # nvcc has the system include paths built in and will automatically
-    # search them; we cannot work around that, so we add the relevant cuda
-    # system paths to the allowed compiler specific include paths.
-    cuda_defines["%{host_compiler_includes}"] = (
-        host_compiler_includes + "\n" + _cuda_include_path(
-            repository_ctx, cuda_config) +
-        "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
-        "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
-
-    # For gcc, do not canonicalize system header paths; some versions of gcc
-    # pick the shortest possible path for system includes when creating the
-    # .d file - given that includes that are prefixed with "../" multiple
-    # time quickly grow longer than the root of the tree, this can lead to
-    # bazel's header check failing.
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
-        "flag: \"-fno-canonical-system-headers\"")
-    nvcc_path = str(
-        repository_ctx.path("%s/bin/nvcc%s" % (
-            cuda_config.cuda_toolkit_path,
-            ".exe" if _is_windows(repository_ctx) else "",
-        )))
+    # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
+    included_files = _read_dir(repository_ctx, cuda_include_path)
+    if not any([file.endswith("cudnn.h") for file in included_files]):
+        copy_rules.append(make_copy_files_rule(
+            repository_ctx,
+            name = "cudnn-include",
+            srcs = [cudnn_header_dir + "/cudnn.h"],
+            outs = ["cuda/include/cudnn.h"],
+        ))
+    else:
+        copy_rules.append("filegroup(name = 'cudnn-include')\n")
+
+    # Set up BUILD file for cuda/
     _tpl(
         repository_ctx,
-        "crosstool:BUILD",
+        "cuda:build_defs.bzl",
         {
-            "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
-            "%{win_linker_files}": ":windows_msvc_wrapper_files",
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
         },
     )
-    wrapper_defines = {
-        "%{cpu_compiler}":
-            str(cc),
-        "%{cuda_version}":
-            cuda_config.cuda_version,
-        "%{nvcc_path}":
-            nvcc_path,
-        "%{gcc_host_compiler_path}":
-            str(cc),
-        "%{cuda_compute_capabilities}":
-            ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],),
-        "%{nvcc_tmp_dir}":
-            _get_nvcc_tmp_dir_for_windows(repository_ctx),
-    }
     _tpl(
         repository_ctx,
-        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        wrapper_defines,
+        "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": cuda_libs["cuda"].basename,
+            "%{cudart_static_lib}": cuda_libs["cudart_static"].basename,
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cudart_lib}": cuda_libs["cudart"].basename,
+            "%{cublas_lib}": cuda_libs["cublas"].basename,
+            "%{cusolver_lib}": cuda_libs["cusolver"].basename,
+            "%{cudnn_lib}": cuda_libs["cudnn"].basename,
+            "%{cufft_lib}": cuda_libs["cufft"].basename,
+            "%{curand_lib}": cuda_libs["curand"].basename,
+            "%{cupti_lib}": cuda_libs["cupti"].basename,
+            "%{copy_rules}": "\n".join(copy_rules),
+            "%{cuda_headers}": (
+                '":cuda-include",\n' + '        ":cudnn-include",'
+            ),
+        },
+        "cuda/BUILD",
     )
-    _tpl(
+
+    is_cuda_clang = _use_cuda_clang(repository_ctx)
+
+    should_download_clang = is_cuda_clang and _flag_enabled(
         repository_ctx,
-        "crosstool:windows/msvc_wrapper_for_nvcc.py",
-        wrapper_defines,
+        _TF_DOWNLOAD_CLANG,
     )
+    if should_download_clang:
+        download_clang(repository_ctx, "crosstool/extra_tools")
+
+    # Set up crosstool/
+    cc = find_cc(repository_ctx)
+    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
+
+    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
+    cuda_defines = {}
+
+    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
+    # https://github.com/bazelbuild/bazel/issues/760).
+    # However, this stops our custom clang toolchain from picking the provided
+    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
+    # toolchain.
+    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
+    #       flag from the CROSSTOOL completely (see
+    #       https://github.com/bazelbuild/bazel/issues/5634)
+    if should_download_clang:
+        cuda_defines["%{linker_bin_path_flag}"] = ""
+    else:
+        cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+
+    if is_cuda_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        flag: "-Wno-invalid-partial-specialization"
+    """
+        cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+        _tpl(repository_ctx, "crosstool:BUILD", {
+            "%{linker_files}": ":empty",
+            "%{win_linker_files}": ":empty",
+        })
+        repository_ctx.file(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            "",
+        )
+        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        # nvcc has the system include paths built in and will automatically
+        # search them; we cannot work around that, so we add the relevant cuda
+        # system paths to the allowed compiler specific include paths.
+        cuda_defines["%{host_compiler_includes}"] = (
+            host_compiler_includes + "\n" + _cuda_include_path(
+                repository_ctx,
+                cuda_config,
+            ) +
+            "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
+            "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir
+        )
+
+        # For gcc, do not canonicalize system header paths; some versions of gcc
+        # pick the shortest possible path for system includes when creating the
+        # .d file - given that includes that are prefixed with "../" multiple
+        # time quickly grow longer than the root of the tree, this can lead to
+        # bazel's header check failing.
+        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+            "flag: \"-fno-canonical-system-headers\""
+        )
+        nvcc_path = str(
+            repository_ctx.path("%s/bin/nvcc%s" % (
+                cuda_config.cuda_toolkit_path,
+                ".exe" if _is_windows(repository_ctx) else "",
+            )),
+        )
+        _tpl(
+            repository_ctx,
+            "crosstool:BUILD",
+            {
+                "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
+                "%{win_linker_files}": ":windows_msvc_wrapper_files",
+            },
+        )
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_path,
+            "%{gcc_host_compiler_path}": str(cc),
+            "%{cuda_compute_capabilities}": ", ".join(
+                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
+            ),
+            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
+        }
+        _tpl(
+            repository_ctx,
+            "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            wrapper_defines,
+        )
+        _tpl(
+            repository_ctx,
+            "crosstool:windows/msvc_wrapper_for_nvcc.py",
+            wrapper_defines,
+        )
 
-  _tpl(
-      repository_ctx,
-      "crosstool:CROSSTOOL",
-      cuda_defines + _get_win_cuda_defines(repository_ctx),
-      out="crosstool/CROSSTOOL",
-  )
-
-  # Set up cuda_config.h, which is used by
-  # tensorflow/stream_executor/dso_loader.cc.
-  _tpl(
-      repository_ctx,
-      "cuda:cuda_config.h",
-      {
-          "%{cuda_version}":
-              cuda_config.cuda_version,
-          "%{cudnn_version}":
-              cuda_config.cudnn_version,
-          "%{cuda_compute_capabilities}":
-              ",".join([
-                  "CudaVersion(\"%s\")" % c
-                  for c in cuda_config.compute_capabilities
-              ],),
-          "%{cuda_toolkit_path}":
-              cuda_config.cuda_toolkit_path,
-      },
-      "cuda/cuda/cuda_config.h",
-  )
+    _tpl(
+        repository_ctx,
+        "crosstool:CROSSTOOL",
+        cuda_defines + _get_win_cuda_defines(repository_ctx),
+        out = "crosstool/CROSSTOOL",
+    )
 
+    # Set up cuda_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
+    _tpl(
+        repository_ctx,
+        "cuda:cuda_config.h",
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_compute_capabilities}": ",".join([
+                "CudaVersion(\"%s\")" % c
+                for c in cuda_config.compute_capabilities
+            ]),
+            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
+        },
+        "cuda/cuda/cuda_config.h",
+    )
 
 def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-  """Creates pointers to a remotely configured repo set up to build with CUDA."""
-  _tpl(
-      repository_ctx,
-      "cuda:build_defs.bzl",
-      {
-          "%{cuda_is_configured}":
-              "True",
-          "%{cuda_extra_copts}":
-              _compute_cuda_extra_copts(
-                  repository_ctx,
-                  compute_capabilities(repository_ctx),
-              ),
-      },
-  )
-  repository_ctx.template(
-      "cuda/BUILD",
-      Label(remote_config_repo + "/cuda:BUILD"),
-      {},
-  )
-  repository_ctx.template(
-      "cuda/build_defs.bzl",
-      Label(remote_config_repo + "/cuda:build_defs.bzl"),
-      {},
-  )
-  repository_ctx.template(
-      "cuda/cuda/cuda_config.h",
-      Label(remote_config_repo + "/cuda:cuda/cuda_config.h"),
-      {},
-  )
-
-
-def _cuda_autoconf_impl(repository_ctx):
-  """Implementation of the cuda_autoconf repository rule."""
-  if not enable_cuda(repository_ctx):
-    _create_dummy_repository(repository_ctx)
-  elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
-    _create_remote_cuda_repository(
+    """Creates pointers to a remotely configured repo set up to build with CUDA."""
+    _tpl(
         repository_ctx,
-        repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                compute_capabilities(repository_ctx),
+            ),
+        },
+    )
+    repository_ctx.template(
+        "cuda/BUILD",
+        Label(remote_config_repo + "/cuda:BUILD"),
+        {},
+    )
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        Label(remote_config_repo + "/cuda:build_defs.bzl"),
+        {},
+    )
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        Label(remote_config_repo + "/cuda:cuda/cuda_config.h"),
+        {},
     )
-  else:
-    _create_local_cuda_repository(repository_ctx)
 
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
+        _create_remote_cuda_repository(
+            repository_ctx,
+            repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+        )
+    else:
+        _create_local_cuda_repository(repository_ctx)
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index f67fb8ae38bce23b55149a5a134f88ff879946f5..63b8cd47c7122980845d91aec8b3f7b8c831657c 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -232,6 +232,42 @@ def _amdgpu_targets(repository_ctx):
             auto_configure_fail("Invalid AMDGPU target: %s" % amdgpu_target)
     return amdgpu_targets
 
+def _hipcc_env(repository_ctx):
+    """Returns the environment variable string for hipcc.
+
+    Args:
+        repository_ctx: The repository context.
+
+    Returns:
+        A string containing environment variables for hipcc.
+    """
+    hipcc_env = ""
+    for name in [
+        "HIP_CLANG_PATH",
+        "DEVICE_LIB_PATH",
+        "HIP_VDI_HOME",
+        "HIPCC_VERBOSE",
+        "HIPCC_COMPILE_FLAGS_APPEND",
+    ]:
+        if name in repository_ctx.os.environ:
+            hipcc_env = (hipcc_env + " " + name + "=\"" +
+                         repository_ctx.os.environ[name].strip() + "\";")
+    return hipcc_env.strip()
+
+def _crosstool_verbose(repository_ctx):
+    """Returns the environment variable value CROSSTOOL_VERBOSE.
+
+    Args:
+        repository_ctx: The repository context.
+
+    Returns:
+        A string containing value of environment variable CROSSTOOL_VERBOSE.
+    """
+    name = "CROSSTOOL_VERBOSE"
+    if name in repository_ctx.os.environ:
+        return repository_ctx.os.environ[name].strip()
+    return "0"
+
 def _cpu_value(repository_ctx):
     """Returns the name of the host operating system.
 
@@ -605,7 +641,6 @@ def _create_local_rocm_repository(repository_ctx):
         outs = rocm_lib_outs,
     ))
 
-
     # Set up BUILD file for rocm/
     _tpl(
         repository_ctx,
@@ -656,6 +691,12 @@ def _create_local_rocm_repository(repository_ctx):
         {
             "%{cpu_compiler}": str(cc),
             "%{hipcc_path}": "/opt/rocm/bin/hipcc",
+            "%{hipcc_env}": _hipcc_env(repository_ctx),
+            "%{hip_runtime_path}": "/opt/rocm/hip/lib",
+            "%{hip_runtime_library}": "hip_hcc",
+            "%{hcc_runtime_path}": "/opt/rocm/hcc/lib",
+            "%{hcc_runtime_library}": "mcwamp",
+            "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
             "%{rocm_amdgpu_targets}": ",".join(
                 ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 3a836fadc336cc100d1e9f62696bd8980efabc71..07e4ad7beac7928e6326b04c9d3e2545a07f6fdd 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -118,8 +118,8 @@ def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_v
 
 def _nccl_configure_impl(repository_ctx):
     """Implementation of the nccl_configure repository rule."""
-    if not enable_cuda(repository_ctx) or \
-       get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD"):
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
         # Add a dummy build file to make bazel query happy.
         repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
         return
diff --git a/third_party/nccl/system.BUILD.tpl b/third_party/nccl/system.BUILD.tpl
index a07f54955fc5aa51aba4b6c079de9489b8e4cba1..970dddb117846004c22b9d0233c344d99c6aa4f0 100644
--- a/third_party/nccl/system.BUILD.tpl
+++ b/third_party/nccl/system.BUILD.tpl
@@ -1,26 +1,25 @@
 filegroup(
-  name = "LICENSE",
-  visibility = ["//visibility:public"],
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
-  name = "nccl",
-  srcs = ["libnccl.so.%{version}"],
-  hdrs = ["nccl.h"],
-  include_prefix = "third_party/nccl",
-  deps = [
-      "@local_config_cuda//cuda:cuda_headers",
-  ],
-  visibility = ["//visibility:public"],
+    name = "nccl",
+    srcs = ["libnccl.so.%{version}"],
+    hdrs = ["nccl.h"],
+    include_prefix = "third_party/nccl",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
 )
 
 genrule(
-  name = "nccl-files",
-  outs = [
-    "libnccl.so.%{version}",
-    "nccl.h",
-  ],
-  cmd = """cp "%{hdr_path}/nccl.h" "$(@D)/nccl.h" &&
+    name = "nccl-files",
+    outs = [
+        "libnccl.so.%{version}",
+        "nccl.h",
+    ],
+    cmd = """cp "%{hdr_path}/nccl.h" "$(@D)/nccl.h" &&
            cp "%{install_path}/libnccl.so.%{version}" "$(@D)/libnccl.so.%{version}" """,
 )
-
diff --git a/third_party/pasta/workspace.bzl b/third_party/pasta/workspace.bzl
index e46cc4a45e42cc8e9da0e8c8401f05673286686d..9961835328ef4ee51e984976fa6a01cab8e5fe87 100644
--- a/third_party/pasta/workspace.bzl
+++ b/third_party/pasta/workspace.bzl
@@ -6,11 +6,11 @@ def repo():
     third_party_http_archive(
         name = "pasta",
         urls = [
-            "https://mirror.bazel.build/github.com/google/pasta/archive/c3d72cdee6fc806251949e912510444d58d7413c.tar.gz",
-            "https://github.com/google/pasta/archive/c3d72cdee6fc806251949e912510444d58d7413c.tar.gz",
+            "https://mirror.bazel.build/github.com/google/pasta/archive/v0.1.2.tar.gz",
+            "https://github.com/google/pasta/archive/v0.1.2.tar.gz",
         ],
-        strip_prefix = "pasta-c3d72cdee6fc806251949e912510444d58d7413c",
-        sha256 = "b5905f9cecc4b28363c563f3c4cb0545288bd35f7cc72c55066e97e53befc084",
+        strip_prefix = "pasta-0.1.2",
+        sha256 = "53e4c009a5eac38e942deb48bfc2d3cfca62cd457255fa86ffedb7e40f726a0c",
         build_file = "//third_party/pasta:BUILD.bazel",
         system_build_file = "//third_party/pasta:BUILD.system",
     )
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 9a7581c246d9d4468f5264cb8975c7260012e2a1..d1b1e5949211cc7556c7020ca3b2a114358b276b 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -11,300 +11,337 @@ _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
 
-
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl
-  repository_ctx.template(
-      out,
-      Label("//third_party/py:%s.tpl" % tpl),
-      substitutions)
-
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//third_party/py:%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _fail(msg):
-  """Output failure message when auto configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
-
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
 
 def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  os_name = repository_ctx.os.name.lower()
-  if os_name.find("windows") != -1:
-    return True
-  return False
-
-
-def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
-             empty_stdout_fine=False):
-  """Executes an arbitrary shell command.
-
-  Args:
-    repository_ctx: the repository_ctx object
-    cmdline: list of strings, the command to execute
-    error_msg: string, a summary of the error if the command fails
-    error_details: string, details about the error or steps to fix it
-    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
-      it's an error
-  Return:
-    the result of repository_ctx.execute(cmdline)
-  """
-  result = repository_ctx.execute(cmdline)
-  if result.stderr or not (empty_stdout_fine or result.stdout):
-    _fail("\n".join([
-        error_msg.strip() if error_msg else "Repository command failed",
-        result.stderr.strip(),
-        error_details if error_details else ""]))
-  return result
-
+    """Returns true if the host operating system is windows."""
+    os_name = repository_ctx.os.name.lower()
+    if os_name.find("windows") != -1:
+        return True
+    return False
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+        it's an error
+    Return:
+      the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        _fail("\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]))
+    return result
 
 def _read_dir(repository_ctx, src_dir):
-  """Returns a string with all files in a directory.
-
-  Finds all files inside a directory, traversing subfolders and following
-  symlinks. The returned string contains the full path of all files
-  separated by line breaks.
-  """
-  if _is_windows(repository_ctx):
-    src_dir = src_dir.replace("/", "\\")
-    find_result = _execute(
-        repository_ctx, ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-        empty_stdout_fine=True)
-    # src_files will be used in genrule.outs where the paths must
-    # use forward slashes.
-    result = find_result.stdout.replace("\\", "/")
-  else:
-    find_result = _execute(
-        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
-        empty_stdout_fine=True)
-    result = find_result.stdout
-  return result
-
+    """Returns a string with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks. The returned string contains the full path of all files
+    separated by line breaks.
+    """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return result
 
 def _genrule(src_dir, genrule_name, command, outs):
-  """Returns a string with a genrule.
-
-  Genrule executes the given command and produces the given outputs.
-  """
-  return (
-      'genrule(\n' +
-      '    name = "' +
-      genrule_name + '",\n' +
-      '    outs = [\n' +
-      outs +
-      '\n    ],\n' +
-      '    cmd = """\n' +
-      command +
-      '\n   """,\n' +
-      ')\n'
-  )
-
+    """Returns a string with a genrule.
+
+    Genrule executes the given command and produces the given outputs.
+    """
+    return (
+        "genrule(\n" +
+        '    name = "' +
+        genrule_name + '",\n' +
+        "    outs = [\n" +
+        outs +
+        "\n    ],\n" +
+        '    cmd = """\n' +
+        command +
+        '\n   """,\n' +
+        ")\n"
+    )
 
 def _norm_path(path):
-  """Returns a path with '/' and remove the trailing slash."""
-  path = path.replace("\\", "/")
-  if path[-1] == "/":
-    path = path[:-1]
-  return path
-
-
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
-    src_files = [], dest_files = []):
-  """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-  If src_dir is passed, files will be read from the given directory; otherwise
-  we assume files are in src_files and dest_files
-  """
-  if src_dir != None:
-    src_dir = _norm_path(src_dir)
-    dest_dir = _norm_path(dest_dir)
-    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-    # Create a list with the src_dir stripped to use for outputs.
-    dest_files = files.replace(src_dir, '').splitlines()
-    src_files = files.splitlines()
-  command = []
-  outs = []
-  for i in range(len(dest_files)):
-    if dest_files[i] != "":
-      # If we have only one file to link we do not want to use the dest_dir, as
-      # $(@D) will include the full path to the file.
-      dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
-      # Copy the headers to create a sandboxable setup.
-      cmd = 'cp -f'
-      command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
-      outs.append('        "' + dest_dir + dest_files[i] + '",')
-  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
-                     "\n".join(outs))
-  return genrule
-
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def _symlink_genrule_for_dir(
+        repository_ctx,
+        src_dir,
+        dest_dir,
+        genrule_name,
+        src_files = [],
+        dest_files = []):
+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+    If src_dir is passed, files will be read from the given directory; otherwise
+    we assume files are in src_files and dest_files
+    """
+    if src_dir != None:
+        src_dir = _norm_path(src_dir)
+        dest_dir = _norm_path(dest_dir)
+        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+        # Create a list with the src_dir stripped to use for outputs.
+        dest_files = files.replace(src_dir, "").splitlines()
+        src_files = files.splitlines()
+    command = []
+    outs = []
+    for i in range(len(dest_files)):
+        if dest_files[i] != "":
+            # If we have only one file to link we do not want to use the dest_dir, as
+            # $(@D) will include the full path to the file.
+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+            # Copy the headers to create a sandboxable setup.
+            cmd = "cp -f"
+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+            outs.append('        "' + dest_dir + dest_files[i] + '",')
+    genrule = _genrule(
+        src_dir,
+        genrule_name,
+        " && ".join(command),
+        "\n".join(outs),
+    )
+    return genrule
 
 def _get_python_bin(repository_ctx):
-  """Gets the python bin path."""
-  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-  if python_bin != None:
-    return python_bin
-  python_bin_path = repository_ctx.which("python")
-  if python_bin_path != None:
-    return str(python_bin_path)
-  _fail("Cannot find python in PATH, please make sure " +
-        "python is installed and add its directory in PATH, or --define " +
-        "%s='/something/else'.\nPATH=%s" % (
-            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
-
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_path = repository_ctx.which("python")
+    if python_bin_path != None:
+        return str(python_bin_path)
+    _fail("Cannot find python in PATH, please make sure " +
+          "python is installed and add its directory in PATH, or --define " +
+          "%s='/something/else'.\nPATH=%s" % (
+              _PYTHON_BIN_PATH,
+              repository_ctx.os.environ.get("PATH", ""),
+          ))
 
 def _get_bash_bin(repository_ctx):
-  """Gets the bash bin path."""
-  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
-  if bash_bin != None:
-    return bash_bin
-  else:
-    bash_bin_path = repository_ctx.which("bash")
-    if bash_bin_path != None:
-      return str(bash_bin_path)
+    """Gets the bash bin path."""
+    bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+    if bash_bin != None:
+        return bash_bin
     else:
-      _fail("Cannot find bash in PATH, please make sure " +
-            "bash is installed and add its directory in PATH, or --define " +
-            "%s='/path/to/bash'.\nPATH=%s" % (
-                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
-
+        bash_bin_path = repository_ctx.which("bash")
+        if bash_bin_path != None:
+            return str(bash_bin_path)
+        else:
+            _fail("Cannot find bash in PATH, please make sure " +
+                  "bash is installed and add its directory in PATH, or --define " +
+                  "%s='/path/to/bash'.\nPATH=%s" % (
+                      _BAZEL_SH,
+                      repository_ctx.os.environ.get("PATH", ""),
+                  ))
 
 def _get_python_lib(repository_ctx, python_bin):
-  """Gets the python lib path."""
-  python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
-  if python_lib != None:
-    return python_lib
-  print_lib = ("<<END\n" +
-      "from __future__ import print_function\n" +
-      "import site\n" +
-      "import os\n" +
-      "\n" +
-      "try:\n" +
-      "  input = raw_input\n" +
-      "except NameError:\n" +
-      "  pass\n" +
-      "\n" +
-      "python_paths = []\n" +
-      "if os.getenv('PYTHONPATH') is not None:\n" +
-      "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
-      "try:\n" +
-      "  library_paths = site.getsitepackages()\n" +
-      "except AttributeError:\n" +
-      " from distutils.sysconfig import get_python_lib\n" +
-      " library_paths = [get_python_lib()]\n" +
-      "all_paths = set(python_paths + library_paths)\n" +
-      "paths = []\n" +
-      "for path in all_paths:\n" +
-      "  if os.path.isdir(path):\n" +
-      "    paths.append(path)\n" +
-      "if len(paths) >=1:\n" +
-      "  print(paths[0])\n" +
-      "END")
-  cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
-  return result.stdout.strip('\n')
-
+    """Gets the python lib path."""
+    python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
+    if python_lib != None:
+        return python_lib
+    print_lib = ("<<END\n" +
+                 "from __future__ import print_function\n" +
+                 "import site\n" +
+                 "import os\n" +
+                 "\n" +
+                 "try:\n" +
+                 "  input = raw_input\n" +
+                 "except NameError:\n" +
+                 "  pass\n" +
+                 "\n" +
+                 "python_paths = []\n" +
+                 "if os.getenv('PYTHONPATH') is not None:\n" +
+                 "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
+                 "try:\n" +
+                 "  library_paths = site.getsitepackages()\n" +
+                 "except AttributeError:\n" +
+                 " from distutils.sysconfig import get_python_lib\n" +
+                 " library_paths = [get_python_lib()]\n" +
+                 "all_paths = set(python_paths + library_paths)\n" +
+                 "paths = []\n" +
+                 "for path in all_paths:\n" +
+                 "  if os.path.isdir(path):\n" +
+                 "    paths.append(path)\n" +
+                 "if len(paths) >=1:\n" +
+                 "  print(paths[0])\n" +
+                 "END")
+    cmd = "%s - %s" % (python_bin, print_lib)
+    result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+    return result.stdout.strip("\n")
 
 def _check_python_lib(repository_ctx, python_lib):
-  """Checks the python lib path."""
-  cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
-  if result.return_code == 1:
-    _fail("Invalid python library path: %s" % python_lib)
-
+    """Checks the python lib path."""
+    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
+    result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+    if result.return_code == 1:
+        _fail("Invalid python library path: %s" % python_lib)
 
 def _check_python_bin(repository_ctx, python_bin):
-  """Checks the python bin path."""
-  cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
-  if result.return_code == 1:
-    _fail("--define %s='%s' is not executable. Is it the python binary?" % (
-        _PYTHON_BIN_PATH, python_bin))
-
+    """Checks the python bin path."""
+    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
+    result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+    if result.return_code == 1:
+        _fail("--define %s='%s' is not executable. Is it the python binary?" % (
+            _PYTHON_BIN_PATH,
+            python_bin,
+        ))
 
 def _get_python_include(repository_ctx, python_bin):
-  """Gets the python include path."""
-  result = _execute(
-      repository_ctx,
-      [python_bin, "-c",
-       'from __future__ import print_function;' +
-       'from distutils import sysconfig;' +
-       'print(sysconfig.get_python_inc())'],
-      error_msg="Problem getting python include path.",
-      error_details=("Is the Python binary path set up right? " +
-                     "(See ./configure or " + _PYTHON_BIN_PATH + ".) " +
-                     "Is distutils installed?"))
-  return result.stdout.splitlines()[0]
-
+    """Gets the python include path."""
+    result = _execute(
+        repository_ctx,
+        [
+            python_bin,
+            "-c",
+            "from __future__ import print_function;" +
+            "from distutils import sysconfig;" +
+            "print(sysconfig.get_python_inc())",
+        ],
+        error_msg = "Problem getting python include path.",
+        error_details = ("Is the Python binary path set up right? " +
+                         "(See ./configure or " + _PYTHON_BIN_PATH + ".) " +
+                         "Is distutils installed?"),
+    )
+    return result.stdout.splitlines()[0]
 
 def _get_python_import_lib_name(repository_ctx, python_bin):
-  """Get Python import library name (pythonXY.lib) on Windows."""
-  result = _execute(
-      repository_ctx,
-      [python_bin, "-c",
-       'import sys;' +
-       'print("python" + str(sys.version_info[0]) + ' +
-       '      str(sys.version_info[1]) + ".lib")'],
-      error_msg="Problem getting python import library.",
-      error_details=("Is the Python binary path set up right? " +
-                     "(See ./configure or " + _PYTHON_BIN_PATH + ".) "))
-  return result.stdout.splitlines()[0]
-
+    """Get Python import library name (pythonXY.lib) on Windows."""
+    result = _execute(
+        repository_ctx,
+        [
+            python_bin,
+            "-c",
+            "import sys;" +
+            'print("python" + str(sys.version_info[0]) + ' +
+            '      str(sys.version_info[1]) + ".lib")',
+        ],
+        error_msg = "Problem getting python import library.",
+        error_details = ("Is the Python binary path set up right? " +
+                         "(See ./configure or " + _PYTHON_BIN_PATH + ".) "),
+    )
+    return result.stdout.splitlines()[0]
 
 def _get_numpy_include(repository_ctx, python_bin):
-  """Gets the numpy include path."""
-  return _execute(repository_ctx,
-                  [python_bin, "-c",
-                   'from __future__ import print_function;' +
-                   'import numpy;' +
-                   ' print(numpy.get_include());'],
-                  error_msg="Problem getting numpy include path.",
-                  error_details="Is numpy installed?").stdout.splitlines()[0]
-
+    """Gets the numpy include path."""
+    return _execute(
+        repository_ctx,
+        [
+            python_bin,
+            "-c",
+            "from __future__ import print_function;" +
+            "import numpy;" +
+            " print(numpy.get_include());",
+        ],
+        error_msg = "Problem getting numpy include path.",
+        error_details = "Is numpy installed?",
+    ).stdout.splitlines()[0]
 
 def _create_local_python_repository(repository_ctx):
-  """Creates the repository containing files set up to build with Python."""
-  python_bin = _get_python_bin(repository_ctx)
-  _check_python_bin(repository_ctx, python_bin)
-  python_lib = _get_python_lib(repository_ctx, python_bin)
-  _check_python_lib(repository_ctx, python_lib)
-  python_include = _get_python_include(repository_ctx, python_bin)
-  numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
-  python_include_rule = _symlink_genrule_for_dir(
-      repository_ctx, python_include, 'python_include', 'python_include')
-  python_import_lib_genrule = ""
-  # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-  # See https://docs.python.org/3/extending/windows.html
-  if _is_windows(repository_ctx):
-    python_include = _norm_path(python_include)
-    python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-    python_import_lib_src = python_include.rsplit('/', 1)[0] + "/libs/" + python_import_lib_name
-    python_import_lib_genrule = _symlink_genrule_for_dir(
-      repository_ctx, None, '', 'python_import_lib',
-      [python_import_lib_src], [python_import_lib_name])
-  numpy_include_rule = _symlink_genrule_for_dir(
-      repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
-  _tpl(repository_ctx, "BUILD", {
-      "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-      "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-      "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-  })
-
+    """Creates the repository containing files set up to build with Python."""
+    python_bin = _get_python_bin(repository_ctx)
+    _check_python_bin(repository_ctx, python_bin)
+    python_lib = _get_python_lib(repository_ctx, python_bin)
+    _check_python_lib(repository_ctx, python_lib)
+    python_include = _get_python_include(repository_ctx, python_bin)
+    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
+    python_include_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        python_include,
+        "python_include",
+        "python_include",
+    )
+    python_import_lib_genrule = ""
+
+    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+    # See https://docs.python.org/3/extending/windows.html
+    if _is_windows(repository_ctx):
+        python_include = _norm_path(python_include)
+        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
+        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
+        python_import_lib_genrule = _symlink_genrule_for_dir(
+            repository_ctx,
+            None,
+            "",
+            "python_import_lib",
+            [python_import_lib_src],
+            [python_import_lib_name],
+        )
+    numpy_include_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        numpy_include,
+        "numpy_include/numpy",
+        "numpy_include",
+    )
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+    })
 
 def _create_remote_python_repository(repository_ctx, remote_config_repo):
-  """Creates pointers to a remotely configured repo set up to build with Python.
-  """
-  repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
-
+    """Creates pointers to a remotely configured repo set up to build with Python.
+    """
+    repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
 
 def _python_autoconf_impl(repository_ctx):
-  """Implementation of the python_autoconf repository rule."""
-  if _TF_PYTHON_CONFIG_REPO in repository_ctx.os.environ:
-      _create_remote_python_repository(repository_ctx,
-          repository_ctx.os.environ[_TF_PYTHON_CONFIG_REPO])
-  else:
-    _create_local_python_repository(repository_ctx)
-
+    """Implementation of the python_autoconf repository rule."""
+    if _TF_PYTHON_CONFIG_REPO in repository_ctx.os.environ:
+        _create_remote_python_repository(
+            repository_ctx,
+            repository_ctx.os.environ[_TF_PYTHON_CONFIG_REPO],
+        )
+    else:
+        _create_local_python_repository(repository_ctx)
 
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
diff --git a/third_party/sycl/sycl/build_defs.bzl.tpl b/third_party/sycl/sycl/build_defs.bzl.tpl
index 33386f8957c821ef579a2bc1dcfb71b94ceb0aa1..a726c8d953cb56c67281e2572edb4e64df05d064 100755
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ b/third_party/sycl/sycl/build_defs.bzl.tpl
@@ -11,7 +11,7 @@ def if_sycl(if_true, if_false = []):
     return select({
         "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
         "@local_config_sycl//sycl:using_sycl_trisycl": if_true[0:1],
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
 
 def if_ccpp(if_true, if_false = []):
@@ -24,5 +24,5 @@ def if_ccpp(if_true, if_false = []):
     return select({
         "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
         "@local_config_sycl//sycl:using_sycl_trisycl": if_false,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index 5b9d0eb383d1b069c2107c2c22a59c3790cb721e..deba6c4116e763c80398c17ff3331cfa0ceb8a9b 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -11,122 +11,124 @@
 """
 
 _HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
-_HOST_C_COMPILER= "HOST_C_COMPILER"
+_HOST_C_COMPILER = "HOST_C_COMPILER"
 _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
 _TRISYCL_INCLUDE_DIR = "TRISYCL_INCLUDE_DIR"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 
 def _enable_sycl(repository_ctx):
-  if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
-    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
-    return enable_sycl == "1"
-  return False
+    if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
+        enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
+        return enable_sycl == "1"
+    return False
 
 def _enable_compute_cpp(repository_ctx):
-  return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
+    return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
 
 def auto_configure_fail(msg):
-  """Output failure message when auto configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+
 # END cc_configure common functions (see TODO above).
 
 def find_c(repository_ctx):
-  """Find host C compiler."""
-  c_name = "gcc"
-  if _HOST_C_COMPILER in repository_ctx.os.environ:
-    c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
-  if c_name.startswith("/"):
-    return c_name
-  c = repository_ctx.which(c_name)
-  if c == None:
-    fail("Cannot find C compiler, please correct your path.")
-  return c
+    """Find host C compiler."""
+    c_name = "gcc"
+    if _HOST_C_COMPILER in repository_ctx.os.environ:
+        c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
+    if c_name.startswith("/"):
+        return c_name
+    c = repository_ctx.which(c_name)
+    if c == None:
+        fail("Cannot find C compiler, please correct your path.")
+    return c
 
 def find_cc(repository_ctx):
-  """Find host C++ compiler."""
-  cc_name = "g++"
-  if _HOST_CXX_COMPILER in repository_ctx.os.environ:
-    cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
-  if cc_name.startswith("/"):
-    return cc_name
-  cc = repository_ctx.which(cc_name)
-  if cc == None:
-    fail("Cannot find C++ compiler, please correct your path.")
-  return cc
+    """Find host C++ compiler."""
+    cc_name = "g++"
+    if _HOST_CXX_COMPILER in repository_ctx.os.environ:
+        cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
+    if cc_name.startswith("/"):
+        return cc_name
+    cc = repository_ctx.which(cc_name)
+    if cc == None:
+        fail("Cannot find C++ compiler, please correct your path.")
+    return cc
 
 def find_computecpp_root(repository_ctx):
-  """Find ComputeCpp compiler."""
-  sycl_name = ""
-  if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
-    sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
-  if sycl_name.startswith("/"):
-    return sycl_name
-  fail("Cannot find SYCL compiler, please correct your path")
+    """Find ComputeCpp compiler."""
+    sycl_name = ""
+    if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
+        sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
+    if sycl_name.startswith("/"):
+        return sycl_name
+    fail("Cannot find SYCL compiler, please correct your path")
 
 def find_trisycl_include_dir(repository_ctx):
-  """Find triSYCL include directory. """
-  if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
-    sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
-    if sycl_name.startswith("/"):
-      return sycl_name
-  fail( "Cannot find triSYCL include directory, please correct your path")
+    """Find triSYCL include directory. """
+    if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
+        sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
+        if sycl_name.startswith("/"):
+            return sycl_name
+    fail("Cannot find triSYCL include directory, please correct your path")
 
 def find_python_lib(repository_ctx):
-  """Returns python path."""
-  if _PYTHON_LIB_PATH in repository_ctx.os.environ:
-    return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
-  fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
-
+    """Returns python path."""
+    if _PYTHON_LIB_PATH in repository_ctx.os.environ:
+        return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
+    fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
 
 def _check_lib(repository_ctx, toolkit_path, lib):
-  """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
+    """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
 
-  Args:
-    repository_ctx: The repository context.
-    toolkit_path: The toolkit directory containing the libraries.
-    ib: The library to look for under toolkit_path.
-  """
-  lib_path = toolkit_path + "/" + lib
-  if not repository_ctx.path(lib_path).exists:
-    auto_configure_fail("Cannot find %s" % lib_path)
+    Args:
+      repository_ctx: The repository context.
+      toolkit_path: The toolkit directory containing the libraries.
+      ib: The library to look for under toolkit_path.
+    """
+    lib_path = toolkit_path + "/" + lib
+    if not repository_ctx.path(lib_path).exists:
+        auto_configure_fail("Cannot find %s" % lib_path)
 
 def _check_dir(repository_ctx, directory):
-  """Checks whether the directory exists and fail if it does not.
+    """Checks whether the directory exists and fail if it does not.
 
-  Args:
-    repository_ctx: The repository context.
-    directory: The directory to check the existence of.
-  """
-  if not repository_ctx.path(directory).exists:
-    auto_configure_fail("Cannot find dir: %s" % directory)
+    Args:
+      repository_ctx: The repository context.
+      directory: The directory to check the existence of.
+    """
+    if not repository_ctx.path(directory).exists:
+        auto_configure_fail("Cannot find dir: %s" % directory)
 
 def _symlink_dir(repository_ctx, src_dir, dest_dir):
-  """Symlinks all the files in a directory.
-
-  Args:
-    repository_ctx: The repository context.
-    src_dir: The source directory.
-    dest_dir: The destination directory to create the symlinks in.
-  """
-  files = repository_ctx.path(src_dir).readdir()
-  for src_file in files:
-    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
-
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl.replace(":", "/")
-  repository_ctx.template(
-      out,
-      Label("//third_party/sycl/%s.tpl" % tpl),
-      substitutions)
+    """Symlinks all the files in a directory.
+
+    Args:
+      repository_ctx: The repository context.
+      src_dir: The source directory.
+      dest_dir: The destination directory to create the symlinks in.
+    """
+    files = repository_ctx.path(src_dir).readdir()
+    for src_file in files:
+        repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//third_party/sycl/%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _file(repository_ctx, label):
-  repository_ctx.template(
-      label.replace(":", "/"),
-      Label("//third_party/sycl/%s" % label),
-      {})
+    repository_ctx.template(
+        label.replace(":", "/"),
+        Label("//third_party/sycl/%s" % label),
+        {},
+    )
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_sycl_disabled():
@@ -147,7 +149,6 @@ def error_sycl_disabled():
   )
 """
 
-
 _DUMMY_CROSSTOOL_BUILD_FILE = """
 load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled")
 
@@ -155,87 +156,97 @@ error_sycl_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
-  # Set up BUILD file for sycl/.
-  _tpl(repository_ctx, "sycl:build_defs.bzl")
-  _tpl(repository_ctx, "sycl:BUILD")
-  _file(repository_ctx, "sycl:LICENSE.text")
-  _tpl(repository_ctx, "sycl:platform.bzl")
-
-  # Create dummy files for the SYCL toolkit since they are still required by
-  # tensorflow/sycl/platform/default/build_config:sycl.
-  repository_ctx.file("sycl/include/sycl.hpp", "")
-  repository_ctx.file("sycl/lib/libComputeCpp.so", "")
-
-  # If sycl_configure is not configured to build with SYCL support, and the user
-  # attempts to build with --config=sycl, add a dummy build rule to intercept
-  # this and fail with an actionable error message.
-  repository_ctx.file("crosstool/error_sycl_disabled.bzl",
-                      _DUMMY_CROSSTOOL_BZL_FILE)
-  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-
-def _sycl_autoconf_imp(repository_ctx):
-  """Implementation of the sycl_autoconf rule."""
-  if not _enable_sycl(repository_ctx):
-    _create_dummy_repository(repository_ctx)
-  else:
-    # copy template files
+    # Set up BUILD file for sycl/.
     _tpl(repository_ctx, "sycl:build_defs.bzl")
     _tpl(repository_ctx, "sycl:BUILD")
-    _tpl(repository_ctx, "sycl:platform.bzl")
-    _tpl(repository_ctx, "crosstool:BUILD")
     _file(repository_ctx, "sycl:LICENSE.text")
+    _tpl(repository_ctx, "sycl:platform.bzl")
 
-    if _enable_compute_cpp(repository_ctx):
-      _tpl(repository_ctx, "crosstool:computecpp",
-      {
-        "%{host_cxx_compiler}" : find_cc(repository_ctx),
-        "%{host_c_compiler}" : find_c(repository_ctx)
-      })
-
-      computecpp_root = find_computecpp_root(repository_ctx);
-      _check_dir(repository_ctx, computecpp_root)
-
-      _tpl(repository_ctx, "crosstool:CROSSTOOL",
-      {
-        "%{sycl_include_dir}" : computecpp_root,
-        "%{sycl_impl}" : "computecpp",
-        "%{c++_std}" : "-std=c++11",
-        "%{python_lib_path}" : find_python_lib(repository_ctx),
-      })
-
-      # symlink libraries
-      _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
-      _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-      _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-      _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
-    else:
-
-      trisycl_include_dir = find_trisycl_include_dir(repository_ctx);
-      _check_dir(repository_ctx, trisycl_include_dir)
-
-      _tpl(repository_ctx, "crosstool:trisycl",
-      {
-        "%{host_cxx_compiler}" : find_cc(repository_ctx),
-        "%{host_c_compiler}" : find_c(repository_ctx),
-        "%{trisycl_include_dir}" : trisycl_include_dir
-      })
-
-
-      _tpl(repository_ctx, "crosstool:CROSSTOOL",
-      {
-        "%{sycl_include_dir}" : trisycl_include_dir,
-        "%{sycl_impl}" : "trisycl",
-        "%{c++_std}" : "-std=c++1y",
-        "%{python_lib_path}" : find_python_lib(repository_ctx),
-      })
+    # Create dummy files for the SYCL toolkit since they are still required by
+    # tensorflow/sycl/platform/default/build_config:sycl.
+    repository_ctx.file("sycl/include/sycl.hpp", "")
+    repository_ctx.file("sycl/lib/libComputeCpp.so", "")
 
-      _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
+    # If sycl_configure is not configured to build with SYCL support, and the user
+    # attempts to build with --config=sycl, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_sycl_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
+def _sycl_autoconf_imp(repository_ctx):
+    """Implementation of the sycl_autoconf rule."""
+    if not _enable_sycl(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        # copy template files
+        _tpl(repository_ctx, "sycl:build_defs.bzl")
+        _tpl(repository_ctx, "sycl:BUILD")
+        _tpl(repository_ctx, "sycl:platform.bzl")
+        _tpl(repository_ctx, "crosstool:BUILD")
+        _file(repository_ctx, "sycl:LICENSE.text")
+
+        if _enable_compute_cpp(repository_ctx):
+            _tpl(
+                repository_ctx,
+                "crosstool:computecpp",
+                {
+                    "%{host_cxx_compiler}": find_cc(repository_ctx),
+                    "%{host_c_compiler}": find_c(repository_ctx),
+                },
+            )
+
+            computecpp_root = find_computecpp_root(repository_ctx)
+            _check_dir(repository_ctx, computecpp_root)
+
+            _tpl(
+                repository_ctx,
+                "crosstool:CROSSTOOL",
+                {
+                    "%{sycl_include_dir}": computecpp_root,
+                    "%{sycl_impl}": "computecpp",
+                    "%{c++_std}": "-std=c++11",
+                    "%{python_lib_path}": find_python_lib(repository_ctx),
+                },
+            )
+
+            # symlink libraries
+            _check_lib(repository_ctx, computecpp_root + "/lib", "libComputeCpp.so")
+            _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+            _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+            _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+        else:
+            trisycl_include_dir = find_trisycl_include_dir(repository_ctx)
+            _check_dir(repository_ctx, trisycl_include_dir)
+
+            _tpl(
+                repository_ctx,
+                "crosstool:trisycl",
+                {
+                    "%{host_cxx_compiler}": find_cc(repository_ctx),
+                    "%{host_c_compiler}": find_c(repository_ctx),
+                    "%{trisycl_include_dir}": trisycl_include_dir,
+                },
+            )
+
+            _tpl(
+                repository_ctx,
+                "crosstool:CROSSTOOL",
+                {
+                    "%{sycl_include_dir}": trisycl_include_dir,
+                    "%{sycl_impl}": "trisycl",
+                    "%{c++_std}": "-std=c++1y",
+                    "%{python_lib_path}": find_python_lib(repository_ctx),
+                },
+            )
+
+            _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
 
 sycl_configure = repository_rule(
-  implementation = _sycl_autoconf_imp,
-  local = True,
+    implementation = _sycl_autoconf_imp,
+    local = True,
 )
 """Detects and configures the SYCL toolchain.
 
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index c6de25b33e3fab545d845986231c6880632babeb..1d019a1b4f531d077747203c38296b4aa0bd8d82 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -10,13 +10,13 @@
 load(
     "//third_party/gpus:cuda_configure.bzl",
     "auto_configure_fail",
-    "get_cpu_value",
     "find_cuda_define",
     "find_lib",
+    "get_cpu_value",
     "lib_name",
-    "matches_version",
     "make_copy_dir_rule",
     "make_copy_files_rule",
+    "matches_version",
 )
 
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
@@ -30,185 +30,200 @@ _DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
 _DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
 _DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
 
-
 def _headers_exist(repository_ctx, path):
-  """Returns whether all TensorRT header files could be found in 'path'.
-
-  Args:
-    repository_ctx: The repository context.
-    path: The TensorRT include path to check.
+    """Returns whether all TensorRT header files could be found in 'path'.
 
-  Returns:
-    True if all TensorRT header files can be found in the path.
-  """
-  for h in _TF_TENSORRT_HEADERS:
-    if not repository_ctx.path("%s/%s" % (path, h)).exists:
-      return False
-  return True
+    Args:
+      repository_ctx: The repository context.
+      path: The TensorRT include path to check.
 
+    Returns:
+      True if all TensorRT header files can be found in the path.
+    """
+    for h in _TF_TENSORRT_HEADERS:
+        if not repository_ctx.path("%s/%s" % (path, h)).exists:
+            return False
+    return True
 
 def _find_trt_header_dir(repository_ctx, trt_install_path):
-  """Returns the path to the directory containing headers of TensorRT.
-
-  Args:
-    repository_ctx: The repository context.
-    trt_install_path: The TensorRT library install directory.
-
-  Returns:
-    The path of the directory containing the TensorRT header.
-  """
-  if trt_install_path == "/usr/lib/x86_64-linux-gnu":
-    path = "/usr/include/x86_64-linux-gnu"
-    if _headers_exist(repository_ctx, path):
-      return path
-  if trt_install_path == "/usr/lib/aarch64-linux-gnu":
-    path = "/usr/include/aarch64-linux-gnu"
+    """Returns the path to the directory containing headers of TensorRT.
+
+    Args:
+      repository_ctx: The repository context.
+      trt_install_path: The TensorRT library install directory.
+
+    Returns:
+      The path of the directory containing the TensorRT header.
+    """
+    if trt_install_path == "/usr/lib/x86_64-linux-gnu":
+        path = "/usr/include/x86_64-linux-gnu"
+        if _headers_exist(repository_ctx, path):
+            return path
+    if trt_install_path == "/usr/lib/aarch64-linux-gnu":
+        path = "/usr/include/aarch64-linux-gnu"
+        if _headers_exist(repository_ctx, path):
+            return path
+    path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
     if _headers_exist(repository_ctx, path):
-      return path
-  path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
-  if _headers_exist(repository_ctx, path):
-    return path
-  auto_configure_fail(
-      "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path)
-
-
-def _trt_lib_version(repository_ctx, trt_install_path):
-  """Detects the library (e.g. libnvinfer) version of TensorRT.
-
-  Args:
-    repository_ctx: The repository context.
-    trt_install_path: The TensorRT library install directory.
-
-  Returns:
-    A string containing the library version of TensorRT.
-  """
-  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
-  major_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
-                                   _DEFINE_TENSORRT_SONAME_MAJOR)
-  minor_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
-                                   _DEFINE_TENSORRT_SONAME_MINOR)
-  patch_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
-                                   _DEFINE_TENSORRT_SONAME_PATCH)
-  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-  environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
-  if not matches_version(environ_version, full_version):
+        return path
     auto_configure_fail(
-        ("TensorRT library version detected from %s/%s (%s) does not match " +
-         "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
-        (trt_header_dir, "NvInfer.h", full_version, environ_version))
-  # Only use the major version to match the SONAME of the library.
-  return major_version
+        "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path,
+    )
 
+def _trt_lib_version(repository_ctx, trt_install_path):
+    """Detects the library (e.g. libnvinfer) version of TensorRT.
+
+    Args:
+      repository_ctx: The repository context.
+      trt_install_path: The TensorRT library install directory.
+
+    Returns:
+      A string containing the library version of TensorRT.
+    """
+    trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+    major_version = find_cuda_define(
+        repository_ctx,
+        trt_header_dir,
+        "NvInfer.h",
+        _DEFINE_TENSORRT_SONAME_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        trt_header_dir,
+        "NvInfer.h",
+        _DEFINE_TENSORRT_SONAME_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        trt_header_dir,
+        "NvInfer.h",
+        _DEFINE_TENSORRT_SONAME_PATCH,
+    )
+    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+    environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
+    if not matches_version(environ_version, full_version):
+        auto_configure_fail(
+            ("TensorRT library version detected from %s/%s (%s) does not match " +
+             "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
+            (trt_header_dir, "NvInfer.h", full_version, environ_version),
+        )
+
+    # Only use the major version to match the SONAME of the library.
+    return major_version
 
 def _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version):
-  """Finds the given TensorRT library on the system.
-
-  Adapted from code contributed by Sami Kama (https://github.com/samikama).
-
-  Args:
-    repository_ctx: The repository context.
-    trt_install_path: The TensorRT library installation directory.
-    trt_lib_version: The version of TensorRT library files as returned
-      by _trt_lib_version.
-
-  Returns:
-    The path to the library.
-  """
-  result = {}
-  for lib in _TF_TENSORRT_LIBS:
-    file_name = lib_name("nvinfer", cpu_value, trt_lib_version)
-    path = find_lib(repository_ctx, ["%s/%s" % (trt_install_path, file_name)])
-    result[file_name] = path
-  return result
-
+    """Finds the given TensorRT library on the system.
+
+    Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+    Args:
+      repository_ctx: The repository context.
+      trt_install_path: The TensorRT library installation directory.
+      trt_lib_version: The version of TensorRT library files as returned
+        by _trt_lib_version.
+
+    Returns:
+      The path to the library.
+    """
+    result = {}
+    for lib in _TF_TENSORRT_LIBS:
+        file_name = lib_name("nvinfer", cpu_value, trt_lib_version)
+        path = find_lib(repository_ctx, ["%s/%s" % (trt_install_path, file_name)])
+        result[file_name] = path
+    return result
 
 def _tpl(repository_ctx, tpl, substitutions):
-  repository_ctx.template(tpl, Label("//third_party/tensorrt:%s.tpl" % tpl),
-                          substitutions)
-
+    repository_ctx.template(
+        tpl,
+        Label("//third_party/tensorrt:%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _create_dummy_repository(repository_ctx):
-  """Create a dummy TensorRT repository."""
-  _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_false"})
+    """Create a dummy TensorRT repository."""
+    _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_false"})
 
-  _tpl(repository_ctx, "BUILD", {
-      "%{tensorrt_genrules}": "",
-      "%{tensorrt_headers}": "[]",
-      "%{tensorrt_libs}": "[]"
-  })
+    _tpl(repository_ctx, "BUILD", {
+        "%{tensorrt_genrules}": "",
+        "%{tensorrt_headers}": "[]",
+        "%{tensorrt_libs}": "[]",
+    })
 
 def _tensorrt_configure_impl(repository_ctx):
-  """Implementation of the tensorrt_configure repository rule."""
-  if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
-    # Forward to the pre-configured remote repository.
-    remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
-    repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
-    repository_ctx.template(
-        "build_defs.bzl",
-        Label(remote_config_repo + ":build_defs.bzl"),
-        {},
+    """Implementation of the tensorrt_configure repository rule."""
+    if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
+        # Forward to the pre-configured remote repository.
+        remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
+        repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
+        repository_ctx.template(
+            "build_defs.bzl",
+            Label(remote_config_repo + ":build_defs.bzl"),
+            {},
+        )
+        return
+
+    if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
+        _create_dummy_repository(repository_ctx)
+        return
+
+    cpu_value = get_cpu_value(repository_ctx)
+    if (cpu_value != "Linux"):
+        auto_configure_fail("TensorRT is supported only on Linux.")
+    if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
+        auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
+    trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
+    if not repository_ctx.path(trt_install_path).exists:
+        auto_configure_fail(
+            "Cannot find TensorRT install path %s." % trt_install_path,
+        )
+
+    # Copy the library files.
+    trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
+    trt_libs = _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version)
+    trt_lib_srcs = []
+    trt_lib_outs = []
+    for path in trt_libs.values():
+        trt_lib_srcs.append(str(path))
+        trt_lib_outs.append("tensorrt/lib/" + path.basename)
+    copy_rules = [make_copy_files_rule(
+        repository_ctx,
+        name = "tensorrt_lib",
+        srcs = trt_lib_srcs,
+        outs = trt_lib_outs,
+    )]
+
+    # Copy the header files header files.
+    trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+    trt_header_srcs = [
+        "%s/%s" % (trt_header_dir, header)
+        for header in _TF_TENSORRT_HEADERS
+    ]
+    trt_header_outs = [
+        "tensorrt/include/" + header
+        for header in _TF_TENSORRT_HEADERS
+    ]
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "tensorrt_include",
+            srcs = trt_header_srcs,
+            outs = trt_header_outs,
+        ),
     )
-    return
-
-  if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
-    _create_dummy_repository(repository_ctx)
-    return
-
-  cpu_value = get_cpu_value(repository_ctx)
-  if (cpu_value != "Linux"):
-    auto_configure_fail("TensorRT is supported only on Linux.")
-  if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
-    auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
-  trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
-  if not repository_ctx.path(trt_install_path).exists:
-    auto_configure_fail(
-        "Cannot find TensorRT install path %s." % trt_install_path)
-
-  # Copy the library files.
-  trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
-  trt_libs = _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version)
-  trt_lib_srcs = []
-  trt_lib_outs = []
-  for path in trt_libs.values():
-    trt_lib_srcs.append(str(path))
-    trt_lib_outs.append("tensorrt/lib/" + path.basename)
-  copy_rules = [make_copy_files_rule(
-      repository_ctx,
-      name = "tensorrt_lib",
-      srcs = trt_lib_srcs,
-      outs = trt_lib_outs,
-  )]
-
-  # Copy the header files header files.
-  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
-  trt_header_srcs = [
-      "%s/%s" % (trt_header_dir, header) for header in _TF_TENSORRT_HEADERS
-  ]
-  trt_header_outs = [
-      "tensorrt/include/" + header for header in _TF_TENSORRT_HEADERS
-  ]
-  copy_rules.append(
-      make_copy_files_rule(
-          repository_ctx,
-          name = "tensorrt_include",
-          srcs = trt_header_srcs,
-          outs = trt_header_outs,
-  ))
-
-  # Set up config file.
-  _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_true"})
-
-  # Set up BUILD file.
-  _tpl(repository_ctx, "BUILD", {
-      "%{copy_rules}": "\n".join(copy_rules),
-      "%{tensorrt_headers}": '":tensorrt_include"',
-      "%{tensorrt_libs}": str(trt_lib_outs),
-  })
 
+    # Set up config file.
+    _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_true"})
+
+    # Set up BUILD file.
+    _tpl(repository_ctx, "BUILD", {
+        "%{copy_rules}": "\n".join(copy_rules),
+        "%{tensorrt_headers}": '":tensorrt_include"',
+        "%{tensorrt_libs}": str(trt_lib_outs),
+    })
 
 tensorrt_configure = repository_rule(
-    implementation=_tensorrt_configure_impl,
-    environ=[
+    implementation = _tensorrt_configure_impl,
+    environ = [
         _TENSORRT_INSTALL_PATH,
         _TF_TENSORRT_VERSION,
     ],
diff --git a/third_party/toolchains/clang6/repo.bzl b/third_party/toolchains/clang6/repo.bzl
index b81f44506f382a6f2e4bfc005249c1a56cf0ae2f..e4b6422c96d749f5b84211ee9911069bc7e62e9b 100644
--- a/third_party/toolchains/clang6/repo.bzl
+++ b/third_party/toolchains/clang6/repo.bzl
@@ -1,30 +1,37 @@
 """Repository rule for Debian 8 Jessie Clang-6.0 portable Linux builds."""
 
 def _clang6_configure(ctx):
-  # TODO(jart): It'd probably be better to use Bazel's struct.to_proto()
-  #             method to generate a gigantic CROSSTOOL file that allows
-  #             Clang to support everything.
-  ctx.symlink(
-      ctx.os.environ.get('TF_LLVM_PATH',
-                         '/usr/lib/llvm-6.0'),
-      'clang6/llvm')
-  ctx.symlink(
-      ctx.os.environ.get('STRIP', '/usr/bin/strip'),
-      'clang6/sbin/strip')
-  ctx.symlink(
-      ctx.os.environ.get('OBJDUMP', '/usr/bin/objdump'),
-      'clang6/sbin/objdump')
-  ctx.symlink(ctx.attr._build, 'clang6/BUILD')
-  ctx.template('clang6/CROSSTOOL', ctx.attr._crosstool, {
-      '%package(@local_config_clang6//clang6)%': str(ctx.path('clang6')),
-  })
+    # TODO(jart): It'd probably be better to use Bazel's struct.to_proto()
+    #             method to generate a gigantic CROSSTOOL file that allows
+    #             Clang to support everything.
+    ctx.symlink(
+        ctx.os.environ.get(
+            "TF_LLVM_PATH",
+            "/usr/lib/llvm-6.0",
+        ),
+        "clang6/llvm",
+    )
+    ctx.symlink(
+        ctx.os.environ.get("STRIP", "/usr/bin/strip"),
+        "clang6/sbin/strip",
+    )
+    ctx.symlink(
+        ctx.os.environ.get("OBJDUMP", "/usr/bin/objdump"),
+        "clang6/sbin/objdump",
+    )
+    ctx.symlink(ctx.attr._build, "clang6/BUILD")
+    ctx.template("clang6/CROSSTOOL", ctx.attr._crosstool, {
+        "%package(@local_config_clang6//clang6)%": str(ctx.path("clang6")),
+    })
 
 clang6_configure = repository_rule(
     implementation = _clang6_configure,
     attrs = {
-        '_build': attr.label(
-            default=str(Label('//third_party/toolchains/clang6:clang.BUILD'))),
-        '_crosstool': attr.label(
-            default=str(Label('//third_party/toolchains/clang6:CROSSTOOL.tpl'))),
+        "_build": attr.label(
+            default = str(Label("//third_party/toolchains/clang6:clang.BUILD")),
+        ),
+        "_crosstool": attr.label(
+            default = str(Label("//third_party/toolchains/clang6:CROSSTOOL.tpl")),
+        ),
     },
 )
diff --git a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
index ab6eac115ce5523a66dd3db66a89dd7c24281ac1..d675e95f70fd31cc73c6c882e9decff3e8991df8 100644
--- a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+++ b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
@@ -1,38 +1,38 @@
 # -*- Python -*-
 """Repository rule for arm compiler autoconfiguration."""
 
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl
-  repository_ctx.template(
-      out,
-      Label("//third_party/toolchains/cpus/arm:%s.tpl" % tpl),
-      substitutions)
-
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//third_party/toolchains/cpus/arm:%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _arm_compiler_configure_impl(repository_ctx):
-  # We need to find a cross-compilation include directory for Python, so look
-  # for an environment variable. Be warned, this crosstool template is only
-  # regenerated on the first run of Bazel, so if you change the variable after
-  # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
-  # doesn't fix this, you'll need to delete the generated file at something like:
-  # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
-  if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
-    python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
-  else:
-    python_include_path = "/usr/include/python2.7"
-  _tpl(repository_ctx, "CROSSTOOL", {
-      "%{ARM_COMPILER_PATH}%": str(repository_ctx.path(
-          repository_ctx.attr.remote_config_repo)),
-      "%{PYTHON_INCLUDE_PATH}%": python_include_path,
-  })
-  repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
-
+    # We need to find a cross-compilation include directory for Python, so look
+    # for an environment variable. Be warned, this crosstool template is only
+    # regenerated on the first run of Bazel, so if you change the variable after
+    # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
+    # doesn't fix this, you'll need to delete the generated file at something like:
+    # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
+    if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
+        python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
+    else:
+        python_include_path = "/usr/include/python2.7"
+    _tpl(repository_ctx, "CROSSTOOL", {
+        "%{ARM_COMPILER_PATH}%": str(repository_ctx.path(
+            repository_ctx.attr.remote_config_repo,
+        )),
+        "%{PYTHON_INCLUDE_PATH}%": python_include_path,
+    })
+    repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
 
 arm_compiler_configure = repository_rule(
     implementation = _arm_compiler_configure_impl,
     attrs = {
-        "remote_config_repo": attr.string(mandatory = False, default =""),
+        "remote_config_repo": attr.string(mandatory = False, default = ""),
         "build_file": attr.label(),
     },
 )
diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl
index bafc7d49434f48dfaad5b421e2ead472fd7ebc5f..a26d2c623c9b05a74787db2b9f5e218e2102ce8c 100644
--- a/third_party/toolchains/preconfig/generate/archives.bzl
+++ b/third_party/toolchains/preconfig/generate/archives.bzl
@@ -1,13 +1,12 @@
-load("//tensorflow:version_check.bzl", "parse_bazel_version")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 def bazel_toolchains_archive():
     http_archive(
-      name = "bazel_toolchains",
-      sha256 = "ee854b5de299138c1f4a2edb5573d22b21d975acfc7aa938f36d30b49ef97498",
-      strip_prefix = "bazel-toolchains-37419a124bdb9af2fec5b99a973d359b6b899b61",
-      urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/37419a124bdb9af2fec5b99a973d359b6b899b61.tar.gz",
-        "https://github.com/bazelbuild/bazel-toolchains/archive/37419a124bdb9af2fec5b99a973d359b6b899b61.tar.gz",
-      ],
+        name = "bazel_toolchains",
+        sha256 = "109a99384f9d08f9e75136d218ebaebc68cc810c56897aea2224c57932052d30",
+        strip_prefix = "bazel-toolchains-94d31935a2c94fe7e7c7379a0f3393e181928ff7",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/94d31935a2c94fe7e7c7379a0f3393e181928ff7.tar.gz",
+            "https://github.com/bazelbuild/bazel-toolchains/archive/94d31935a2c94fe7e7c7379a0f3393e181928ff7.tar.gz",
+        ],
     )
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index 40e0957cf2e2e36ee26e05bcda8fb0c873a7a40e..7458c4dc46223332d643c37aeebbb418bd49b195 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -27,6 +27,7 @@ def _tensorflow_rbe_config(name, compiler, python_version, cuda_version = None,
 
     if cuda_version != None:
         base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version)
+
         # The cuda toolchain currently contains its own C++ toolchain definition,
         # so we do not fetch local_config_cc.
         config_repos = [
@@ -37,12 +38,12 @@ def _tensorflow_rbe_config(name, compiler, python_version, cuda_version = None,
         env.update({
             "TF_NEED_CUDA": "1",
             "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
-            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0,6.0",
             "TF_ENABLE_XLA": "1",
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT" : "1",
+            "TF_NEED_TENSORRT": "1",
             "TF_TENSORRT_VERSION": tensorrt_version,
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if compiler != "clang" else "",
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
index e39daa6a30baf1c91a5b00f4b7491620fc4cf5dd..754ec2057e04ee527f06bacfcf5b44e4ab823344 100755
--- a/third_party/toolchains/preconfig/generate/generate.sh
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -99,6 +99,7 @@ else
 fi
 
 # Cleanup for copybara.
+find "${OS}" -name '*.h' |xargs clang-format -i
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs -I {} mv {} {}.oss
 
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
index 0495173786328367b1a74d00653da58f759d963c..bce2d5bdbf44ceacb8bf9ad7bee7c6110c1edba7 100644
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -1,7 +1,10 @@
+load(
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
+    container_repositories = "repositories",
+)
 load(
     "@io_bazel_rules_docker//container:container.bzl",
     "container_pull",
-    container_repositories = "repositories",
 )
 load(":containers.bzl", "container_digests")
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
index 13429d7b88be691eeddad77e9de1aede2819816f..a75170ed7239c48b9a9901c74f30825f3babafad 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -1192,7 +1192,7 @@ genrule(
         "cuda/include/vector_functions.hpp",
         "cuda/include/vector_types.h",
     ],
-    cmd = """cp -rf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
 )
 
 genrule(
@@ -1200,7 +1200,7 @@ genrule(
     outs = [
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
-    cmd = """cp -rf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
 )
 
 genrule(
@@ -1237,7 +1237,7 @@ genrule(
         "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
         "cuda/extras/CUPTI/include/openmp/ompt.h",
     ],
-    cmd = """cp -rf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
 genrule(
@@ -1277,7 +1277,7 @@ genrule(
         "cuda/bin/nvprune",
         "cuda/bin/ptxas",
     ],
-    cmd = """cp -rf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
 )
 
 genrule(
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
old mode 100755
new mode 100644
index 0934618e0b538ab0db2a969870c85aa9c4053130..783d678417644be42150954116b03790813e3fea
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef CUDA_CUDA_CONFIG_H_
 #define CUDA_CUDA_CONFIG_H_
 
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0")
+#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
 
 #define TF_CUDA_VERSION "10.0"
 #define TF_CUDNN_VERSION "7"
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
index 6442e7628a416e3298cfd2579cee275459780145..2b84b761ff9fb0c8a803a8cae8d1f9b89c210008 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
@@ -22,6 +22,12 @@ cc_toolchain_suite(
         "local|compiler": ":cc-compiler-local",
         "darwin|compiler": ":cc-compiler-darwin",
         "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
     },
 )
 
@@ -41,6 +47,7 @@ cc_toolchain(
     # last on the command line and contain all shared libraries to link, so all
     # regular options will be left of them.
     supports_param_files = 1,
+    toolchain_identifier = "local_linux",
 )
 
 cc_toolchain(
@@ -55,6 +62,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
 )
 
 cc_toolchain(
@@ -69,6 +77,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "local_windows",
 )
 
 filegroup(
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
index 1c2e8bcae63ebc9b1ee22b5d677c185589b547f8..b6b87e87d2bed20d2d6def051e7b7c32374189e9 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
@@ -2,31 +2,6 @@ major_version: "local"
 minor_version: ""
 default_target_cpu: "same_as_host"
 
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "local_windows"
-}
-
 toolchain {
   abi_version: "local"
   abi_libc_version: "local"
@@ -664,6 +639,31 @@ toolchain {
     name: "no_legacy_features"
   }
 
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
   # Suppress startup banner.
   feature {
     name: "nologo"
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 7ae59e9967adf9b1a980a8085e203459ba8a7c7b..c49b20f2eb9d9c8f9c66ca5096aad771377bf437 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -199,7 +199,7 @@ def InvokeNvcc(argv, log=False):
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
-  supported_cuda_compute_capabilities = [ "3.0" ]
+  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
   nvccopts = '-D_FORCE_INLINES '
   for capability in supported_cuda_compute_capabilities:
     capability = capability.replace('.', '')
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
index 426b9ca86746c3ef92299435d7de4e6191e4b664..510ba52fd5eaa328bd7514c86c1e5e2d7ddd7893 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -36,7 +36,7 @@ GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
 NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
 NVCC_VERSION = '10.0'
 NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0" ]
+supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
@@ -103,9 +103,8 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [
-      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
-  ]
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')